llama.cpp
.devops
nix
apps.nix devshells.nix docker.nix jetson-support.nix nixpkgs-instances.nix package-gguf-py.nix package.nix python-scripts.nix scope.nix sif.nix.github
ISSUE_TEMPLATE
010-bug-compilation.yml 011-bug-results.yml 019-bug-misc.yml 020-enhancement.yml 030-research.yml 040-refactor.yml config.ymlworkflows
bench.yml.disabled build-cache.yml build-cmake-pkg.yml build-linux-cross.yml build.yml check-vendor.yml close-issue.yml copilot-setup-steps.yml docker.yml editorconfig.yml gguf-publish.yml labeler.yml pre-tokenizer-hashes.yml python-check-requirements.yml python-lint.yml python-type-check.yml release.yml server-metal.yml server-webui.yml server.yml update-ops-docs.yml winget.ymlbenches
cmake
arm64-apple-clang.cmake arm64-windows-llvm.cmake build-info.cmake common.cmake download-models.cmake git-vars.cmake license.cmake llama-config.cmake.in llama.pc.in riscv64-spacemit-linux-gnu-gcc.cmake x64-windows-llvm.cmakecommon
jinja
README.md caps.cpp caps.h lexer.cpp lexer.h parser.cpp parser.h runtime.cpp runtime.h string.cpp string.h utils.h value.cpp value.hdocs
multimodal
MobileVLM.md gemma3.md glmedge.md granitevision.md llava.md minicpmo2.6.md minicpmo4.0.md minicpmv2.5.md minicpmv2.6.md minicpmv4.0.md minicpmv4.5.mdops
BLAS.csv CANN.csv CPU.csv CUDA.csv Metal.csv OpenCL.csv SYCL.csv Vulkan.csv WebGPU.csv ZenDNN.csv zDNN.csvexamples
llama.android
app
src
lib
.gitignore build.gradle.kts consumer-rules.pro proguard-rules.promodel-conversion
scripts
causal
compare-embeddings-logits.sh compare-logits.py convert-model.sh modelcard.template run-casual-gen-embeddings-org.py run-converted-model-embeddings-logits.sh run-converted-model.sh run-org-model.pyembedding
compare-embeddings-logits.sh convert-model.sh modelcard.template run-converted-model.sh run-original-model.pyutils
__init__.py check-nmse.py common.py compare_tokens.py create-collection-add-model.sh curl-embedding-server.sh hf-add-model-to-collection.py hf-create-collection.py hf-create-model.py hf-upload-gguf-model.py inspect-converted-model.sh inspect-org-model.py perplexity-gen.sh perplexity-run-simple.sh perplexity-run.sh quantize.sh run-embedding-server.sh semantic_check.py tensor-info.pysycl
CMakeLists.txt README.md build.sh ls-sycl-device.cpp run-llama2.sh test.sh win-build-sycl.bat win-run-llama2.bat win-test.batggml
include
ggml-alloc.h ggml-backend.h ggml-blas.h ggml-cann.h ggml-cpp.h ggml-cpu.h ggml-cuda.h ggml-hexagon.h ggml-metal.h ggml-opencl.h ggml-opt.h ggml-rpc.h ggml-sycl.h ggml-virtgpu.h ggml-vulkan.h ggml-webgpu.h ggml-zdnn.h ggml-zendnn.h ggml.h gguf.hsrc
ggml-cann
CMakeLists.txt acl_tensor.cpp acl_tensor.h aclnn_ops.cpp aclnn_ops.h common.h ggml-cann.cppggml-cpu
CMakeLists.txt arch-fallback.h binary-ops.cpp binary-ops.h common.h ggml-cpu-impl.h ggml-cpu.c ggml-cpu.cpp hbm.cpp hbm.h ops.cpp ops.h quants.c quants.h repack.cpp repack.h simd-mappings.h traits.cpp traits.h unary-ops.cpp unary-ops.h vec.cpp vec.hggml-cuda
template-instances
fattn-mma-f16-instance-ncols1_1-ncols2_16.cu fattn-mma-f16-instance-ncols1_1-ncols2_32.cu fattn-mma-f16-instance-ncols1_1-ncols2_8.cu fattn-mma-f16-instance-ncols1_16-ncols2_1.cu fattn-mma-f16-instance-ncols1_16-ncols2_2.cu fattn-mma-f16-instance-ncols1_16-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_16.cu fattn-mma-f16-instance-ncols1_2-ncols2_32.cu fattn-mma-f16-instance-ncols1_2-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_8.cu fattn-mma-f16-instance-ncols1_32-ncols2_1.cu fattn-mma-f16-instance-ncols1_32-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_16.cu fattn-mma-f16-instance-ncols1_4-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_4.cu fattn-mma-f16-instance-ncols1_4-ncols2_8.cu fattn-mma-f16-instance-ncols1_64-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_2.cu fattn-mma-f16-instance-ncols1_8-ncols2_4.cu fattn-mma-f16-instance-ncols1_8-ncols2_8.cu fattn-tile-instance-dkq112-dv112.cu fattn-tile-instance-dkq128-dv128.cu fattn-tile-instance-dkq256-dv256.cu fattn-tile-instance-dkq40-dv40.cu fattn-tile-instance-dkq576-dv512.cu fattn-tile-instance-dkq64-dv64.cu fattn-tile-instance-dkq72-dv72.cu fattn-tile-instance-dkq80-dv80.cu fattn-tile-instance-dkq96-dv96.cu fattn-vec-instance-f16-f16.cu fattn-vec-instance-f16-q4_0.cu fattn-vec-instance-f16-q4_1.cu fattn-vec-instance-f16-q5_0.cu fattn-vec-instance-f16-q5_1.cu fattn-vec-instance-f16-q8_0.cu fattn-vec-instance-q4_0-f16.cu fattn-vec-instance-q4_0-q4_0.cu fattn-vec-instance-q4_0-q4_1.cu fattn-vec-instance-q4_0-q5_0.cu fattn-vec-instance-q4_0-q5_1.cu fattn-vec-instance-q4_0-q8_0.cu fattn-vec-instance-q4_1-f16.cu fattn-vec-instance-q4_1-q4_0.cu fattn-vec-instance-q4_1-q4_1.cu fattn-vec-instance-q4_1-q5_0.cu fattn-vec-instance-q4_1-q5_1.cu fattn-vec-instance-q4_1-q8_0.cu fattn-vec-instance-q5_0-f16.cu fattn-vec-instance-q5_0-q4_0.cu fattn-vec-instance-q5_0-q4_1.cu fattn-vec-instance-q5_0-q5_0.cu fattn-vec-instance-q5_0-q5_1.cu fattn-vec-instance-q5_0-q8_0.cu fattn-vec-instance-q5_1-f16.cu fattn-vec-instance-q5_1-q4_0.cu fattn-vec-instance-q5_1-q4_1.cu fattn-vec-instance-q5_1-q5_0.cu fattn-vec-instance-q5_1-q5_1.cu fattn-vec-instance-q5_1-q8_0.cu fattn-vec-instance-q8_0-f16.cu fattn-vec-instance-q8_0-q4_0.cu fattn-vec-instance-q8_0-q4_1.cu fattn-vec-instance-q8_0-q5_0.cu fattn-vec-instance-q8_0-q5_1.cu fattn-vec-instance-q8_0-q8_0.cu generate_cu_files.py mmf-instance-ncols_1.cu mmf-instance-ncols_10.cu mmf-instance-ncols_11.cu mmf-instance-ncols_12.cu mmf-instance-ncols_13.cu mmf-instance-ncols_14.cu mmf-instance-ncols_15.cu mmf-instance-ncols_16.cu mmf-instance-ncols_2.cu mmf-instance-ncols_3.cu mmf-instance-ncols_4.cu mmf-instance-ncols_5.cu mmf-instance-ncols_6.cu mmf-instance-ncols_7.cu mmf-instance-ncols_8.cu mmf-instance-ncols_9.cu mmq-instance-iq1_s.cu mmq-instance-iq2_s.cu mmq-instance-iq2_xs.cu mmq-instance-iq2_xxs.cu mmq-instance-iq3_s.cu mmq-instance-iq3_xxs.cu mmq-instance-iq4_nl.cu mmq-instance-iq4_xs.cu mmq-instance-mxfp4.cu mmq-instance-q2_k.cu mmq-instance-q3_k.cu mmq-instance-q4_0.cu mmq-instance-q4_1.cu mmq-instance-q4_k.cu mmq-instance-q5_0.cu mmq-instance-q5_1.cu mmq-instance-q5_k.cu mmq-instance-q6_k.cu mmq-instance-q8_0.cuggml-hexagon
htp
CMakeLists.txt act-ops.c argsort-ops.c binary-ops.c cmake-toolchain.cmake cpy-ops.c flash-attn-ops.c get-rows-ops.c hex-dma.c hex-dma.h hex-dump.h hex-fastdiv.h hex-utils.h htp-ctx.h htp-msg.h htp-ops.h htp_iface.idl hvx-arith.h hvx-base.h hvx-copy.h hvx-div.h hvx-dump.h hvx-exp.h hvx-floor.h hvx-inverse.h hvx-reduce.h hvx-scale.h hvx-sigmoid.h hvx-sqrt.h hvx-types.h hvx-utils.h main.c matmul-ops.c rope-ops.c set-rows-ops.c softmax-ops.c sum-rows-ops.c unary-ops.c worker-pool.c worker-pool.hggml-metal
CMakeLists.txt ggml-metal-common.cpp ggml-metal-common.h ggml-metal-context.h ggml-metal-context.m ggml-metal-device.cpp ggml-metal-device.h ggml-metal-device.m ggml-metal-impl.h ggml-metal-ops.cpp ggml-metal-ops.h ggml-metal.cpp ggml-metal.metalggml-opencl
kernels
add.cl add_id.cl argsort.cl clamp.cl concat.cl conv2d.cl conv2d_f16_f32.cl cpy.cl cvt.cl diag_mask_inf.cl div.cl embed_kernel.py expm1.cl fill.cl flash_attn_f16.cl flash_attn_f32.cl flash_attn_f32_f16.cl gelu.cl gemm_moe_mxfp4_f32.cl gemv_moe_mxfp4_f32.cl gemv_noshuffle.cl gemv_noshuffle_general.cl gemv_noshuffle_general_q8_0_f32.cl get_rows.cl glu.cl group_norm.cl im2col_f16.cl im2col_f32.cl mean.cl mul.cl mul_mat_Ab_Bi_8x4.cl mul_mat_f16_f32.cl mul_mm_f16_f32_kq_kqv.cl mul_mm_f16_f32_l4_lm.cl mul_mm_f32_f32_l4_lm.cl mul_mm_q6_k_f32_l4_lm.cl mul_mm_q8_0_f32_8x4.cl mul_mm_q8_0_f32_l4_lm.cl mul_mv_f16_f16.cl mul_mv_f16_f32.cl mul_mv_f16_f32_1row.cl mul_mv_f16_f32_l4.cl mul_mv_f32_f32.cl mul_mv_id_mxfp4_f32.cl mul_mv_id_mxfp4_f32_flat.cl mul_mv_id_q4_0_f32_8x_flat.cl mul_mv_id_q8_0_f32.cl mul_mv_id_q8_0_f32_flat.cl mul_mv_mxfp4_f32.cl mul_mv_mxfp4_f32_flat.cl mul_mv_q4_0_f32.cl mul_mv_q4_0_f32_1d_16x_flat.cl mul_mv_q4_0_f32_1d_8x_flat.cl mul_mv_q4_0_f32_8x_flat.cl mul_mv_q4_0_f32_v.cl mul_mv_q4_k_f32.cl mul_mv_q6_k_f32.cl mul_mv_q6_k_f32_flat.cl mul_mv_q8_0_f32.cl mul_mv_q8_0_f32_flat.cl norm.cl pad.cl relu.cl repeat.cl rms_norm.cl rope.cl scale.cl set_rows.cl sigmoid.cl silu.cl softmax_4_f16.cl softmax_4_f32.cl softmax_f16.cl softmax_f32.cl softplus.cl solve_tri.cl sqr.cl sqrt.cl ssm_conv.cl sub.cl sum_rows.cl tanh.cl transpose.cl tri.cl tsembd.cl upscale.clggml-sycl
CMakeLists.txt add-id.cpp add-id.hpp backend.hpp binbcast.cpp binbcast.hpp common.cpp common.hpp concat.cpp concat.hpp conv.cpp conv.hpp convert.cpp convert.hpp count-equal.cpp count-equal.hpp cpy.cpp cpy.hpp dequantize.hpp dmmv.cpp dmmv.hpp element_wise.cpp element_wise.hpp gemm.hpp getrows.cpp getrows.hpp ggml-sycl.cpp gla.cpp gla.hpp im2col.cpp im2col.hpp mmq.cpp mmq.hpp mmvq.cpp mmvq.hpp norm.cpp norm.hpp outprod.cpp outprod.hpp pad.cpp pad.hpp pad_reflect_1d.cpp pad_reflect_1d.hpp presets.hpp quantize.hpp quants.hpp repeat_back.cpp repeat_back.hpp roll.cpp roll.hpp rope.cpp rope.hpp set.cpp set.hpp set_rows.cpp set_rows.hpp softmax.cpp softmax.hpp ssm_conv.cpp ssm_conv.hpp sycl_hw.cpp sycl_hw.hpp tsembd.cpp tsembd.hpp vecdotq.hpp wkv.cpp wkv.hppggml-virtgpu
backend
CMakeLists.txt apir_cs_ggml-rpc-back.cpp backend-convert.h backend-dispatched-backend.cpp backend-dispatched-buffer-type.cpp backend-dispatched-buffer.cpp backend-dispatched-device.cpp backend-dispatched.cpp backend-dispatched.gen.h backend-dispatched.h backend-virgl-apir.h backend.cppggml-vulkan
vulkan-shaders
CMakeLists.txt abs.comp acc.comp add.comp add1.comp add_id.comp arange.comp argmax.comp argsort.comp argsort_large.comp ceil.comp clamp.comp concat.comp contig_copy.comp conv2d_dw.comp conv2d_mm.comp conv_transpose_1d.comp copy.comp copy_from_quant.comp copy_to_quant.comp copy_transpose.comp cos.comp count_equal.comp count_experts.comp cumsum.comp cumsum_multipass1.comp cumsum_multipass2.comp dequant_f32.comp dequant_funcs.glsl dequant_funcs_cm2.glsl dequant_head.glsl dequant_iq1_m.comp dequant_iq1_s.comp dequant_iq2_s.comp dequant_iq2_xs.comp dequant_iq2_xxs.comp dequant_iq3_s.comp dequant_iq3_xxs.comp dequant_iq4_nl.comp dequant_iq4_xs.comp dequant_mxfp4.comp dequant_q2_k.comp dequant_q3_k.comp dequant_q4_0.comp dequant_q4_1.comp dequant_q4_k.comp dequant_q5_0.comp dequant_q5_1.comp dequant_q5_k.comp dequant_q6_k.comp dequant_q8_0.comp diag.comp diag_mask_inf.comp div.comp exp.comp fill.comp flash_attn.comp flash_attn_base.glsl flash_attn_cm1.comp flash_attn_cm2.comp flash_attn_mask_opt.comp flash_attn_split_k_reduce.comp floor.comp geglu.comp geglu_erf.comp geglu_quick.comp gelu.comp gelu_erf.comp gelu_quick.comp generic_binary_head.glsl generic_head.glsl generic_unary_head.glsl get_rows.comp get_rows_quant.comp glu_head.glsl glu_main.glsl group_norm.comp hardsigmoid.comp hardswish.comp im2col.comp im2col_3d.comp l2_norm.comp leaky_relu.comp log.comp mul.comp mul_mat_split_k_reduce.comp mul_mat_vec.comp mul_mat_vec_base.glsl mul_mat_vec_iface.glsl mul_mat_vec_iq1_m.comp mul_mat_vec_iq1_s.comp mul_mat_vec_iq2_s.comp mul_mat_vec_iq2_xs.comp mul_mat_vec_iq2_xxs.comp mul_mat_vec_iq3_s.comp mul_mat_vec_iq3_xxs.comp mul_mat_vec_nc.comp mul_mat_vec_p021.comp mul_mat_vec_q2_k.comp mul_mat_vec_q3_k.comp mul_mat_vec_q4_k.comp mul_mat_vec_q5_k.comp mul_mat_vec_q6_k.comp mul_mat_vecq.comp mul_mat_vecq_funcs.glsl mul_mm.comp mul_mm_cm2.comp mul_mm_funcs.glsl mul_mm_id_funcs.glsl mul_mmq.comp mul_mmq_funcs.glsl mul_mmq_shmem_types.glsl multi_add.comp neg.comp norm.comp opt_step_adamw.comp opt_step_sgd.comp pad.comp pool2d.comp quantize_q8_1.comp reglu.comp relu.comp repeat.comp repeat_back.comp rms_norm.comp rms_norm_back.comp rms_norm_partials.comp roll.comp rope_funcs.glsl rope_head.glsl rope_multi.comp rope_neox.comp rope_norm.comp rope_params.glsl rope_vision.comp round.comp rte.glsl scale.comp sigmoid.comp silu.comp silu_back.comp sin.comp soft_max.comp soft_max_back.comp soft_max_large1.comp soft_max_large2.comp soft_max_large3.comp soft_max_large_common.glsl softplus.comp solve_tri.comp sqrt.comp square.comp ssm_conv.comp ssm_scan.comp step.comp sub.comp sum_rows.comp sum_rows.glsl swiglu.comp swiglu_oai.comp tanh.comp timestep_embedding.comp topk_argsort.comp topk_moe.comp topk_nary_search.comp tri.comp trunc.comp types.glsl upscale.comp utils.glsl vulkan-shaders-gen.cpp wkv6.comp wkv7.comp xielu.compggml-webgpu
wgsl-shaders
argmax.wgsl argsort.wgsl argsort_merge.wgsl binary.wgsl common_decls.tmpl cpy.tmpl.wgsl cumsum.wgsl embed_wgsl.py flash_attn.wgsl get_rows.tmpl.wgsl glu.tmpl.wgsl memset.wgsl mul_mat.tmpl.wgsl mul_mat_decls.tmpl mul_mat_reg_tile.tmpl.wgsl mul_mat_subgroup_matrix.tmpl.wgsl mul_mat_vec.tmpl.wgsl pad.wgsl rms_norm.wgsl rope.tmpl.wgsl scale.tmpl.wgsl set_rows.wgsl soft_max.tmpl.wgsl sum_rows.wgsl unary.wgslgguf-py
gguf
scripts
gguf_convert_endian.py gguf_dump.py gguf_editor_gui.py gguf_hash.py gguf_new_metadata.py gguf_set_metadata.pygrammars
README.md arithmetic.gbnf c.gbnf chess.gbnf english.gbnf japanese.gbnf json.gbnf json_arr.gbnf list.gbnfmedia
llama0-banner.png llama0-logo.png llama1-banner.png llama1-icon-transparent.png llama1-icon-transparent.svg llama1-icon.png llama1-icon.svg llama1-logo.png llama1-logo.svg matmul.png matmul.svgmodels
templates
Apertus-8B-Instruct.jinja ByteDance-Seed-OSS.jinja CohereForAI-c4ai-command-r-plus-tool_use.jinja CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja GLM-4.6.jinja Kimi-K2-Instruct.jinja Kimi-K2-Thinking.jinja MiMo-VL.jinja MiniMax-M2.jinja Mistral-Small-3.2-24B-Instruct-2506.jinja NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja NVIDIA-Nemotron-Nano-v2.jinja NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja Qwen-QwQ-32B.jinja Qwen-Qwen2.5-7B-Instruct.jinja Qwen-Qwen3-0.6B.jinja Qwen3-Coder.jinja README.md deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja deepseek-ai-DeepSeek-V3.1.jinja fireworks-ai-llama-3-firefunction-v2.jinja google-gemma-2-2b-it.jinja ibm-granite-granite-3.3-2B-Instruct.jinja llama-cpp-deepseek-r1.jinja llama-cpp-lfm2.jinja llama-cpp-rwkv-world.jinja meetkai-functionary-medium-v3.1.jinja meetkai-functionary-medium-v3.2.jinja meta-llama-Llama-3.1-8B-Instruct.jinja meta-llama-Llama-3.2-3B-Instruct.jinja meta-llama-Llama-3.3-70B-Instruct.jinja microsoft-Phi-3.5-mini-instruct.jinja mistralai-Ministral-3-14B-Reasoning-2512.jinja mistralai-Mistral-Nemo-Instruct-2407.jinja moonshotai-Kimi-K2.jinja openai-gpt-oss-120b.jinja unsloth-Apriel-1.5.jinja unsloth-mistral-Devstral-Small-2507.jinja upstage-Solar-Open-100B.jinjarequirements
requirements-all.txt requirements-compare-llama-bench.txt requirements-convert_hf_to_gguf.txt requirements-convert_hf_to_gguf_update.txt requirements-convert_legacy_llama.txt requirements-convert_llama_ggml_to_gguf.txt requirements-convert_lora_to_gguf.txt requirements-gguf_editor_gui.txt requirements-pydantic.txt requirements-server-bench.txt requirements-test-tokenizer-random.txt requirements-tool_bench.txtscripts
bench-models.sh build-info.sh check-requirements.sh compare-commits.sh compare-llama-bench.py compare-logprobs.py create_ops_docs.py debug-test.sh fetch_server_test_models.py gen-authors.sh gen-unicode-data.py get-flags.mk get-hellaswag.sh get-pg.sh get-wikitext-103.sh get-wikitext-2.sh get-winogrande.sh get_chat_template.py hf.sh install-oneapi.bat pr2wt.sh serve-static.js server-bench.py sync-ggml-am.sh sync-ggml.last sync-ggml.sh sync_vendor.py tool_bench.py tool_bench.sh verify-checksum-models.py xxd.cmakesrc
models
afmoe.cpp apertus.cpp arcee.cpp arctic.cpp arwkv7.cpp baichuan.cpp bailingmoe.cpp bailingmoe2.cpp bert.cpp bitnet.cpp bloom.cpp chameleon.cpp chatglm.cpp codeshell.cpp cogvlm.cpp cohere2-iswa.cpp command-r.cpp dbrx.cpp deci.cpp deepseek.cpp deepseek2.cpp dots1.cpp dream.cpp ernie4-5-moe.cpp ernie4-5.cpp exaone-moe.cpp exaone.cpp exaone4.cpp falcon-h1.cpp falcon.cpp gemma-embedding.cpp gemma.cpp gemma2-iswa.cpp gemma3.cpp gemma3n-iswa.cpp glm4-moe.cpp glm4.cpp gpt2.cpp gptneox.cpp granite-hybrid.cpp granite.cpp graph-context-mamba.cpp grok.cpp grovemoe.cpp hunyuan-dense.cpp hunyuan-moe.cpp internlm2.cpp jais.cpp jamba.cpp kimi-linear.cpp lfm2.cpp llada-moe.cpp llada.cpp llama-iswa.cpp llama.cpp maincoder.cpp mamba.cpp mimo2-iswa.cpp minicpm3.cpp minimax-m2.cpp mistral3.cpp models.h modern-bert.cpp mpt.cpp nemotron-h.cpp nemotron.cpp neo-bert.cpp olmo.cpp olmo2.cpp olmoe.cpp openai-moe-iswa.cpp openelm.cpp orion.cpp pangu-embedded.cpp phi2.cpp phi3.cpp plamo.cpp plamo2.cpp plamo3.cpp plm.cpp qwen.cpp qwen2.cpp qwen2moe.cpp qwen2vl.cpp qwen3.cpp qwen35.cpp qwen35moe.cpp qwen3moe.cpp qwen3next.cpp qwen3vl-moe.cpp qwen3vl.cpp refact.cpp rnd1.cpp rwkv6-base.cpp rwkv6.cpp rwkv6qwen2.cpp rwkv7-base.cpp rwkv7.cpp seed-oss.cpp smallthinker.cpp smollm3.cpp stablelm.cpp starcoder.cpp starcoder2.cpp step35-iswa.cpp t5-dec.cpp t5-enc.cpp wavtokenizer-dec.cpp xverse.cpptests
peg-parser
simple-tokenize.cpp simple-tokenize.h test-basic.cpp test-gbnf-generation.cpp test-json-parser.cpp test-json-serialization.cpp test-unicode.cpp tests.htools
cvector-generator
CMakeLists.txt README.md completions.txt cvector-generator.cpp mean.hpp negative.txt pca.hpp positive.txtmtmd
legacy-models
convert_image_encoder_to_gguf.py glmedge-convert-image-encoder-to-gguf.py glmedge-surgery.py llava_surgery.py llava_surgery_v2.py minicpmv-convert-image-encoder-to-gguf.py minicpmv-surgery.pymodels
cogvlm.cpp conformer.cpp glm4v.cpp internvl.cpp kimik25.cpp kimivl.cpp llama4.cpp llava.cpp minicpmv.cpp mobilenetv5.cpp models.h pixtral.cpp qwen2vl.cpp qwen3vl.cpp siglip.cpp whisper-enc.cpp youtuvl.cppserver
public_legacy
colorthemes.css completion.js favicon.ico index-new.html index.html index.js json-schema-to-grammar.mjs loading.html prompt-formats.js style.css system-prompts.js theme-beeninorder.css theme-ketivah.css theme-mangotango.css theme-playground.css theme-polarnight.css theme-snowstorm.csspublic_simplechat
datautils.mjs index.html readme.md simplechat.css simplechat.js simplechat_screens.webp ui.mjstests
unit
test_basic.py test_chat_completion.py test_compat_anthropic.py test_compat_oai_responses.py test_completion.py test_ctx_shift.py test_embedding.py test_infill.py test_lora.py test_rerank.py test_router.py test_security.py test_sleep.py test_slot_save.py test_speculative.py test_template.py test_tokenize.py test_tool_call.py test_vision_api.pywebui
.storybook
ModeWatcherDecorator.svelte TooltipProviderDecorator.svelte main.ts preview.ts vitest.setup.tssrc
lib
components
app
chat
ChatAttachments
ChatAttachmentPreview.svelte ChatAttachmentThumbnailFile.svelte ChatAttachmentThumbnailImage.svelte ChatAttachmentsList.svelte ChatAttachmentsViewAll.svelteChatForm
ChatFormActions
ChatFormActionFileAttachments.svelte ChatFormActionRecord.svelte ChatFormActionSubmit.svelte ChatFormActions.svelteChatMessages
ChatMessage.svelte ChatMessageActions.svelte ChatMessageAssistant.svelte ChatMessageBranchingControls.svelte ChatMessageEditForm.svelte ChatMessageStatistics.svelte ChatMessageSystem.svelte ChatMessageThinkingBlock.svelte ChatMessageUser.svelte ChatMessages.svelteChatScreen
ChatScreen.svelte ChatScreenDragOverlay.svelte ChatScreenHeader.svelte ChatScreenProcessingInfo.sveltedialogs
DialogChatAttachmentPreview.svelte DialogChatAttachmentsViewAll.svelte DialogChatError.svelte DialogChatSettings.svelte DialogConfirmation.svelte DialogConversationSelection.svelte DialogConversationTitleUpdate.svelte DialogEmptyFileAlert.svelte DialogModelInformation.svelte DialogModelNotAvailable.sveltemisc
ActionButton.svelte ActionDropdown.svelte BadgeChatStatistic.svelte BadgeInfo.svelte BadgeModality.svelte CodePreviewDialog.svelte ConversationSelection.svelte CopyToClipboardIcon.svelte KeyboardShortcutInfo.svelte MarkdownContent.svelte RemoveButton.svelte SearchInput.svelte SyntaxHighlightedCode.svelteui
alert-dialog
alert-dialog-action.svelte alert-dialog-cancel.svelte alert-dialog-content.svelte alert-dialog-description.svelte alert-dialog-footer.svelte alert-dialog-header.svelte alert-dialog-overlay.svelte alert-dialog-title.svelte alert-dialog-trigger.svelte index.tscard
card-action.svelte card-content.svelte card-description.svelte card-footer.svelte card-header.svelte card-title.svelte card.svelte index.tsdialog
dialog-close.svelte dialog-content.svelte dialog-description.svelte dialog-footer.svelte dialog-header.svelte dialog-overlay.svelte dialog-title.svelte dialog-trigger.svelte index.tsdropdown-menu
dropdown-menu-checkbox-item.svelte dropdown-menu-content.svelte dropdown-menu-group-heading.svelte dropdown-menu-group.svelte dropdown-menu-item.svelte dropdown-menu-label.svelte dropdown-menu-radio-group.svelte dropdown-menu-radio-item.svelte dropdown-menu-separator.svelte dropdown-menu-shortcut.svelte dropdown-menu-sub-content.svelte dropdown-menu-sub-trigger.svelte dropdown-menu-trigger.svelte index.tspopover
index.ts popover-close.svelte popover-content.svelte popover-portal.svelte popover-trigger.svelte popover.svelteselect
index.ts select-content.svelte select-group-heading.svelte select-group.svelte select-item.svelte select-label.svelte select-scroll-down-button.svelte select-scroll-up-button.svelte select-separator.svelte select-trigger.sveltesheet
index.ts sheet-close.svelte sheet-content.svelte sheet-description.svelte sheet-footer.svelte sheet-header.svelte sheet-overlay.svelte sheet-title.svelte sheet-trigger.sveltesidebar
constants.ts context.svelte.ts index.ts sidebar-content.svelte sidebar-footer.svelte sidebar-group-action.svelte sidebar-group-content.svelte sidebar-group-label.svelte sidebar-group.svelte sidebar-header.svelte sidebar-input.svelte sidebar-inset.svelte sidebar-menu-action.svelte sidebar-menu-badge.svelte sidebar-menu-button.svelte sidebar-menu-item.svelte sidebar-menu-skeleton.svelte sidebar-menu-sub-button.svelte sidebar-menu-sub-item.svelte sidebar-menu-sub.svelte sidebar-menu.svelte sidebar-provider.svelte sidebar-rail.svelte sidebar-separator.svelte sidebar-trigger.svelte sidebar.sveltetable
index.ts table-body.svelte table-caption.svelte table-cell.svelte table-footer.svelte table-head.svelte table-header.svelte table-row.svelte table.svelteconstants
auto-scroll.ts binary-detection.ts default-context.ts floating-ui-constraints.ts icons.ts input-classes.ts latex-protection.ts literal-html.ts localstorage-keys.ts max-bundle-size.ts precision.ts processing-info.ts settings-config.ts supported-file-types.ts table-html-restorer.ts tooltip-config.ts viewport.tsstores
chat.svelte.ts conversations.svelte.ts models.svelte.ts persisted.svelte.ts server.svelte.ts settings.svelte.tsutils
api-headers.ts api-key-validation.ts attachment-display.ts attachment-type.ts audio-recording.ts autoresize-textarea.ts branching.ts browser-only.ts clipboard.ts config-helpers.ts conversation-utils.ts convert-files-to-extra.ts file-preview.ts file-type.ts formatters.ts index.ts is-ime-composing.ts latex-protection.ts modality-file-validation.ts model-names.ts pdf-processing.ts portal-to-body.ts precision.ts process-uploaded-files.ts svg-to-png.ts syntax-highlight-language.ts text-files.ts text.ts webp-to-png.tstests
llama.cpp/convert_hf_to_gguf.py
raw
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4from __future__ import annotations
5
6import ast
7import logging
8import argparse
9import contextlib
10import json
11import os
12import re
13import sys
14from enum import IntEnum
15from pathlib import Path
16from hashlib import sha256
17from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
18from itertools import chain
19from transformers import AutoConfig
20
21import math
22import numpy as np
23import torch
24
25if TYPE_CHECKING:
26 from torch import Tensor
27
28if 'NO_LOCAL_GGUF' not in os.environ:
29 sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
30import gguf
31from gguf.vocab import MistralTokenizerType, MistralVocab
32
33try:
34 from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
35 from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
36 from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
37 from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
38 SentencePieceTokenizer,
39 )
40
41 _mistral_common_installed = True
42 _mistral_import_error_msg = ""
43except ImportError:
44 _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
45 _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
46
47 _mistral_common_installed = False
48 TokenizerVersion = None
49 Tekkenizer = None
50 SentencePieceTokenizer = None
51 _mistral_import_error_msg = (
52 "Mistral format requires `mistral-common` to be installed. Please run "
53 "`pip install mistral-common[image,audio]` to install it."
54 )
55
56
57logger = logging.getLogger("hf-to-gguf")
58
59
60###### MODEL DEFINITIONS ######
61
62class SentencePieceTokenTypes(IntEnum):
63 NORMAL = 1
64 UNKNOWN = 2
65 CONTROL = 3
66 USER_DEFINED = 4
67 UNUSED = 5
68 BYTE = 6
69
70
71class ModelType(IntEnum):
72 TEXT = 1
73 MMPROJ = 2
74
75
76AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
77
78
79class ModelBase:
80 _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
81 ModelType.TEXT: {},
82 ModelType.MMPROJ: {},
83 }
84
85 dir_model: Path
86 ftype: gguf.LlamaFileType
87 fname_out: Path
88 is_big_endian: bool
89 endianess: gguf.GGUFEndian
90 use_temp_file: bool
91 lazy: bool
92 dry_run: bool
93 hparams: dict[str, Any]
94 model_tensors: dict[str, Callable[[], Tensor]]
95 gguf_writer: gguf.GGUFWriter
96 model_name: str | None
97 metadata_override: Path | None
98 dir_model_card: Path
99 remote_hf_model_id: str | None
100
101 # subclasses should define this!
102 model_arch: gguf.MODEL_ARCH
103
104 # subclasses should initialize this!
105 block_count: int
106 tensor_map: gguf.TensorNameMap
107
108 # Mistral format specifics
109 is_mistral_format: bool = False
110 disable_mistral_community_chat_template: bool = False
111 sentence_transformers_dense_modules: bool = False
112
113 def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
114 use_temp_file: bool = False, eager: bool = False,
115 metadata_override: Path | None = None, model_name: str | None = None,
116 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
117 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
118 disable_mistral_community_chat_template: bool = False,
119 sentence_transformers_dense_modules: bool = False):
120 if type(self) is ModelBase or \
121 type(self) is TextModel or \
122 type(self) is MmprojModel:
123 raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
124
125 if self.is_mistral_format and not _mistral_common_installed:
126 raise ImportError(_mistral_import_error_msg)
127
128 self.dir_model = dir_model
129 self.ftype = ftype
130 self.fname_out = fname_out
131 self.is_big_endian = is_big_endian
132 self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
133 self.use_temp_file = use_temp_file
134 self.lazy = not eager or (remote_hf_model_id is not None)
135 self.dry_run = dry_run
136 self.remote_hf_model_id = remote_hf_model_id
137 self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
138 self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
139 self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
140 self.metadata_override = metadata_override
141 self.model_name = model_name
142 self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
143
144 # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
145 # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
146 if self.ftype == gguf.LlamaFileType.GUESSED:
147 for _, tensor in self.get_tensors():
148 if tensor.dim() < 2:
149 continue
150
151 if tensor.dtype == torch.bfloat16:
152 self.ftype = gguf.LlamaFileType.MOSTLY_BF16
153 logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16")
154 break
155 elif tensor.dtype == torch.float16:
156 self.ftype = gguf.LlamaFileType.MOSTLY_F16
157 logger.info("heuristics detected float16 tensor dtype, setting --outtype f16")
158 break
159 else:
160 self.ftype = gguf.LlamaFileType.MOSTLY_F16
161 logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
162
163 # Configure GGUF Writer
164 self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
165 split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
166
167 # Mistral specific
168 self.disable_mistral_community_chat_template = disable_mistral_community_chat_template
169
170 @classmethod
171 def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path:
172 stem, suffix = path.stem, path.suffix
173 new_name = f"{prefix}{stem}{suffix}"
174 return path.with_name(new_name)
175
176 def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
177 key = next((k for k in keys if k in self.hparams), None)
178 if key is not None:
179 return self.hparams[key]
180 if optional:
181 return None
182 raise KeyError(f"could not find any of: {keys}")
183
184 def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
185 tensors: dict[str, Callable[[], Tensor]] = {}
186
187 if remote_hf_model_id is not None:
188 is_safetensors = True
189
190 logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
191 remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
192 for name, remote_tensor in remote_tensors.items():
193 tensors[name] = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r)
194
195 return tensors
196
197 prefix = "model" if not self.is_mistral_format else "consolidated"
198 part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
199 is_safetensors: bool = len(part_names) > 0
200 if not is_safetensors:
201 part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
202
203 tensor_names_from_index: set[str] = set()
204
205 if not self.is_mistral_format:
206 index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin"
207 index_name += ".index.json"
208 index_file = self.dir_model / index_name
209
210 if index_file.is_file():
211 logger.info(f"gguf: loading model weight map from '{index_name}'")
212 with open(index_file, "r", encoding="utf-8") as f:
213 index: dict[str, Any] = json.load(f)
214 weight_map = index.get("weight_map")
215 if weight_map is None or not isinstance(weight_map, dict):
216 raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
217 tensor_names_from_index.update(weight_map.keys())
218 part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
219 part_names = sorted(part_dict.keys())
220 else:
221 weight_map = {}
222 else:
223 weight_map = {}
224
225 for part_name in part_names:
226 logger.info(f"gguf: indexing model part '{part_name}'")
227 ctx: ContextManager[Any]
228 if is_safetensors:
229 ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name))
230 else:
231 ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
232
233 with ctx as model_part:
234 assert model_part is not None
235
236 for name in model_part.keys():
237 if is_safetensors:
238 data: gguf.utility.LocalTensor = model_part[name]
239 if self.lazy:
240 data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731
241 else:
242 dtype = LazyTorchTensor._dtype_str_map[data.dtype]
243 data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731
244 else:
245 data_torch: Tensor = model_part[name]
246 if self.lazy:
247 data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731
248 else:
249 data_gen = lambda data=data_torch: data # noqa: E731
250 tensors[name] = data_gen
251
252 # verify tensor name presence and identify potentially missing files
253 if len(tensor_names_from_index) > 0:
254 tensor_names_from_parts = set(tensors.keys())
255 if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0:
256 missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts))
257 extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index))
258 missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
259 if len(extra) == 0 and len(missing_files) > 0:
260 raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
261 f"Missing tensors: {missing}")
262 else:
263 raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
264 f"Missing tensors: {missing}\n"
265 f"Extra tensors: {extra}")
266
267 return tensors
268
269 def dequant_model(self):
270 tensors_to_remove: list[str] = []
271 new_tensors: dict[str, Callable[[], Tensor]] = {}
272
273 if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict):
274 quant_method = quant_config.get("quant_method")
275
276 def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor:
277 weight = weight.view(torch.uint8)
278 orig_shape = weight.shape
279
280 shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape)))))
281 data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift
282 data = data & 3
283 data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:]))
284
285 # The scale is inverted
286 return data / scale.float()
287
288 def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor:
289 scale = scale.float()
290
291 if block_size is not None:
292 for i, size in enumerate(block_size):
293 scale = scale.repeat_interleave(size, i)
294 # unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
295 scale = scale[tuple(slice(0, size) for size in weight.shape)]
296
297 return weight.float() * scale
298
299 # ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476
300 def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor:
301 bits = quant_config["bits"]
302 assert bits in (2, 3, 4, 8)
303 assert qweight.dtype == qzeros.dtype
304 maxq = (2 ** bits) - 1
305 weight = None
306 zeros = None
307 pack_dtype_bits = qweight.dtype.itemsize * 8
308
309 if bits in [2, 4, 8]:
310 pack_factor = pack_dtype_bits // bits
311 wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0)
312 if self.lazy:
313 wf = LazyTorchTensor.from_eager(wf)
314
315 zeros = torch.bitwise_right_shift(
316 qzeros.unsqueeze(2).expand(-1, -1, pack_factor),
317 wf.unsqueeze(0)
318 ).to(torch.int16 if bits == 8 else torch.int8)
319 zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape)
320
321 weight = torch.bitwise_and(
322 torch.bitwise_right_shift(
323 qweight.unsqueeze(1).expand(-1, pack_factor, -1),
324 wf.unsqueeze(-1)
325 ).to(torch.int16 if bits == 8 else torch.int8),
326 maxq
327 )
328 elif bits == 3:
329 raise NotImplementedError("3-bit gptq dequantization is not yet implemented")
330
331 assert weight is not None
332 assert zeros is not None
333
334 weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
335
336 # gptq_v2 doesn't need to offset zeros
337 if quant_config.get("checkpoint_format", "gptq") == "gptq":
338 zeros += 1
339
340 return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T
341
342 def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int):
343 assert w.dtype == torch.int32
344 shape = tuple(shape_tensor.tolist())
345 assert len(shape) == 2
346 mask = (1 << num_bits) - 1
347
348 shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32)
349 if self.lazy:
350 shifts = LazyTorchTensor.from_eager(shifts)
351
352 if zero_point is None:
353 offset = 1 << (num_bits - 1)
354 else:
355 assert len(zero_point.shape) == 2
356 offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask
357 offset = offset.reshape(-1, zero_point.shape[1])
358 # trim padding, and prepare for broadcast
359 # NOTE: the zero-point is packed along dim 0
360 offset = offset[:shape[0], :].unsqueeze(-1)
361
362 # extract values
363 # NOTE: the weights are packed along dim 1
364 unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask
365 unpacked = unpacked.reshape(shape[0], -1)
366
367 # trim padding
368 unpacked = unpacked[:, :shape[1]]
369
370 # prepare for broadcast of the scale
371 unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size)
372 unpacked = unpacked - offset
373
374 return (unpacked * scale.unsqueeze(-1).float()).reshape(shape)
375
376 if quant_method == "bitnet":
377 for name in self.model_tensors.keys():
378 if name.endswith(".weight_scale"):
379 weight_name = name.removesuffix("_scale")
380 w = self.model_tensors[weight_name]
381 s = self.model_tensors[name]
382 self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s())
383 tensors_to_remove.append(name)
384 elif quant_method == "fp8":
385 block_size = quant_config.get("weight_block_size")
386 for name in self.model_tensors.keys():
387 if name.endswith(".weight_scale_inv"):
388 weight_name = name.removesuffix("_scale_inv")
389 w = self.model_tensors[weight_name]
390 s = self.model_tensors[name]
391 self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
392 tensors_to_remove.append(name)
393 if name.endswith(".activation_scale"): # unused
394 tensors_to_remove.append(name)
395 # mistral format
396 if name.endswith(".qscale_weight"):
397 weight_name = name.removesuffix("qscale_weight") + "weight"
398 w = self.model_tensors[weight_name]
399 s = self.model_tensors[name]
400 self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
401 tensors_to_remove.append(name)
402 if name.endswith(".qscale_act"):
403 tensors_to_remove.append(name)
404 elif quant_method == "gptq":
405 for name in self.model_tensors.keys():
406 if name.endswith(".qweight"):
407 base_name = name.removesuffix(".qweight")
408 g_idx = self.model_tensors[base_name + ".g_idx"]
409 qweight = self.model_tensors[base_name + ".qweight"]
410 qzeros = self.model_tensors[base_name + ".qzeros"]
411 scales = self.model_tensors[base_name + ".scales"]
412 new_tensors[base_name + ".weight"] = (
413 lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq(
414 g(), w(), z(), s()
415 )
416 )
417 tensors_to_remove += [
418 base_name + n
419 for n in (
420 ".g_idx",
421 ".qzeros",
422 ".qweight",
423 ".scales",
424 )
425 ]
426 elif quant_method == "compressed-tensors":
427 quant_format = quant_config["format"]
428 groups = quant_config["config_groups"]
429 if len(groups) > 1:
430 raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
431 weight_config = tuple(groups.values())[0]["weights"]
432
433 if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized":
434 block_size = weight_config.get("block_structure", None)
435 strategy = weight_config.get("strategy")
436 assert strategy == "channel" or strategy == "block"
437 assert weight_config.get("group_size") is None # didn't find a model using this yet
438 for name in self.model_tensors.keys():
439 if name.endswith(".weight_scale"):
440 weight_name = name.removesuffix("_scale")
441 w = self.model_tensors[weight_name]
442 s = self.model_tensors[name]
443 self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
444 tensors_to_remove.append(name)
445 elif quant_format == "pack-quantized":
446 assert weight_config.get("strategy") == "group"
447 assert weight_config.get("type", "int") == "int"
448 num_bits = weight_config.get("num_bits")
449 group_size = weight_config.get("group_size")
450 assert isinstance(num_bits, int)
451 assert isinstance(group_size, int)
452 for name in self.model_tensors.keys():
453 if name.endswith(".weight_packed"):
454 base_name = name.removesuffix("_packed")
455 w = self.model_tensors[name]
456 scale = self.model_tensors[base_name + "_scale"]
457 shape = self.model_tensors[base_name + "_shape"]
458 zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None)
459 new_tensors[base_name] = (
460 lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed(
461 w(), scale(), shape(), zero_point(), num_bits, group_size,
462 )
463 )
464 tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
465 if (base_name + "_zero_point") in self.model_tensors:
466 tensors_to_remove.append(base_name + "_zero_point")
467 else:
468 raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
469 else:
470 raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
471
472 for name in tensors_to_remove:
473 if name in self.model_tensors:
474 del self.model_tensors[name]
475
476 for name, value in new_tensors.items():
477 self.model_tensors[name] = value
478
479 def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
480 for name, gen in self.model_tensors.items():
481 yield name, gen()
482
483 def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
484 if key not in gguf.MODEL_TENSORS[self.model_arch]:
485 raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
486 name: str = gguf.TENSOR_NAMES[key]
487 if "{bid}" in name:
488 assert bid is not None
489 name = name.format(bid=bid)
490 return name + suffix
491
492 def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
493 if key not in gguf.MODEL_TENSORS[self.model_arch]:
494 return False
495 key_name: str = gguf.TENSOR_NAMES[key]
496 if "{bid}" in key_name:
497 if bid is None:
498 return False
499 key_name = key_name.format(bid=bid)
500 else:
501 if bid is not None:
502 return False
503 return name == (key_name + suffix)
504
505 def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
506 new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
507 if new_name is None:
508 raise ValueError(f"Can not map tensor {name!r}")
509 return new_name
510
511 def set_gguf_parameters(self):
512 raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
513
514 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
515 del bid # unused
516 return [(self.map_tensor_name(name), data_torch)]
517
518 def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
519 del name, new_name, bid, n_dims # unused
520
521 return False
522
523 # some models need extra generated tensors (like rope_freqs)
524 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
525 return ()
526
527 def prepare_tensors(self):
528 self.dequant_model()
529
530 # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
531 if self.tensor_map.mapping:
532 max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
533 else:
534 max_name_len = len("vision_encoder.weight,") # Default reasonable length
535
536 for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
537 # we don't need these
538 if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
539 continue
540
541 old_dtype = data_torch.dtype
542
543 # convert any unsupported data types to float32
544 if data_torch.dtype not in (torch.float16, torch.float32):
545 data_torch = data_torch.to(torch.float32)
546
547 # use the first number-like part of the tensor name as the block id
548 bid = None
549 for part in name.split("."):
550 if part.isdecimal():
551 bid = int(part)
552 break
553
554 for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
555 # TODO: why do we squeeze here?
556 # data = data_torch.squeeze().numpy()
557 data = data_torch.numpy()
558
559 n_dims = len(data.shape)
560 data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
561
562 # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
563 if n_dims <= 1 or new_name.endswith("_norm.weight"):
564 data_qtype = gguf.GGMLQuantizationType.F32
565
566 # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
567 # Some tensor types are always in float32
568 if data_qtype is False and (
569 any(
570 self.match_model_tensor_name(new_name, key, bid)
571 for key in (
572 gguf.MODEL_TENSOR.FFN_GATE_INP,
573 gguf.MODEL_TENSOR.POS_EMBD,
574 gguf.MODEL_TENSOR.TOKEN_TYPES,
575 gguf.MODEL_TENSOR.SSM_CONV1D,
576 gguf.MODEL_TENSOR.SHORTCONV_CONV,
577 gguf.MODEL_TENSOR.TIME_MIX_FIRST,
578 gguf.MODEL_TENSOR.TIME_MIX_W1,
579 gguf.MODEL_TENSOR.TIME_MIX_W2,
580 gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
581 gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
582 gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
583 gguf.MODEL_TENSOR.POSNET_NORM1,
584 gguf.MODEL_TENSOR.POSNET_NORM2,
585 gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
586 gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
587 gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
588 gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
589 # Kimi KDA conv weights should be F32
590 gguf.MODEL_TENSOR.SSM_CONV1D_Q,
591 gguf.MODEL_TENSOR.SSM_CONV1D_K,
592 gguf.MODEL_TENSOR.SSM_CONV1D_V,
593 )
594 )
595 or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
596 ):
597 data_qtype = gguf.GGMLQuantizationType.F32
598
599 if data_qtype is False and any(
600 self.match_model_tensor_name(new_name, key, bid)
601 for key in (
602 gguf.MODEL_TENSOR.TOKEN_EMBD,
603 gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
604 gguf.MODEL_TENSOR.OUTPUT,
605 gguf.MODEL_TENSOR.ALTUP_ROUTER,
606 gguf.MODEL_TENSOR.LAUREL_L,
607 gguf.MODEL_TENSOR.LAUREL_R,
608 )
609 ):
610 if self.ftype in (
611 gguf.LlamaFileType.MOSTLY_TQ1_0,
612 gguf.LlamaFileType.MOSTLY_TQ2_0,
613 ):
614 # TODO: use Q4_K and Q6_K
615 data_qtype = gguf.GGMLQuantizationType.F16
616
617 # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
618 if isinstance(data_qtype, bool):
619 if self.ftype == gguf.LlamaFileType.ALL_F32:
620 data_qtype = gguf.GGMLQuantizationType.F32
621 elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
622 data_qtype = gguf.GGMLQuantizationType.F16
623 elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
624 data_qtype = gguf.GGMLQuantizationType.BF16
625 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
626 data_qtype = gguf.GGMLQuantizationType.Q8_0
627 elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
628 data_qtype = gguf.GGMLQuantizationType.TQ1_0
629 elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
630 data_qtype = gguf.GGMLQuantizationType.TQ2_0
631 else:
632 raise ValueError(f"Unknown file type: {self.ftype.name}")
633
634 try:
635 data = gguf.quants.quantize(data, data_qtype)
636 except gguf.QuantError as e:
637 logger.warning("%s, %s", e, "falling back to F16")
638 data_qtype = gguf.GGMLQuantizationType.F16
639 data = gguf.quants.quantize(data, data_qtype)
640
641 shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
642
643 # reverse shape to make it similar to the internal ggml dimension order
644 shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
645
646 # n_dims is implicit in the shape
647 logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
648
649 self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
650
651 def set_type(self):
652 self.gguf_writer.add_type(gguf.GGUFType.MODEL)
653
654 def prepare_metadata(self, vocab_only: bool):
655
656 total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
657
658 self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
659
660 # If we are using HF model id, set the metadata name to the model id
661 if self.remote_hf_model_id:
662 self.metadata.name = self.remote_hf_model_id
663
664 # Fallback to model directory name if metadata name is still missing
665 if self.metadata.name is None:
666 self.metadata.name = self.dir_model.name
667
668 # Generate parameter weight class (useful for leader boards) if not yet determined
669 if self.metadata.size_label is None and total_params > 0:
670 self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
671
672 self.set_type()
673
674 logger.info("Set meta model")
675 self.metadata.set_gguf_meta_model(self.gguf_writer)
676
677 logger.info("Set model parameters")
678 self.set_gguf_parameters()
679
680 logger.info("Set model quantization version")
681 self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
682
683 def write_vocab(self):
684 raise NotImplementedError("write_vocab() must be implemented in subclasses")
685
686 def write(self):
687 self.prepare_tensors()
688 self.prepare_metadata(vocab_only=False)
689 self.gguf_writer.write_header_to_file(path=self.fname_out)
690 self.gguf_writer.write_kv_data_to_file()
691 self.gguf_writer.write_tensors_to_file(progress=True)
692 self.gguf_writer.close()
693
694 @staticmethod
695 def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
696 part_names: list[str] = []
697 for filename in os.listdir(dir_model):
698 if filename.startswith(prefix) and filename.endswith(suffix):
699 part_names.append(filename)
700
701 part_names.sort()
702
703 return part_names
704
705 @staticmethod
706 def load_hparams(dir_model: Path, is_mistral_format: bool):
707 if is_mistral_format:
708 with open(dir_model / "params.json", "r", encoding="utf-8") as f:
709 config = json.load(f)
710 return config
711
712 try:
713 # for security reason, we don't allow loading remote code by default
714 # if a model need remote code, we will fallback to config.json
715 config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
716 except Exception as e:
717 logger.warning(f"Failed to load model config from {dir_model}: {e}")
718 logger.warning("Trying to load config.json instead")
719 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
720 config = json.load(f)
721 if "llm_config" in config:
722 # rename for InternVL
723 config["text_config"] = config["llm_config"]
724 if "lm_config" in config:
725 # rename for GlmASR
726 config["text_config"] = config["lm_config"]
727 if "thinker_config" in config:
728 # rename for Qwen2.5-Omni
729 config["text_config"] = config["thinker_config"]["text_config"]
730 if "lfm" in config:
731 # rename for LFM2-Audio
732 config["text_config"] = config["lfm"]
733 return config
734
735 @classmethod
736 def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
737 assert names
738
739 def func(modelcls: AnyModel) -> AnyModel:
740 model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
741 for name in names:
742 cls._model_classes[model_type][name] = modelcls
743 return modelcls
744 return func
745
746 @classmethod
747 def print_registered_models(cls):
748 for model_type, model_classes in cls._model_classes.items():
749 logger.error(f"{model_type.name} models:")
750 for name in sorted(model_classes.keys()):
751 logger.error(f" - {name}")
752
753 @classmethod
754 def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]:
755 try:
756 return cls._model_classes[model_type][arch]
757 except KeyError:
758 raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
759
760
761class TextModel(ModelBase):
762 model_type = ModelType.TEXT
763 hf_arch: str
764
765 def __init__(self, *args, **kwargs):
766 super().__init__(*args, **kwargs)
767 if not self.is_mistral_format:
768 self.hf_arch = get_model_architecture(self.hparams, self.model_type)
769 else:
770 self.hf_arch = ""
771
772 if "text_config" in self.hparams:
773 # move the text_config to the root level
774 self.hparams = {**self.hparams, **self.hparams["text_config"]}
775
776 self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
777 self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
778
779 self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
780
781 rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
782 local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
783
784 # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
785 if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
786 if local_rope_theta is not None:
787 self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
788 if "rope_theta" not in self.rope_parameters and rope_theta is not None:
789 self.rope_parameters["rope_theta"] = rope_theta
790 if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
791 self.rope_parameters["rope_type"] = rope_type
792
793 @classmethod
794 def __init_subclass__(cls):
795 # can't use an abstract property, because overriding it without type errors
796 # would require using decorated functions instead of simply defining the property
797 if "model_arch" not in cls.__dict__:
798 raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
799
800 def set_vocab(self):
801 self._set_vocab_gpt2()
802
803 def prepare_metadata(self, vocab_only: bool):
804 super().prepare_metadata(vocab_only=vocab_only)
805
806 total_params = self.gguf_writer.get_total_parameter_count()[0]
807 # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
808 output_type: str = self.ftype.name.partition("_")[2]
809
810 # Filename Output
811 if self.fname_out.is_dir():
812 # Generate default filename based on model specification and available metadata
813 if not vocab_only:
814 fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
815 else:
816 fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
817
818 # Use the default filename
819 self.fname_out = self.fname_out / f"{fname_default}.gguf"
820 else:
821 # Output path is a custom defined templated filename
822 # Note: `not is_dir()` is used because `.is_file()` will not detect
823 # file template strings as it doesn't actually exist as a file
824
825 # Process templated file name with the output ftype, useful with the "auto" ftype
826 self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
827
828 logger.info("Set model tokenizer")
829 self.set_vocab()
830
831 def set_gguf_parameters(self):
832 self.gguf_writer.add_block_count(self.block_count)
833
834 if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None:
835 self.gguf_writer.add_context_length(n_ctx)
836 logger.info(f"gguf: context length = {n_ctx}")
837
838 if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
839 self.gguf_writer.add_embedding_length(n_embd)
840 logger.info(f"gguf: embedding length = {n_embd}")
841
842 if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
843 self.gguf_writer.add_feed_forward_length(n_ff)
844 logger.info(f"gguf: feed forward length = {n_ff}")
845
846 if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
847 self.gguf_writer.add_head_count(n_head)
848 logger.info(f"gguf: head count = {n_head}")
849
850 if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None:
851 self.gguf_writer.add_head_count_kv(n_head_kv)
852 logger.info(f"gguf: key-value head count = {n_head_kv}")
853
854 # TODO: Handle "sliding_attention" similarly when models start implementing it
855 rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
856 if (rope_type := rope_params.get("rope_type")) is not None:
857 rope_factor = rope_params.get("factor")
858 rope_gguf_type = gguf.RopeScalingType.NONE
859 if rope_type == "linear" and rope_factor is not None:
860 rope_gguf_type = gguf.RopeScalingType.LINEAR
861 self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
862 self.gguf_writer.add_rope_scaling_factor(rope_factor)
863 elif rope_type == "yarn" and rope_factor is not None:
864 rope_gguf_type = gguf.RopeScalingType.YARN
865 self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
866 self.gguf_writer.add_rope_scaling_factor(rope_factor)
867 self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
868 if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None:
869 self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor)
870 if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None:
871 self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor)
872 if (yarn_beta_fast := rope_params.get("beta_fast")) is not None:
873 self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast)
874 if (yarn_beta_slow := rope_params.get("beta_slow")) is not None:
875 self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow)
876 # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
877 elif rope_type == "su" or rope_type == "longrope":
878 rope_gguf_type = gguf.RopeScalingType.LONGROPE
879 self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
880 elif rope_type == "dynamic":
881 # HunYuan, handled in model class
882 pass
883 elif rope_type.lower() == "llama3":
884 # Handled in generate_extra_tensors
885 pass
886 else:
887 logger.warning(f"Unknown RoPE type: {rope_type}")
888 logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
889
890 if "mrope_section" in self.rope_parameters:
891 mrope_section = self.rope_parameters["mrope_section"]
892 # Pad to 4 dimensions [time, height, width, extra]
893 while len(mrope_section) < 4:
894 mrope_section.append(0)
895 self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
896 logger.info(f"gguf: mrope sections: {mrope_section[:4]}")
897
898 if (rope_theta := rope_params.get("rope_theta")) is not None:
899 self.gguf_writer.add_rope_freq_base(rope_theta)
900 logger.info(f"gguf: rope theta = {rope_theta}")
901 if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
902 self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
903 logger.info(f"gguf: rope theta swa = {local_rope_theta}")
904 if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
905 self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
906 logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
907 if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
908 self.gguf_writer.add_layer_norm_eps(f_norm_eps)
909 logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
910 if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
911 self.gguf_writer.add_expert_count(n_experts)
912 logger.info(f"gguf: expert count = {n_experts}")
913 if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
914 self.gguf_writer.add_expert_used_count(n_experts_used)
915 logger.info(f"gguf: experts used count = {n_experts_used}")
916 if (n_expert_groups := self.hparams.get("n_group")) is not None:
917 self.gguf_writer.add_expert_group_count(n_expert_groups)
918 logger.info(f"gguf: expert groups count = {n_expert_groups}")
919 if (n_group_used := self.hparams.get("topk_group")) is not None:
920 self.gguf_writer.add_expert_group_used_count(n_group_used)
921 logger.info(f"gguf: expert groups used count = {n_group_used}")
922
923 if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
924 if score_func == "sigmoid":
925 self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
926 elif score_func == "softmax":
927 self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
928 else:
929 raise ValueError(f"Unsupported expert score gating function value: {score_func}")
930 logger.info(f"gguf: expert score gating function = {score_func}")
931
932 if (head_dim := self.hparams.get("head_dim")) is not None:
933 self.gguf_writer.add_key_length(head_dim)
934 self.gguf_writer.add_value_length(head_dim)
935
936 self.gguf_writer.add_file_type(self.ftype)
937 logger.info(f"gguf: file type = {self.ftype}")
938
939 def write_vocab(self):
940 if len(self.gguf_writer.tensors) != 1:
941 raise ValueError('Splitting the vocabulary is not supported')
942
943 self.prepare_metadata(vocab_only=True)
944 self.gguf_writer.write_header_to_file(path=self.fname_out)
945 self.gguf_writer.write_kv_data_to_file()
946 self.gguf_writer.close()
947
948 def does_token_look_special(self, token: str | bytes) -> bool:
949 if isinstance(token, (bytes, bytearray)):
950 token_text = token.decode(encoding="utf-8")
951 elif isinstance(token, memoryview):
952 token_text = token.tobytes().decode(encoding="utf-8")
953 else:
954 token_text = token
955
956 # Some models mark some added tokens which ought to be control tokens as not special.
957 # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
958 seems_special = token_text in (
959 "<pad>", # deepseek-coder
960 "<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
961 )
962
963 seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
964 seems_special = seems_special or (token_text.startswith("<๏ฝ") and token_text.endswith("๏ฝ>")) # deepseek-coder
965
966 # TODO: should these be marked as UNUSED instead? (maybe not)
967 seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
968
969 return seems_special
970
971 # used for GPT-2 BPE and WordPiece vocabs
972 def get_vocab_base(self) -> tuple[list[str], list[int], str]:
973 tokens: list[str] = []
974 toktypes: list[int] = []
975
976 from transformers import AutoTokenizer
977 tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
978 vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
979 assert max(tokenizer.vocab.values()) < vocab_size
980
981 tokpre = self.get_vocab_base_pre(tokenizer)
982
983 reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
984 added_vocab = tokenizer.get_added_vocab()
985
986 added_tokens_decoder = tokenizer.added_tokens_decoder
987
988 for i in range(vocab_size):
989 if i not in reverse_vocab:
990 tokens.append(f"[PAD{i}]")
991 toktypes.append(gguf.TokenType.UNUSED)
992 else:
993 token: str = reverse_vocab[i]
994 if token in added_vocab:
995 # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
996 # To avoid unexpected issues - we make sure to normalize non-normalized tokens
997 if not added_tokens_decoder[i].normalized:
998 previous_token = token
999 token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
1000 if previous_token != token:
1001 logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
1002
1003 if added_tokens_decoder[i].special or self.does_token_look_special(token):
1004 toktypes.append(gguf.TokenType.CONTROL)
1005 else:
1006 # NOTE: this was added for Gemma.
1007 # Encoding and decoding the tokens above isn't sufficient for this case.
1008 token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
1009 toktypes.append(gguf.TokenType.USER_DEFINED)
1010 else:
1011 toktypes.append(gguf.TokenType.NORMAL)
1012 tokens.append(token)
1013
1014 return tokens, toktypes, tokpre
1015
1016 # NOTE: this function is generated by convert_hf_to_gguf_update.py
1017 # do not modify it manually!
1018 # ref: https://github.com/ggml-org/llama.cpp/pull/6920
1019 # Marker: Start get_vocab_base_pre
1020 def get_vocab_base_pre(self, tokenizer) -> str:
1021 # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
1022 # is specific for the BPE pre-tokenizer used by the model
1023 # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
1024 # use in llama.cpp to implement the same pre-tokenizer
1025
1026 chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n๐ (normal) ๐ถ\u200d๐ซ๏ธ (multiple emojis concatenated) โ
๐ฆ๐ฆ 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 แแถแแแแแแทแแแแขแถแ
๐ ?ๆๆณๅจappleๅทฅไฝ1314151ๅคฉ๏ฝ ------======= ะฝะตัะพ ะฝะฐ ะัะปะณะฐััะบะธ \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
1027
1028 chktok = tokenizer.encode(chktxt)
1029 chkhsh = sha256(str(chktok).encode()).hexdigest()
1030
1031 logger.debug(f"chktok: {chktok}")
1032 logger.debug(f"chkhsh: {chkhsh}")
1033
1034 res = None
1035
1036 # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
1037 # or pull the latest version of the model from Huggingface
1038 # don't edit the hashes manually!
1039 if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
1040 # ref: https://huggingface.co/THUDM/glm-4-9b-chat
1041 res = "chatglm-bpe"
1042 if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
1043 # ref: https://huggingface.co/THUDM/glm-4-9b-chat
1044 res = "chatglm-bpe"
1045 if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
1046 # ref: https://huggingface.co/THUDM/glm-4-9b-hf
1047 res = "glm4"
1048 if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
1049 # ref: https://huggingface.co/zai-org/GLM-4.5-Air
1050 res = "glm4"
1051 if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
1052 # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
1053 res = "minerva-7b"
1054 if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
1055 # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
1056 res = "hunyuan"
1057 if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
1058 # ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
1059 res = "hunyuan-dense"
1060 if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
1061 # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
1062 res = "falcon-h1"
1063 if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
1064 # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
1065 res = "falcon-h1"
1066 if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
1067 # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
1068 res = "falcon-h1"
1069 if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
1070 # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
1071 res = "falcon-h1"
1072 if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
1073 # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
1074 res = "kimi-k2"
1075 if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
1076 # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
1077 res = "qwen2"
1078 if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
1079 # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
1080 res = "grok-2"
1081 if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
1082 # ref: https://huggingface.co/aari1995/German_Semantic_V3
1083 res = "jina-v2-de"
1084 if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
1085 # ref: https://huggingface.co/zai-org/GLM-4.7-Flash
1086 res = "glm4"
1087 if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
1088 # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
1089 res = "llama-bpe"
1090 if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
1091 # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
1092 res = "deepseek-llm"
1093 if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
1094 # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
1095 res = "deepseek-coder"
1096 if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
1097 # ref: https://huggingface.co/tiiuae/falcon-7b
1098 res = "falcon"
1099 if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
1100 # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
1101 res = "bert-bge"
1102 if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
1103 # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
1104 res = "falcon3"
1105 if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
1106 # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
1107 res = "bert-bge-large"
1108 if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
1109 # ref: https://huggingface.co/mosaicml/mpt-7b
1110 res = "mpt"
1111 if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
1112 # ref: https://huggingface.co/bigcode/starcoder2-3b
1113 res = "starcoder"
1114 if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
1115 # ref: https://huggingface.co/openai-community/gpt2
1116 res = "gpt-2"
1117 if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
1118 # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
1119 res = "stablelm2"
1120 if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
1121 # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
1122 res = "refact"
1123 if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
1124 # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
1125 res = "command-r"
1126 if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
1127 # ref: https://huggingface.co/Qwen/Qwen1.5-7B
1128 res = "qwen2"
1129 if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
1130 # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
1131 res = "olmo"
1132 if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
1133 # ref: https://huggingface.co/databricks/dbrx-base
1134 res = "dbrx"
1135 if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
1136 # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1137 res = "jina-v1-en"
1138 if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
1139 # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
1140 res = "jina-v2-en"
1141 if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
1142 # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
1143 res = "jina-v2-es"
1144 if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
1145 # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
1146 res = "jina-v2-de"
1147 if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
1148 # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
1149 res = "smaug-bpe"
1150 if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
1151 # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
1152 res = "poro-chat"
1153 if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
1154 # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
1155 res = "jina-v2-code"
1156 if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
1157 # ref: https://huggingface.co/LumiOpen/Viking-7B
1158 res = "viking"
1159 if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
1160 # ref: https://huggingface.co/core42/jais-13b
1161 res = "jais"
1162 if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
1163 # ref: https://huggingface.co/WisdomShell/CodeShell-7B
1164 res = "codeshell"
1165 if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
1166 # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
1167 res = "tekken"
1168 if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
1169 # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
1170 res = "smollm"
1171 if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
1172 # ref: https://huggingface.co/bigscience/bloom
1173 res = "bloom"
1174 if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
1175 # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
1176 res = "gpt3-finnish"
1177 if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
1178 # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
1179 res = "exaone"
1180 if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
1181 # ref: https://huggingface.co/microsoft/phi-2
1182 res = "phi-2"
1183 if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
1184 # ref: https://huggingface.co/facebook/chameleon-7b
1185 res = "chameleon"
1186 if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
1187 # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
1188 res = "roberta-bpe"
1189 if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
1190 # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
1191 res = "gigachat"
1192 if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
1193 # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
1194 res = "megrez"
1195 if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
1196 # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
1197 res = "deepseek-v3"
1198 if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
1199 # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
1200 res = "deepseek-r1-qwen"
1201 if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
1202 # ref: https://huggingface.co/Xenova/gpt-4o
1203 res = "gpt-4o"
1204 if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
1205 # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
1206 res = "superbpe"
1207 if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
1208 # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
1209 res = "trillion"
1210 if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
1211 # ref: https://huggingface.co/inclusionAI/Ling-lite
1212 res = "bailingmoe"
1213 if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
1214 # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
1215 res = "llama4"
1216 if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
1217 # ref: https://huggingface.co/mistral-community/pixtral-12b
1218 res = "pixtral"
1219 if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
1220 # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
1221 res = "seed-coder"
1222 if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
1223 # ref: https://huggingface.co/skt/A.X-4.0
1224 res = "a.x-4.0"
1225 if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
1226 # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
1227 res = "midm-2.0"
1228 if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
1229 # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
1230 res = "lfm2"
1231 if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
1232 # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
1233 res = "exaone4"
1234 if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
1235 # ref: https://huggingface.co/JetBrains/Mellum-4b-base
1236 res = "mellum"
1237 if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
1238 # ref: https://huggingface.co/answerdotai/ModernBERT-base
1239 res = "modern-bert"
1240 if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
1241 # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
1242 res = "afmoe"
1243 if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
1244 # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0
1245 res = "bailingmoe2"
1246 if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
1247 # ref: https://huggingface.co/ibm-granite/granite-docling-258M
1248 res = "granite-docling"
1249 if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
1250 # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
1251 res = "minimax-m2"
1252 if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
1253 # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
1254 res = "kormo"
1255 if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
1256 # ref: https://huggingface.co/tencent/Youtu-LLM-2B
1257 res = "youtu"
1258 if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
1259 # ref: https://huggingface.co/upstage/Solar-Open-100B
1260 res = "solar-open"
1261 if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
1262 # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
1263 res = "exaone-moe"
1264 if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4":
1265 # ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct
1266 res = "qwen35"
1267
1268 if res is None:
1269 logger.warning("\n")
1270 logger.warning("**************************************************************************************")
1271 logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
1272 logger.warning("** There are 2 possible reasons for this:")
1273 logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
1274 logger.warning("** - the pre-tokenization config has changed upstream")
1275 logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
1276 logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
1277 logger.warning("**")
1278 logger.warning(f"** chkhsh: {chkhsh}")
1279 logger.warning("**************************************************************************************")
1280 logger.warning("\n")
1281 raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
1282
1283 logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
1284 logger.debug(f"chkhsh: {chkhsh}")
1285
1286 return res
1287 # Marker: End get_vocab_base_pre
1288
1289 def _set_vocab_none(self) -> None:
1290 self.gguf_writer.add_tokenizer_model("none")
1291
1292 def _set_vocab_gpt2(self) -> None:
1293 tokens, toktypes, tokpre = self.get_vocab_base()
1294 self.gguf_writer.add_tokenizer_model("gpt2")
1295 self.gguf_writer.add_tokenizer_pre(tokpre)
1296 self.gguf_writer.add_token_list(tokens)
1297 self.gguf_writer.add_token_types(toktypes)
1298
1299 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1300 special_vocab.add_to_gguf(self.gguf_writer)
1301
1302 def _set_vocab_qwen(self):
1303 dir_model = self.dir_model
1304 hparams = self.hparams
1305 tokens: list[str] = []
1306 toktypes: list[int] = []
1307
1308 from transformers import AutoTokenizer
1309 tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
1310 vocab_size = hparams["vocab_size"]
1311 assert max(tokenizer.get_vocab().values()) < vocab_size
1312
1313 tokpre = self.get_vocab_base_pre(tokenizer)
1314
1315 merges = []
1316 vocab = {}
1317 mergeable_ranks = tokenizer.mergeable_ranks
1318 for token, rank in mergeable_ranks.items():
1319 vocab[QwenModel.token_bytes_to_string(token)] = rank
1320 if len(token) == 1:
1321 continue
1322 merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
1323 assert len(merged) == 2
1324 merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
1325
1326 # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
1327 added_vocab = tokenizer.special_tokens
1328 reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
1329
1330 for i in range(vocab_size):
1331 if i not in reverse_vocab:
1332 tokens.append(f"[PAD{i}]")
1333 toktypes.append(gguf.TokenType.UNUSED)
1334 elif reverse_vocab[i] in added_vocab:
1335 tokens.append(reverse_vocab[i])
1336 toktypes.append(gguf.TokenType.CONTROL)
1337 else:
1338 tokens.append(reverse_vocab[i])
1339 toktypes.append(gguf.TokenType.NORMAL)
1340
1341 self.gguf_writer.add_tokenizer_model("gpt2")
1342 self.gguf_writer.add_tokenizer_pre(tokpre)
1343 self.gguf_writer.add_token_list(tokens)
1344 self.gguf_writer.add_token_types(toktypes)
1345
1346 special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
1347 special_vocab.merges = merges
1348 # only add special tokens when they were not already loaded from config.json
1349 if len(special_vocab.special_token_ids) == 0:
1350 special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
1351 special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
1352 # this one is usually not in config.json anyway
1353 special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
1354 special_vocab.add_to_gguf(self.gguf_writer)
1355
1356 def _set_vocab_sentencepiece(self, add_to_gguf=True):
1357 tokens, scores, toktypes = self._create_vocab_sentencepiece()
1358
1359 self.gguf_writer.add_tokenizer_model("llama")
1360 self.gguf_writer.add_tokenizer_pre("default")
1361 self.gguf_writer.add_token_list(tokens)
1362 self.gguf_writer.add_token_scores(scores)
1363 self.gguf_writer.add_token_types(toktypes)
1364
1365 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
1366 special_vocab.add_to_gguf(self.gguf_writer)
1367
1368 def _create_vocab_sentencepiece(self):
1369 from sentencepiece import SentencePieceProcessor
1370
1371 tokenizer_path = self.dir_model / 'tokenizer.model'
1372
1373 if not tokenizer_path.is_file():
1374 raise FileNotFoundError(f"File not found: {tokenizer_path}")
1375
1376 tokenizer = SentencePieceProcessor()
1377 tokenizer.LoadFromFile(str(tokenizer_path))
1378
1379 vocab_size = self.find_hparam([
1380 "vocab_size_per_layer_input", # gemma3n
1381 "vocab_size",
1382 ], optional=True) or tokenizer.vocab_size()
1383
1384 tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
1385 scores: list[float] = [-10000.0] * vocab_size
1386 toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
1387
1388 for token_id in range(tokenizer.vocab_size()):
1389 if token_id >= vocab_size:
1390 logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
1391 break
1392
1393 piece = tokenizer.IdToPiece(token_id)
1394 text = piece.encode("utf-8")
1395 score = tokenizer.GetScore(token_id)
1396
1397 toktype = SentencePieceTokenTypes.NORMAL
1398 if tokenizer.IsUnknown(token_id):
1399 toktype = SentencePieceTokenTypes.UNKNOWN
1400 elif tokenizer.IsControl(token_id):
1401 toktype = SentencePieceTokenTypes.CONTROL
1402 elif tokenizer.IsUnused(token_id):
1403 toktype = SentencePieceTokenTypes.UNUSED
1404 elif tokenizer.IsByte(token_id):
1405 toktype = SentencePieceTokenTypes.BYTE
1406
1407 tokens[token_id] = text
1408 scores[token_id] = score
1409 toktypes[token_id] = toktype
1410
1411 added_tokens_file = self.dir_model / 'added_tokens.json'
1412 if added_tokens_file.is_file():
1413 with open(added_tokens_file, "r", encoding="utf-8") as f:
1414 added_tokens_json = json.load(f)
1415 for key in added_tokens_json:
1416 token_id = added_tokens_json[key]
1417 if token_id >= vocab_size:
1418 logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
1419 continue
1420
1421 tokens[token_id] = key.encode("utf-8")
1422 scores[token_id] = -1000.0
1423 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
1424
1425 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1426 if tokenizer_config_file.is_file():
1427 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1428 tokenizer_config_json = json.load(f)
1429 added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
1430 for token_id, token_data in added_tokens_decoder.items():
1431 token_id = int(token_id)
1432 token: str = token_data["content"]
1433 if token_id >= vocab_size:
1434 logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
1435 continue
1436 if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
1437 if tokens[token_id] != token.encode("utf-8"):
1438 logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
1439 if token_data.get("special") or self.does_token_look_special(token):
1440 toktypes[token_id] = SentencePieceTokenTypes.CONTROL
1441 else:
1442 token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
1443 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
1444
1445 scores[token_id] = -1000.0
1446 tokens[token_id] = token.encode("utf-8")
1447
1448 if vocab_size > len(tokens):
1449 pad_count = vocab_size - len(tokens)
1450 logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
1451 for i in range(1, pad_count + 1):
1452 tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
1453 scores.append(-1000.0)
1454 toktypes.append(SentencePieceTokenTypes.UNUSED)
1455
1456 return tokens, scores, toktypes
1457
1458 def _set_vocab_llama_hf(self):
1459 vocab = gguf.LlamaHfVocab(self.dir_model)
1460 tokens = []
1461 scores = []
1462 toktypes = []
1463
1464 for text, score, toktype in vocab.all_tokens():
1465 tokens.append(text)
1466 scores.append(score)
1467 toktypes.append(toktype)
1468
1469 assert len(tokens) == vocab.vocab_size
1470
1471 self.gguf_writer.add_tokenizer_model("llama")
1472 self.gguf_writer.add_tokenizer_pre("default")
1473 self.gguf_writer.add_token_list(tokens)
1474 self.gguf_writer.add_token_scores(scores)
1475 self.gguf_writer.add_token_types(toktypes)
1476
1477 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
1478 special_vocab.add_to_gguf(self.gguf_writer)
1479
1480 def _set_vocab_rwkv_world(self):
1481 assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
1482 vocab_size = self.hparams.get("vocab_size", 65536)
1483
1484 tokens: list[bytes] = ['<s>'.encode("utf-8")]
1485 toktypes: list[int] = [gguf.TokenType.CONTROL]
1486
1487 with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
1488 lines = f.readlines()
1489 for line in lines:
1490 parts = line.split(' ')
1491 assert len(parts) >= 3
1492 token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
1493 token = token.encode("utf-8") if isinstance(token, str) else token
1494 assert isinstance(token, bytes)
1495 assert len(token) == token_len
1496 token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
1497 tokens.append(token_text.encode("utf-8"))
1498 toktypes.append(gguf.TokenType.NORMAL)
1499 remainder = vocab_size - len(tokens)
1500 assert remainder >= 0
1501 for i in range(len(tokens), vocab_size):
1502 tokens.append(f"[PAD{i}]".encode("utf-8"))
1503 toktypes.append(gguf.TokenType.UNUSED)
1504
1505 self.gguf_writer.add_tokenizer_model("rwkv")
1506 self.gguf_writer.add_token_list(tokens)
1507 self.gguf_writer.add_token_types(toktypes)
1508 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
1509 if special_vocab.chat_template is None:
1510 template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
1511 if template_path.is_file():
1512 with open(template_path, "r", encoding="utf-8") as f:
1513 template = f.read()
1514 else:
1515 template = "rwkv-world"
1516 special_vocab.chat_template = template
1517 # hack: Add '\n\n' as the EOT token to make it chat normally
1518 special_vocab._set_special_token("eot", 261)
1519 # hack: Override these as they have already been set (incorrectly)
1520 special_vocab.special_token_ids["bos"] = 0
1521 special_vocab.special_token_ids["eos"] = 0
1522
1523 special_vocab.add_to_gguf(self.gguf_writer)
1524
1525 def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
1526 tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
1527 logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
1528 vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
1529
1530 default_pre = "mpt" if model_name == "gpt-neox" else "default"
1531
1532 field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
1533 assert field # tokenizer model
1534 self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
1535
1536 field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
1537 self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
1538
1539 field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
1540 assert field # token list
1541 self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
1542
1543 if model_name == "llama-spm":
1544 field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
1545 assert field # token scores
1546 self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
1547
1548 field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
1549 assert field # token types
1550 self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
1551
1552 if model_name != "llama-spm":
1553 field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
1554 assert field # token merges
1555 self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
1556
1557 if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
1558 self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
1559 if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
1560 self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
1561 if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
1562 self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
1563 if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
1564 self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
1565 if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
1566 self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
1567 if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
1568 self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
1569
1570 def _try_set_pooling_type(self) -> None:
1571 # get pooling path
1572 pooling_path = None
1573 module_path = self.dir_model / "modules.json"
1574 if module_path.is_file():
1575 with open(module_path, encoding="utf-8") as f:
1576 modules = json.load(f)
1577 for mod in modules:
1578 if mod["type"] == "sentence_transformers.models.Pooling":
1579 pooling_path = mod["path"]
1580 break
1581
1582 # get pooling type
1583 if pooling_path is not None:
1584 with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
1585 pooling = json.load(f)
1586 if pooling["pooling_mode_mean_tokens"]:
1587 pooling_type = gguf.PoolingType.MEAN
1588 elif pooling["pooling_mode_cls_token"]:
1589 pooling_type = gguf.PoolingType.CLS
1590 elif pooling["pooling_mode_lasttoken"]:
1591 pooling_type = gguf.PoolingType.LAST
1592 else:
1593 raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
1594 self.gguf_writer.add_pooling_type(pooling_type)
1595
1596 def _set_vocab_glmedge(self):
1597 from transformers import AutoTokenizer
1598 tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
1599 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1600 tokens, toktypes, tokpre = self.get_vocab_base()
1601 self.gguf_writer.add_tokenizer_model("gpt2")
1602 self.gguf_writer.add_tokenizer_pre(tokpre)
1603 self.gguf_writer.add_token_list(tokens)
1604 self.gguf_writer.add_token_types(toktypes)
1605 special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
1606 special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
1607 special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
1608 special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
1609 special_vocab.add_to_gguf(self.gguf_writer)
1610
1611 def _set_vocab_interns1(self):
1612 tokens: list[str] = []
1613 toktypes: list[int] = []
1614
1615 from transformers import AutoTokenizer
1616 tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
1617 vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
1618 vocab_size = self.hparams.get("vocab_size", len(vocab))
1619 assert max(vocab.values()) < vocab_size
1620
1621 tokpre = self.get_vocab_base_pre(tokenizer)
1622
1623 reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
1624 added_vocab = tokenizer.get_added_vocab()
1625
1626 added_tokens_decoder = tokenizer.added_tokens_decoder
1627
1628 for i in range(vocab_size):
1629 if i not in reverse_vocab:
1630 tokens.append(f"[PAD{i}]")
1631 toktypes.append(gguf.TokenType.UNUSED)
1632 else:
1633 token: str = reverse_vocab[i]
1634 if token in added_vocab:
1635 # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
1636 # To avoid unexpected issues - we make sure to normalize non-normalized tokens
1637 if not added_tokens_decoder[i].normalized:
1638 previous_token = token
1639 token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
1640 if previous_token != token:
1641 logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
1642
1643 if added_tokens_decoder[i].special or self.does_token_look_special(token):
1644 toktypes.append(gguf.TokenType.CONTROL)
1645 else:
1646 toktypes.append(gguf.TokenType.USER_DEFINED)
1647 else:
1648 toktypes.append(gguf.TokenType.NORMAL)
1649 tokens.append(token)
1650
1651 self.gguf_writer.add_tokenizer_model("gpt2")
1652 self.gguf_writer.add_tokenizer_pre(tokpre)
1653 self.gguf_writer.add_token_list(tokens)
1654 self.gguf_writer.add_token_types(toktypes)
1655
1656 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1657 special_vocab._set_special_token("bos", 151643)
1658 special_vocab.add_to_gguf(self.gguf_writer)
1659
1660 def _set_vocab_mistral(self):
1661 if not _mistral_common_installed:
1662 raise ImportError(_mistral_import_error_msg)
1663
1664 vocab = MistralVocab(self.dir_model)
1665 logger.info(
1666 f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
1667 )
1668
1669 self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
1670
1671 tokens = []
1672 scores = []
1673 toktypes = []
1674
1675 for text, score, toktype in vocab.all_tokens():
1676 tokens.append(text)
1677 scores.append(score)
1678 toktypes.append(toktype)
1679
1680 assert len(tokens) == vocab.vocab_size, (
1681 f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
1682 )
1683
1684 if vocab.tokenizer_type == MistralTokenizerType.tekken:
1685 self.gguf_writer.add_tokenizer_pre("tekken")
1686 self.gguf_writer.add_token_merges(
1687 vocab.extract_vocab_merges_from_model()
1688 )
1689
1690 logger.info(
1691 f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
1692 )
1693
1694 self.gguf_writer.add_bos_token_id(vocab.bos_id)
1695 self.gguf_writer.add_eos_token_id(vocab.eos_id)
1696 self.gguf_writer.add_unk_token_id(vocab.unk_id)
1697 self.gguf_writer.add_pad_token_id(vocab.pad_id)
1698
1699 self.gguf_writer.add_token_list(tokens)
1700 self.gguf_writer.add_token_scores(scores)
1701 self.gguf_writer.add_token_types(toktypes)
1702 self.gguf_writer.add_vocab_size(vocab.vocab_size)
1703
1704 self.gguf_writer.add_add_bos_token(True)
1705 self.gguf_writer.add_add_eos_token(False)
1706
1707 local_template_file_path = self.dir_model / "chat_template.jinja"
1708
1709 if self.is_mistral_format and local_template_file_path.is_file():
1710 # Ministral-3 and other new Mistral models come with chat templates.
1711 # ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
1712 logger.info("Using an existing Mistral local chat template.")
1713
1714 with open(local_template_file_path, "r", encoding="utf-8") as f:
1715 template = f.read()
1716 elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
1717 template_dir = Path(__file__).parent / "models/templates/"
1718
1719 # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
1720 if self.is_mistral_format:
1721 logger.info(
1722 "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
1723 "Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
1724 )
1725 template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
1726 else:
1727 logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
1728 template = None
1729
1730 if template is not None:
1731 self.gguf_writer.add_chat_template(template)
1732
1733 def _set_vocab_plamo(self):
1734 # PLaMo models use a custom tokenizer with a .jsonl file
1735 tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
1736 tokenizer_config_path = self.dir_model / "tokenizer_config.json"
1737
1738 if not tokenizer_jsonl_path.is_file():
1739 raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}")
1740
1741 # Load tokenizer config
1742 with open(tokenizer_config_path, "r", encoding="utf-8") as f:
1743 tokenizer_config = json.load(f)
1744
1745 # Load tokens from JSONL file (actually a list format)
1746 tokens = []
1747 scores = []
1748 toktypes = []
1749
1750 with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f:
1751 for line_num, line in enumerate(f):
1752 if line.strip():
1753 token_data = json.loads(line)
1754 # Format: [token, score, type, ?, ?, ?, ?]
1755 token = token_data[0].encode("utf-8")
1756 score = float(token_data[1])
1757 token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
1758
1759 tokens.append(token)
1760 scores.append(score)
1761
1762 if token_type_str == "UNKNOWN":
1763 toktypes.append(gguf.TokenType.UNKNOWN)
1764 elif token_type_str == "CONTROL":
1765 toktypes.append(gguf.TokenType.CONTROL)
1766 elif token_type_str == "BYTE":
1767 toktypes.append(gguf.TokenType.BYTE)
1768 else:
1769 token_str = token_data[0]
1770 if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
1771 toktypes.append(gguf.TokenType.CONTROL)
1772 else:
1773 toktypes.append(gguf.TokenType.NORMAL)
1774
1775 vocab_size = self.hparams["vocab_size"]
1776 if vocab_size > len(tokens):
1777 pad_count = vocab_size - len(tokens)
1778 logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
1779 for i in range(1, pad_count + 1):
1780 tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
1781 scores.append(-1000.0)
1782 toktypes.append(gguf.TokenType.UNUSED)
1783
1784 self.gguf_writer.add_tokenizer_model("plamo2")
1785 self.gguf_writer.add_tokenizer_pre("default")
1786 self.gguf_writer.add_token_list(tokens)
1787 self.gguf_writer.add_token_scores(scores)
1788 self.gguf_writer.add_token_types(toktypes)
1789
1790 if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
1791 token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
1792 self.gguf_writer.add_bos_token_id(token_id)
1793 if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
1794 token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
1795 self.gguf_writer.add_eos_token_id(token_id)
1796 if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
1797 token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
1798 self.gguf_writer.add_pad_token_id(token_id)
1799 if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
1800 token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
1801 self.gguf_writer.add_sep_token_id(token_id)
1802 if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
1803 token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
1804 self.gguf_writer.add_unk_token_id(token_id)
1805
1806 # Add <|plamo:op|> as EOT to ensure appropriate end of generation
1807 self.gguf_writer.add_eot_token_id(4)
1808
1809 self.gguf_writer.add_add_space_prefix(False)
1810
1811
1812class MmprojModel(ModelBase):
1813 model_type = ModelType.MMPROJ
1814 model_arch = gguf.MODEL_ARCH.MMPROJ
1815 preprocessor_config: dict[str, Any]
1816 global_config: dict[str, Any]
1817
1818 n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"]
1819
1820 has_vision_encoder: bool = True # by default
1821 has_audio_encoder: bool = False
1822
1823 # for models having multiple encoders, we need to separate their hparams
1824 hparams_vision: dict[str, Any] | None = None
1825 hparams_audio: dict[str, Any] | None = None
1826
1827 def __init__(self, *args, **kwargs):
1828 super().__init__(*args, **kwargs)
1829
1830 if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
1831 raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
1832
1833 # get n_embd of the text model
1834 if not self.is_mistral_format:
1835 if "text_config" not in self.hparams:
1836 self.hparams["text_config"] = {}
1837 if "audio_config" not in self.hparams:
1838 self.hparams["audio_config"] = {}
1839 text_config = {**self.hparams, **self.hparams["text_config"]}
1840 self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
1841 else:
1842 text_config = {
1843 k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
1844 }
1845 self.n_embd_text = text_config.get("hidden_dim", 0)
1846
1847 assert self.n_embd_text > 0, "n_embd not found in hparams"
1848
1849 # move vision config to the top level, while preserving the original hparams in global_config
1850 import copy
1851 self.global_config = copy.deepcopy(self.hparams)
1852 self.hparams_vision = self.get_vision_config()
1853 self.hparams_audio = self.get_audio_config()
1854
1855 if self.hparams_vision is None and self.hparams_audio is None:
1856 raise ValueError("vision_config / audio_config not found in hparams")
1857
1858 # for compat with vision-only models
1859 self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
1860
1861 # TODO @ngxson : this is a hack to support both vision and audio encoders
1862 have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
1863 self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
1864 self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
1865
1866 # load preprocessor config
1867 self.preprocessor_config = {}
1868
1869 # prefer preprocessor_config.json if possible
1870 preprocessor_config_path = self.dir_model / "preprocessor_config.json"
1871 if preprocessor_config_path.is_file():
1872 with open(preprocessor_config_path, "r", encoding="utf-8") as f:
1873 cfg = json.load(f)
1874 # move media_proc_cfg to root level for compat
1875 if "media_proc_cfg" in cfg:
1876 cfg = {
1877 **cfg,
1878 **cfg["media_proc_cfg"],
1879 }
1880 # merge configs
1881 self.preprocessor_config = {**self.preprocessor_config, **cfg}
1882
1883 # prefer processor_config.json if possible
1884 processor_config_path = self.dir_model / "processor_config.json"
1885 if processor_config_path.is_file():
1886 with open(processor_config_path, "r", encoding="utf-8") as f:
1887 cfg = json.load(f)
1888 # move image_processor to root level for compat
1889 if "image_processor" in cfg:
1890 cfg = {
1891 **cfg,
1892 **cfg["image_processor"],
1893 }
1894 # merge configs
1895 self.preprocessor_config = {**self.preprocessor_config, **cfg}
1896
1897 def get_vision_config(self) -> dict[str, Any] | None:
1898 config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
1899 return self.global_config.get(config_name)
1900
1901 def get_audio_config(self) -> dict[str, Any] | None:
1902 mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
1903 return self.global_config.get(mm_config_key)
1904
1905 def set_type(self):
1906 self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
1907
1908 def prepare_metadata(self, vocab_only: bool):
1909 super().prepare_metadata(vocab_only=vocab_only)
1910
1911 output_type: str = self.ftype.name.partition("_")[2]
1912
1913 if self.fname_out.is_dir():
1914 fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None)
1915 self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf"
1916 else:
1917 self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
1918
1919 def set_gguf_parameters(self):
1920 self.gguf_writer.add_file_type(self.ftype)
1921
1922 if self.has_vision_encoder:
1923 self.gguf_writer.add_clip_has_vision_encoder(True)
1924 self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
1925
1926 # vision config
1927 self.image_size = self.find_vparam(["image_size"])
1928 self.gguf_writer.add_vision_image_size(self.image_size)
1929 self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
1930 self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
1931 self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
1932 self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1933 self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
1934
1935 # preprocessor config
1936 image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1937 image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
1938
1939 self.gguf_writer.add_vision_image_mean(image_mean)
1940 self.gguf_writer.add_vision_image_std(image_std)
1941
1942 if self.has_audio_encoder:
1943 self.gguf_writer.add_clip_has_audio_encoder(True)
1944 self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
1945
1946 # audio config
1947 self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
1948 self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
1949 self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
1950 self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
1951
1952 if not self.has_vision_encoder and not self.has_audio_encoder:
1953 raise ValueError("MmprojModel must have either vision or audio encoder")
1954
1955 def write_vocab(self):
1956 raise ValueError("MmprojModel does not support vocab writing")
1957
1958 def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
1959 assert self.hparams_vision is not None
1960 return self._find_param(self.hparams_vision, keys, optional)
1961
1962 def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
1963 assert self.hparams_audio is not None
1964 return self._find_param(self.hparams_audio, keys, optional)
1965
1966 def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
1967 key = next((k for k in keys if k in obj), None)
1968 if key is not None:
1969 return obj[key]
1970 if optional:
1971 return None
1972 raise KeyError(f"could not find any of: {keys}")
1973
1974 def tensor_force_quant(self, name, new_name, bid, n_dims):
1975 del bid, name, n_dims # unused
1976 if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
1977 return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
1978 return False
1979
1980
1981@ModelBase.register("GPTNeoXForCausalLM")
1982class GPTNeoXModel(TextModel):
1983 model_arch = gguf.MODEL_ARCH.GPTNEOX
1984
1985 def set_gguf_parameters(self):
1986 self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1987 self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1988 self.gguf_writer.add_block_count(self.block_count)
1989 self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1990 self.gguf_writer.add_rope_dimension_count(
1991 int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
1992 )
1993 self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1994 self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
1995 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
1996
1997 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1998 n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
1999 n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
2000
2001 if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
2002 # Map bloom-style qkv_linear to gpt-style qkv_linear
2003 # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
2004 # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
2005 qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
2006 data_torch = torch.cat(
2007 (
2008 qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
2009 qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
2010 qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
2011 ),
2012 dim=0,
2013 )
2014 logger.info("re-format attention.linear_qkv.weight")
2015 elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
2016 qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
2017 data_torch = torch.cat(
2018 (
2019 qkv_bias[:, 0, :].reshape((n_embed,)),
2020 qkv_bias[:, 1, :].reshape((n_embed,)),
2021 qkv_bias[:, 2, :].reshape((n_embed,)),
2022 ),
2023 dim=0,
2024 )
2025 logger.info("re-format attention.linear_qkv.bias")
2026
2027 yield from super().modify_tensors(data_torch, name, bid)
2028
2029
2030@ModelBase.register("BloomForCausalLM", "BloomModel")
2031class BloomModel(TextModel):
2032 model_arch = gguf.MODEL_ARCH.BLOOM
2033
2034 def set_gguf_parameters(self):
2035 n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
2036 n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
2037 self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
2038 self.gguf_writer.add_embedding_length(n_embed)
2039 self.gguf_writer.add_feed_forward_length(4 * n_embed)
2040 self.gguf_writer.add_block_count(self.block_count)
2041 self.gguf_writer.add_head_count(n_head)
2042 self.gguf_writer.add_head_count_kv(n_head)
2043 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
2044 self.gguf_writer.add_file_type(self.ftype)
2045
2046 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2047 n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
2048 n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
2049
2050 name = re.sub(r'transformer\.', '', name)
2051
2052 if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
2053 # Map bloom-style qkv_linear to gpt-style qkv_linear
2054 # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
2055 # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
2056 qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
2057 data_torch = torch.cat(
2058 (
2059 qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
2060 qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
2061 qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
2062 ),
2063 dim=0,
2064 )
2065 logger.info("re-format attention.linear_qkv.weight")
2066 elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
2067 qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
2068 data_torch = torch.cat(
2069 (
2070 qkv_bias[:, 0, :].reshape((n_embed,)),
2071 qkv_bias[:, 1, :].reshape((n_embed,)),
2072 qkv_bias[:, 2, :].reshape((n_embed,)),
2073 ),
2074 dim=0,
2075 )
2076 logger.info("re-format attention.linear_qkv.bias")
2077
2078 yield from super().modify_tensors(data_torch, name, bid)
2079
2080
2081@ModelBase.register("MPTForCausalLM")
2082class MPTModel(TextModel):
2083 model_arch = gguf.MODEL_ARCH.MPT
2084
2085 def set_vocab(self):
2086 try:
2087 self._set_vocab_gpt2()
2088 except Exception:
2089 # Fallback for SEA-LION model
2090 self._set_vocab_sentencepiece()
2091 self.gguf_writer.add_add_bos_token(False)
2092 self.gguf_writer.add_pad_token_id(3)
2093 self.gguf_writer.add_eos_token_id(1)
2094 self.gguf_writer.add_unk_token_id(0)
2095
2096 def set_gguf_parameters(self):
2097 self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
2098 self.gguf_writer.add_embedding_length(self.hparams["d_model"])
2099 self.gguf_writer.add_block_count(self.block_count)
2100 self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
2101 self.gguf_writer.add_head_count(self.hparams["n_heads"])
2102 if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
2103 self.gguf_writer.add_head_count_kv(kv_n_heads)
2104 self.gguf_writer.add_layer_norm_eps(1e-5)
2105 if self.hparams["attn_config"]["clip_qkv"] is not None:
2106 self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
2107 if self.hparams["attn_config"]["alibi"]:
2108 self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
2109 else:
2110 self.gguf_writer.add_max_alibi_bias(0.0)
2111
2112 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2113 if "scales" in name:
2114 new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales"))
2115 new_name = new_name.replace("scales", "act.scales")
2116 else:
2117 new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias"))
2118
2119 yield from super().modify_tensors(data_torch, new_name, bid)
2120
2121
2122@ModelBase.register("OrionForCausalLM")
2123class OrionModel(TextModel):
2124 model_arch = gguf.MODEL_ARCH.ORION
2125
2126 def set_vocab(self):
2127 self._set_vocab_sentencepiece()
2128
2129 def set_gguf_parameters(self):
2130 head_count = self.hparams["num_attention_heads"]
2131 head_count_kv = self.hparams.get("num_key_value_heads", head_count)
2132
2133 ctx_length = 0
2134 if "max_sequence_length" in self.hparams:
2135 ctx_length = self.hparams["max_sequence_length"]
2136 elif "max_position_embeddings" in self.hparams:
2137 ctx_length = self.hparams["max_position_embeddings"]
2138 elif "model_max_length" in self.hparams:
2139 ctx_length = self.hparams["model_max_length"]
2140 else:
2141 raise ValueError("gguf: can not find ctx length parameter.")
2142
2143 self.gguf_writer.add_file_type(self.ftype)
2144 self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
2145 self.gguf_writer.add_context_length(ctx_length)
2146 self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
2147 self.gguf_writer.add_block_count(self.block_count)
2148 self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
2149 self.gguf_writer.add_head_count(head_count)
2150 self.gguf_writer.add_head_count_kv(head_count_kv)
2151 # note: config provides rms norm but it is actually layer norm
2152 # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
2153 self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
2154
2155
2156@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
2157class BaichuanModel(TextModel):
2158 model_arch = gguf.MODEL_ARCH.BAICHUAN
2159
2160 def set_vocab(self):
2161 self._set_vocab_sentencepiece()
2162
2163 def set_gguf_parameters(self):
2164 super().set_gguf_parameters()
2165
2166 self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
2167 self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
2168
2169 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2170 head_count = self.hparams["num_attention_heads"]
2171 head_count_kv = self.hparams.get("num_key_value_heads", head_count)
2172
2173 if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
2174 logger.info(f"Unpacking and permuting layer {bid}")
2175 yield from [
2176 (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid),
2177 self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)),
2178 (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid),
2179 self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)),
2180 (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid),
2181 self._reverse_hf_part(data_torch, 2)),
2182 ]
2183 else:
2184 yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid)
2185
2186 def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
2187 if n_kv_head is not None and n_head != n_kv_head:
2188 n_head //= n_kv_head
2189
2190 return (
2191 weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2192 .swapaxes(1, 2)
2193 .reshape(weights.shape)
2194 )
2195
2196 def _reverse_hf_permute_part(
2197 self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
2198 ) -> Tensor:
2199 r = weights.shape[0] // 3
2200 return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
2201
2202 def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
2203 r = weights.shape[0] // 3
2204 return weights[r * n_part:r * n_part + r, ...]
2205
2206
2207@ModelBase.register("XverseForCausalLM")
2208class XverseModel(TextModel):
2209 model_arch = gguf.MODEL_ARCH.XVERSE
2210
2211 def set_vocab(self):
2212 assert (self.dir_model / "tokenizer.json").is_file()
2213 dir_model = self.dir_model
2214 hparams = self.hparams
2215
2216 tokens: list[bytes] = []
2217 toktypes: list[int] = []
2218
2219 from transformers import AutoTokenizer
2220 tokenizer = AutoTokenizer.from_pretrained(dir_model)
2221 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
2222 # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
2223 # because vocab_size is the count of items, and indexes start at 0.
2224 max_vocab_index = max(tokenizer.get_vocab().values())
2225 if max_vocab_index >= vocab_size:
2226 raise ValueError("Vocabulary size exceeds expected maximum size.")
2227
2228 reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
2229 added_vocab = tokenizer.get_added_vocab()
2230
2231 for token_id in range(vocab_size):
2232 token_text = reverse_vocab[token_id].encode('utf-8')
2233 # replace "\x00" to string with length > 0
2234 if token_text == b"\x00":
2235 toktype = gguf.TokenType.BYTE # special
2236 token_text = f"<{token_text}>".encode('utf-8')
2237 elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
2238 toktype = gguf.TokenType.BYTE # special
2239 elif reverse_vocab[token_id] in added_vocab:
2240 if tokenizer.added_tokens_decoder[token_id].special:
2241 toktype = gguf.TokenType.CONTROL
2242 else:
2243 toktype = gguf.TokenType.USER_DEFINED
2244 else:
2245 toktype = gguf.TokenType.NORMAL
2246
2247 tokens.append(token_text)
2248 toktypes.append(toktype)
2249
2250 self.gguf_writer.add_tokenizer_model("llama")
2251 self.gguf_writer.add_tokenizer_pre("default")
2252 self.gguf_writer.add_token_list(tokens)
2253 self.gguf_writer.add_token_types(toktypes)
2254
2255 special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
2256 special_vocab.add_to_gguf(self.gguf_writer)
2257
2258 def set_gguf_parameters(self):
2259 super().set_gguf_parameters()
2260
2261 self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
2262 self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
2263
2264 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2265 head_count = self.hparams["num_attention_heads"]
2266 head_count_kv = self.hparams.get("num_key_value_heads", head_count)
2267
2268 # HF models permute some of the tensors, so we need to undo that
2269 if name.endswith("q_proj.weight"):
2270 data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
2271 if name.endswith("k_proj.weight"):
2272 data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
2273
2274 yield from super().modify_tensors(data_torch, name, bid)
2275
2276 def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
2277 if n_kv_head is not None and n_head != n_kv_head:
2278 n_head //= n_kv_head
2279
2280 return (
2281 weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2282 .swapaxes(1, 2)
2283 .reshape(weights.shape)
2284 )
2285
2286
2287@ModelBase.register("FalconForCausalLM", "RWForCausalLM")
2288class FalconModel(TextModel):
2289 model_arch = gguf.MODEL_ARCH.FALCON
2290
2291 def set_gguf_parameters(self):
2292 n_head = self.hparams.get("num_attention_heads")
2293 if n_head is None:
2294 n_head = self.hparams["n_head"] # old name
2295
2296 n_head_kv = self.hparams.get("num_kv_heads")
2297 if n_head_kv is None:
2298 n_head_kv = self.hparams.get("n_head_kv", 1) # old name
2299
2300 self.gguf_writer.add_context_length(2048) # not in config.json
2301 self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
2302 self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
2303 self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
2304 self.gguf_writer.add_block_count(self.block_count)
2305 self.gguf_writer.add_head_count(n_head)
2306 self.gguf_writer.add_head_count_kv(n_head_kv)
2307 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
2308 self.gguf_writer.add_file_type(self.ftype)
2309
2310 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2311 # QKV tensor transform
2312 # The original query_key_value tensor contains n_head_kv "kv groups",
2313 # each consisting of n_head/n_head_kv query weights followed by one key
2314 # and one value weight (shared by all query heads in the kv group).
2315 # This layout makes it a big pain to work with in GGML.
2316 # So we rearrange them here,, so that we have n_head query weights
2317 # followed by n_head_kv key weights followed by n_head_kv value weights,
2318 # in contiguous fashion.
2319 # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
2320
2321 if "query_key_value" in name:
2322 n_head = self.find_hparam(["num_attention_heads", "n_head"])
2323 n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1
2324 head_dim = self.hparams["hidden_size"] // n_head
2325
2326 qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
2327 q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
2328 k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
2329 v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
2330 data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
2331
2332 yield from super().modify_tensors(data_torch, name, bid)
2333
2334
2335@ModelBase.register("GPTBigCodeForCausalLM")
2336class StarCoderModel(TextModel):
2337 model_arch = gguf.MODEL_ARCH.STARCODER
2338
2339 def set_gguf_parameters(self):
2340 self.gguf_writer.add_context_length(self.hparams["n_positions"])
2341 self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
2342 self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
2343 self.gguf_writer.add_block_count(self.block_count)
2344 self.gguf_writer.add_head_count(self.hparams["n_head"])
2345 self.gguf_writer.add_head_count_kv(1)
2346 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
2347 self.gguf_writer.add_file_type(self.ftype)
2348
2349
2350@ModelBase.register("GPTRefactForCausalLM")
2351class RefactModel(TextModel):
2352 model_arch = gguf.MODEL_ARCH.REFACT
2353
2354 def set_vocab(self):
2355 super().set_vocab()
2356
2357 # TODO: how to determine special FIM tokens automatically?
2358 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
2359 special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
2360 special_vocab._set_special_token("prefix", 1)
2361 special_vocab._set_special_token("suffix", 3)
2362 special_vocab._set_special_token("middle", 2)
2363 special_vocab.chat_template = None # do not add it twice
2364 special_vocab.add_to_gguf(self.gguf_writer)
2365
2366 def set_gguf_parameters(self):
2367 hidden_dim = self.hparams["n_embd"]
2368 inner_dim = 4 * hidden_dim
2369 hidden_dim = int(2 * inner_dim / 3)
2370 multiple_of = 256
2371 ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
2372
2373 # refact uses Alibi. So this is from config.json which might be used by training.
2374 self.gguf_writer.add_context_length(self.hparams["n_positions"])
2375 self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
2376
2377 self.gguf_writer.add_feed_forward_length(ff_dim)
2378 self.gguf_writer.add_block_count(self.block_count)
2379 self.gguf_writer.add_head_count(self.hparams["n_head"])
2380 self.gguf_writer.add_head_count_kv(1)
2381 self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
2382 self.gguf_writer.add_file_type(self.ftype)
2383
2384 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2385 hidden_dim = self.hparams["n_embd"]
2386 inner_dim = 4 * hidden_dim
2387 hidden_dim = int(2 * inner_dim / 3)
2388 multiple_of = 256
2389 ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
2390 n_head = self.hparams["n_head"]
2391 n_head_kv = 1
2392 head_dim = self.hparams["n_embd"] // n_head
2393
2394 if bid is not None:
2395 if name == f"transformer.h.{bid}.attn.kv.weight":
2396 yield from super().modify_tensors(data_torch[:n_head_kv * head_dim], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid)
2397 yield from super().modify_tensors(data_torch[n_head_kv * head_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
2398 return
2399 if name == f"transformer.h.{bid}.attn.q.weight":
2400 yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid)
2401 return
2402 if name == f"transformer.h.{bid}.mlp.gate_up_proj.weight":
2403 yield from super().modify_tensors(data_torch[:ff_dim], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid)
2404 yield from super().modify_tensors(data_torch[ff_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid)
2405 return
2406
2407 yield from super().modify_tensors(data_torch, name, bid)
2408
2409
2410@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
2411class StableLMModel(TextModel):
2412 model_arch = gguf.MODEL_ARCH.STABLELM
2413
2414 def set_vocab(self):
2415 if (self.dir_model / "tokenizer.json").is_file():
2416 self._set_vocab_gpt2()
2417 else:
2418 # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
2419 self._set_vocab_qwen()
2420
2421 def set_gguf_parameters(self):
2422 hparams = self.hparams
2423
2424 self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
2425 self.gguf_writer.add_embedding_length(hparams["hidden_size"])
2426 self.gguf_writer.add_block_count(self.block_count)
2427 self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
2428 rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
2429 self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
2430 self.gguf_writer.add_head_count(hparams["num_attention_heads"])
2431 self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
2432 self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
2433 self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
2434 self.gguf_writer.add_file_type(self.ftype)
2435
2436 _q_norms: list[dict[str, Tensor]] | None = None
2437 _k_norms: list[dict[str, Tensor]] | None = None
2438
2439 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2440 n_head = self.hparams["num_attention_heads"]
2441 n_kv_head = self.hparams["num_key_value_heads"]
2442
2443 if name.find("q_layernorm.norms") != -1:
2444 assert bid is not None
2445
2446 if self._q_norms is None:
2447 self._q_norms = [{} for _ in range(self.block_count)]
2448
2449 self._q_norms[bid][name] = data_torch
2450
2451 if len(self._q_norms[bid]) >= n_head:
2452 return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm")
2453 else:
2454 return
2455
2456 if name.find("k_layernorm.norms") != -1:
2457 assert bid is not None
2458
2459 if self._k_norms is None:
2460 self._k_norms = [{} for _ in range(self.block_count)]
2461
2462 self._k_norms[bid][name] = data_torch
2463
2464 if len(self._k_norms[bid]) >= n_kv_head:
2465 return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm")
2466 else:
2467 return
2468
2469 yield from super().modify_tensors(data_torch, name, bid)
2470
2471 def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"):
2472 datas: list[Tensor] = []
2473 # extract the norms in order
2474 for xid in range(n_head):
2475 ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
2476 datas.append(norms[ename])
2477 del norms[ename]
2478 data_torch = torch.stack(datas, dim=0)
2479
2480 merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
2481
2482 yield from super().modify_tensors(data_torch, merged_name, bid)
2483
2484 def prepare_tensors(self):
2485 super().prepare_tensors()
2486
2487 if self._q_norms is not None or self._k_norms is not None:
2488 # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
2489 norms = (
2490 [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else []
2491 ) + (
2492 [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else []
2493 )
2494 if len(norms) > 0:
2495 raise ValueError(f"Unprocessed norms: {norms}")
2496
2497
2498@ModelBase.register(
2499 "LLaMAForCausalLM",
2500 "LlamaForCausalLM",
2501 "MistralForCausalLM",
2502 "MixtralForCausalLM",
2503 "VLlama3ForCausalLM",
2504 "LlavaForConditionalGeneration",
2505 "VoxtralForConditionalGeneration",
2506 "IQuestCoderForCausalLM",
2507 "LlamaModel")
2508class LlamaModel(TextModel):
2509 model_arch = gguf.MODEL_ARCH.LLAMA
2510 undo_permute = True
2511
2512 def __init__(self, *args, **kwargs):
2513 super().__init__(*args, **kwargs)
2514 # fix for SmolVLM2, missing `num_attention_heads` in config.json
2515 if self.hf_arch == "VLlama3ForCausalLM":
2516 self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
2517 hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
2518 self.origin_hf_arch = hparams.get('architectures', [None])[0]
2519
2520 def set_vocab(self):
2521 if self.origin_hf_arch == "GlmasrModel":
2522 return self._set_vocab_glmedge()
2523
2524 if self.is_mistral_format:
2525 return self._set_vocab_mistral()
2526
2527 path_tekken_json = self.dir_model / "tekken.json"
2528 path_tokenizer_json = self.dir_model / "tokenizer.json"
2529 if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
2530 self._set_vocab_mistral()
2531
2532 try:
2533 self._set_vocab_sentencepiece()
2534 except FileNotFoundError:
2535 try:
2536 self._set_vocab_llama_hf()
2537 except (FileNotFoundError, TypeError):
2538 # Llama 3
2539 self._set_vocab_gpt2()
2540
2541 # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
2542 if self.hparams.get("vocab_size", 32000) == 32016:
2543 special_vocab = gguf.SpecialVocab(
2544 self.dir_model, load_merges=False,
2545 special_token_types = ['prefix', 'suffix', 'middle', 'eot']
2546 )
2547 special_vocab._set_special_token("prefix", 32007)
2548 special_vocab._set_special_token("suffix", 32008)
2549 special_vocab._set_special_token("middle", 32009)
2550 special_vocab._set_special_token("eot", 32010)
2551 special_vocab.add_to_gguf(self.gguf_writer)
2552
2553 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2554 if tokenizer_config_file.is_file():
2555 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2556 tokenizer_config_json = json.load(f)
2557 if "add_prefix_space" in tokenizer_config_json:
2558 self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
2559
2560 # Apply to granite small models only
2561 if self.hparams.get("vocab_size", 32000) == 49152:
2562 self.gguf_writer.add_add_bos_token(False)
2563
2564 def set_gguf_parameters(self):
2565 super().set_gguf_parameters()
2566 hparams = self.hparams
2567
2568 if not self.is_mistral_format:
2569 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2570
2571 if (rope_dim := hparams.get("head_dim")) is None:
2572 rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2573 self.gguf_writer.add_rope_dimension_count(rope_dim)
2574
2575 @staticmethod
2576 def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
2577 if n_head_kv is not None and n_head != n_head_kv:
2578 n_head = n_head_kv
2579 return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2580 .swapaxes(1, 2)
2581 .reshape(weights.shape))
2582
2583 _experts: list[dict[str, Tensor]] | None = None
2584
2585 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2586 n_head = self.find_hparam(["n_heads", "num_attention_heads"])
2587 n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
2588
2589 vision_prefixes = [
2590 "vision_encoder.",
2591 "vision_language_adapter.",
2592 "patch_merger.",
2593 "pre_mm_projector_norm",
2594 "audio_encoder.",
2595 ]
2596
2597 is_multimodal_tensor = "vision_tower" in name \
2598 or "vision_model" in name \
2599 or "audio_tower" in name \
2600 or "model.connector" in name \
2601 or "multi_modal_projector" in name \
2602 or any(
2603 name.startswith(prefix)
2604 for prefix in vision_prefixes
2605 )
2606
2607 if is_multimodal_tensor:
2608 return # skip vision tensors
2609 elif self.hf_arch == "LlamaModel":
2610 name = "model." + name
2611 elif name.startswith("model.text_model"):
2612 name = name.replace("text_model.", "") # for SmolVLM
2613 elif name.startswith("language_model."):
2614 name = name.replace("language_model.", "") # for the rest
2615
2616 if self.undo_permute:
2617 if name.endswith(("q_proj.weight", "q_proj.bias")):
2618 data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2619 if name.endswith(("k_proj.weight", "k_proj.bias")):
2620 data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
2621
2622 # process the experts separately
2623 if name.find("block_sparse_moe.experts") != -1:
2624 n_experts = self.hparams["num_local_experts"]
2625
2626 assert bid is not None
2627
2628 if self._experts is None:
2629 self._experts = [{} for _ in range(self.block_count)]
2630
2631 self._experts[bid][name] = data_torch
2632
2633 if len(self._experts[bid]) >= n_experts * 3:
2634 # merge the experts into a single 3d tensor
2635 for wid in ["w1", "w2", "w3"]:
2636 datas: list[Tensor] = []
2637
2638 for xid in range(n_experts):
2639 ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
2640 datas.append(self._experts[bid][ename])
2641 del self._experts[bid][ename]
2642
2643 data_torch = torch.stack(datas, dim=0)
2644
2645 merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
2646
2647 yield from super().modify_tensors(data_torch, merged_name, bid)
2648 return
2649 else:
2650 return
2651
2652 yield from super().modify_tensors(data_torch, name, bid)
2653
2654 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2655 if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
2656 if rope_params.get("rope_type", '').lower() == "llama3":
2657 base = rope_params.get("rope_theta", 10000.0)
2658 if (dim := self.hparams.get("head_dim")) is None:
2659 dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
2660 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
2661
2662 factor = rope_params.get("factor", 8.0)
2663 low_freq_factor = rope_params.get("low_freq_factor", 1.0)
2664 high_freq_factor = rope_params.get("high_freq_factor", 4.0)
2665 old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
2666
2667 low_freq_wavelen = old_context_len / low_freq_factor
2668 high_freq_wavelen = old_context_len / high_freq_factor
2669 # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4
2670
2671 rope_factors = []
2672 for freq in freqs:
2673 wavelen = 2 * math.pi / freq
2674 if wavelen < high_freq_wavelen:
2675 rope_factors.append(1)
2676 elif wavelen > low_freq_wavelen:
2677 rope_factors.append(factor)
2678 else:
2679 smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
2680 rope_factors.append(1 / ((1 - smooth) / factor + smooth))
2681
2682 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
2683
2684 def prepare_tensors(self):
2685 super().prepare_tensors()
2686
2687 if self._experts is not None:
2688 # flatten `list[dict[str, Tensor]]` into `list[str]`
2689 experts = [k for d in self._experts for k in d.keys()]
2690 if len(experts) > 0:
2691 raise ValueError(f"Unprocessed experts: {experts}")
2692
2693
2694@ModelBase.register("ArceeForCausalLM")
2695class ArceeModel(LlamaModel):
2696 model_arch = gguf.MODEL_ARCH.ARCEE
2697
2698 def set_gguf_parameters(self):
2699 super().set_gguf_parameters()
2700 self._try_set_pooling_type()
2701
2702
2703@ModelBase.register("AfmoeForCausalLM")
2704class AfmoeModel(LlamaModel):
2705 model_arch = gguf.MODEL_ARCH.AFMOE
2706
2707 def set_gguf_parameters(self):
2708 super().set_gguf_parameters()
2709
2710 # MoE parameters
2711 if (n_experts := self.hparams.get("num_experts")) is not None:
2712 self.gguf_writer.add_expert_count(n_experts)
2713 if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None:
2714 self.gguf_writer.add_expert_shared_count(n_shared_experts)
2715 if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
2716 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
2717 if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None:
2718 self.gguf_writer.add_leading_dense_block_count(n_dense_layers)
2719
2720 # Route normalization and scaling
2721 if (route_norm := self.hparams.get("route_norm")) is not None:
2722 self.gguf_writer.add_expert_weights_norm(route_norm)
2723 if (route_scale := self.hparams.get("route_scale")) is not None:
2724 self.gguf_writer.add_expert_weights_scale(route_scale)
2725
2726 # Sliding window attention
2727 if (sliding_window := self.hparams.get("sliding_window")) is not None:
2728 self.gguf_writer.add_sliding_window(sliding_window)
2729
2730 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2731 # Handle expert weights - they're already merged in the HF format
2732 # process the experts separately
2733 if name.find("mlp.experts") != -1:
2734 n_experts = self.hparams["num_experts"]
2735 assert bid is not None
2736
2737 if self._experts is None:
2738 self._experts = [{} for _ in range(self.block_count)]
2739
2740 self._experts[bid][name] = data_torch
2741
2742 if len(self._experts[bid]) >= n_experts * 3:
2743 # merge the experts into a single 3d tensor
2744 for w_name in ["gate_proj", "up_proj", "down_proj"]:
2745 datas: list[Tensor] = []
2746
2747 for xid in range(n_experts):
2748 ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2749 datas.append(self._experts[bid][ename_to_retrieve])
2750 del self._experts[bid][ename_to_retrieve]
2751
2752 data_torch = torch.stack(datas, dim=0)
2753 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2754 yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid)
2755
2756 return
2757 else:
2758 return
2759
2760 if name.endswith(".expert_bias"):
2761 name = name.replace(".expert_bias", ".expert_bias.bias")
2762
2763 yield from ModelBase.modify_tensors(self, data_torch, name, bid)
2764
2765
2766@ModelBase.register(
2767 "LlavaForConditionalGeneration", # pixtral
2768 "Mistral3ForConditionalGeneration", # mistral small 3.1
2769)
2770class LlavaVisionModel(MmprojModel):
2771 img_break_tok_id = -1
2772 use_break_tok = True
2773
2774 def __init__(self, *args, **kwargs):
2775 super().__init__(*args, **kwargs)
2776 if self.hparams.get("model_type") == "pixtral":
2777 # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
2778 self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
2779 if self.use_break_tok:
2780 self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
2781 elif self.is_mistral_format:
2782 # hparams is already vision config here so norm_eps is only defined in global_config.
2783 self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
2784 assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
2785 if self.use_break_tok:
2786 self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
2787 else:
2788 raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
2789 logger.info(f"Image break token id: {self.img_break_tok_id}")
2790
2791 def get_token_id(self, token: str) -> int:
2792 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
2793 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
2794 added_tokens_decoder = json.load(f)['added_tokens_decoder']
2795 for id_, token_data in added_tokens_decoder.items():
2796 if token_data["content"] == token:
2797 return int(id_)
2798 raise ValueError(f"Token '{token}' not found in tokenizer config.")
2799
2800 def set_gguf_parameters(self):
2801 super().set_gguf_parameters()
2802 hparams = self.hparams
2803 if hparams.get("model_type") == "pixtral":
2804 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
2805 self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
2806
2807 # hidden_act
2808 if hparams["hidden_act"] == "silu":
2809 self.gguf_writer.add_vision_use_silu(True)
2810 elif hparams["hidden_act"] == "gelu":
2811 self.gguf_writer.add_vision_use_gelu(True)
2812 else:
2813 raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
2814
2815 # spatial_merge_size
2816 if "spatial_merge_size" in self.global_config:
2817 self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"])
2818
2819 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2820 n_head = (
2821 self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"])
2822 )
2823 n_kv_head = n_head
2824
2825 valid_prefixes = (
2826 "multi_modal_projector.",
2827 "vision_tower.",
2828 "vision_encoder.",
2829 "vision_language_adapter.",
2830 "patch_merger.",
2831 "pre_mm_projector_norm",
2832 )
2833
2834 if any(name.startswith(prefix) for prefix in valid_prefixes):
2835 # process vision tensors
2836 if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format:
2837 data_torch = LlamaModel.permute(data_torch, n_head, n_head)
2838 if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format:
2839 data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
2840 yield from super().modify_tensors(data_torch, name, bid)
2841 return
2842
2843 embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight"
2844 if self.img_break_tok_id > 0 and embed_key in name:
2845 logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
2846 # for pixtral model, we need to extract the [IMG_BREAK] token embedding
2847 img_break_embd = data_torch[self.img_break_tok_id]
2848 name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK]
2849 yield from super().modify_tensors(img_break_embd, name, bid)
2850
2851 return # skip other tensors
2852
2853
2854@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
2855class SmolVLMModel(MmprojModel):
2856 def __init__(self, *args, **kwargs):
2857 super().__init__(*args, **kwargs)
2858 if self.hparams["model_type"] == "smolvlm_vision":
2859 # fix for SmolVLM2, missing some keys in config.json
2860 # default values are taken from transformers code
2861 self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
2862 self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
2863 self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
2864
2865 def set_gguf_parameters(self):
2866 super().set_gguf_parameters()
2867 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
2868 self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
2869 self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
2870 self.gguf_writer.add_vision_use_gelu(True)
2871
2872 # Add the preprocessor longest edge size
2873 preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
2874 self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
2875
2876 def tensor_force_quant(self, name, new_name, bid, n_dims):
2877 if ".embeddings." in name:
2878 return gguf.GGMLQuantizationType.F32
2879 return super().tensor_force_quant(name, new_name, bid, n_dims)
2880
2881 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2882 is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
2883
2884 if is_vision_tensor:
2885 yield from super().modify_tensors(data_torch, name, bid)
2886
2887 return # skip other tensors
2888
2889
2890@ModelBase.register(
2891 "Llama4ForConditionalGeneration",
2892 "Llama4ForCausalLM",
2893)
2894class Llama4Model(LlamaModel):
2895 model_arch = gguf.MODEL_ARCH.LLAMA4
2896 undo_permute = False
2897
2898 def __init__(self, *args, **kwargs):
2899 super().__init__(*args, **kwargs)
2900 # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
2901 self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
2902 self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
2903
2904 def set_vocab(self):
2905 self._set_vocab_gpt2()
2906
2907 def set_gguf_parameters(self):
2908 super().set_gguf_parameters()
2909 self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
2910 self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
2911 if "layer_types" in self.hparams:
2912 if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
2913 # all layers are full attention (for MobileLLM), disable swa
2914 self.gguf_writer.add_sliding_window(0)
2915
2916 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2917 if name.startswith("language_model."):
2918 name = name.replace("language_model.", "")
2919
2920 # split the gate_up into gate and up
2921 if "gate_up_proj" in name:
2922 name_up = name.replace("gate_up_proj", "up_proj.weight")
2923 name_gate = name.replace("gate_up_proj", "gate_proj.weight")
2924 dim_half = data_torch.shape[-1] // 2
2925 gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
2926 yield from super().modify_tensors(gate_proj_weight, name_gate, bid)
2927 yield from super().modify_tensors(up_proj_weight, name_up, bid)
2928 return
2929
2930 if name.endswith("down_proj"):
2931 name += ".weight"
2932 data_torch = data_torch.transpose(-1, -2)
2933
2934 if "multi_modal_projector" in name or "vision_model" in name:
2935 return
2936 yield from super().modify_tensors(data_torch, name, bid)
2937
2938
2939@ModelBase.register("Llama4ForConditionalGeneration")
2940class Llama4VisionModel(MmprojModel):
2941 def set_gguf_parameters(self):
2942 super().set_gguf_parameters()
2943 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
2944 self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
2945 self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
2946 assert self.hparams["hidden_act"] == "gelu"
2947 self.gguf_writer.add_vision_use_gelu(True)
2948
2949 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2950 if "multi_modal_projector" in name or "vision_model" in name:
2951 # process vision tensors
2952 if "positional_embedding_vlm" in name and ".weight" not in name:
2953 name += ".weight"
2954 if "multi_modal_projector.linear_1" in name:
2955 # despite the name with number postfix, this is a single fully connected layer
2956 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)
2957 else:
2958 yield from super().modify_tensors(data_torch, name, bid)
2959
2960
2961@ModelBase.register(
2962 "Mistral3ForConditionalGeneration",
2963 "Ministral3ForCausalLM",
2964)
2965class Mistral3Model(LlamaModel):
2966 model_arch = gguf.MODEL_ARCH.MISTRAL3
2967
2968 def __init__(self, *args, **kwargs):
2969 super().__init__(*args, **kwargs)
2970 # for compatibility, we use LLAMA arch for older models
2971 # TODO: remove this once everyone has migrated to newer version of llama.cpp
2972 if self.hparams.get("model_type") != "ministral3":
2973 self.model_arch = gguf.MODEL_ARCH.LLAMA
2974 self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
2975 self.gguf_writer.add_architecture()
2976 self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
2977
2978 def set_gguf_parameters(self):
2979 super().set_gguf_parameters()
2980 rope_params = self.rope_parameters
2981 if self.hparams.get("model_type") == "ministral3":
2982 assert rope_params, "ministral3 must have 'rope_parameters' config"
2983 assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
2984 self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
2985 self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
2986
2987 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2988 name = name.replace("language_model.", "")
2989 if "multi_modal_projector" in name or "vision_tower" in name:
2990 return
2991
2992 yield from super().modify_tensors(data_torch, name, bid)
2993
2994
2995@ModelBase.register("DeciLMForCausalLM")
2996class DeciModel(TextModel):
2997 model_arch = gguf.MODEL_ARCH.DECI
2998
2999 @staticmethod
3000 def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
3001 # DeciLM-specific code
3002 intermediate_size = int(2 * ffn_mult * n_embd / 3)
3003 return DeciModel._find_multiple(intermediate_size, 256)
3004
3005 @staticmethod
3006 def _find_multiple(n: int, k: int) -> int:
3007 # DeciLM-specific code
3008 if n % k == 0:
3009 return n
3010 return n + k - (n % k)
3011
3012 def __init__(self, *args, **kwargs):
3013 super().__init__(*args, **kwargs)
3014
3015 if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
3016 _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
3017 assert self.block_count == len(_block_configs)
3018 self._num_kv_heads = list()
3019 self._num_heads = list()
3020 _ffn_multipliers = list()
3021 # ***linear attention layer***
3022 # if n_heads_in_group is None and replace_with_linear is True
3023 # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
3024 # ***attention-free layer***
3025 # if n_heads_in_group is None and replace_with_linear is False
3026 # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
3027 # ***normal attention-layer***
3028 # if n_heads_in_group is not None, then
3029 # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
3030 # _num_heads[il] is num_attention_head
3031 # ***dummy layer*** for nemotron 253B
3032 # if n_heads_in_group is None and ffn_mult is None
3033 # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
3034 for il in range(len(_block_configs)):
3035 if _block_configs[il]["attention"]["n_heads_in_group"] is None:
3036 if _block_configs[il]["attention"]["replace_with_linear"] is True:
3037 self._num_kv_heads.append(0)
3038 self._num_heads.append(self.hparams["num_attention_heads"])
3039 else:
3040 self._num_kv_heads.append(0)
3041 self._num_heads.append(0)
3042 else:
3043 self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
3044 self._num_heads.append(self.hparams["num_attention_heads"])
3045 if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
3046 _ffn_multipliers.append(0.0)
3047 else:
3048 _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
3049 assert self.block_count == len(self._num_kv_heads)
3050 assert self.block_count == len(self._num_heads)
3051 assert self.block_count == len(_ffn_multipliers)
3052 assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
3053 assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
3054 assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
3055 self._ffn_dims: list[int] = [
3056 DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
3057 for multiplier in _ffn_multipliers
3058 ]
3059
3060 def set_vocab(self):
3061 # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
3062 # eos_token from '|eot_id|' to '|end_of_text|'
3063 if self.hparams.get("vocab_size", 128256) == 128256:
3064 tokens, toktypes, tokpre = self.get_vocab_base()
3065 self.gguf_writer.add_tokenizer_model("gpt2")
3066 self.gguf_writer.add_tokenizer_pre(tokpre)
3067 self.gguf_writer.add_token_list(tokens)
3068 self.gguf_writer.add_token_types(toktypes)
3069
3070 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3071 special_vocab.add_to_gguf(self.gguf_writer)
3072 else:
3073 # DeciLM-7B
3074 self._set_vocab_llama_hf()
3075
3076 def set_gguf_parameters(self):
3077 if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
3078 assert self.block_count == len(self._num_kv_heads)
3079 assert self.block_count == len(self._num_heads)
3080 assert self.block_count == len(self._ffn_dims)
3081 if (rope_theta := self.rope_parameters.get("rope_theta")) is not None:
3082 self.gguf_writer.add_rope_freq_base(rope_theta)
3083 self.gguf_writer.add_head_count_kv(self._num_kv_heads)
3084 self.gguf_writer.add_head_count(self._num_heads)
3085 self.gguf_writer.add_feed_forward_length(self._ffn_dims)
3086 self.gguf_writer.add_block_count(self.block_count)
3087 self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
3088 self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
3089 self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
3090 self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
3091 self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
3092 self.gguf_writer.add_file_type(self.ftype)
3093 else: # DeciLM-7B
3094 super().set_gguf_parameters()
3095 if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
3096 self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
3097 assert self.block_count == len(self._num_kv_heads)
3098 self.gguf_writer.add_head_count_kv(self._num_kv_heads)
3099 hparams = self.hparams
3100 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3101
3102 if (rope_dim := hparams.get("head_dim")) is None:
3103 rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
3104 self.gguf_writer.add_rope_dimension_count(rope_dim)
3105
3106 @staticmethod
3107 def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
3108 if n_head_kv is not None and n_head != n_head_kv:
3109 n_head = n_head_kv
3110 return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
3111 .swapaxes(1, 2)
3112 .reshape(weights.shape))
3113
3114 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3115 n_head = self.hparams["num_attention_heads"]
3116 if bid is not None:
3117 if "num_key_value_heads_per_layer" in self.hparams:
3118 n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
3119 elif "block_configs" in self.hparams:
3120 n_kv_head = self._num_kv_heads[bid]
3121 n_head = self._num_heads[bid]
3122 else:
3123 n_kv_head = self.hparams.get("num_key_value_heads")
3124 else:
3125 n_kv_head = self.hparams.get("num_key_value_heads")
3126
3127 if name.endswith(("q_proj.weight", "q_proj.bias")):
3128 data_torch = DeciModel.permute(data_torch, n_head, n_head)
3129 if name.endswith(("k_proj.weight", "k_proj.bias")):
3130 data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
3131 yield from super().modify_tensors(data_torch, name, bid)
3132
3133 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
3134 if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
3135 if rope_params.get("rope_type", '').lower() == "llama3":
3136 base = rope_params.get("rope_theta", 10000.0)
3137 if (dim := self.hparams.get("head_dim")) is None:
3138 dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3139 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
3140
3141 factor = rope_params.get("factor", 8.0)
3142 low_freq_factor = rope_params.get("low_freq_factor", 1.0)
3143 high_freq_factor = rope_params.get("high_freq_factor", 4.0)
3144 old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
3145
3146 low_freq_wavelen = old_context_len / low_freq_factor
3147 high_freq_wavelen = old_context_len / high_freq_factor
3148 assert low_freq_wavelen != high_freq_wavelen
3149
3150 rope_factors = []
3151 for freq in freqs:
3152 wavelen = 2 * math.pi / freq
3153 if wavelen < high_freq_wavelen:
3154 rope_factors.append(1)
3155 elif wavelen > low_freq_wavelen:
3156 rope_factors.append(factor)
3157 else:
3158 smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
3159 rope_factors.append(1 / ((1 - smooth) / factor + smooth))
3160
3161 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
3162
3163 def prepare_tensors(self):
3164 super().prepare_tensors()
3165
3166
3167@ModelBase.register("BitnetForCausalLM")
3168class BitnetModel(TextModel):
3169 model_arch = gguf.MODEL_ARCH.BITNET
3170
3171 def set_vocab(self):
3172 self._set_vocab_sentencepiece()
3173
3174 def set_gguf_parameters(self):
3175 super().set_gguf_parameters()
3176 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3177 self.gguf_writer.add_rope_scaling_factor(1.0)
3178
3179 def weight_quant(self, weight: Tensor) -> Tensor:
3180 dtype = weight.dtype
3181 weight = weight.float()
3182 scale = weight.abs().mean().clamp(min=1e-5)
3183 iscale = 1 / scale
3184 # TODO: multiply by the scale directly instead of inverting it twice
3185 # (this is also unnecessarily doubly inverted upstream)
3186 # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
3187 result = (weight * iscale).round().clamp(-1, 1) / iscale
3188 return result.type(dtype)
3189
3190 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3191 new_name = self.map_tensor_name(name)
3192
3193 if any(self.match_model_tensor_name(new_name, key, bid) for key in [
3194 gguf.MODEL_TENSOR.ATTN_Q,
3195 gguf.MODEL_TENSOR.ATTN_K,
3196 gguf.MODEL_TENSOR.ATTN_V,
3197 gguf.MODEL_TENSOR.ATTN_OUT,
3198 gguf.MODEL_TENSOR.FFN_UP,
3199 gguf.MODEL_TENSOR.FFN_DOWN,
3200 gguf.MODEL_TENSOR.FFN_GATE,
3201 ]):
3202 # transform weight into 1/0/-1 (in fp32)
3203 data_torch = self.weight_quant(data_torch)
3204
3205 yield from super().modify_tensors(data_torch, name, bid)
3206
3207
3208@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
3209class GrokModel(TextModel):
3210 model_arch = gguf.MODEL_ARCH.GROK
3211
3212 def set_vocab(self):
3213 if (self.dir_model / 'tokenizer.model').is_file():
3214 self._set_vocab_sentencepiece()
3215 return
3216
3217 if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
3218 logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
3219 sys.exit(1)
3220
3221 self._set_vocab_gpt2()
3222
3223 def __init__(self, *args, **kwargs):
3224 super().__init__(*args, **kwargs)
3225
3226 def set_gguf_parameters(self):
3227 super().set_gguf_parameters()
3228
3229 self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
3230 self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
3231 if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
3232 self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
3233
3234 if (rope_dim := self.hparams.get("head_dim")) is None:
3235 rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3236
3237 if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
3238 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
3239
3240 # Treat "original" as "yarn", seems to have been a mistake
3241 if self.hparams.get("rope_type") in ("yarn", "original"):
3242 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
3243 self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
3244 self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
3245 self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
3246 self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
3247 self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
3248 self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
3249
3250 if temp_len := self.hparams.get("attn_temperature_len"):
3251 self.gguf_writer.add_attn_temperature_length(temp_len)
3252
3253 self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
3254 self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
3255 self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
3256
3257 _experts: list[dict[str, list[Tensor]]] | None = None
3258 _cur_expert = ""
3259
3260 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3261 deferred: list[tuple[Tensor, str, int | None]] = []
3262 is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
3263
3264 if not is_expert:
3265 deferred.append((data_torch, name, bid))
3266
3267 # process the experts separately
3268 if is_expert or self._cur_expert:
3269 n_experts = self.hparams["num_local_experts"]
3270
3271 assert bid is not None
3272
3273 if self._experts is None:
3274 self._experts = [{} for _ in range(self.block_count)]
3275
3276 # concatenate split tensors
3277 if name in self._experts[bid]:
3278 self._cur_expert = name
3279 self._experts[bid][name].append(data_torch)
3280 return
3281 elif is_expert:
3282 self._cur_expert = name
3283 self._experts[bid][name] = [data_torch]
3284 return
3285 else:
3286 self._cur_expert = ""
3287
3288 for bid in range(self.block_count):
3289 if len(self._experts[bid]) >= n_experts * 3:
3290 # merge the experts into a single 3d tensor
3291 for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
3292 datas: list[Tensor] = []
3293
3294 for xid in range(n_experts):
3295 ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
3296 if ename not in self._experts[bid]:
3297 ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
3298 tensor_list = self._experts[bid][ename]
3299 datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
3300 del self._experts[bid][ename]
3301
3302 data_torch = torch.stack(datas, dim=0)
3303
3304 merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
3305
3306 yield from super().modify_tensors(data_torch, merged_name, bid)
3307
3308 for t in deferred:
3309 yield from super().modify_tensors(*t)
3310
3311
3312@ModelBase.register("DbrxForCausalLM")
3313class DbrxModel(TextModel):
3314 model_arch = gguf.MODEL_ARCH.DBRX
3315
3316 def set_gguf_parameters(self):
3317 ffn_config = self.hparams["ffn_config"]
3318 attn_config = self.hparams["attn_config"]
3319 self.gguf_writer.add_block_count(self.block_count)
3320
3321 self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
3322 self.gguf_writer.add_embedding_length(self.hparams["d_model"])
3323 self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
3324
3325 self.gguf_writer.add_head_count(self.hparams["n_heads"])
3326 self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
3327
3328 self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
3329
3330 self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
3331
3332 self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
3333 self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
3334
3335 self.gguf_writer.add_layer_norm_eps(1e-5)
3336
3337 self.gguf_writer.add_file_type(self.ftype)
3338 logger.info(f"gguf: file type = {self.ftype}")
3339
3340 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3341 n_expert = self.hparams["ffn_config"]["moe_num_experts"]
3342 n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
3343 n_embd = self.hparams["d_model"]
3344
3345 # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
3346 # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
3347 # But llama.cpp moe graph works differently
3348 # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
3349 # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
3350 exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
3351 "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
3352 "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
3353 experts = False
3354
3355 for exp_tensor_name in exp_tensor_names.keys():
3356 if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
3357 experts = True
3358 data_torch = data_torch.view(n_expert, n_ff, n_embd)
3359 if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
3360 data_torch = data_torch.permute(*permute_tensor)
3361 break
3362
3363 # map tensor names
3364 # In MoE models the ffn tensors are typically most of the model weights,
3365 # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
3366 # Every other model has the weight names ending in .weight,
3367 # let's assume that is the convention which is not the case for dbrx:
3368 # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
3369 new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
3370
3371 yield from super().modify_tensors(data_torch, new_name, bid)
3372
3373 def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
3374 del name, new_name, bid # unused
3375
3376 return n_dims > 1
3377
3378
3379@ModelBase.register("MiniCPMForCausalLM")
3380class MiniCPMModel(TextModel):
3381 model_arch = gguf.MODEL_ARCH.MINICPM
3382
3383 def set_gguf_parameters(self):
3384 super().set_gguf_parameters()
3385 embedding_scale = float(self.hparams["scale_emb"])
3386 self.gguf_writer.add_embedding_scale(embedding_scale)
3387 logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
3388 residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
3389 self.gguf_writer.add_residual_scale(residual_scale)
3390 logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
3391 logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
3392 self.gguf_writer.add_logit_scale(logit_scale)
3393 logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
3394
3395 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
3396 rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3397
3398 rope_scaling = self.find_hparam(['rope_scaling'], True)
3399 if rope_scaling is not None:
3400 long_factors = rope_scaling.get('long_factor', None)
3401 short_factors = rope_scaling.get('short_factor', None)
3402
3403 if long_factors is None or short_factors is None:
3404 raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
3405
3406 if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
3407 raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
3408
3409 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
3410 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
3411
3412 def set_vocab(self):
3413 self._set_vocab_sentencepiece()
3414
3415 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3416 n_head = self.hparams["num_attention_heads"]
3417 n_kv_head = self.hparams.get("num_key_value_heads")
3418
3419 # HF models permute some of the tensors, so we need to undo that
3420 if name.endswith(("q_proj.weight")):
3421 data_torch = LlamaModel.permute(data_torch, n_head, n_head)
3422 if name.endswith(("k_proj.weight")):
3423 data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
3424
3425 yield from super().modify_tensors(data_torch, name, bid)
3426
3427
3428@ModelBase.register("MiniCPM3ForCausalLM")
3429class MiniCPM3Model(TextModel):
3430 model_arch = gguf.MODEL_ARCH.MINICPM3
3431
3432 def set_gguf_parameters(self):
3433 hparams = self.hparams
3434
3435 self.gguf_writer.add_file_type(self.ftype)
3436 self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
3437 self.gguf_writer.add_embedding_length(hparams["hidden_size"])
3438 self.gguf_writer.add_block_count(self.block_count)
3439 self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
3440 self.gguf_writer.add_head_count(hparams["num_attention_heads"])
3441 self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
3442 self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
3443 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3444 if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
3445 self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
3446 self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
3447 self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
3448 self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
3449
3450 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
3451 rope_scaling = self.find_hparam(['rope_scaling'], True)
3452 if rope_scaling is not None:
3453 rope_dims = self.hparams["qk_rope_head_dim"]
3454
3455 long_factors = rope_scaling.get('long_factor', None)
3456 short_factors = rope_scaling.get('short_factor', None)
3457
3458 if long_factors is None or short_factors is None:
3459 raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
3460
3461 if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
3462 raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
3463
3464 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
3465 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
3466
3467 def set_vocab(self):
3468 self._set_vocab_sentencepiece()
3469
3470 def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
3471 if n_kv_head is not None and n_head != n_kv_head:
3472 n_head //= n_kv_head
3473
3474 return (
3475 weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
3476 .swapaxes(1, 2)
3477 .reshape(weights.shape)
3478 )
3479
3480
3481@ModelBase.register("QWenLMHeadModel")
3482class QwenModel(TextModel):
3483 model_arch = gguf.MODEL_ARCH.QWEN
3484
3485 @staticmethod
3486 def token_bytes_to_string(b):
3487 from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
3488 byte_encoder = bytes_to_unicode()
3489 return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
3490
3491 @staticmethod
3492 def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
3493 parts = [bytes([b]) for b in token]
3494 while True:
3495 min_idx = None
3496 min_rank = None
3497 for i, pair in enumerate(zip(parts[:-1], parts[1:])):
3498 rank = mergeable_ranks.get(pair[0] + pair[1])
3499 if rank is not None and (min_rank is None or rank < min_rank):
3500 min_idx = i
3501 min_rank = rank
3502 if min_rank is None or (max_rank is not None and min_rank >= max_rank):
3503 break
3504 assert min_idx is not None
3505 parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
3506 return parts
3507
3508 def set_vocab(self):
3509 self._set_vocab_qwen()
3510
3511
3512@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
3513class Qwen2Model(TextModel):
3514 model_arch = gguf.MODEL_ARCH.QWEN2
3515
3516 def set_vocab(self):
3517 try:
3518 self._set_vocab_sentencepiece()
3519 except FileNotFoundError:
3520 self._set_vocab_gpt2()
3521
3522 def set_gguf_parameters(self):
3523 super().set_gguf_parameters()
3524 self._try_set_pooling_type()
3525
3526 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3527 if self.hf_arch == "Qwen2Model":
3528 name = f"model.{name}" # map to Qwen2ForCausalLM tensors
3529 if "language_model." in name:
3530 name = name.replace("language_model.", "") # for InternVL
3531 if name.startswith("mlp") or name.startswith("multi_modal_projector") \
3532 or name.startswith("vision_model") or name.startswith("audio_tower") \
3533 or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
3534 # skip vision and audio tensors
3535 return
3536 yield from super().modify_tensors(data_torch, name, bid)
3537
3538
3539@ModelBase.register("DreamModel")
3540class DreamModel(TextModel):
3541 model_arch = gguf.MODEL_ARCH.DREAM
3542
3543 def get_vocab_base(self) -> tuple[list[str], list[int], str]:
3544 tokens: list[str] = []
3545 toktypes: list[int] = []
3546
3547 from transformers import AutoTokenizer
3548 tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3549
3550 vocab_dict = tokenizer.get_vocab()
3551 vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
3552 assert max(vocab_dict.values()) < vocab_size
3553
3554 tokpre = self.get_vocab_base_pre(tokenizer)
3555
3556 reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
3557 added_vocab = tokenizer.get_added_vocab()
3558
3559 for i in range(vocab_size):
3560 if i not in reverse_vocab:
3561 tokens.append(f"[PAD{i}]")
3562 toktypes.append(gguf.TokenType.UNUSED)
3563 elif reverse_vocab[i] in added_vocab:
3564 tokens.append(reverse_vocab[i])
3565 # Check if it's a special token - treat special tokens as CONTROL tokens
3566 if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
3567 if tokenizer.added_tokens_decoder[i].special:
3568 toktypes.append(gguf.TokenType.CONTROL)
3569 else:
3570 toktypes.append(gguf.TokenType.USER_DEFINED)
3571 else:
3572 # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
3573 toktypes.append(gguf.TokenType.CONTROL)
3574 else:
3575 tokens.append(reverse_vocab[i])
3576 toktypes.append(gguf.TokenType.NORMAL)
3577
3578 return tokens, toktypes, tokpre
3579
3580 def set_vocab(self):
3581 try:
3582 self._set_vocab_sentencepiece()
3583 except FileNotFoundError:
3584 self._set_vocab_gpt2()
3585
3586 def set_gguf_parameters(self):
3587 super().set_gguf_parameters()
3588 self._try_set_pooling_type()
3589
3590 # Dream models use non-causal attention for diffusion
3591 self.gguf_writer.add_causal_attention(False)
3592
3593 # Add Dream-specific parameters
3594 mask_token_id = self.hparams.get("mask_token_id")
3595 if mask_token_id is not None:
3596 self.gguf_writer.add_mask_token_id(mask_token_id)
3597
3598 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3599 # Dream model tensors should be mapped directly since it's the base model
3600 yield from super().modify_tensors(data_torch, name, bid)
3601
3602
3603@ModelBase.register("LLaDAModelLM")
3604class LLaDAModel(TextModel):
3605 model_arch = gguf.MODEL_ARCH.LLADA
3606 undo_permute = True
3607
3608 def get_vocab_base(self) -> tuple[list[str], list[int], str]:
3609 tokens: list[str] = []
3610 toktypes: list[int] = []
3611
3612 from transformers import AutoTokenizer
3613 tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3614
3615 vocab_dict = tokenizer.get_vocab()
3616 vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
3617 assert max(vocab_dict.values()) < vocab_size
3618
3619 tokpre = self.get_vocab_base_pre(tokenizer)
3620
3621 reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
3622 added_vocab = tokenizer.get_added_vocab()
3623
3624 for i in range(vocab_size):
3625 if i not in reverse_vocab:
3626 tokens.append(f"[PAD{i}]")
3627 toktypes.append(gguf.TokenType.UNUSED)
3628 elif reverse_vocab[i] in added_vocab:
3629 tokens.append(reverse_vocab[i])
3630 # Check if it's a special token - treat special tokens as CONTROL tokens
3631 if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
3632 if tokenizer.added_tokens_decoder[i].special:
3633 toktypes.append(gguf.TokenType.CONTROL)
3634 else:
3635 toktypes.append(gguf.TokenType.USER_DEFINED)
3636 else:
3637 # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
3638 toktypes.append(gguf.TokenType.CONTROL)
3639 else:
3640 tokens.append(reverse_vocab[i])
3641 toktypes.append(gguf.TokenType.NORMAL)
3642
3643 return tokens, toktypes, tokpre
3644
3645 def set_vocab(self):
3646 self._set_vocab_gpt2()
3647
3648 # LLaDA specific parameters
3649 self.gguf_writer.add_add_bos_token(True)
3650
3651 def set_gguf_parameters(self):
3652 super().set_gguf_parameters()
3653 self._try_set_pooling_type()
3654
3655 # Add parameters similar to LlamaModel
3656 hparams = self.hparams
3657 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3658
3659 if (rope_dim := hparams.get("head_dim")) is None:
3660 n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
3661 rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
3662 self.gguf_writer.add_rope_dimension_count(rope_dim)
3663
3664 # Set context length for LLaDA
3665 context_length = self.hparams.get("max_sequence_length", 4096)
3666 self.gguf_writer.add_context_length(context_length)
3667
3668 # Set embedding length (dimension size)
3669 embedding_length = self.hparams.get("d_model", 4096)
3670 self.gguf_writer.add_embedding_length(embedding_length)
3671
3672 # Set feed forward length (MLP hidden size)
3673 feed_forward_length = self.hparams.get("mlp_hidden_size", 12288)
3674 self.gguf_writer.add_feed_forward_length(feed_forward_length)
3675
3676 # LLaDA models use non-causal attention for diffusion, similar to Dream
3677 self.gguf_writer.add_causal_attention(False)
3678
3679 # LLaDA models don't shift their logits
3680 self.gguf_writer.add_diffusion_shift_logits(False)
3681
3682 @staticmethod
3683 def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
3684 if n_head_kv is not None and n_head != n_head_kv:
3685 n_head = n_head_kv
3686 return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
3687 .swapaxes(1, 2)
3688 .reshape(weights.shape))
3689
3690 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3691 n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
3692 n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
3693
3694 if self.undo_permute:
3695 if name.endswith(("q_proj.weight", "q_proj.bias")):
3696 data_torch = LLaDAModel.permute(data_torch, n_head, n_head)
3697 if name.endswith(("k_proj.weight", "k_proj.bias")):
3698 data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head)
3699
3700 # LLaDA model tensors should be mapped directly since it's the base model
3701 yield from super().modify_tensors(data_torch, name, bid)
3702
3703
3704@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM")
3705class Ernie4_5Model(TextModel):
3706 model_arch = gguf.MODEL_ARCH.ERNIE4_5
3707
3708 def set_vocab(self):
3709 self._set_vocab_sentencepiece()
3710
3711 def set_gguf_parameters(self):
3712 super().set_gguf_parameters()
3713
3714 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3715 num_heads = self.hparams["num_attention_heads"]
3716 num_kv_heads = self.hparams["num_key_value_heads"]
3717 if (head_dim := self.hparams.get("head_dim")) is None:
3718 head_dim = self.hparams["hidden_size"] // num_heads
3719
3720 if "ernie." in name:
3721 name = name.replace("ernie.", "model.")
3722 # split the qkv weights
3723 # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
3724 if "qkv_proj" in name:
3725 name_q = name.replace("qkv_proj.weight", "q_proj.weight")
3726 name_k = name.replace("qkv_proj.weight", "k_proj.weight")
3727 name_v = name.replace("qkv_proj.weight", "v_proj.weight")
3728 total_q_dim = num_heads * head_dim
3729 total_k_dim = num_kv_heads * head_dim
3730 total_v_dim = num_kv_heads * head_dim
3731 q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
3732 yield from super().modify_tensors(q_proj_weight, name_q, bid)
3733 yield from super().modify_tensors(k_proj_weight, name_k, bid)
3734 yield from super().modify_tensors(v_proj_weight, name_v, bid)
3735 # split the up_gate_proj into gate and up
3736 # up_gate_proj shape: [2 * intermediate_size, hidden_size]
3737 elif "up_gate_proj" in name:
3738 name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
3739 name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
3740 dim_half = data_torch.shape[0] // 2
3741 gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
3742 yield from super().modify_tensors(gate_proj_weight, name_gate, bid)
3743 yield from super().modify_tensors(up_proj_weight, name_up, bid)
3744 else:
3745 yield from super().modify_tensors(data_torch, name, bid)
3746
3747
3748@ModelBase.register("Ernie4_5_MoeForCausalLM")
3749class Ernie4_5MoeModel(Ernie4_5Model):
3750 model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE
3751 _experts: list[dict[str, Tensor]] | None = None
3752
3753 def __init__(self, *args, **kwargs):
3754 super().__init__(*args, **kwargs)
3755 self._experts = [{} for _ in range(self.block_count)]
3756
3757 def set_gguf_parameters(self):
3758 super().set_gguf_parameters()
3759 self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
3760 self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
3761 self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
3762 self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"])
3763 if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
3764 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
3765 if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
3766 self.gguf_writer.add_expert_shared_count(shared_expert_count)
3767 if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
3768 self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
3769
3770 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3771 # Modify correction bias name as in DeepseekV2
3772 if name.endswith("e_score_correction_bias"):
3773 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
3774
3775 # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
3776 match = re.match(r"model.mtp_block.(\d+)", name)
3777 if match:
3778 return
3779
3780 # skip all other MTP tensors for now
3781 match = re.match(r"model.mtp_emb_norm.(\d+)", name)
3782 if match:
3783 return
3784
3785 match = re.match(r"model.mtp_hidden_norm.(\d+)", name)
3786 if match:
3787 return
3788
3789 match = re.match(r"model.mtp_linear_proj.(\d+)", name)
3790 if match:
3791 return
3792
3793 # process the experts separately
3794 if name.find("mlp.experts") != -1:
3795 n_experts = self.hparams["moe_num_experts"]
3796 assert bid is not None
3797
3798 if self._experts is None:
3799 self._experts = [{} for _ in range(self.block_count)]
3800
3801 self._experts[bid][name] = data_torch
3802
3803 if len(self._experts[bid]) >= n_experts * 3:
3804 # merge the experts into a single 3d tensor
3805 for w_name in ["gate_proj", "up_proj", "down_proj"]:
3806 datas: list[Tensor] = []
3807
3808 for xid in range(n_experts):
3809 ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3810 datas.append(self._experts[bid][ename_to_retrieve])
3811 del self._experts[bid][ename_to_retrieve]
3812
3813 data_torch = torch.stack(datas, dim=0)
3814 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3815 yield from super().modify_tensors(data_torch, merged_name, bid)
3816 else:
3817 yield from ModelBase.modify_tensors(self, data_torch, name, bid)
3818
3819 def prepare_tensors(self):
3820 super().prepare_tensors()
3821
3822 if self._experts is not None:
3823 # flatten `list[dict[str, Tensor]]` into `list[str]`
3824 experts = [k for d in self._experts for k in d.keys()]
3825 if len(experts) > 0:
3826 raise ValueError(f"Unprocessed experts: {experts}")
3827
3828
3829@ModelBase.register(
3830 "Qwen2VLModel",
3831 "Qwen2VLForConditionalGeneration",
3832 "Qwen2_5_VLForConditionalGeneration",
3833 "Qwen2_5OmniModel",
3834)
3835class Qwen2VLModel(TextModel):
3836 model_arch = gguf.MODEL_ARCH.QWEN2VL
3837
3838 def set_gguf_parameters(self):
3839 super().set_gguf_parameters()
3840
3841 def set_vocab(self):
3842 try:
3843 self._set_vocab_sentencepiece()
3844 except FileNotFoundError:
3845 self._set_vocab_gpt2()
3846
3847 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3848 if name.startswith("thinker."):
3849 name = name.replace("thinker.", "")
3850 if name.startswith("visual") or name.startswith("audio") or \
3851 name.startswith("talker") or name.startswith("token2wav"):
3852 # skip multimodal tensors
3853 return
3854 yield from super().modify_tensors(data_torch, name, bid)
3855
3856
3857@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
3858class Qwen2VLVisionModel(MmprojModel):
3859 def __init__(self, *args, **kwargs):
3860 super().__init__(*args, **kwargs)
3861 assert self.hparams_vision is not None
3862 self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
3863 # rename config.json values
3864 self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
3865 self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
3866 if "embed_dim" in self.hparams_vision: # qwen2vl
3867 self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
3868 self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
3869
3870 def set_gguf_parameters(self):
3871 super().set_gguf_parameters()
3872 assert self.hparams_vision is not None
3873 hparams = self.hparams_vision
3874 model_type = self.global_config['model_type']
3875 if model_type == 'qwen2_vl':
3876 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
3877 elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
3878 if model_type == 'qwen2_5_omni':
3879 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
3880 else:
3881 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
3882 self.gguf_writer.add_vision_use_silu(True)
3883 # find n_wa_pattern (window attention pattern)
3884 fullatt_block_indexes = hparams.get("fullatt_block_indexes")
3885 assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
3886 n_wa_pattern = fullatt_block_indexes[0] + 1
3887 # validate n_wa_pattern
3888 for i in range(1, len(fullatt_block_indexes)):
3889 if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
3890 raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
3891 self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
3892 else:
3893 raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
3894 # default values below are taken from HF tranformers code
3895 self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
3896
3897 def tensor_force_quant(self, name, new_name, bid, n_dims):
3898 if ".position_embd." in new_name:
3899 return gguf.GGMLQuantizationType.F32
3900 return super().tensor_force_quant(name, new_name, bid, n_dims)
3901
3902 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3903 if name.startswith("visual."):
3904 # process visual tensors
3905 # split QKV tensors if needed
3906 if ".qkv." in name:
3907 if data_torch.ndim == 2: # weight
3908 c3, _ = data_torch.shape
3909 else: # bias
3910 c3 = data_torch.shape[0]
3911 assert c3 % 3 == 0
3912 c = c3 // 3
3913 wq = data_torch[:c]
3914 wk = data_torch[c: c * 2]
3915 wv = data_torch[c * 2:]
3916 yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid)
3917 yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid)
3918 yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid)
3919 elif 'patch_embed.proj.weight' in name:
3920 # split Conv3D into Conv2Ds
3921 c1, c2, kt, kh, kw = data_torch.shape
3922 del c1, c2, kh, kw # unused
3923 assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
3924 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...])
3925 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
3926 else:
3927 yield from super().modify_tensors(data_torch, name, bid)
3928
3929
3930@ModelBase.register("Qwen2_5OmniModel")
3931class Qwen25OmniModel(Qwen2VLVisionModel):
3932 has_vision_encoder = True
3933 has_audio_encoder = True
3934
3935 def __init__(self, *args, **kwargs):
3936 super().__init__(*args, **kwargs)
3937 assert self.hparams_audio is not None
3938 self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
3939 self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
3940 self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
3941
3942 def set_gguf_parameters(self):
3943 super().set_gguf_parameters()
3944 assert self.hparams_audio is not None
3945 self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
3946 self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
3947
3948 def get_vision_config(self) -> dict[str, Any] | None:
3949 return self.global_config["thinker_config"].get("vision_config")
3950
3951 def get_audio_config(self) -> dict[str, Any] | None:
3952 return self.global_config["thinker_config"].get("audio_config")
3953
3954 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
3955 # SinusoidsPositionEmbedding
3956 assert self.hparams_audio is not None
3957 max_timescale = 10000
3958 length = 1500
3959 channels = self.hparams_audio["hidden_size"]
3960 log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
3961 inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
3962 scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
3963 pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
3964 yield ("audio_tower.embed_positions.weight", pos_embd)
3965
3966 def tensor_force_quant(self, name, new_name, bid, n_dims):
3967 if ".conv" in name and ".weight" in name:
3968 return gguf.GGMLQuantizationType.F16
3969 return super().tensor_force_quant(name, new_name, bid, n_dims)
3970
3971 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3972 if name.startswith("thinker."):
3973 name = name.replace("thinker.", "")
3974
3975 if name.startswith("audio_tower"):
3976 # process audio tensors
3977 if "conv1.bias" in name or "conv2.bias" in name:
3978 # transpose conv1 and conv2 bias
3979 data_torch = data_torch.unsqueeze(-1)
3980 if "audio_bos_eos_token" in name:
3981 # this tensor is left unused in transformers code
3982 # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
3983 return
3984 yield from super().modify_tensors(data_torch, name, bid)
3985
3986
3987@ModelBase.register("InternVisionModel")
3988class InternVisionModel(MmprojModel):
3989 def set_gguf_parameters(self):
3990 assert self.hparams_vision is not None
3991 if isinstance(self.hparams_vision['image_size'], list):
3992 self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0]
3993 if isinstance(self.hparams_vision['patch_size'], list):
3994 self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0]
3995 super().set_gguf_parameters()
3996
3997 hparams = self.hparams
3998 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
3999 self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
4000 # hidden_act
4001 if hparams["hidden_act"] == "silu":
4002 self.gguf_writer.add_vision_use_silu(True)
4003 elif hparams["hidden_act"] == "gelu":
4004 self.gguf_writer.add_vision_use_gelu(True)
4005 else:
4006 raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
4007 # downsample_ratio
4008 downsample_ratio = self.global_config.get("downsample_ratio")
4009 assert downsample_ratio is not None
4010 self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
4011
4012 def tensor_force_quant(self, name, new_name, bid, n_dims):
4013 if ".position_embd." in new_name:
4014 return gguf.GGMLQuantizationType.F32
4015 return super().tensor_force_quant(name, new_name, bid, n_dims)
4016
4017 def _mapping_interns1_name(self, name):
4018 names_map = {
4019 "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
4020 "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
4021 "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
4022 "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
4023 "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
4024 "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
4025 }
4026 if name in names_map:
4027 name = names_map[name]
4028 return name
4029
4030 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4031 vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector']
4032 # deal with intern-s1 special case
4033 name = self._mapping_interns1_name(name)
4034 if any([name.startswith(prefix) for prefix in vision_prefix]):
4035 # process visual tensors
4036 # correct name
4037 if name.startswith("vision_model"):
4038 name = "vision_tower." + name
4039 if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"):
4040 name += ".weight"
4041 # split QKV tensors if needed
4042 if ".qkv." in name:
4043 if data_torch.ndim == 2: # weight
4044 c3, _ = data_torch.shape
4045 else: # bias
4046 c3 = data_torch.shape[0]
4047 assert c3 % 3 == 0
4048 c = c3 // 3
4049 wq = data_torch[:c]
4050 wk = data_torch[c: c * 2]
4051 wv = data_torch[c * 2:]
4052 yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid)
4053 yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid)
4054 yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid)
4055 else:
4056 yield from super().modify_tensors(data_torch, name, bid)
4057
4058
4059@ModelBase.register("WavTokenizerDec")
4060class WavTokenizerDecModel(TextModel):
4061 model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
4062
4063 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4064 if \
4065 name.endswith("codebook.cluster_size") or \
4066 name.endswith("codebook.embed_avg") or \
4067 name.endswith("codebook.inited"):
4068 logger.debug(f"Skipping {name!r}")
4069 return
4070
4071 logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
4072
4073 yield from super().modify_tensors(data_torch, name, bid)
4074
4075 def set_vocab(self):
4076 self._set_vocab_none()
4077
4078 def set_gguf_parameters(self):
4079 super().set_gguf_parameters()
4080 self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
4081 self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
4082 self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
4083 self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
4084 self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
4085
4086 self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
4087 self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
4088
4089 self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
4090 self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
4091
4092 self.gguf_writer.add_causal_attention(False)
4093
4094
4095@ModelBase.register("Qwen2MoeForCausalLM")
4096class Qwen2MoeModel(TextModel):
4097 model_arch = gguf.MODEL_ARCH.QWEN2MOE
4098
4099 def set_gguf_parameters(self):
4100 super().set_gguf_parameters()
4101 if (n_experts := self.hparams.get("num_experts")) is not None:
4102 self.gguf_writer.add_expert_count(n_experts)
4103 if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
4104 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
4105 logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
4106 if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
4107 self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
4108 logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
4109
4110 _experts: list[dict[str, Tensor]] | None = None
4111
4112 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4113 # process the experts separately
4114 name = name.replace("language_model.", "") # InternVL
4115
4116 # handle aggregated expert tensors
4117 # GGUF stores dimensions reversed from PyTorch, so:
4118 # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
4119 # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
4120 # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
4121 if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
4122 mapped = f"{name}.weight" if not name.endswith(".weight") else name
4123 # HF: [n_expert, n_embd, n_ff] -> GGML: {n_ff, n_embd, n_expert}
4124 yield from super().modify_tensors(data_torch, mapped, bid)
4125 return
4126
4127 if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
4128 if data_torch.ndim < 3 or data_torch.shape[-2] % 2 != 0:
4129 raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
4130 # HF: [n_expert, 2*n_ff, n_embd] -> split on dim=-2
4131 n_ff = data_torch.shape[-2] // 2
4132 gate = data_torch[..., :n_ff, :].contiguous()
4133 up = data_torch[..., n_ff:, :].contiguous()
4134 # gate/up: [n_expert, n_ff, n_embd] -> GGML: {n_embd, n_ff, n_expert}
4135 base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj")
4136 mapped_gate = f"{base_name}.gate_proj.weight"
4137 mapped_up = f"{base_name}.up_proj.weight"
4138 yield from super().modify_tensors(gate, mapped_gate, bid)
4139 yield from super().modify_tensors(up, mapped_up, bid)
4140 return
4141
4142 if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
4143 # skip visual tensors
4144 return
4145
4146 if name.find("experts") != -1:
4147 n_experts = self.hparams["num_experts"]
4148 assert bid is not None
4149
4150 if self._experts is None:
4151 self._experts = [{} for _ in range(self.block_count)]
4152
4153 self._experts[bid][name] = data_torch
4154
4155 if len(self._experts[bid]) >= n_experts * 3:
4156 # merge the experts into a single 3d tensor
4157 for w_name in ["down_proj", "gate_proj", "up_proj"]:
4158 datas: list[Tensor] = []
4159
4160 for xid in range(n_experts):
4161 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4162 datas.append(self._experts[bid][ename])
4163 del self._experts[bid][ename]
4164
4165 data_torch = torch.stack(datas, dim=0)
4166
4167 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4168
4169 yield from super().modify_tensors(data_torch, merged_name, bid)
4170 return
4171 else:
4172 return
4173
4174 yield from super().modify_tensors(data_torch, name, bid)
4175
4176 def prepare_tensors(self):
4177 super().prepare_tensors()
4178
4179 if self._experts is not None:
4180 # flatten `list[dict[str, Tensor]]` into `list[str]`
4181 experts = [k for d in self._experts for k in d.keys()]
4182 if len(experts) > 0:
4183 raise ValueError(f"Unprocessed experts: {experts}")
4184
4185
4186@ModelBase.register("Qwen3ForCausalLM")
4187class Qwen3Model(Qwen2Model):
4188 model_arch = gguf.MODEL_ARCH.QWEN3
4189
4190 # extra logic for rerank models
4191 is_rerank: bool = False
4192 is_tied_embeddings: bool = False
4193 token_false_id: int | None = None
4194 token_true_id: int | None = None
4195
4196 def __init__(self, *args, **kwargs):
4197 super().__init__(*args, **kwargs)
4198
4199 # track for intern-s1-mini
4200 hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
4201 self.origin_hf_arch = hparams.get('architectures', [None])[0]
4202
4203 # a bit hacky, but currently the only way to detect if this is a rerank model
4204 # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
4205 readme_path = self.dir_model / "README.md"
4206 readme_text = ""
4207 if readme_path.exists():
4208 with readme_path.open("r", encoding="utf-8") as f:
4209 readme_text = f.read()
4210 if "# Qwen3-Reranker" in readme_text:
4211 self._find_rerank_config()
4212
4213 def set_vocab(self):
4214 # deal with intern-s1-mini
4215 if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
4216 self._set_vocab_interns1()
4217 return
4218
4219 super().set_vocab()
4220
4221 def _find_rerank_config(self):
4222 from transformers import AutoTokenizer
4223 tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
4224
4225 self.is_rerank = True
4226 self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
4227 self.token_false_id = tokenizer.convert_tokens_to_ids("no")
4228 self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
4229 self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
4230
4231 assert self.token_false_id is not None and self.token_true_id is not None
4232
4233 def set_gguf_parameters(self):
4234 super().set_gguf_parameters()
4235 if self.is_rerank:
4236 self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
4237 self.gguf_writer.add_classifier_output_labels(["yes", "no"])
4238 self.gguf_writer.add_chat_template([{
4239 "name": "rerank",
4240 "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
4241 "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
4242 "<|im_start|>assistant\n<think>\n\n</think>\n\n"
4243 }])
4244
4245 def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
4246 # extract "yes" and "no" tokens from the output lm_head tensor
4247 false_row = data_torch[self.token_false_id]
4248 true_row = data_torch[self.token_true_id]
4249 return torch.stack([true_row, false_row], dim=0)
4250
4251 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4252 if "model.vision_" in name:
4253 # skip multimodal tensors
4254 return
4255
4256 if self.is_rerank:
4257 is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
4258 is_real_head = not self.is_tied_embeddings and "lm_head" in name
4259 if is_tied_head or is_real_head:
4260 cls_out_head = (
4261 gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
4262 self._get_cls_out_tensor(data_torch),
4263 )
4264 yield cls_out_head
4265 if is_tied_head:
4266 yield from super().modify_tensors(data_torch, name, bid)
4267 return
4268
4269 yield from super().modify_tensors(data_torch, name, bid)
4270
4271
4272@ModelBase.register("Qwen3MoeForCausalLM")
4273class Qwen3MoeModel(Qwen2MoeModel):
4274 model_arch = gguf.MODEL_ARCH.QWEN3MOE
4275
4276 def __init__(self, *args, **kwargs):
4277 super().__init__(*args, **kwargs)
4278 hparams = ModelBase.load_hparams(self.dir_model, False)
4279 self.origin_hf_arch = hparams.get('architectures', [None])[0]
4280
4281 def set_vocab(self):
4282 # deal with intern-s1
4283 if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
4284 self._set_vocab_interns1()
4285 return
4286
4287 super().set_vocab()
4288
4289
4290@ModelBase.register("Qwen3NextForCausalLM")
4291class Qwen3NextModel(Qwen2MoeModel):
4292 model_arch = gguf.MODEL_ARCH.QWEN3NEXT
4293
4294 def set_gguf_parameters(self):
4295 super().set_gguf_parameters()
4296 self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"])
4297 self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"])
4298 self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"])
4299 self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"])
4300 self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"])
4301 self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
4302 if (rope_dim := self.hparams.get("head_dim")) is None:
4303 rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4304 self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
4305
4306 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4307 if name.startswith("mtp"):
4308 return # ignore MTP layers for now
4309 if name.endswith(".A_log"):
4310 data_torch = -torch.exp(data_torch)
4311 elif name.endswith(".dt_bias"):
4312 name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
4313 elif "conv1d" in name:
4314 data_torch = data_torch.squeeze()
4315 elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
4316 data_torch = data_torch + 1
4317
4318 if "in_proj_qkvz.weight" in name:
4319 # original order: [q, k, v, z] * head_count
4320 # corrected order: [q * head_count, k * head_count, v * head_count, z * head_count]
4321 head_k_dim = self.hparams["linear_key_head_dim"]
4322 head_v_dim = self.hparams["linear_value_head_dim"]
4323 num_v_heads = self.hparams["linear_num_value_heads"]
4324 num_k_heads = self.hparams["linear_num_key_heads"]
4325 hidden_size = self.hparams["hidden_size"]
4326 split_arg_list_qkvz = [
4327 head_k_dim, # q partition
4328 head_k_dim, # k partition
4329 (num_v_heads // num_k_heads * head_v_dim), # v partition
4330 (num_v_heads // num_k_heads * head_v_dim), # z partition
4331 ]
4332 # view as (n_embd, head_count, [q+k+v+z])
4333 data_torch = data_torch.permute(1, 0).contiguous()
4334 data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz))
4335 # split into q, k, v, z
4336 q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1)
4337 # flatten dim + head_count
4338 q = q.contiguous().view(hidden_size, -1)
4339 k = k.contiguous().view(hidden_size, -1)
4340 v = v.contiguous().view(hidden_size, -1)
4341 z = z.contiguous().view(hidden_size, -1)
4342 # stack back
4343 qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous()
4344 z = z.permute(1, 0).contiguous()
4345 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, ".weight"), qkv)
4346 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z)
4347 else:
4348 yield from super().modify_tensors(data_torch, name, bid)
4349
4350
4351@ModelBase.register("RND1")
4352class RND1Model(Qwen2MoeModel):
4353 model_arch = gguf.MODEL_ARCH.RND1
4354
4355 def set_gguf_parameters(self):
4356 super().set_gguf_parameters()
4357
4358 # RND1 specific parameters
4359 # RND1 uses bidirectional attention
4360 self.gguf_writer.add_causal_attention(False)
4361
4362 if (mask_token_id := self.hparams.get("mask_token_id")) is not None:
4363 self.gguf_writer.add_mask_token_id(mask_token_id)
4364
4365
4366@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration")
4367class Qwen3VLVisionModel(MmprojModel):
4368 def __init__(self, *args, **kwargs):
4369 super().__init__(*args, **kwargs)
4370 assert self.hparams_vision is not None
4371 # Compute image_size if not present
4372 if "image_size" not in self.hparams_vision:
4373 # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
4374 num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
4375 patch_size = self.hparams_vision.get("patch_size", 16)
4376 # num_position_embeddings = (image_size / patch_size) ** 2
4377 # So image_size = sqrt(num_position_embeddings) * patch_size
4378 image_size = int(num_pos**0.5 * patch_size)
4379 self.hparams_vision["image_size"] = image_size
4380
4381 # Rename config values for compatibility
4382 self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
4383 self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
4384
4385 self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0)
4386 for idx in self.hparams_vision.get("deepstack_visual_indexes", []):
4387 self.is_deepstack_layers[idx] = True
4388
4389 def set_gguf_parameters(self):
4390 super().set_gguf_parameters()
4391 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
4392 self.gguf_writer.add_vision_use_gelu(True)
4393
4394 if self.hparams_vision is not None:
4395 merge_size = self.hparams_vision.get("spatial_merge_size")
4396 if merge_size is not None:
4397 self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
4398
4399 # Use text config's rms_norm_eps for vision attention layernorm eps
4400 rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6)
4401 self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
4402
4403 if self.is_deepstack_layers:
4404 self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers)
4405
4406 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4407 assert self.hparams_vision is not None
4408 # Skip text model tensors - they go in the text model file
4409 if name.startswith("model.language_model.") or name.startswith("lm_head."):
4410 return
4411
4412 # Skip MTP tensors
4413 if name.startswith("mtp."):
4414 return
4415
4416 if name.startswith("model.visual."):
4417 name = name.replace("model.visual.", "visual.", 1)
4418
4419 if name.startswith("visual.deepstack_merger_list."):
4420 prefix, rest = name.split(".", maxsplit=3)[2:]
4421 # prefix is the layer index, convert to absolute clip layer index!
4422 idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)]
4423 target = rest
4424
4425 tensor_type: gguf.MODEL_TENSOR
4426 if target.startswith("norm."):
4427 tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
4428 suffix = target.split(".", 1)[1]
4429 elif target.startswith("linear_fc1."):
4430 tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
4431 suffix = target.split(".", 1)[1]
4432 elif target.startswith("linear_fc2."):
4433 tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
4434 suffix = target.split(".", 1)[1]
4435 else:
4436 raise ValueError(f"Unexpected deepstack tensor: {name}")
4437
4438 new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
4439 yield from super().modify_tensors(data_torch, new_name, bid)
4440 return
4441
4442 if name.startswith("visual.merger."):
4443 suffix = name.split(".", 2)[2]
4444 if suffix.startswith("linear_fc"):
4445 fc_idx_str, tail = suffix.split(".", 1)
4446 fc_num = int(fc_idx_str.replace("linear_fc", ""))
4447 # Qwen3VL has linear_fc1 and linear_fc2
4448 # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
4449 if fc_num == 1:
4450 fc_idx = 0
4451 elif fc_num == 2:
4452 fc_idx = 2
4453 else:
4454 raise ValueError(f"unexpected fc index {fc_num} in {name}")
4455 new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
4456 elif suffix.startswith("norm."):
4457 new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
4458 else:
4459 raise ValueError(f"Unexpected merger tensor: {name}")
4460 yield (new_name, data_torch)
4461 return
4462
4463 if name == "visual.patch_embed.proj.weight":
4464 # split Conv3D into Conv2Ds along temporal dimension
4465 c1, c2, kt, _, _ = data_torch.shape
4466 del c1, c2
4467 if kt != 2:
4468 raise ValueError("Current implementation only supports temporal_patch_size of 2")
4469 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...])
4470 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
4471 return
4472
4473 if name == "visual.patch_embed.proj.bias":
4474 # Include the bias - it's used by the C++ code
4475 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)
4476 return
4477
4478 if name.startswith("visual."):
4479 yield from super().modify_tensors(data_torch, name, bid)
4480 return
4481
4482 # Fall back to parent class for other tensors
4483 yield from super().modify_tensors(data_torch, name, bid)
4484
4485
4486@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
4487class Glm4VVisionModel(Qwen3VLVisionModel):
4488 def set_gguf_parameters(self):
4489 MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
4490 assert self.hparams_vision is not None
4491 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
4492
4493 hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
4494 if hidden_act == "gelu":
4495 self.gguf_writer.add_vision_use_gelu(True)
4496 elif hidden_act == "silu":
4497 self.gguf_writer.add_vision_use_silu(True)
4498
4499 rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5)
4500 self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
4501
4502 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4503 if name.startswith("model.visual."):
4504 name = name.replace("model.visual.", "visual.")
4505 if name.startswith("visual.merger."):
4506 yield from ModelBase.modify_tensors(self, data_torch, name, bid)
4507 return
4508 yield from super().modify_tensors(data_torch, name, bid)
4509
4510
4511@ModelBase.register("Qwen3VLForConditionalGeneration")
4512class Qwen3VLTextModel(Qwen3Model):
4513 model_arch = gguf.MODEL_ARCH.QWEN3VL
4514
4515 def set_gguf_parameters(self):
4516 super().set_gguf_parameters()
4517
4518 # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
4519 vision_config = self.hparams.get("vision_config", {})
4520 deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
4521 self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
4522
4523 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4524 # Skip vision tensors - they go in the mmproj file
4525 if name.startswith("model.visual."):
4526 return
4527
4528 yield from super().modify_tensors(data_torch, name, bid)
4529
4530
4531@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
4532class Qwen3VLMoeTextModel(Qwen3MoeModel):
4533 model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
4534
4535 def set_gguf_parameters(self):
4536 super().set_gguf_parameters()
4537 vision_config = self.hparams.get("vision_config", {})
4538 deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
4539 self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
4540
4541 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4542 # Skip vision tensors - they go in the mmproj file
4543 if name.startswith("model.visual."):
4544 return
4545
4546 # Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors
4547 if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
4548 name = name.replace("language_model.", "")
4549 mapped = f"{name}.weight" if not name.endswith(".weight") else name
4550 permuted = data_torch.permute(0, 2, 1).contiguous()
4551 yield from ModelBase.modify_tensors(self, permuted, mapped, bid)
4552 return
4553
4554 if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
4555 name = name.replace("language_model.", "")
4556 if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
4557 raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
4558 split_dim = data_torch.shape[-1] // 2
4559 gate = data_torch[..., :split_dim].contiguous()
4560 up = data_torch[..., split_dim:].contiguous()
4561 # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
4562 # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
4563 # Need PyTorch: (128, 768, 2048) [reversed of GGML]
4564 # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
4565 base_name = name.removesuffix(".weight")
4566 base = base_name.rsplit('.', 1)[0]
4567 mapped_gate = f"{base}.gate_proj.weight"
4568 mapped_up = f"{base}.up_proj.weight"
4569 perm_gate = gate.permute(0, 2, 1).contiguous()
4570 perm_up = up.permute(0, 2, 1).contiguous()
4571 yield from ModelBase.modify_tensors(self, perm_gate, mapped_gate, bid)
4572 yield from ModelBase.modify_tensors(self, perm_up, mapped_up, bid)
4573 return
4574
4575 yield from super().modify_tensors(data_torch, name, bid)
4576
4577
4578class _LinearAttentionVReorderBase(Qwen3NextModel):
4579 model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses
4580 """reorders V heads from grouped to tiled order for ggml broadcast
4581
4582 see https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306
4583
4584 Linear attention may has num_k_heads < num_v_heads. The HF weights store
4585 V heads grouped by K head: [G0_v0..v{r-1}, G1_v0..v{r-1}, ...].
4586 ggml binary ops use tiled broadcast: [K0, K1, ..., K0, K1, ...].
4587 We reorder V heads to tiled order so ggml_repeat can replace the expensive
4588 interleaved repeat: [G0_v0, G1_v0, ..., G0_v1, G1_v1, ...].
4589 """
4590
4591 @staticmethod
4592 def _reorder_v_heads(tensor: Tensor, dim: int, num_k_heads: int, num_v_per_k: int, head_dim: int) -> Tensor:
4593 """Reorder V heads from grouped (by K head) to tiled order along the given dimension."""
4594 shape = list(tensor.shape)
4595 if dim < 0:
4596 dim += len(shape)
4597 new_shape = shape[:dim] + [num_k_heads, num_v_per_k, head_dim] + shape[dim + 1:]
4598 tensor = tensor.reshape(*new_shape)
4599 perm = list(range(len(new_shape)))
4600 perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim]
4601 return tensor.permute(*perm).contiguous().reshape(*shape)
4602
4603 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4604 num_k_heads = self.hparams.get("linear_num_key_heads", 0)
4605 num_v_heads = self.hparams.get("linear_num_value_heads", 0)
4606
4607 if num_k_heads > 0 and num_v_heads > 0 and num_k_heads != num_v_heads and "linear_attn." in name:
4608 head_k_dim = self.hparams["linear_key_head_dim"]
4609 head_v_dim = self.hparams["linear_value_head_dim"]
4610 num_v_per_k = num_v_heads // num_k_heads
4611
4612 if ".in_proj_qkv." in name:
4613 # QKV weight: reorder only the V rows
4614 q_dim = head_k_dim * num_k_heads
4615 k_dim = head_k_dim * num_k_heads
4616 q = data_torch[:q_dim]
4617 k = data_torch[q_dim:q_dim + k_dim]
4618 v = data_torch[q_dim + k_dim:]
4619 v = self._reorder_v_heads(v, 0, num_k_heads, num_v_per_k, head_v_dim)
4620 data_torch = torch.cat([q, k, v], dim=0)
4621
4622 elif ".in_proj_z." in name:
4623 # Z gate weight: reorder rows (num_v_heads * head_v_dim)
4624 data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, head_v_dim)
4625
4626 elif ".in_proj_b." in name or ".in_proj_a." in name:
4627 # Beta/Alpha weight: reorder rows (num_v_heads, head_dim=1)
4628 data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, 1)
4629
4630 elif ".A_log" in name or ".dt_bias" in name or ".dt_proj" in name:
4631 # A_log / dt_bias: 1D parameters with num_v_heads elements
4632 if data_torch.ndim == 1:
4633 data_torch = self._reorder_v_heads(
4634 data_torch.unsqueeze(-1), 0, num_k_heads, num_v_per_k, 1
4635 ).squeeze(-1)
4636 else:
4637 data_torch = self._reorder_v_heads(data_torch, -1, num_k_heads, num_v_per_k, 1)
4638
4639 elif ".conv1d" in name:
4640 # Conv1d kernel: reorder only the V channel portion
4641 data = data_torch.squeeze()
4642 qk_channels = head_k_dim * num_k_heads * 2
4643 qk_part = data[:qk_channels]
4644 v_part = data[qk_channels:]
4645 v_part = self._reorder_v_heads(v_part, 0, num_k_heads, num_v_per_k, head_v_dim)
4646 data_torch = torch.cat([qk_part, v_part], dim=0)
4647
4648 elif ".out_proj." in name:
4649 # Out projection weight: reorder columns (input dimension)
4650 data_torch = self._reorder_v_heads(data_torch, 1, num_k_heads, num_v_per_k, head_v_dim)
4651
4652 yield from super().modify_tensors(data_torch, name, bid)
4653
4654
4655@ModelBase.register("Qwen3_5ForConditionalGeneration")
4656class Qwen3_5TextModel(_LinearAttentionVReorderBase):
4657 model_arch = gguf.MODEL_ARCH.QWEN35
4658
4659
4660@ModelBase.register("Qwen3_5MoeForConditionalGeneration")
4661class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
4662 model_arch = gguf.MODEL_ARCH.QWEN35MOE
4663
4664
4665@ModelBase.register("GPT2LMHeadModel")
4666class GPT2Model(TextModel):
4667 model_arch = gguf.MODEL_ARCH.GPT2
4668
4669 def set_gguf_parameters(self):
4670 self.gguf_writer.add_block_count(self.block_count)
4671 self.gguf_writer.add_context_length(self.hparams["n_ctx"])
4672 self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
4673 self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
4674 self.gguf_writer.add_head_count(self.hparams["n_head"])
4675 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
4676 self.gguf_writer.add_file_type(self.ftype)
4677
4678 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4679 # we don't need these
4680 if name.endswith((".attn.bias", ".attn.masked_bias")):
4681 yield from super().modify_tensors(data_torch, name, bid)
4682 return
4683
4684 if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
4685 data_torch = data_torch.transpose(1, 0)
4686
4687 new_name = self.map_tensor_name(name)
4688
4689 yield from super().modify_tensors(data_torch, new_name, bid)
4690
4691
4692@ModelBase.register("PhiForCausalLM")
4693class Phi2Model(TextModel):
4694 model_arch = gguf.MODEL_ARCH.PHI2
4695
4696 def set_gguf_parameters(self):
4697 rot_pct = self.find_hparam(["partial_rotary_factor"])
4698 n_embd = self.find_hparam(["hidden_size", "n_embd"])
4699 n_head = self.find_hparam(["num_attention_heads", "n_head"])
4700
4701 self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
4702
4703 self.gguf_writer.add_embedding_length(n_embd)
4704 self.gguf_writer.add_feed_forward_length(4 * n_embd)
4705 self.gguf_writer.add_block_count(self.block_count)
4706 self.gguf_writer.add_head_count(n_head)
4707 self.gguf_writer.add_head_count_kv(n_head)
4708 self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
4709 self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
4710 self.gguf_writer.add_file_type(self.ftype)
4711 self.gguf_writer.add_add_bos_token(False)
4712
4713
4714@ModelBase.register("Phi3ForCausalLM")
4715class Phi3MiniModel(TextModel):
4716 model_arch = gguf.MODEL_ARCH.PHI3
4717
4718 def set_vocab(self):
4719 # Phi-4 model uses GPT2Tokenizer
4720 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
4721 if tokenizer_config_file.is_file():
4722 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
4723 tokenizer_config_json = json.load(f)
4724 tokenizer_class = tokenizer_config_json['tokenizer_class']
4725 if tokenizer_class == 'GPT2Tokenizer':
4726 return self._set_vocab_gpt2()
4727
4728 from sentencepiece import SentencePieceProcessor
4729
4730 tokenizer_path = self.dir_model / 'tokenizer.model'
4731
4732 if not tokenizer_path.is_file():
4733 raise ValueError(f'Error: Missing {tokenizer_path}')
4734
4735 tokenizer = SentencePieceProcessor()
4736 tokenizer.LoadFromFile(str(tokenizer_path))
4737
4738 vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
4739
4740 tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
4741 scores: list[float] = [-10000.0] * vocab_size
4742 toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
4743
4744 for token_id in range(tokenizer.vocab_size()):
4745
4746 piece = tokenizer.IdToPiece(token_id)
4747 text = piece.encode("utf-8")
4748 score = tokenizer.GetScore(token_id)
4749
4750 toktype = SentencePieceTokenTypes.NORMAL
4751 if tokenizer.IsUnknown(token_id):
4752 toktype = SentencePieceTokenTypes.UNKNOWN
4753 elif tokenizer.IsControl(token_id):
4754 toktype = SentencePieceTokenTypes.CONTROL
4755 elif tokenizer.IsUnused(token_id):
4756 toktype = SentencePieceTokenTypes.UNUSED
4757 elif tokenizer.IsByte(token_id):
4758 toktype = SentencePieceTokenTypes.BYTE
4759
4760 tokens[token_id] = text
4761 scores[token_id] = score
4762 toktypes[token_id] = toktype
4763
4764 added_tokens_file = self.dir_model / 'added_tokens.json'
4765 if added_tokens_file.is_file():
4766 with open(added_tokens_file, "r", encoding="utf-8") as f:
4767 added_tokens_json = json.load(f)
4768
4769 for key in added_tokens_json:
4770 token_id = added_tokens_json[key]
4771 if token_id >= vocab_size:
4772 logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
4773 continue
4774
4775 tokens[token_id] = key.encode("utf-8")
4776 scores[token_id] = -1000.0
4777 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
4778
4779 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
4780 if tokenizer_config_file.is_file():
4781 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
4782 tokenizer_config_json = json.load(f)
4783 added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
4784 for token_id, foken_data in added_tokens_decoder.items():
4785 token_id = int(token_id)
4786 token = foken_data["content"].encode("utf-8")
4787 if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
4788 if tokens[token_id] != token:
4789 logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
4790 tokens[token_id] = token
4791 scores[token_id] = -1000.0
4792 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
4793 if foken_data.get("special"):
4794 toktypes[token_id] = SentencePieceTokenTypes.CONTROL
4795
4796 tokenizer_file = self.dir_model / 'tokenizer.json'
4797 if tokenizer_file.is_file():
4798 with open(tokenizer_file, "r", encoding="utf-8") as f:
4799 tokenizer_json = json.load(f)
4800 added_tokens = tokenizer_json.get("added_tokens", [])
4801 for foken_data in added_tokens:
4802 token_id = int(foken_data["id"])
4803 token = foken_data["content"].encode("utf-8")
4804 if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
4805 if tokens[token_id] != token:
4806 logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
4807 tokens[token_id] = token
4808 scores[token_id] = -1000.0
4809 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
4810 if foken_data.get("special"):
4811 toktypes[token_id] = SentencePieceTokenTypes.CONTROL
4812
4813 self.gguf_writer.add_tokenizer_model("llama")
4814 self.gguf_writer.add_tokenizer_pre("default")
4815 self.gguf_writer.add_token_list(tokens)
4816 self.gguf_writer.add_token_scores(scores)
4817 self.gguf_writer.add_token_types(toktypes)
4818
4819 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
4820 special_vocab.add_to_gguf(self.gguf_writer)
4821
4822 def set_gguf_parameters(self):
4823 n_embd = self.find_hparam(["hidden_size", "n_embd"])
4824 n_head = self.find_hparam(["num_attention_heads", "n_head"])
4825 n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
4826 rms_eps = self.find_hparam(["rms_norm_eps"])
4827 max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
4828 orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
4829 rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
4830 rope_dims = int(rot_pct * n_embd) // n_head
4831
4832 self.gguf_writer.add_context_length(max_pos_embds)
4833 self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
4834 self.gguf_writer.add_embedding_length(n_embd)
4835 self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
4836 self.gguf_writer.add_block_count(self.block_count)
4837 self.gguf_writer.add_head_count(n_head)
4838 self.gguf_writer.add_head_count_kv(n_head_kv)
4839 self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
4840 self.gguf_writer.add_rope_dimension_count(rope_dims)
4841 self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"])
4842 self.gguf_writer.add_file_type(self.ftype)
4843 sliding_window = self.hparams.get("sliding_window")
4844 # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
4845 if sliding_window is None:
4846 sliding_window = 0
4847 self.gguf_writer.add_sliding_window(sliding_window)
4848
4849 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
4850 n_embd = self.find_hparam(["hidden_size", "n_embd"])
4851 n_head = self.find_hparam(["num_attention_heads", "n_head"])
4852 max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
4853 orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
4854 rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
4855 rope_dims = int(rot_pct * n_embd) // n_head
4856
4857 # write rope scaling for long context (128k) model
4858 rope_scaling = self.find_hparam(['rope_scaling'], True)
4859 if rope_scaling is None:
4860 return
4861
4862 scale = max_pos_embds / orig_max_pos_embds
4863
4864 rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
4865 if len(rope_scaling_type) == 0:
4866 raise KeyError('Missing the required key rope_scaling.type')
4867
4868 if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
4869 attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
4870 elif rope_scaling_type == 'yarn':
4871 attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
4872 else:
4873 raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
4874
4875 self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
4876
4877 long_factors = rope_scaling.get('long_factor', None)
4878 short_factors = rope_scaling.get('short_factor', None)
4879
4880 if long_factors is None or short_factors is None:
4881 raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
4882
4883 if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
4884 raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
4885
4886 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
4887 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
4888
4889
4890@ModelBase.register("PhiMoEForCausalLM")
4891class PhiMoeModel(Phi3MiniModel):
4892 model_arch = gguf.MODEL_ARCH.PHIMOE
4893
4894 _experts: list[dict[str, Tensor]] | None = None
4895
4896 def set_gguf_parameters(self):
4897 super().set_gguf_parameters()
4898 self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
4899 self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
4900
4901 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4902 # process the experts separately
4903 if name.find("block_sparse_moe.experts") != -1:
4904 n_experts = self.hparams["num_local_experts"]
4905 assert bid is not None
4906
4907 if self._experts is None:
4908 self._experts = [{} for _ in range(self.block_count)]
4909
4910 self._experts[bid][name] = data_torch
4911
4912 if len(self._experts[bid]) >= n_experts * 3:
4913 # merge the experts into a single 3d tensor
4914 for w_name in ["w1", "w2", "w3"]:
4915 datas: list[Tensor] = []
4916
4917 for xid in range(n_experts):
4918 ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
4919 datas.append(self._experts[bid][ename])
4920 del self._experts[bid][ename]
4921
4922 data_torch = torch.stack(datas, dim=0)
4923
4924 merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
4925
4926 yield from super().modify_tensors(data_torch, merged_name, bid)
4927 return
4928 else:
4929 return
4930
4931 yield from super().modify_tensors(data_torch, name, bid)
4932
4933 def prepare_tensors(self):
4934 super().prepare_tensors()
4935
4936 if self._experts is not None:
4937 # flatten `list[dict[str, Tensor]]` into `list[str]`
4938 experts = [k for d in self._experts for k in d.keys()]
4939 if len(experts) > 0:
4940 raise ValueError(f"Unprocessed experts: {experts}")
4941
4942
4943@ModelBase.register("PlamoForCausalLM")
4944class PlamoModel(TextModel):
4945 model_arch = gguf.MODEL_ARCH.PLAMO
4946
4947 def set_vocab(self):
4948 self._set_vocab_sentencepiece()
4949
4950 def set_gguf_parameters(self):
4951 hparams = self.hparams
4952
4953 self.gguf_writer.add_context_length(4096) # not in config.json
4954 self.gguf_writer.add_embedding_length(hparams["hidden_size"])
4955 self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
4956 self.gguf_writer.add_block_count(self.block_count)
4957 self.gguf_writer.add_head_count(hparams["num_attention_heads"])
4958 self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
4959 self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
4960 self.gguf_writer.add_file_type(self.ftype)
4961
4962 def shuffle_attn_q_weight(self, data_torch):
4963 assert data_torch.size() == (5120, 5120)
4964 data_torch = data_torch.reshape(8, 5, 128, 5120)
4965 data_torch = torch.permute(data_torch, (1, 0, 2, 3))
4966 data_torch = torch.reshape(data_torch, (5120, 5120))
4967 return data_torch
4968
4969 def shuffle_attn_output_weight(self, data_torch):
4970 assert data_torch.size() == (5120, 5120)
4971 data_torch = data_torch.reshape(5120, 8, 5, 128)
4972 data_torch = torch.permute(data_torch, (0, 2, 1, 3))
4973 data_torch = torch.reshape(data_torch, (5120, 5120))
4974 return data_torch
4975
4976 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4977 new_name = self.map_tensor_name(name)
4978
4979 # shuffle for broadcasting of gqa in ggml_mul_mat
4980 if new_name.endswith("attn_q.weight"):
4981 data_torch = self.shuffle_attn_q_weight(data_torch)
4982 elif new_name.endswith("attn_output.weight"):
4983 data_torch = self.shuffle_attn_output_weight(data_torch)
4984
4985 yield from super().modify_tensors(data_torch, name, bid)
4986
4987
4988@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM")
4989class Plamo2Model(TextModel):
4990 model_arch = gguf.MODEL_ARCH.PLAMO2
4991
4992 def set_vocab(self):
4993 self._set_vocab_plamo()
4994
4995 def set_gguf_parameters(self):
4996 hparams = self.hparams
4997 self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
4998
4999 # Which layers are Mamba layers
5000 # PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer)
5001 # This logic matches modeling_plamo.py's is_mamba function
5002 mamba_step = hparams.get("mamba_step", 2)
5003 mamba_enabled = hparams.get("mamba_enabled", True)
5004 num_key_value_heads = []
5005 num_attention_heads = []
5006
5007 if mamba_enabled:
5008 for i in range(self.block_count):
5009 if self.block_count <= (mamba_step // 2):
5010 # use attention in last layer
5011 is_mamba = (i != self.block_count - 1)
5012 else:
5013 is_mamba = (i % mamba_step) != (mamba_step // 2)
5014 if is_mamba:
5015 num_key_value_heads.append(0)
5016 num_attention_heads.append(0)
5017 else:
5018 num_key_value_heads.append(hparams.get("num_key_value_heads", 4))
5019 num_attention_heads.append(hparams.get("num_attention_heads", 32))
5020
5021 if num_key_value_heads and num_attention_heads:
5022 self.gguf_writer.add_head_count_kv(num_key_value_heads)
5023 self.gguf_writer.add_head_count(num_attention_heads)
5024
5025 self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
5026 self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
5027 self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128))
5028 self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
5029 self.gguf_writer.add_block_count(self.block_count)
5030 self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
5031 self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000))
5032
5033 # Mamba parameters
5034 self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
5035 self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
5036 self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64))
5037 intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128)
5038 self.gguf_writer.add_ssm_inner_size(intermediate_size)
5039 self.gguf_writer.add_ssm_group_count(0)
5040
5041 # MLP feed forward parameters (for attention layers)
5042 self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
5043 self.gguf_writer.add_file_type(self.ftype)
5044
5045 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5046 if name.endswith(".A_log"):
5047 data_torch = -torch.exp(data_torch)
5048 elif name.endswith(".dt_bias"):
5049 name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
5050 elif name.endswith(".dt_norm_weight"):
5051 name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight"
5052 elif name.endswith(".B_norm_weight"):
5053 name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight"
5054 elif name.endswith(".C_norm_weight"):
5055 name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight"
5056 elif name.endswith(".k_weight"):
5057 name = name.rpartition(".k_weight")[0] + ".k.weight"
5058 elif name.endswith(".q_weight"):
5059 name = name.rpartition(".q_weight")[0] + ".q.weight"
5060 elif name.endswith(".conv1d.weight"):
5061 data_torch = torch.squeeze(data_torch) # remove (, 1, )
5062 assert data_torch.ndim == 2
5063 elif name.endswith(".pre_mixer_norm.weight"):
5064 data_torch += 1.0
5065 elif name.endswith(".post_mixer_norm.weight"):
5066 data_torch += 1.0 / 5
5067 elif name.endswith(".pre_mlp_norm.weight"):
5068 data_torch += 1.0
5069 elif name.endswith(".post_mlp_norm.weight"):
5070 data_torch += 1.0 / (5**1.5)
5071 elif name.endswith(".norm.weight"):
5072 data_torch += 1.0
5073
5074 yield from super().modify_tensors(data_torch, name, bid)
5075
5076
5077@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM")
5078class Plamo3Model(TextModel):
5079 model_arch = gguf.MODEL_ARCH.PLAMO3
5080
5081 def set_vocab(self):
5082 self._set_vocab_plamo()
5083
5084 tokenizer_config_path = self.dir_model / "tokenizer_config.json"
5085 tokenizer_config = {}
5086
5087 if tokenizer_config_path.is_file():
5088 with open(tokenizer_config_path, encoding="utf-8") as f:
5089 tokenizer_config = json.load(f)
5090
5091 chat_template = tokenizer_config.get("chat_template")
5092 chat_template_jinja = self.dir_model / "chat_template.jinja"
5093
5094 if chat_template_jinja.is_file():
5095 with open(chat_template_jinja, encoding="utf-8") as f:
5096 chat_template = f.read()
5097
5098 if chat_template:
5099 self.gguf_writer.add_chat_template(chat_template)
5100
5101 def set_gguf_parameters(self):
5102 super().set_gguf_parameters()
5103 self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
5104 if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None:
5105 self.gguf_writer.add_sliding_window(sliding_window)
5106 self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"])
5107
5108 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5109
5110 if name.endswith(".pre_mixer_norm.weight"):
5111 data_torch = data_torch + 1.0
5112 elif name.endswith(".post_mixer_norm.weight"):
5113 data_torch = data_torch + 1.0 / 5
5114 elif name.endswith(".pre_mlp_norm.weight"):
5115 data_torch = data_torch + 1.0
5116 elif name.endswith(".post_mlp_norm.weight"):
5117 data_torch = data_torch + 1.0 / (5**1.5)
5118 elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")):
5119 data_torch = data_torch + 1.0
5120 elif name.endswith(".norm.weight"):
5121 data_torch = data_torch + 1.0
5122
5123 yield from super().modify_tensors(data_torch, name, bid)
5124
5125
5126@ModelBase.register("CodeShellForCausalLM")
5127class CodeShellModel(TextModel):
5128 model_arch = gguf.MODEL_ARCH.CODESHELL
5129
5130 def set_gguf_parameters(self):
5131 self.gguf_writer.add_context_length(self.hparams["n_positions"])
5132 self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
5133 self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
5134 self.gguf_writer.add_block_count(self.block_count)
5135 self.gguf_writer.add_head_count(self.hparams["n_head"])
5136 self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
5137 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
5138 self.gguf_writer.add_file_type(self.ftype)
5139 self.gguf_writer.add_rope_freq_base(10000.0)
5140 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
5141 self.gguf_writer.add_rope_scaling_factor(1.0)
5142
5143
5144@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM")
5145class KimiLinearModel(TextModel):
5146 """Kimi-Linear model with hybrid MLA+KDA architecture"""
5147 model_arch = gguf.MODEL_ARCH.KIMI_LINEAR
5148
5149 _experts: list[dict[str, Tensor]] | None = None
5150
5151 def set_vocab(self):
5152 try:
5153 self._set_vocab_gpt2()
5154 return
5155 except Exception:
5156 pass
5157
5158 from transformers import AutoTokenizer
5159 tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
5160 tokpre = self.get_vocab_base_pre(tokenizer)
5161
5162 if tokpre == "kimi-k2":
5163 # Build merges list using the approach similar to HunYuanMoE
5164 merges = []
5165 vocab = {}
5166 mergeable_ranks = tokenizer.model._mergeable_ranks
5167 for token, rank in mergeable_ranks.items():
5168 vocab[QwenModel.token_bytes_to_string(token)] = rank
5169 if len(token) == 1:
5170 continue
5171 merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
5172 if len(merged) == 2:
5173 merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
5174 # Build token list
5175 vocab_size = self.hparams["vocab_size"]
5176 special_tokens = tokenizer.special_tokens
5177 reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
5178 tokens: list[str] = []
5179 toktypes: list[int] = []
5180
5181 for i in range(vocab_size):
5182 if i not in reverse_vocab:
5183 tokens.append(f"[PAD{i}]")
5184 toktypes.append(gguf.TokenType.UNUSED)
5185 else:
5186 token = reverse_vocab[i]
5187 tokens.append(token)
5188 if i in special_tokens.values():
5189 toktypes.append(gguf.TokenType.CONTROL)
5190 else:
5191 toktypes.append(gguf.TokenType.NORMAL)
5192
5193 self.gguf_writer.add_tokenizer_model("gpt2")
5194 self.gguf_writer.add_tokenizer_pre(tokpre)
5195 self.gguf_writer.add_token_list(tokens)
5196 self.gguf_writer.add_token_types(toktypes)
5197 self.gguf_writer.add_token_merges(merges)
5198
5199 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
5200 special_vocab.add_to_gguf(self.gguf_writer)
5201 # override eos id in config.json with tiktoken eos id
5202 self.gguf_writer.add_eos_token_id(tokenizer.eos_id)
5203 else:
5204 raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
5205
5206 def set_gguf_parameters(self):
5207 # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group)
5208 self.hparams["num_key_value_heads"] = 1
5209
5210 super().set_gguf_parameters()
5211 self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
5212
5213 # KDA & MLA params
5214 # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
5215 linear_attn_config = self.hparams["linear_attn_config"]
5216 # n_head == 0 for KDA layers, n_head > 0 for MLA layers
5217 # full_attention_layers list will be used to distingush layer type
5218 _num_kv_heads = list()
5219 _full_attn_layers = linear_attn_config["full_attn_layers"]
5220 for il in range(self.hparams["num_hidden_layers"]):
5221 if il + 1 in _full_attn_layers:
5222 _num_kv_heads.append(self.hparams["num_key_value_heads"])
5223 else:
5224 _num_kv_heads.append(0)
5225 assert len(_num_kv_heads) == self.hparams["num_hidden_layers"]
5226 self.gguf_writer.add_head_count_kv(_num_kv_heads)
5227
5228 if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None:
5229 self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
5230 if (kda_head_dim := linear_attn_config.get("head_dim")) is not None:
5231 self.gguf_writer.add_kda_head_dim(kda_head_dim)
5232
5233 # MLA params - use add_* methods that handle arch substitution
5234 # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
5235 if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None:
5236 self.gguf_writer.add_q_lora_rank(q_lora_rank)
5237 # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA
5238 kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False)
5239 self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
5240
5241 # MLA head dimensions
5242 # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
5243 qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
5244 # Rotation - use qk_rope_head_dim for Kimi
5245 qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False)
5246 self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim)
5247 self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim)
5248 v_head_dim = self.hparams.get("v_head_dim")
5249
5250 # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
5251 if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None:
5252 self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
5253 elif qk_nope_head_dim is not None:
5254 n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
5255 self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
5256
5257 # n_embd_head_v_mla = v_head_dim
5258 if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None:
5259 self.gguf_writer.add_value_length_mla(n_embd_head_v_mla)
5260 elif v_head_dim is not None:
5261 self.gguf_writer.add_value_length_mla(v_head_dim)
5262
5263 # moe_intermediate_size (1024 for Kimi)
5264 self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
5265 # num_shared_experts (1 for Kimi)
5266 self.gguf_writer.add_expert_shared_count(self.hparams["num_shared_experts"])
5267 # first_k_dense_replace (1 for Kimi - first layer uses dense MLP)
5268 self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5269 # Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
5270 self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5271
5272 def prepare_tensors(self):
5273 super().prepare_tensors()
5274 if self._experts is not None:
5275 experts = [k for d in self._experts for k in d.keys()]
5276 if len(experts) > 0:
5277 raise ValueError(f"Unprocessed experts: {experts}")
5278
5279 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5280 logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}")
5281
5282 # Handle KDA conv1d weights
5283 # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest
5284 # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest
5285 # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1]
5286 # Memory layouts match: both have conv_step (d_conv) changing fastest
5287 if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")):
5288 # HF shape: [d_inner, d_conv] e.g. [4096, 4]
5289 # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1]
5290 if data_torch.ndim == 2:
5291 d_inner, d_conv = data_torch.shape
5292 # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest)
5293 data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
5294 logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
5295 elif data_torch.ndim == 3:
5296 # Already 3D [d_inner, 1, d_conv] from unsqueeze
5297 d_inner, _, d_conv = data_torch.shape
5298 data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
5299 logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
5300
5301 # Kimi specific bias
5302 if name.endswith("e_score_correction_bias"):
5303 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5304
5305 # Handle A_log: iHF stores as [1, 1, num_heads, 1]
5306 # llama.cpp expects ggml ne = [1, num_heads, 1, 1]
5307 # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1]
5308 if name.endswith(".A_log"):
5309 data_torch = -torch.exp(data_torch)
5310 if name.endswith(".dt_bias"):
5311 name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
5312 logger.info("Changed dt_bias to dt_proj.bias")
5313
5314 # process the experts separately
5315 if name.find("block_sparse_moe.experts") != -1:
5316 n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=False)
5317 assert bid is not None
5318
5319 if self._experts is None:
5320 self._experts = [{} for _ in range(self.block_count)]
5321
5322 self._experts[bid][name] = data_torch
5323
5324 if len(self._experts[bid]) >= n_experts * 3:
5325 # merge the experts into a single 3d tensor
5326 # w1: gate, w2: down, w3: up
5327 for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
5328 ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
5329 ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]:
5330 datas: list[Tensor] = []
5331 for xid in range(n_experts):
5332 ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
5333 datas.append(self._experts[bid][ename])
5334 del self._experts[bid][ename]
5335 data_torch = torch.stack(datas, dim=0)
5336 new_name = self.format_tensor_name(tname, bid)
5337 yield from super().modify_tensors(data_torch, new_name, bid)
5338 return
5339
5340 # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
5341 if name.endswith("kv_b_proj.weight"):
5342 name_kb = name.replace("kv_b_proj", "k_b_proj")
5343 name_vb = name.replace("kv_b_proj", "v_b_proj")
5344 n_head_kv = self.hparams["num_key_value_heads"]
5345 v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False)
5346 qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
5347 logger.info("Split kv_b n_head_kv %d\n" % n_head_kv)
5348 assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
5349 kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
5350 k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
5351 k_b = k_b.transpose(1, 2)
5352 yield from super().modify_tensors(k_b, name_kb, bid)
5353 yield from super().modify_tensors(v_b, name_vb, bid)
5354 return
5355
5356 yield from super().modify_tensors(data_torch, name, bid)
5357
5358
5359@ModelBase.register("InternLM2ForCausalLM")
5360class InternLM2Model(TextModel):
5361 model_arch = gguf.MODEL_ARCH.INTERNLM2
5362
5363 def set_vocab(self):
5364 # (TODO): Is there a better way?
5365 # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
5366 # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
5367 # recognized as an empty string in C++.
5368 from sentencepiece import SentencePieceProcessor
5369 from sentencepiece import sentencepiece_model_pb2 as model
5370
5371 tokenizer_path = self.dir_model / 'tokenizer.model'
5372
5373 tokens: list[bytes] = []
5374 scores: list[float] = []
5375 toktypes: list[int] = []
5376
5377 if not tokenizer_path.is_file():
5378 logger.error(f'Error: Missing {tokenizer_path}')
5379 sys.exit(1)
5380
5381 sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
5382 sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
5383 add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
5384
5385 tokenizer = SentencePieceProcessor()
5386 tokenizer.LoadFromFile(str(tokenizer_path))
5387
5388 vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
5389
5390 for token_id in range(vocab_size):
5391 piece = tokenizer.IdToPiece(token_id)
5392 text = piece.encode("utf-8")
5393 score = tokenizer.GetScore(token_id)
5394 if text == b"\x00":
5395 # (TODO): fixme
5396 # Hack here and replace the \x00 characters.
5397 logger.warning(f"InternLM2 convert token '{text}' to '๐'!")
5398 text = "๐".encode("utf-8")
5399
5400 toktype = SentencePieceTokenTypes.NORMAL
5401 if tokenizer.IsUnknown(token_id):
5402 toktype = SentencePieceTokenTypes.UNKNOWN
5403 elif tokenizer.IsControl(token_id):
5404 toktype = SentencePieceTokenTypes.CONTROL
5405 elif tokenizer.IsUnused(token_id):
5406 toktype = SentencePieceTokenTypes.UNUSED
5407 elif tokenizer.IsByte(token_id):
5408 toktype = SentencePieceTokenTypes.BYTE
5409 # take care of ununsed raw token
5410 if piece.startswith('[UNUSED'):
5411 toktype = SentencePieceTokenTypes.UNUSED
5412
5413 tokens.append(text)
5414 scores.append(score)
5415 toktypes.append(toktype)
5416
5417 added_tokens_file = self.dir_model / 'added_tokens.json'
5418 if added_tokens_file.is_file():
5419 with open(added_tokens_file, "r", encoding="utf-8") as f:
5420 added_tokens_json = json.load(f)
5421
5422 for key in added_tokens_json:
5423 tokens.append(key.encode("utf-8"))
5424 scores.append(-1000.0)
5425 toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
5426
5427 chat_eos_token = '<|im_end|>'
5428 chat_eos_token_id = None
5429
5430 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
5431 if tokenizer_config_file.is_file():
5432 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
5433 tokenizer_config_json = json.load(f)
5434 added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
5435 for token_id, foken_data in added_tokens_decoder.items():
5436 token_id = int(token_id)
5437 token = foken_data["content"]
5438 if token == chat_eos_token:
5439 chat_eos_token_id = token_id
5440 token = token.encode("utf-8")
5441 if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
5442 if tokens[token_id] != token:
5443 logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
5444 tokens[token_id] = token
5445 scores[token_id] = -1000.0
5446 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
5447 if foken_data.get("special"):
5448 toktypes[token_id] = SentencePieceTokenTypes.CONTROL
5449
5450 tokenizer_file = self.dir_model / 'tokenizer.json'
5451 if tokenizer_file.is_file():
5452 with open(tokenizer_file, "r", encoding="utf-8") as f:
5453 tokenizer_json = json.load(f)
5454 added_tokens = tokenizer_json.get("added_tokens", [])
5455 for foken_data in added_tokens:
5456 token_id = int(foken_data["id"])
5457 token = foken_data["content"]
5458 if token == chat_eos_token:
5459 chat_eos_token_id = token_id
5460 token = token.encode("utf-8")
5461 if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
5462 if tokens[token_id] != token:
5463 logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
5464 tokens[token_id] = token
5465 scores[token_id] = -1000.0
5466 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
5467 if foken_data.get("special"):
5468 toktypes[token_id] = SentencePieceTokenTypes.CONTROL
5469
5470 self.gguf_writer.add_tokenizer_model("llama")
5471 self.gguf_writer.add_tokenizer_pre("default")
5472 self.gguf_writer.add_token_list(tokens)
5473 self.gguf_writer.add_token_scores(scores)
5474 self.gguf_writer.add_token_types(toktypes)
5475 self.gguf_writer.add_add_space_prefix(add_prefix)
5476
5477 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5478 old_eos = special_vocab.special_token_ids["eos"]
5479 if chat_eos_token_id is not None:
5480 # For the chat model, we replace the eos with '<|im_end|>'.
5481 # TODO: this is a hack, should be fixed
5482 # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
5483 special_vocab.special_token_ids["eos"] = chat_eos_token_id
5484 logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
5485 " in chat mode so that the conversation can end normally.")
5486
5487 special_vocab.add_to_gguf(self.gguf_writer)
5488
5489 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5490 num_heads = self.hparams["num_attention_heads"]
5491 num_kv_heads = self.hparams["num_key_value_heads"]
5492 n_embd = self.hparams["hidden_size"]
5493 q_per_kv = num_heads // num_kv_heads
5494 head_dim = n_embd // num_heads
5495 num_groups = num_heads // q_per_kv
5496
5497 name = name.replace("language_model.", "") # InternVL
5498 if name.startswith("mlp") or name.startswith("vision_model"):
5499 # skip visual tensors
5500 return
5501
5502 if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
5503 qkv = data_torch
5504
5505 qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
5506 q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
5507
5508 # The model weights of q and k equire additional reshape.
5509 q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
5510 k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
5511 v = v.reshape((-1, v.shape[-1]))
5512
5513 yield from super().modify_tensors(q, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid)
5514 yield from super().modify_tensors(k, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid)
5515 yield from super().modify_tensors(v, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
5516 else:
5517 yield from super().modify_tensors(data_torch, name, bid)
5518
5519
5520@ModelBase.register("InternLM3ForCausalLM")
5521class InternLM3Model(TextModel):
5522 model_arch = gguf.MODEL_ARCH.LLAMA
5523
5524 def set_vocab(self):
5525 tokens, scores, toktypes = self._create_vocab_sentencepiece()
5526
5527 self.gguf_writer.add_tokenizer_model("llama")
5528 self.gguf_writer.add_tokenizer_pre("default")
5529 self.gguf_writer.add_token_list(tokens)
5530 self.gguf_writer.add_token_scores(scores)
5531 self.gguf_writer.add_token_types(toktypes)
5532
5533 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5534
5535 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
5536 if tokenizer_config_file.is_file():
5537 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
5538 tokenizer_config_json = json.load(f)
5539 if "add_prefix_space" in tokenizer_config_json:
5540 self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
5541
5542 if "added_tokens_decoder" in tokenizer_config_json:
5543 for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
5544 if token_data.get("special"):
5545 token_id = int(token_id)
5546 token = token_data["content"]
5547 special_vocab._set_special_token(token, token_id)
5548 # update eos token
5549 if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
5550 special_vocab.special_token_ids["eos"] = token_id
5551
5552 special_vocab.add_to_gguf(self.gguf_writer)
5553
5554 def set_gguf_parameters(self):
5555 super().set_gguf_parameters()
5556 hparams = self.hparams
5557 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
5558
5559 if (rope_dim := hparams.get("head_dim")) is None:
5560 rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
5561 self.gguf_writer.add_rope_dimension_count(rope_dim)
5562
5563 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5564 n_head = self.hparams["num_attention_heads"]
5565 n_kv_head = self.hparams.get("num_key_value_heads")
5566 name = name.replace("language_model.", "") # InternVL
5567 if name.startswith("mlp") or name.startswith("vision_model"):
5568 # skip visual tensors
5569 return
5570 if name.endswith(("q_proj.weight", "q_proj.bias")):
5571 data_torch = LlamaModel.permute(data_torch, n_head, n_head)
5572 if name.endswith(("k_proj.weight", "k_proj.bias")):
5573 data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
5574 yield from super().modify_tensors(data_torch, name, bid)
5575
5576
5577@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
5578class BertModel(TextModel):
5579 model_arch = gguf.MODEL_ARCH.BERT
5580
5581 def __init__(self, *args, **kwargs):
5582 super().__init__(*args, **kwargs)
5583 self.vocab_size = None
5584
5585 if cls_out_labels := self.hparams.get("id2label"):
5586 if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
5587 # Remove dummy labels added by AutoConfig
5588 cls_out_labels = None
5589 self.cls_out_labels = cls_out_labels
5590
5591 def set_gguf_parameters(self):
5592 super().set_gguf_parameters()
5593 self.gguf_writer.add_causal_attention(False)
5594 self._try_set_pooling_type()
5595
5596 if self.cls_out_labels:
5597 self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
5598
5599 def set_vocab(self):
5600 tokens, toktypes, tokpre = self.get_vocab_base()
5601 self.vocab_size = len(tokens)
5602
5603 # we need this to validate the size of the token_type embeddings
5604 # though currently we are passing all zeros to the token_type embeddings
5605 # "Sequence A" or "Sequence B"
5606 self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
5607
5608 # convert to phantom space vocab
5609 def phantom(tok, toktype):
5610 if toktype == gguf.TokenType.CONTROL:
5611 return tok
5612 if tok.startswith("##"):
5613 return tok[2:]
5614 return "\u2581" + tok
5615 assert len(tokens) == len(toktypes)
5616 tokens = list(map(phantom, tokens, toktypes))
5617
5618 # add vocab to gguf
5619 self.gguf_writer.add_tokenizer_model("bert")
5620 self.gguf_writer.add_tokenizer_pre(tokpre)
5621 self.gguf_writer.add_token_list(tokens)
5622 self.gguf_writer.add_token_types(toktypes)
5623
5624 # handle special tokens
5625 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5626 special_vocab.add_to_gguf(self.gguf_writer)
5627
5628 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5629 if name.startswith("bert."):
5630 name = name[5:]
5631
5632 if name.endswith(".gamma"):
5633 name = name[:-6] + ".weight"
5634
5635 if name.endswith(".beta"):
5636 name = name[:-5] + ".bias"
5637
5638 # we are only using BERT for embeddings so we don't need the pooling layer
5639 if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
5640 return # we don't need these
5641
5642 if name.startswith("cls.predictions"):
5643 return
5644
5645 if name.startswith("cls.seq_relationship"):
5646 return
5647
5648 if self.cls_out_labels:
5649 # For BertForSequenceClassification (direct projection layer)
5650 if name == "classifier.weight":
5651 name = "classifier.out_proj.weight"
5652
5653 if name == "classifier.bias":
5654 name = "classifier.out_proj.bias"
5655
5656 yield from super().modify_tensors(data_torch, name, bid)
5657
5658 def _xlmroberta_tokenizer_init(self) -> None:
5659 # we need the pad_token_id to know how to chop down position_embd matrix
5660 if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
5661 self._position_offset = 1 + pad_token_id
5662 if "max_position_embeddings" in self.hparams:
5663 self.hparams["max_position_embeddings"] -= self._position_offset
5664 else:
5665 self._position_offset = None
5666
5667 def _xlmroberta_set_vocab(self) -> None:
5668 # to avoid TypeError: Descriptors cannot be created directly
5669 # exception when importing sentencepiece_model_pb2
5670 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
5671 from sentencepiece import SentencePieceProcessor
5672 from sentencepiece import sentencepiece_model_pb2 as model
5673
5674 tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
5675
5676 tokenizer_json = {}
5677 tokenizer_config_json = {}
5678 if not tokenizer_path.is_file():
5679 tokenizer_path = self.dir_model / 'tokenizer.json'
5680 tokenizer_config_path = self.dir_model / 'tokenizer_config.json'
5681
5682 if not tokenizer_path.is_file():
5683 raise FileNotFoundError(f"File not found: {tokenizer_path}")
5684
5685 from base64 import b64decode
5686 from transformers import AutoTokenizer
5687 tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
5688
5689 with open(tokenizer_path, "r", encoding="utf-8") as fp:
5690 tokenizer_json = json.load(fp)
5691
5692 if tokenizer_config_path.is_file():
5693 with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
5694 tokenizer_config_json = json.load(fp)
5695
5696 add_prefix = tokenizer.add_prefix_space
5697 remove_whitespaces = tokenizer.clean_up_tokenization_spaces
5698 precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
5699
5700 vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
5701 else:
5702 sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
5703 sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
5704 assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
5705
5706 add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
5707 remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
5708 precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
5709
5710 tokenizer = SentencePieceProcessor()
5711 tokenizer.LoadFromFile(str(tokenizer_path))
5712
5713 vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
5714
5715 tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
5716 scores: list[float] = [-10000.0] * vocab_size
5717 toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
5718
5719 if isinstance(tokenizer, SentencePieceProcessor):
5720 for token_id in range(tokenizer.vocab_size()):
5721 piece = tokenizer.IdToPiece(token_id)
5722 text = piece.encode("utf-8")
5723 score = tokenizer.GetScore(token_id)
5724
5725 toktype = SentencePieceTokenTypes.NORMAL
5726 if tokenizer.IsUnknown(token_id):
5727 toktype = SentencePieceTokenTypes.UNKNOWN
5728 elif tokenizer.IsControl(token_id):
5729 toktype = SentencePieceTokenTypes.CONTROL
5730 elif tokenizer.IsUnused(token_id):
5731 toktype = SentencePieceTokenTypes.UNUSED
5732 elif tokenizer.IsByte(token_id):
5733 toktype = SentencePieceTokenTypes.BYTE
5734
5735 tokens[token_id] = text
5736 scores[token_id] = score
5737 toktypes[token_id] = toktype
5738 else:
5739 added_vocab = tokenizer.get_added_vocab()
5740 unk_token = tokenizer_config_json.get("unk_token")
5741 unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
5742
5743 for token_id in range(tokenizer.vocab_size):
5744 piece = tokenizer._convert_id_to_token(token_id)
5745 if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
5746 text = piece.encode("utf-8")
5747 score = tokenizer_json["model"]["vocab"][token_id][1]
5748
5749 toktype = SentencePieceTokenTypes.NORMAL
5750 if token_id == unk_token_id:
5751 toktype = SentencePieceTokenTypes.UNKNOWN
5752 elif token_id in tokenizer.all_special_ids:
5753 toktype = SentencePieceTokenTypes.CONTROL
5754 elif token_id in added_vocab.values():
5755 toktype = SentencePieceTokenTypes.USER_DEFINED
5756 # No reliable way to detect this, but jina doesn't have any
5757 # elif tokenizer.IsByte(token_id):
5758 # toktype = SentencePieceTokenTypes.BYTE
5759
5760 tokens[token_id] = text
5761 scores[token_id] = score
5762 toktypes[token_id] = toktype
5763
5764 if isinstance(tokenizer, SentencePieceProcessor):
5765 # realign tokens (see HF tokenizer code)
5766 tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
5767 scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
5768 toktypes = [
5769 SentencePieceTokenTypes.CONTROL,
5770 SentencePieceTokenTypes.CONTROL,
5771 SentencePieceTokenTypes.CONTROL,
5772 SentencePieceTokenTypes.UNKNOWN,
5773 ] + toktypes[3:-1]
5774
5775 if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
5776 # Add mask token missing from sentencepiece.bpe.model
5777 tokens[250001] = b'<mask>'
5778 scores[250001] = 0.0
5779 toktypes[250001] = SentencePieceTokenTypes.CONTROL
5780
5781 self.gguf_writer.add_tokenizer_model("t5")
5782 self.gguf_writer.add_tokenizer_pre("default")
5783 self.gguf_writer.add_token_list(tokens)
5784 self.gguf_writer.add_token_scores(scores)
5785 self.gguf_writer.add_token_types(toktypes)
5786 self.gguf_writer.add_add_space_prefix(add_prefix)
5787 self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
5788 self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
5789 if precompiled_charsmap:
5790 self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
5791
5792 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5793 special_vocab.add_to_gguf(self.gguf_writer)
5794
5795
5796@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
5797class DistilBertModel(BertModel):
5798 model_arch = gguf.MODEL_ARCH.BERT
5799
5800 def set_gguf_parameters(self):
5801 self.gguf_writer.add_layer_norm_eps(1e-12)
5802 logger.info("gguf: layer norm epsilon = 1e-12")
5803 super().set_gguf_parameters()
5804
5805 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5806 if name.startswith("distilbert."):
5807 name = name[11:]
5808
5809 # These layers act as MLM head, so we don't need them
5810 if name.startswith("vocab_"):
5811 return
5812
5813 yield from super().modify_tensors(data_torch, name, bid)
5814
5815
5816@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
5817class RobertaModel(BertModel):
5818 model_arch = gguf.MODEL_ARCH.BERT
5819
5820 def __init__(self, *args, **kwargs):
5821 super().__init__(*args, **kwargs)
5822
5823 # we need the pad_token_id to know how to chop down position_embd matrix
5824 if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
5825 self._position_offset = 1 + pad_token_id
5826 if "max_position_embeddings" in self.hparams:
5827 self.hparams["max_position_embeddings"] -= self._position_offset
5828 else:
5829 self._position_offset = None
5830
5831 def set_vocab(self):
5832 """Support BPE tokenizers for roberta models"""
5833 bpe_tok_path = self.dir_model / "tokenizer.json"
5834 if bpe_tok_path.exists():
5835 self._set_vocab_gpt2()
5836
5837 # we need this to validate the size of the token_type embeddings
5838 # though currently we are passing all zeros to the token_type embeddings
5839 # "Sequence A" or "Sequence B"
5840 self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
5841
5842 else:
5843 return super().set_vocab()
5844
5845 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5846 # if name starts with "roberta.", remove the prefix
5847 # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
5848 if name.startswith("roberta."):
5849 name = name[8:]
5850
5851 # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
5852 if name == "embeddings.position_embeddings.weight":
5853 if self._position_offset is not None:
5854 data_torch = data_torch[self._position_offset:,:]
5855
5856 yield from super().modify_tensors(data_torch, name, bid)
5857
5858
5859@ModelBase.register("NomicBertModel")
5860class NomicBertModel(BertModel):
5861 model_arch = gguf.MODEL_ARCH.BERT
5862
5863 def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
5864 hparams = kwargs.pop("hparams", None)
5865 if hparams is None:
5866 hparams = ModelBase.load_hparams(dir_model, False)
5867
5868 self.is_moe = bool(hparams.get("moe_every_n_layers"))
5869 self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
5870
5871 super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
5872
5873 self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
5874 if self._tokenizer_is_xlmroberta:
5875 self._xlmroberta_tokenizer_init()
5876
5877 npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048)
5878 if npos == 8192 and mtp == 2048:
5879 self.hparams["n_positions"] = 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens.
5880 elif npos == 2048 and mtp == 2048:
5881 self.hparams["n_positions"] = 512 # nomic-embed-text-v2-moe is trained for 512 tokens.
5882 else:
5883 raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}")
5884
5885 assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
5886
5887 # this doesn't do anything in the HF version
5888 assert self.hparams["causal"] is False
5889 # no bias tensors unless MoE
5890 assert self.hparams["qkv_proj_bias"] == self.is_moe
5891 assert self.hparams["mlp_fc1_bias"] == self.is_moe
5892 assert self.hparams["mlp_fc2_bias"] == self.is_moe
5893
5894 # norm at end of layer
5895 assert self.hparams["prenorm"] is False
5896 # standard RoPE
5897 assert self.hparams["rotary_emb_fraction"] == 1.0
5898 assert self.hparams["rotary_emb_interleaved"] is False
5899 assert self.hparams["rotary_emb_scale_base"] is None
5900
5901 def set_vocab(self) -> None:
5902 if self._tokenizer_is_xlmroberta:
5903 return self._xlmroberta_set_vocab()
5904 return super().set_vocab()
5905
5906 def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
5907 # If the tensor is an experts bias tensor, skip it by returning an empty list.
5908 if "mlp.experts.bias" in name:
5909 return # Explicitly return.
5910
5911 if "mlp.experts.mlp.w1" in name:
5912 data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
5913 name += ".weight"
5914
5915 if "mlp.experts.mlp.w2" in name:
5916 data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
5917 data_torch = data_torch.transpose(1, 2)
5918 name += ".weight"
5919
5920 yield from super().modify_tensors(data_torch, name, bid)
5921
5922 def set_gguf_parameters(self):
5923 super().set_gguf_parameters()
5924 if self.is_moe:
5925 self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
5926 self.gguf_writer.add_expert_count(self.hparams["num_experts"])
5927 self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
5928
5929 def _is_tokenizer_xlmroberta(self) -> bool:
5930 with open(self.dir_model / "tokenizer.json") as f:
5931 tokenizer_json = json.load(f)
5932 toktyp = tokenizer_json["model"]["type"]
5933 if toktyp == "Unigram":
5934 return True
5935 if toktyp == "WordPiece":
5936 return False
5937 raise ValueError(f"unknown tokenizer: {toktyp}")
5938
5939
5940@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
5941class NeoBert(BertModel):
5942 model_arch = gguf.MODEL_ARCH.NEO_BERT
5943
5944 def set_gguf_parameters(self):
5945 super().set_gguf_parameters()
5946
5947 # NeoBERT uses 2/3 of the intermediate size as feed forward length
5948 self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
5949 self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
5950 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
5951
5952 f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
5953 self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
5954 logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
5955
5956 self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
5957
5958 def modify_tensors(self, data_torch, name, bid):
5959 if name.startswith("decoder."):
5960 return
5961
5962 if name.startswith("model."):
5963 name = name[6:]
5964
5965 yield from super().modify_tensors(data_torch, name, bid)
5966
5967
5968@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
5969class XLMRobertaModel(BertModel):
5970 model_arch = gguf.MODEL_ARCH.BERT
5971 _lora_files = {}
5972 _lora_names = []
5973
5974 def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
5975 hparams = kwargs.pop("hparams", None)
5976 if hparams is None:
5977 hparams = ModelBase.load_hparams(dir_model, False)
5978
5979 if lora_names := hparams.get("lora_adaptations"):
5980 self._lora_names = lora_names
5981 self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
5982
5983 super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
5984 self._xlmroberta_tokenizer_init()
5985
5986 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
5987 if self._lora_names:
5988 for name in self._lora_names:
5989 fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-")
5990 self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run)
5991
5992 return super().generate_extra_tensors()
5993
5994 def set_type(self):
5995 for lora_writer in self._lora_files.values():
5996 lora_writer.add_type(gguf.GGUFType.ADAPTER)
5997 lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
5998 super().set_type()
5999
6000 def set_vocab(self):
6001 self._xlmroberta_set_vocab()
6002
6003 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6004 # if name starts with "roberta.", remove the prefix
6005 # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
6006 if name.startswith("roberta."):
6007 name = name[8:]
6008
6009 # jina-embeddings-v3
6010 if ".parametrizations." in name:
6011 name = name.replace(".parametrizations.", ".")
6012 if name.endswith(".original"):
6013 name = name[:-9]
6014
6015 # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
6016 if name == "embeddings.position_embeddings.weight":
6017 if self._position_offset is not None:
6018 data_torch = data_torch[self._position_offset:,:]
6019
6020 if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"):
6021 if name.startswith("pooler.dense"):
6022 return
6023
6024 num_loras = data_torch.size(0)
6025 assert num_loras == len(self._lora_names)
6026
6027 # Split out each LoRA in their own GGUF
6028 for i, lora_writer in enumerate(self._lora_files.values()):
6029 new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower()
6030 data = data_torch[i, :, :]
6031 # Transpose/flip token_embd/types into correct shape
6032 if new_name == "token_embd.weight.lora_b":
6033 data = data.T
6034 elif new_name.startswith("token_types.weight."):
6035 new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b")
6036 lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32)
6037
6038 return
6039
6040 yield from super().modify_tensors(data_torch, name, bid)
6041
6042 def set_gguf_parameters(self):
6043 super().set_gguf_parameters()
6044
6045 # jina-embeddings-v3
6046 lora_alpha = self.hparams.get("lora_alpha")
6047 if lora_prompt_prefixes := self.hparams.get("task_instructions"):
6048 assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
6049 for lora_name, lora_writer in self._lora_files.items():
6050 lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0)
6051 lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name)
6052 if lora_prompt_prefixes:
6053 lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name])
6054
6055 def write(self):
6056 super().write()
6057 for lora_writer in self._lora_files.values():
6058 lora_writer.write_header_to_file()
6059 lora_writer.write_kv_data_to_file()
6060 lora_writer.write_tensors_to_file(progress=True)
6061 lora_writer.close()
6062
6063
6064@ModelBase.register("GemmaForCausalLM")
6065class GemmaModel(TextModel):
6066 model_arch = gguf.MODEL_ARCH.GEMMA
6067
6068 def set_vocab(self):
6069 self._set_vocab_sentencepiece()
6070
6071 # TODO: these special tokens should be exported only for the CodeGemma family
6072 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
6073 special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
6074 special_vocab._set_special_token("prefix", 67)
6075 special_vocab._set_special_token("suffix", 69)
6076 special_vocab._set_special_token("middle", 68)
6077 special_vocab._set_special_token("fsep", 70)
6078 special_vocab._set_special_token("eot", 107)
6079 special_vocab.chat_template = None # do not add it twice
6080 special_vocab.add_to_gguf(self.gguf_writer)
6081
6082 self.gguf_writer.add_add_space_prefix(False)
6083
6084 def set_gguf_parameters(self):
6085 hparams = self.hparams
6086
6087 self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
6088 self.gguf_writer.add_embedding_length(hparams["hidden_size"])
6089 self.gguf_writer.add_block_count(self.block_count)
6090 self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
6091 self.gguf_writer.add_head_count(hparams["num_attention_heads"])
6092 self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
6093 self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
6094 self.gguf_writer.add_key_length(hparams["head_dim"])
6095 self.gguf_writer.add_value_length(hparams["head_dim"])
6096 self.gguf_writer.add_file_type(self.ftype)
6097
6098 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6099 # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
6100 # To prevent errors, skip loading lm_head.weight.
6101 if name == "lm_head.weight":
6102 logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
6103 return
6104
6105 # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
6106 if name.endswith("norm.weight"):
6107 data_torch = data_torch + 1
6108
6109 yield from super().modify_tensors(data_torch, name, bid)
6110
6111
6112@ModelBase.register("Gemma2ForCausalLM")
6113class Gemma2Model(TextModel):
6114 model_arch = gguf.MODEL_ARCH.GEMMA2
6115
6116 def set_vocab(self):
6117 self._set_vocab_sentencepiece()
6118
6119 self.gguf_writer.add_add_space_prefix(False)
6120
6121 def set_gguf_parameters(self):
6122 hparams = self.hparams
6123
6124 self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
6125 self.gguf_writer.add_embedding_length(hparams["hidden_size"])
6126 self.gguf_writer.add_block_count(self.block_count)
6127 self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
6128 self.gguf_writer.add_head_count(hparams["num_attention_heads"])
6129 self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
6130 self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
6131 self.gguf_writer.add_key_length(hparams["head_dim"])
6132 self.gguf_writer.add_value_length(hparams["head_dim"])
6133 self.gguf_writer.add_file_type(self.ftype)
6134 self.gguf_writer.add_attn_logit_softcapping(
6135 self.hparams["attn_logit_softcapping"]
6136 )
6137 self.gguf_writer.add_final_logit_softcapping(
6138 self.hparams["final_logit_softcapping"]
6139 )
6140 self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
6141
6142 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6143 # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
6144 # To prevent errors, skip loading lm_head.weight.
6145 if name == "lm_head.weight":
6146 logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
6147 return
6148
6149 # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
6150 if name.endswith("norm.weight"):
6151 data_torch = data_torch + 1
6152
6153 yield from super().modify_tensors(data_torch, name, bid)
6154
6155
6156@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
6157class Gemma3Model(TextModel):
6158 model_arch = gguf.MODEL_ARCH.GEMMA3
6159 norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
6160
6161 def set_vocab(self):
6162 if (self.dir_model / "tokenizer.model").is_file():
6163 self._set_vocab_sentencepiece()
6164 self.gguf_writer.add_add_space_prefix(False)
6165 else:
6166 self._set_vocab_gpt2()
6167
6168 def set_gguf_parameters(self):
6169 super().set_gguf_parameters()
6170 hparams = self.hparams
6171
6172 # some default values are not specified in the hparams
6173 self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
6174 self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
6175 self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
6176 self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
6177 self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
6178 self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers
6179 # attn_logit_softcapping is removed in Gemma3
6180 assert hparams.get("attn_logit_softcapping") is None
6181 if (final_logit_softcap := hparams.get("final_logit_softcapping")):
6182 self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
6183 if hparams.get("sliding_window_pattern") != 1:
6184 self.gguf_writer.add_sliding_window(hparams["sliding_window"])
6185 self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
6186
6187 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6188 if "language_model." in name:
6189 name = name.replace("language_model.", "")
6190
6191 elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
6192 or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
6193 return # skip vision tensors
6194
6195 # remove OOV (out-of-vocabulary) rows in token_embd
6196 if "embed_tokens.weight" in name:
6197 if (self.dir_model / "tokenizer.model").is_file():
6198 tokens = self._create_vocab_sentencepiece()[0]
6199 else:
6200 tokens = self.get_vocab_base()[0]
6201 data_torch = data_torch[:len(tokens)]
6202
6203 # ref code in Gemma3RMSNorm
6204 # output = output * (1.0 + self.weight.float())
6205 # note: this is not the case on gemma3n
6206 if name.endswith("norm.weight"):
6207 data_torch = data_torch + self.norm_shift
6208
6209 yield from super().modify_tensors(data_torch, name, bid)
6210
6211
6212@ModelBase.register("Gemma3TextModel")
6213class EmbeddingGemma(Gemma3Model):
6214 model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
6215 module_paths = []
6216 dense_features_dims = {}
6217
6218 def __init__(self, *args, **kwargs):
6219 super().__init__(*args, **kwargs)
6220 if self.sentence_transformers_dense_modules:
6221 # read modules.json to determine if model has Dense layers
6222 modules_file = self.dir_model / "modules.json"
6223 if modules_file.is_file():
6224 with open(modules_file, encoding="utf-8") as modules_json_file:
6225 mods = json.load(modules_json_file)
6226 for mod in mods:
6227 if mod["type"] == "sentence_transformers.models.Dense":
6228 mod_path = mod["path"]
6229 # check if model.safetensors file for Dense layer exists
6230 model_tensors_file = self.dir_model / mod_path / "model.safetensors"
6231 if model_tensors_file.is_file():
6232 self.module_paths.append(mod_path)
6233 # read config.json of the Dense layer to get in/out features
6234 mod_conf_file = self.dir_model / mod_path / "config.json"
6235 if mod_conf_file.is_file():
6236 with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file:
6237 mod_conf = json.load(mod_conf_json_file)
6238 # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights
6239 prefix = self._get_dense_prefix(mod_path)
6240 if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None:
6241 self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"])
6242
6243 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
6244 from safetensors.torch import load_file
6245 module_paths = list(self.module_paths)
6246 for i, module_path in enumerate(module_paths):
6247 tensors_file = self.dir_model / module_path / "model.safetensors"
6248 local_tensors = load_file(tensors_file)
6249 tensor_name = self._get_dense_prefix(module_path)
6250 for name, local_tensor in local_tensors.items():
6251 if not name.endswith(".weight"):
6252 continue
6253 orig_name = name.replace("linear", tensor_name)
6254 name = self.map_tensor_name(orig_name)
6255 yield name, local_tensor.clone()
6256
6257 @staticmethod
6258 def _get_dense_prefix(module_path) -> str:
6259 """Get the tensor name prefix for the Dense layer from module path."""
6260 tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3"
6261 return tensor_name
6262
6263 def set_gguf_parameters(self):
6264 super().set_gguf_parameters()
6265
6266 # Override the sliding window size as it gets adjusted by the Gemma3TextConfig
6267 # constructor. We want to use the value from the original model's config.json.
6268 # ref: https://github.com/huggingface/transformers/pull/40700
6269 with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
6270 config = json.load(f)
6271 orig_sliding_window = config.get("sliding_window")
6272 if orig_sliding_window is None:
6273 raise ValueError("sliding_window not found in model config - this is required for the model")
6274
6275 logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
6276 f"instead of {self.hparams['sliding_window']}")
6277 self.gguf_writer.add_sliding_window(orig_sliding_window)
6278 if self.sentence_transformers_dense_modules:
6279 for dense, dims in self.dense_features_dims.items():
6280 logger.info(f"Setting dense layer {dense} in/out features to {dims}")
6281 self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1])
6282
6283 self._try_set_pooling_type()
6284
6285
6286@ModelBase.register("Gemma3ForConditionalGeneration")
6287class Gemma3VisionModel(MmprojModel):
6288 def set_gguf_parameters(self):
6289 super().set_gguf_parameters()
6290 hparams = self.hparams
6291 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
6292 # default values below are taken from HF tranformers code
6293 self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
6294 self.gguf_writer.add_vision_use_gelu(True)
6295 # calculate proj_scale_factor (used by tinygemma3 test model)
6296 image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
6297 n_per_side = int(image_seq_length ** 0.5)
6298 image_size = self.hparams["image_size"]
6299 patch_size = self.hparams["patch_size"]
6300 proj_scale_factor = (image_size // patch_size) // n_per_side
6301 if proj_scale_factor > 0 and proj_scale_factor != 4:
6302 # we only need to write this if it's not the default value
6303 # in this case, we are converting a test model
6304 self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
6305
6306 def tensor_force_quant(self, name, new_name, bid, n_dims):
6307 # related to https://github.com/ggml-org/llama.cpp/issues/13025
6308 if "input_projection" in name:
6309 return gguf.GGMLQuantizationType.F16
6310 if ".embeddings." in name:
6311 return gguf.GGMLQuantizationType.F32
6312 return super().tensor_force_quant(name, new_name, bid, n_dims)
6313
6314 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6315 if "vision_model.head." in name:
6316 return # skip redundant tensors for tinygemma3
6317
6318 if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
6319 or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
6320 # process vision tensors
6321 name = name.replace("_weight", ".weight")
6322
6323 # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
6324 # the other norm values are part of SigLIP model, and they are already correct
6325 # ref code: Gemma3RMSNorm
6326 if "soft_emb_norm.weight" in name:
6327 logger.info(f"Correcting norm value for '{name}'")
6328 data_torch = data_torch + 1
6329
6330 yield from super().modify_tensors(data_torch, name, bid)
6331
6332 return # skip other tensors
6333
6334
6335class ConformerAudioModel(MmprojModel):
6336 _batch_norm_tensors: list[dict[str, Tensor]] | None = None
6337
6338 @staticmethod
6339 def is_audio_tensor(name: str):
6340 return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
6341
6342 def tensor_force_quant(self, name, new_name, bid, n_dims):
6343 if ConformerAudioModel.is_audio_tensor(name):
6344 if ".conv" in name or "_conv" in name and ".weight" in name:
6345 return gguf.GGMLQuantizationType.F32
6346 return super().tensor_force_quant(name, new_name, bid, n_dims)
6347
6348 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6349 # fold running_mean, running_var and eps into weight and bias for batch_norm
6350 if "batch_norm" in name:
6351 if self._batch_norm_tensors is None:
6352 self._batch_norm_tensors = [{} for _ in range(self.block_count)]
6353 assert bid is not None
6354 self._batch_norm_tensors[bid][name] = data_torch
6355
6356 if len(self._batch_norm_tensors[bid]) < 5:
6357 return
6358
6359 weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
6360 bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
6361 running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
6362 running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
6363 eps = 1e-5 # default value
6364
6365 a = weight / torch.sqrt(running_var + eps)
6366 b = bias - running_mean * a
6367 yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid)
6368 yield from super().modify_tensors(b, f"conformer.layers.{bid}.conv.batch_norm.bias", bid)
6369 return
6370
6371 # reshape conv weights
6372 if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
6373 data_torch = data_torch[:, None, None]
6374 if "conv.depthwise_conv" in name and name.endswith(".weight"):
6375 assert data_torch.shape[1] == 1
6376 data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
6377 if "conv.pointwise_conv" in name and name.endswith(".weight"):
6378 assert data_torch.shape[2] == 1
6379 data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
6380
6381 yield from super().modify_tensors(data_torch, name, bid)
6382
6383
6384@ModelBase.register("Gemma3nForConditionalGeneration")
6385class Gemma3nVisionAudioModel(ConformerAudioModel):
6386 has_audio_encoder = True
6387 has_vision_encoder = True
6388
6389 # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
6390 # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
6391 block_tensor_mapping = {
6392 "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight",
6393 "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight",
6394 "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight",
6395 "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight",
6396 "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight",
6397 "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight",
6398 "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight",
6399 "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight",
6400 "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight",
6401 "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight",
6402 "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight",
6403 "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight",
6404 "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma",
6405 "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight",
6406 "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight",
6407 "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight",
6408 "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight",
6409 "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
6410 "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight",
6411 "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
6412 "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight",
6413 "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight",
6414 }
6415
6416 def __init__(self, *args, **kwargs):
6417 # Parent init will call find_hparam which now returns 0 for empty keys
6418 super().__init__(*args, **kwargs)
6419 assert self.hparams_vision is not None
6420 self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
6421 self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
6422 self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
6423
6424 # MobileNetV5 does not use image_mean/std
6425 self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
6426 self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
6427 self.hparams_vision["image_size"] = self.preprocessor_config.get(
6428 "size", {"height": 768, "width": 768}
6429 )["height"]
6430
6431 # Image sequence length (256 tokens = 16x16 for Gemma3n)
6432 image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
6433 image_size = self.hparams_vision["image_size"]
6434 self.hparams_vision["patch_size"] = image_size // image_seq_length
6435
6436 # remap audio hparams
6437 assert self.hparams_audio is not None
6438 self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
6439 self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
6440 self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
6441 self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
6442
6443 def set_gguf_parameters(self):
6444 super().set_gguf_parameters()
6445
6446 # vision params
6447 self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
6448 self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
6449
6450 # audio params
6451 assert self.hparams_audio is not None
6452 self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
6453 self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
6454 self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
6455
6456 def tensor_force_quant(self, name, new_name, bid, n_dims):
6457 # Force quantization settings for specific tensor types
6458 if "input_projection" in name or "input_proj" in name:
6459 return gguf.GGMLQuantizationType.F16
6460 if ".embeddings." in name or "stem" in name:
6461 return gguf.GGMLQuantizationType.F32
6462 return super().tensor_force_quant(name, new_name, bid, n_dims)
6463
6464 def custom_map(self, name: str) -> str:
6465 """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
6466 parts = name.split(".")
6467 # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
6468 if len(parts) >= 7:
6469 bid, sid = parts[4], parts[5]
6470 suffix = ".".join(parts[6:])
6471 template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
6472 if template in self.block_tensor_mapping:
6473 return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
6474
6475 raise ValueError(f"Unknown name: {name}")
6476
6477 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6478 if (ConformerAudioModel.is_audio_tensor(name)):
6479 name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
6480 yield from super().modify_tensors(data_torch, name, bid)
6481
6482 # Gemma3n uses
6483 # - model.embed_vision.* for projection layers
6484 # - model.vision_tower.* for vision encoder
6485 # Skip non-vision tensors
6486 if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
6487 return
6488
6489 if name.startswith("model.vision_tower.timm_model.blocks."):
6490 # Double-indexed block tensors through custom logic
6491 yield (self.custom_map(name), data_torch)
6492 return
6493 else:
6494 # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
6495 new_name = self.map_tensor_name(name)
6496
6497 if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
6498 data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
6499
6500 yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
6501
6502
6503@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
6504class Gemma3NModel(Gemma3Model):
6505 model_arch = gguf.MODEL_ARCH.GEMMA3N
6506 norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
6507
6508 _altup_proj: list[Tensor] = []
6509 _altup_unembd: list[Tensor] = []
6510
6511 def __init__(self, *args, **kwargs):
6512 super().__init__(*args, **kwargs)
6513 assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
6514 self._altup_proj = [
6515 torch.Tensor(), # to be replaced
6516 torch.Tensor(), # to be replaced
6517 torch.Tensor(), # to be replaced
6518 ]
6519 self._altup_unembd = [
6520 torch.Tensor(), # to be replaced
6521 torch.Tensor(), # to be replaced
6522 torch.Tensor(), # to be replaced
6523 ]
6524
6525 def set_vocab(self):
6526 # For Gemma3n multimodal models, we need the FULL vocab_size (262400)
6527 # which includes special tokens from 262144-262399 for vision/audio.
6528 # The vocab_size_per_layer_input (262144) is only the embedding size per layer.
6529 # Temporarily override the hparams lookup order to prioritize vocab_size.
6530
6531 # Store original vocab_size_per_layer_input if it exists
6532 vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
6533
6534 # Temporarily remove vocab_size_per_layer_input to force using vocab_size
6535 if vocab_size_per_layer_input is not None:
6536 del self.hparams["vocab_size_per_layer_input"]
6537
6538 # Call parent set_vocab which will now use vocab_size (262400)
6539 super().set_vocab()
6540
6541 # Restore vocab_size_per_layer_input for later use
6542 if vocab_size_per_layer_input is not None:
6543 self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
6544
6545 def set_gguf_parameters(self):
6546 super().set_gguf_parameters()
6547 self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
6548 self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
6549 self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
6550 self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
6551
6552 activation_sparsity_scale = []
6553 for s in self.hparams["activation_sparsity_pattern"]:
6554 normal_dist = torch.distributions.normal.Normal(0, 1)
6555 std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
6556 activation_sparsity_scale.append(std_multiplier.item())
6557 self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
6558
6559 sliding_window_pattern = []
6560 for t in self.hparams["layer_types"]:
6561 sliding_window_pattern.append(t == "sliding_attention")
6562 self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
6563
6564 def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
6565 has_all = all(m.numel() > 0 for m in matrices)
6566 if not has_all:
6567 return None
6568 else:
6569 return torch.stack(matrices, dim=0)
6570
6571 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6572 if name.endswith("_scale"):
6573 name = name + ".weight"
6574
6575 # TODO: implement self.prediction_coefs.weight.clamp_(...)
6576
6577 if "language_model." not in name:
6578 return # skip non-language model tensors
6579
6580 # Pad token embeddings for vision/audio special tokens (262144-262399)
6581 if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
6582 # Move to CPU to avoid meta device issues during padding
6583 data_torch = data_torch.to(device="cpu")
6584
6585 vocab_size = self.hparams.get("vocab_size", 262400)
6586 current_size = data_torch.shape[0] # First dimension is vocab_size
6587
6588 if current_size < vocab_size:
6589 # Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
6590 padding_size = vocab_size - current_size
6591 tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
6592 logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
6593
6594 # Create padding with zeros (vision tokens won't use these embeddings)
6595 padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
6596 data_torch = torch.cat([data_torch, padding], dim=0)
6597
6598 # Continue with normal processing
6599 name = name.replace("language_model.", "")
6600 yield from ModelBase.modify_tensors(self, data_torch, name, bid)
6601 return
6602
6603 if "altup_unembed_projections" in name:
6604 data_torch = data_torch.to(device="cpu")
6605 # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
6606 # They should NOT be padded
6607 if ".0." in name:
6608 self._altup_unembd[0] = data_torch
6609 elif ".1." in name:
6610 self._altup_unembd[1] = data_torch
6611 elif ".2." in name:
6612 self._altup_unembd[2] = data_torch
6613 else:
6614 raise ValueError(f"Unknown name: {name}")
6615 out = self._stack_matrices(self._altup_unembd)
6616 if out is not None:
6617 yield from ModelBase.modify_tensors(self, out, "model.altup_unembed_projections.weight", bid)
6618 return
6619 else:
6620 return
6621
6622 if "altup_projections" in name:
6623 data_torch = data_torch.to(device="cpu")
6624 if ".0." in name:
6625 self._altup_proj[0] = data_torch
6626 elif ".1." in name:
6627 self._altup_proj[1] = data_torch
6628 elif ".2." in name:
6629 self._altup_proj[2] = data_torch
6630 else:
6631 raise ValueError(f"Unknown name: {name}")
6632 out = self._stack_matrices(self._altup_proj)
6633 if out is not None:
6634 yield from ModelBase.modify_tensors(self, out, "model.altup_projections.weight", bid)
6635 return
6636 else:
6637 return
6638
6639 yield from super().modify_tensors(data_torch, name, bid)
6640
6641
6642@ModelBase.register("Starcoder2ForCausalLM")
6643class StarCoder2Model(TextModel):
6644 model_arch = gguf.MODEL_ARCH.STARCODER2
6645
6646
6647@ModelBase.register("Rwkv6ForCausalLM")
6648class Rwkv6Model(TextModel):
6649 model_arch = gguf.MODEL_ARCH.RWKV6
6650
6651 def set_vocab(self):
6652 self._set_vocab_rwkv_world()
6653
6654 def set_gguf_parameters(self):
6655 head_size = self.hparams["head_size"]
6656 hidden_size = self.hparams["hidden_size"]
6657 layer_norm_eps = self.hparams["layer_norm_epsilon"]
6658 rescale_every_n_layers = self.hparams["rescale_every"]
6659 intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
6660 time_mix_extra_dim = 64 if hidden_size == 4096 else 32
6661 time_decay_extra_dim = 128 if hidden_size == 4096 else 64
6662
6663 # RWKV isn't context limited
6664 self.gguf_writer.add_context_length(1048576)
6665 self.gguf_writer.add_embedding_length(hidden_size)
6666 self.gguf_writer.add_block_count(self.block_count)
6667 self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
6668 self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
6669 self.gguf_writer.add_wkv_head_size(head_size)
6670 self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
6671 self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
6672 self.gguf_writer.add_feed_forward_length(intermediate_size)
6673 self.gguf_writer.add_file_type(self.ftype)
6674
6675 # required by llama.cpp, unused
6676 self.gguf_writer.add_head_count(0)
6677
6678 lerp_weights: dict[int, dict[str, Tensor]] = {}
6679
6680 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6681 new_name = self.map_tensor_name(name)
6682
6683 if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
6684 new_name += ".weight"
6685
6686 if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
6687 data_torch = data_torch.transpose(0, 1)
6688
6689 if new_name.endswith("time_mix_w2.weight"):
6690 data_torch = data_torch.permute(0, 2, 1)
6691
6692 if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
6693 data_torch = data_torch.squeeze()
6694
6695 try:
6696 rescale_every_n_layers = self.hparams["rescale_every"]
6697 if rescale_every_n_layers > 0:
6698 if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
6699 data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
6700 except KeyError:
6701 pass
6702
6703 # concat time_mix_lerp weights to reduce some cpu overhead
6704 # also reduces the number of tensors in the model
6705 if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
6706 try:
6707 self.lerp_weights[bid][new_name] = data_torch
6708 except KeyError:
6709 self.lerp_weights[bid] = {new_name: data_torch}
6710 if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
6711 new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
6712 data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
6713 yield (new_name, data)
6714 return
6715
6716 yield (new_name, data_torch)
6717
6718
6719@ModelBase.register("RWKV6Qwen2ForCausalLM")
6720class RWKV6Qwen2Model(Rwkv6Model):
6721 model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
6722
6723 def set_vocab(self):
6724 try:
6725 self._set_vocab_sentencepiece()
6726 except FileNotFoundError:
6727 self._set_vocab_gpt2()
6728
6729 def set_gguf_parameters(self):
6730 num_attention_heads = self.hparams["num_attention_heads"]
6731 num_key_value_heads = self.hparams["num_key_value_heads"]
6732 hidden_size = self.hparams["hidden_size"]
6733 head_size = hidden_size // num_attention_heads
6734 rms_norm_eps = self.hparams["rms_norm_eps"]
6735 intermediate_size = self.hparams["intermediate_size"]
6736 time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
6737 time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
6738
6739 # RWKV isn't context limited
6740 self.gguf_writer.add_context_length(1048576)
6741 self.gguf_writer.add_embedding_length(hidden_size)
6742 self.gguf_writer.add_block_count(self.block_count)
6743 self.gguf_writer.add_wkv_head_size(head_size)
6744 self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
6745 self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
6746 self.gguf_writer.add_feed_forward_length(intermediate_size)
6747 self.gguf_writer.add_file_type(self.ftype)
6748
6749 # special parameters for time_mixing in RWKV6QWEN2
6750 self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
6751 self.gguf_writer.add_token_shift_count(1)
6752 # RWKV6QWEN2 use grouped key/value like GQA
6753 self.gguf_writer.add_head_count_kv(num_key_value_heads)
6754
6755 # required by llama.cpp, unused
6756 self.gguf_writer.add_head_count(0)
6757
6758 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6759 for new_name, data in super().modify_tensors(data_torch, name, bid):
6760 if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
6761 data = data.view(5, -1, data.shape[-1])
6762 # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
6763 # permute them here to avoid code changes
6764 data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
6765 if "w2" in new_name:
6766 data = data.view(5, -1, data.shape[-1])
6767 yield (new_name, data)
6768 continue
6769 yield (new_name, data)
6770
6771
6772@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
6773class Rwkv7Model(TextModel):
6774 model_arch = gguf.MODEL_ARCH.RWKV7
6775
6776 def set_vocab(self):
6777 self._set_vocab_rwkv_world()
6778
6779 def calc_lora_rank(self, hidden_size, exponent, multiplier):
6780 return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
6781
6782 def set_gguf_parameters(self):
6783 try:
6784 head_size = self.hparams["head_size"]
6785 layer_norm_eps = self.hparams["layer_norm_epsilon"]
6786 except KeyError:
6787 head_size = self.hparams["head_dim"]
6788 layer_norm_eps = self.hparams["norm_eps"]
6789 hidden_size = self.hparams["hidden_size"]
6790 intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4)
6791
6792 # ICLR: In-Context-Learning-Rate
6793 try:
6794 lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
6795 lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
6796 lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
6797 lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
6798 except KeyError:
6799 lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
6800 lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8)
6801 lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3)
6802 lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6)
6803
6804 # RWKV isn't context limited
6805 self.gguf_writer.add_context_length(1048576)
6806 self.gguf_writer.add_embedding_length(hidden_size)
6807 self.gguf_writer.add_block_count(self.block_count)
6808 self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
6809 self.gguf_writer.add_wkv_head_size(head_size)
6810 self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
6811 self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
6812 self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
6813 self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
6814 self.gguf_writer.add_feed_forward_length(intermediate_size)
6815 self.gguf_writer.add_file_type(self.ftype)
6816
6817 # required by llama.cpp, unused
6818 self.gguf_writer.add_head_count(0)
6819
6820 lerp_weights: dict[int, dict[str, Tensor]] = {}
6821 lora_needs_transpose: bool = True
6822
6823 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6824 # unify tensor names here to make life easier
6825 name = name.replace("blocks", "layers").replace("ffn", "feed_forward")
6826 name = name.replace("self_attn", "attention").replace("attn", "attention")
6827 name = name.replace("time_mixer.", "")
6828 # lora layer names in fla-hub's impl
6829 if "_lora.lora" in name:
6830 self.lora_needs_transpose = False
6831 name = name.replace("_lora.lora.0.weight", "1.weight")
6832 name = name.replace("_lora.lora.2.weight", "2.weight")
6833 name = name.replace("_lora.lora.2.bias", "0.weight")
6834
6835 name = name.replace("feed_forward_norm", "ln2")
6836 name = name.replace("g_norm", "ln_x")
6837
6838 if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0:
6839 # some models have dummy v0/v1/v2 on first layer while others don't
6840 # ignore them all since they are not used
6841 return
6842
6843 wkv_has_gate = self.hparams.get("wkv_has_gate", True)
6844 lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"]
6845
6846 if bid is not None and "attention.x_" in name:
6847 if "attention.x_x" in name:
6848 # already concatenated
6849 new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
6850 data = data_torch.reshape(len(lerp_list), 1, 1, -1)
6851 yield (new_name, data)
6852 else:
6853 try:
6854 self.lerp_weights[bid][name] = data_torch
6855 except KeyError:
6856 self.lerp_weights[bid] = {name: data_torch}
6857 if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list):
6858 new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
6859 data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0)
6860 yield (new_name, data)
6861 return
6862 else:
6863 data_torch = data_torch.squeeze()
6864 new_name = self.map_tensor_name(name)
6865
6866 if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
6867 new_name += ".weight"
6868
6869 if self.lora_needs_transpose and any(
6870 new_name.endswith(t) for t in [
6871 "time_mix_w1.weight", "time_mix_w2.weight",
6872 "time_mix_a1.weight", "time_mix_a2.weight",
6873 "time_mix_v1.weight", "time_mix_v2.weight",
6874 "time_mix_g1.weight", "time_mix_g2.weight",
6875 ]
6876 ):
6877 data_torch = data_torch.transpose(0, 1)
6878
6879 if 'r_k' in new_name:
6880 data_torch = data_torch.flatten()
6881
6882 if bid == 0 and "time_mix_a" in new_name:
6883 # dummy v0/v1/v2 on first layer
6884 # easist way to make llama happy
6885 yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
6886
6887 yield (new_name, data_torch)
6888
6889
6890@ModelBase.register("RwkvHybridForCausalLM")
6891class ARwkv7Model(Rwkv7Model):
6892 model_arch = gguf.MODEL_ARCH.ARWKV7
6893
6894 def set_vocab(self):
6895 try:
6896 self._set_vocab_sentencepiece()
6897 except FileNotFoundError:
6898 self._set_vocab_gpt2()
6899
6900 def set_gguf_parameters(self):
6901 hidden_size = self.hparams["hidden_size"]
6902 head_size = self.hparams["head_size"]
6903 rms_norm_eps = self.hparams["rms_norm_eps"]
6904 intermediate_size = self.hparams["intermediate_size"]
6905 wkv_has_gate = self.hparams["wkv_has_gate"]
6906 assert self.hparams["wkv_version"] == 7
6907
6908 # ICLR: In-Context-Learning-Rate
6909 lora_rank_decay = 64
6910 lora_rank_iclr = 64
6911 lora_rank_value_residual_mix = 32
6912 lora_rank_gate = 128 if wkv_has_gate else 0
6913
6914 # RWKV isn't context limited
6915 self.gguf_writer.add_context_length(1048576)
6916 self.gguf_writer.add_embedding_length(hidden_size)
6917 self.gguf_writer.add_block_count(self.block_count)
6918 self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
6919 self.gguf_writer.add_wkv_head_size(head_size)
6920 self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
6921 self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr)
6922 self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix)
6923 self.gguf_writer.add_gate_lora_rank(lora_rank_gate)
6924 self.gguf_writer.add_feed_forward_length(intermediate_size)
6925 self.gguf_writer.add_file_type(self.ftype)
6926 self.gguf_writer.add_token_shift_count(1)
6927
6928 # required by llama.cpp, unused
6929 self.gguf_writer.add_head_count(0)
6930
6931
6932@ModelBase.register("MaincoderForCausalLM")
6933class MaincoderModel(TextModel):
6934 model_arch = gguf.MODEL_ARCH.MAINCODER
6935
6936 def set_gguf_parameters(self):
6937 super().set_gguf_parameters()
6938
6939 if (head_dim := self.hparams.get("head_dim")) is not None:
6940 self.gguf_writer.add_rope_dimension_count(head_dim)
6941
6942
6943@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
6944class MambaModel(TextModel):
6945 model_arch = gguf.MODEL_ARCH.MAMBA
6946
6947 def __init__(self, dir_model: Path, *args, **kwargs):
6948 # Avoid using AutoConfig for hparams
6949 hparams = kwargs.pop("hparams", None)
6950 if hparams is None:
6951 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
6952 hparams = json.load(f)
6953 super().__init__(dir_model, *args, hparams=hparams, **kwargs)
6954
6955 def set_vocab(self):
6956 vocab_size = self.hparams["vocab_size"]
6957 # Round vocab size to next multiple of 8
6958 pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
6959 # pad using ceiling division
6960 # ref: https://stackoverflow.com/a/17511341/22827863
6961 vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
6962 self.hparams["vocab_size"] = vocab_size
6963
6964 if (self.dir_model / "tokenizer.json").is_file():
6965 self._set_vocab_gpt2()
6966 elif (self.dir_model / "tokenizer.model").is_file():
6967 self._set_vocab_sentencepiece()
6968 else:
6969 # Use the GPT-NeoX tokenizer when no tokenizer files are present
6970 self._set_vocab_builtin("gpt-neox", vocab_size)
6971
6972 def set_gguf_parameters(self):
6973 d_model = self.find_hparam(["hidden_size", "d_model"])
6974 d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
6975 d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
6976 d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
6977 # ceiling division
6978 # ref: https://stackoverflow.com/a/17511341/22827863
6979 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
6980 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
6981 rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
6982 use_dt_b_c_norm = False
6983 # For falconmamba we do apply RMS norm on B / DT and C layers
6984 if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
6985 use_dt_b_c_norm = True
6986 # Fail early for models which don't have a block expansion factor of 2
6987 assert d_inner == 2 * d_model
6988
6989 self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
6990 self.gguf_writer.add_embedding_length(d_model)
6991 self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
6992 self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
6993 self.gguf_writer.add_block_count(self.block_count)
6994 self.gguf_writer.add_ssm_conv_kernel(d_conv)
6995 self.gguf_writer.add_ssm_inner_size(d_inner)
6996 self.gguf_writer.add_ssm_state_size(d_state)
6997 self.gguf_writer.add_ssm_time_step_rank(dt_rank)
6998 self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
6999 self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
7000 self.gguf_writer.add_file_type(self.ftype)
7001
7002 _tok_embd = None
7003
7004 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7005 output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
7006 tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
7007
7008 new_name = self.map_tensor_name(name)
7009
7010 if name.endswith(".A_log"):
7011 logger.debug("A_log --> A ==> " + new_name)
7012 data_torch = -torch.exp(data_torch)
7013
7014 # [4 1 8192 1] -> [4 8192 1 1]
7015 if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
7016 data_torch = data_torch.squeeze()
7017
7018 # assuming token_embd.weight is seen before output.weight
7019 if self._tok_embd is not None and new_name == output_name:
7020 if torch.equal(self._tok_embd, data_torch):
7021 logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
7022 return
7023 elif new_name == tok_embd_name:
7024 self._tok_embd = data_torch
7025
7026 yield from super().modify_tensors(data_torch, new_name, bid)
7027
7028
7029@ModelBase.register("Mamba2ForCausalLM")
7030class Mamba2Model(TextModel):
7031 model_arch = gguf.MODEL_ARCH.MAMBA2
7032
7033 def __init__(self, dir_model: Path, *args, **kwargs):
7034 # Avoid using AutoConfig for hparams
7035 # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
7036 hparams = kwargs.pop("hparams", None)
7037 if hparams is None:
7038 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
7039 hparams = json.load(f)
7040 super().__init__(dir_model, *args, hparams=hparams, **kwargs)
7041 self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
7042 self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
7043 self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
7044
7045 def set_vocab(self):
7046 vocab_size = self.hparams["vocab_size"]
7047 # Round vocab size to next multiple of 16
7048 pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
7049 # pad using ceiling division
7050 # ref: https://stackoverflow.com/a/17511341/22827863
7051 vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
7052 self.hparams["vocab_size"] = vocab_size
7053
7054 if (self.dir_model / "tokenizer.model").is_file():
7055 self._set_vocab_sentencepiece()
7056 elif (self.dir_model / "tokenizer.model.v3").is_file():
7057 # mamba-codestral
7058 raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
7059 elif (self.dir_model / "tokenizer.json").is_file():
7060 self._set_vocab_gpt2()
7061 else:
7062 # Use the GPT-NeoX tokenizer when no tokenizer files are present
7063 self._set_vocab_builtin("gpt-neox", vocab_size)
7064
7065 def set_gguf_parameters(self):
7066 d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
7067 d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
7068 head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
7069
7070 rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
7071
7072 # Fail early for models which don't have a block expansion factor of 2
7073 # TODO: does this really matter?
7074 # skip the assertion for FalconH1 Model
7075 if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
7076 assert self.d_inner == 2 * self.d_model
7077 assert self.d_inner % head_dim == 0
7078
7079 self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
7080 self.gguf_writer.add_embedding_length(self.d_model)
7081 self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
7082 self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
7083 self.gguf_writer.add_block_count(self.block_count)
7084 self.gguf_writer.add_ssm_conv_kernel(d_conv)
7085 self.gguf_writer.add_ssm_inner_size(self.d_inner)
7086 self.gguf_writer.add_ssm_state_size(d_state)
7087 self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
7088 self.gguf_writer.add_ssm_group_count(self.n_group)
7089 self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
7090 self.gguf_writer.add_file_type(self.ftype)
7091
7092 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7093
7094 if name.startswith("model.backbone") or name.startswith("model.lm_head"):
7095 # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
7096 name = name.removeprefix("model.")
7097
7098 if name.endswith(".dt_bias"):
7099 name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
7100
7101 new_name = self.map_tensor_name(name)
7102
7103 if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
7104 data_torch = data_torch.squeeze()
7105 elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
7106 gguf.MODEL_TENSOR.SSM_A,
7107 gguf.MODEL_TENSOR.SSM_D,
7108 ]):
7109 # unsqueeze A to use similar shape semantics as Mamba-1
7110 # (D is also unsqueezed, but for more straightforward broadcast internally)
7111 data_torch = data_torch.reshape((*data_torch.shape, 1))
7112 elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
7113 data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
7114
7115 if name.endswith(".A_log"):
7116 logger.debug("A_log --> A ==> " + new_name)
7117 data_torch = -torch.exp(data_torch)
7118
7119 yield (new_name, data_torch)
7120
7121
7122@ModelBase.register("JambaForCausalLM")
7123class JambaModel(TextModel):
7124 model_arch = gguf.MODEL_ARCH.JAMBA
7125
7126 def set_vocab(self):
7127 if (self.dir_model / "tokenizer.model").is_file():
7128 self._set_vocab_sentencepiece()
7129 else:
7130 self._set_vocab_llama_hf()
7131 self.gguf_writer.add_add_space_prefix(False)
7132
7133 def set_gguf_parameters(self):
7134 d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
7135 d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
7136 d_inner = self.hparams["mamba_expand"] * d_model
7137 d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
7138 # ceiling division
7139 # ref: https://stackoverflow.com/a/17511341/22827863
7140 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
7141 dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
7142 rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
7143 n_kv_head = self.hparams["num_key_value_heads"]
7144 attn_offset = self.hparams["attn_layer_offset"]
7145 attn_period = self.hparams["attn_layer_period"]
7146 n_kv_vec = [0 for _ in range(attn_offset)] + [
7147 n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
7148 ]
7149
7150 self.gguf_writer.add_block_count(self.block_count)
7151 self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
7152 self.gguf_writer.add_embedding_length(d_model)
7153 self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
7154 self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
7155 self.gguf_writer.add_head_count_kv(n_kv_vec)
7156 self.gguf_writer.add_ssm_conv_kernel(d_conv)
7157 self.gguf_writer.add_ssm_inner_size(d_inner)
7158 self.gguf_writer.add_ssm_state_size(d_state)
7159 self.gguf_writer.add_ssm_time_step_rank(dt_rank)
7160 self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
7161 self.gguf_writer.add_expert_count(self.hparams["num_experts"])
7162 self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
7163 self.gguf_writer.add_file_type(self.ftype)
7164
7165 _experts: list[dict[str, Tensor]] | None = None
7166
7167 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7168
7169 # Mini-Jamba
7170 name = name.replace(".moe.", ".feed_forward.")
7171 if bid is not None:
7172 moe_offset = self.hparams["expert_layer_offset"]
7173 moe_period = self.hparams["expert_layer_period"]
7174
7175 if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
7176 name = name.replace(".experts.0.", ".")
7177
7178 # process the experts separately
7179 if ".feed_forward.experts." in name:
7180 n_experts = self.hparams["num_experts"]
7181
7182 assert bid is not None
7183
7184 if self._experts is None:
7185 self._experts = [{} for _ in range(self.block_count)]
7186
7187 self._experts[bid][name] = data_torch
7188
7189 if len(self._experts[bid]) >= n_experts * 3:
7190
7191 # merge the experts into a single 3d tensor
7192 for wid in ["down_proj", "gate_proj", "up_proj"]:
7193 datas: list[Tensor] = []
7194
7195 for xid in range(n_experts):
7196 ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
7197 datas.append(self._experts[bid][ename])
7198 del self._experts[bid][ename]
7199
7200 data_torch = torch.stack(datas, dim=0)
7201
7202 # using the same merged name as qwen2moe
7203 merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
7204
7205 new_name = self.map_tensor_name(merged_name)
7206
7207 yield new_name, data_torch
7208 return
7209
7210 new_name = self.map_tensor_name(name)
7211
7212 if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
7213 data_torch = data_torch.squeeze()
7214
7215 if name.endswith(".A_log"):
7216 logger.debug("A_log --> A ==> " + new_name)
7217 data_torch = -torch.exp(data_torch)
7218
7219 yield (new_name, data_torch)
7220
7221 def prepare_tensors(self):
7222 super().prepare_tensors()
7223
7224 if self._experts is not None:
7225 # flatten `list[dict[str, Tensor]]` into `list[str]`
7226 experts = [k for d in self._experts for k in d.keys()]
7227 if len(experts) > 0:
7228 raise ValueError(f"Unprocessed experts: {experts}")
7229
7230
7231@ModelBase.register("CohereForCausalLM")
7232class CommandR2Model(TextModel):
7233 model_arch = gguf.MODEL_ARCH.COMMAND_R
7234
7235 def __init__(self, *args, **kwargs):
7236 super().__init__(*args, **kwargs)
7237
7238 # max_position_embeddings = 8192 in config.json but model was actually
7239 # trained on 128k context length
7240 # aya-23 models don't have model_max_length specified
7241 self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
7242
7243 def set_gguf_parameters(self):
7244 super().set_gguf_parameters()
7245 self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
7246 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
7247
7248
7249@ModelBase.register("Cohere2ForCausalLM")
7250class Cohere2Model(TextModel):
7251 model_arch = gguf.MODEL_ARCH.COHERE2
7252
7253 def set_gguf_parameters(self):
7254 super().set_gguf_parameters()
7255
7256 self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
7257 self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
7258 self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
7259
7260 rotary_pct = self.hparams["rotary_pct"]
7261 hidden_size = self.hparams["hidden_size"]
7262 num_attention_heads = self.hparams["num_attention_heads"]
7263 self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
7264 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
7265
7266
7267@ModelBase.register("OlmoForCausalLM")
7268@ModelBase.register("OLMoForCausalLM")
7269class OlmoModel(TextModel):
7270 model_arch = gguf.MODEL_ARCH.OLMO
7271
7272 def set_gguf_parameters(self):
7273 super().set_gguf_parameters()
7274 self.gguf_writer.add_layer_norm_eps(1e-5)
7275 clip_qkv = self.hparams.get("clip_qkv")
7276 if clip_qkv is not None:
7277 self.gguf_writer.add_clamp_kqv(clip_qkv)
7278
7279 # Same as super class, but permuting q_proj, k_proj
7280 # Copied from: LlamaModel
7281 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7282 n_head = self.hparams["num_attention_heads"]
7283 n_kv_head = self.hparams.get("num_key_value_heads")
7284
7285 if name.endswith("q_proj.weight"):
7286 data_torch = LlamaModel.permute(data_torch, n_head, n_head)
7287 if name.endswith("k_proj.weight"):
7288 data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
7289
7290 yield from super().modify_tensors(data_torch, name, bid)
7291
7292
7293@ModelBase.register("SeedOssForCausalLM")
7294class SeedOssModel(TextModel):
7295 model_arch = gguf.MODEL_ARCH.SEED_OSS
7296
7297
7298@ModelBase.register("Olmo2ForCausalLM")
7299@ModelBase.register("Olmo3ForCausalLM")
7300class Olmo2Model(TextModel):
7301 model_arch = gguf.MODEL_ARCH.OLMO2
7302
7303 def set_gguf_parameters(self):
7304 super().set_gguf_parameters()
7305
7306 if "sliding_window" in self.hparams:
7307 self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
7308
7309 sliding_window_pattern = []
7310 if "layer_types" in self.hparams:
7311 sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
7312 else:
7313 # Olmo2 does not use sliding window attention.
7314 # Olmo3 defaults to using sliding window for all layers except every 4th.
7315 for i in range(self.hparams["num_hidden_layers"]):
7316 sliding_window_pattern.append((i + 1) % 4 != 0)
7317
7318 self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
7319
7320
7321@ModelBase.register("OlmoeForCausalLM")
7322class OlmoeModel(TextModel):
7323 model_arch = gguf.MODEL_ARCH.OLMOE
7324
7325 def set_gguf_parameters(self):
7326 super().set_gguf_parameters()
7327 self.gguf_writer.add_layer_norm_rms_eps(1e-5)
7328 if (n_experts := self.hparams.get("num_experts")) is not None:
7329 self.gguf_writer.add_expert_count(n_experts)
7330
7331 _experts: list[dict[str, Tensor]] | None = None
7332
7333 # Copied from: Qwen2MoeModel
7334 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7335 # process the experts separately
7336 if name.find("experts") != -1:
7337 n_experts = self.hparams["num_experts"]
7338 assert bid is not None
7339
7340 if self._experts is None:
7341 self._experts = [{} for _ in range(self.block_count)]
7342
7343 self._experts[bid][name] = data_torch
7344
7345 if len(self._experts[bid]) >= n_experts * 3:
7346 # merge the experts into a single 3d tensor
7347 for w_name in ["down_proj", "gate_proj", "up_proj"]:
7348 datas: list[Tensor] = []
7349
7350 for xid in range(n_experts):
7351 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
7352 datas.append(self._experts[bid][ename])
7353 del self._experts[bid][ename]
7354
7355 data_torch = torch.stack(datas, dim=0)
7356
7357 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
7358
7359 yield from super().modify_tensors(data_torch, merged_name, bid)
7360 return
7361 else:
7362 return
7363
7364 yield from super().modify_tensors(data_torch, name, bid)
7365
7366 # Copied from: Qwen2MoeModel
7367 def prepare_tensors(self):
7368 super().prepare_tensors()
7369
7370 if self._experts is not None:
7371 # flatten `list[dict[str, Tensor]]` into `list[str]`
7372 experts = [k for d in self._experts for k in d.keys()]
7373 if len(experts) > 0:
7374 raise ValueError(f"Unprocessed experts: {experts}")
7375
7376
7377@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM")
7378class JinaBertV2Model(BertModel):
7379 model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
7380
7381 def set_vocab(self):
7382 tokenizer_class = 'BertTokenizer'
7383 with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
7384 tokenizer_class = json.load(f)['tokenizer_class']
7385
7386 if tokenizer_class == 'BertTokenizer':
7387 super().set_vocab()
7388 elif tokenizer_class == 'RobertaTokenizer':
7389 self._set_vocab_gpt2()
7390 self.gguf_writer.add_token_type_count(2)
7391 else:
7392 raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
7393
7394
7395@ModelBase.register("OpenELMForCausalLM")
7396class OpenELMModel(TextModel):
7397 model_arch = gguf.MODEL_ARCH.OPENELM
7398
7399 @staticmethod
7400 def _make_divisible(v: float | int, divisor: int) -> int:
7401 # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
7402 new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
7403 # Make sure that round down does not go down by more than 10%.
7404 if new_v < 0.9 * v:
7405 new_v += divisor
7406 return new_v
7407
7408 def __init__(self, *args, **kwargs):
7409 super().__init__(*args, **kwargs)
7410
7411 ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
7412 ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
7413 self._n_embd: int = self.hparams["model_dim"]
7414 self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
7415 self._num_query_heads: list[int] = self.hparams["num_query_heads"]
7416 self._ffn_dims: list[int] = [
7417 OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
7418 for multiplier in ffn_multipliers
7419 ]
7420 assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
7421 assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
7422
7423 # Uses the tokenizer from meta-llama/Llama-2-7b-hf
7424 def set_vocab(self):
7425 try:
7426 self._set_vocab_sentencepiece()
7427 except FileNotFoundError:
7428 self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
7429
7430 def set_gguf_parameters(self):
7431 n_embd = self._n_embd
7432 head_dim = self.hparams["head_dim"]
7433 rot_pct = 1.0
7434 assert self.block_count == len(self._num_kv_heads)
7435 assert self.block_count == len(self._num_query_heads)
7436 assert self.block_count == len(self._ffn_dims)
7437
7438 self.gguf_writer.add_block_count(self.block_count)
7439 self.gguf_writer.add_context_length(self.hparams["max_context_length"])
7440 self.gguf_writer.add_embedding_length(n_embd)
7441 self.gguf_writer.add_feed_forward_length(self._ffn_dims)
7442 self.gguf_writer.add_head_count(self._num_query_heads)
7443 self.gguf_writer.add_head_count_kv(self._num_kv_heads)
7444 self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
7445 # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
7446 self.gguf_writer.add_layer_norm_rms_eps(1e-6)
7447 self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
7448 self.gguf_writer.add_key_length(head_dim)
7449 self.gguf_writer.add_value_length(head_dim)
7450 self.gguf_writer.add_file_type(self.ftype)
7451
7452 def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
7453 if "n_layers" in keys:
7454 return self.hparams["num_transformer_layers"]
7455
7456 return super().find_hparam(keys, optional)
7457
7458 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7459
7460 # split ff
7461 if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
7462 ff_dim = self._ffn_dims[bid]
7463 yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
7464 yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
7465 return
7466
7467 yield (self.map_tensor_name(name), data_torch)
7468
7469
7470@ModelBase.register("ArcticForCausalLM")
7471class ArcticModel(TextModel):
7472 model_arch = gguf.MODEL_ARCH.ARCTIC
7473
7474 def set_vocab(self):
7475 # The reason for using a custom implementation here is that the
7476 # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
7477 # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
7478 from sentencepiece import SentencePieceProcessor
7479
7480 tokenizer_path = self.dir_model / 'tokenizer.model'
7481
7482 if not tokenizer_path.is_file():
7483 logger.error(f'Error: Missing {tokenizer_path}')
7484 sys.exit(1)
7485
7486 # Read the whole vocabulary from the tokenizer.model file
7487 tokenizer = SentencePieceProcessor()
7488 tokenizer.LoadFromFile(str(tokenizer_path))
7489
7490 vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
7491
7492 tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
7493 scores: list[float] = [-10000.0] * vocab_size
7494 toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
7495
7496 for token_id in range(tokenizer.vocab_size()):
7497
7498 piece = tokenizer.IdToPiece(token_id)
7499 text = piece.encode("utf-8")
7500 score = tokenizer.GetScore(token_id)
7501
7502 toktype = SentencePieceTokenTypes.NORMAL
7503 if tokenizer.IsUnknown(token_id):
7504 toktype = SentencePieceTokenTypes.UNKNOWN
7505 elif tokenizer.IsControl(token_id):
7506 toktype = SentencePieceTokenTypes.CONTROL
7507 elif tokenizer.IsUnused(token_id):
7508 toktype = SentencePieceTokenTypes.UNUSED
7509 elif tokenizer.IsByte(token_id):
7510 toktype = SentencePieceTokenTypes.BYTE
7511
7512 tokens[token_id] = text
7513 scores[token_id] = score
7514 toktypes[token_id] = toktype
7515
7516 # Use the added_tokens_decoder field from tokeniser_config.json as the source
7517 # of information about added/redefined tokens and modify them accordingly.
7518 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
7519 if tokenizer_config_file.is_file():
7520 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
7521 tokenizer_config_json = json.load(f)
7522
7523 if "added_tokens_decoder" in tokenizer_config_json:
7524 added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
7525 for token_id, token_json in added_tokens_decoder.items():
7526 token_id = int(token_id)
7527 if token_id >= vocab_size:
7528 logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
7529 continue
7530
7531 token_content = token_json["content"]
7532 token_type = SentencePieceTokenTypes.USER_DEFINED
7533 token_score = -10000.0
7534
7535 # Map unk_token to UNKNOWN, other special tokens to CONTROL
7536 # Set the score to 0.0 as in the original tokenizer.model
7537 if ("special" in token_json) and token_json["special"]:
7538 if token_content == tokenizer_config_json["unk_token"]:
7539 token_type = SentencePieceTokenTypes.UNKNOWN
7540 else:
7541 token_type = SentencePieceTokenTypes.CONTROL
7542 token_score = 0.0
7543
7544 logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
7545 tokens[token_id] = token_content.encode("utf-8")
7546 toktypes[token_id] = token_type
7547 scores[token_id] = token_score
7548
7549 self.gguf_writer.add_tokenizer_model("llama")
7550 self.gguf_writer.add_tokenizer_pre("default")
7551 self.gguf_writer.add_token_list(tokens)
7552 self.gguf_writer.add_token_scores(scores)
7553 self.gguf_writer.add_token_types(toktypes)
7554
7555 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
7556 special_vocab.add_to_gguf(self.gguf_writer)
7557
7558 def set_gguf_parameters(self):
7559 super().set_gguf_parameters()
7560 hparams = self.hparams
7561 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
7562 self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
7563
7564 _experts: list[dict[str, Tensor]] | None = None
7565
7566 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7567 n_head = self.hparams["num_attention_heads"]
7568 n_kv_head = self.hparams.get("num_key_value_heads")
7569
7570 if name.endswith("q_proj.weight"):
7571 data_torch = LlamaModel.permute(data_torch, n_head, n_head)
7572 if name.endswith("k_proj.weight"):
7573 data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
7574
7575 # process the experts separately
7576 if name.find("block_sparse_moe.experts") != -1:
7577 n_experts = self.hparams["num_local_experts"]
7578
7579 assert bid is not None
7580
7581 if self._experts is None:
7582 self._experts = [{} for _ in range(self.block_count)]
7583
7584 self._experts[bid][name] = data_torch
7585
7586 if len(self._experts[bid]) >= n_experts * 3:
7587 # merge the experts into a single 3d tensor
7588 for wid in ["w1", "w2", "w3"]:
7589 datas: list[Tensor] = []
7590
7591 for xid in range(n_experts):
7592 ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
7593 datas.append(self._experts[bid][ename])
7594 del self._experts[bid][ename]
7595
7596 data_torch = torch.stack(datas, dim=0)
7597
7598 merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
7599
7600 yield from super().modify_tensors(data_torch, merged_name, bid)
7601 return
7602 else:
7603 return
7604
7605 yield from super().modify_tensors(data_torch, name, bid)
7606
7607 def prepare_tensors(self):
7608 super().prepare_tensors()
7609
7610 if self._experts is not None:
7611 # flatten `list[dict[str, Tensor]]` into `list[str]`
7612 experts = [k for d in self._experts for k in d.keys()]
7613 if len(experts) > 0:
7614 raise ValueError(f"Unprocessed experts: {experts}")
7615
7616
7617@ModelBase.register("DeepseekForCausalLM")
7618class DeepseekModel(TextModel):
7619 model_arch = gguf.MODEL_ARCH.DEEPSEEK
7620
7621 def set_vocab(self):
7622 try:
7623 self._set_vocab_sentencepiece()
7624 except FileNotFoundError:
7625 self._set_vocab_gpt2()
7626
7627 def set_gguf_parameters(self):
7628 super().set_gguf_parameters()
7629 hparams = self.hparams
7630 if (rope_dim := hparams.get("head_dim")) is None:
7631 rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
7632
7633 self.gguf_writer.add_rope_dimension_count(rope_dim)
7634 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
7635 self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
7636 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
7637 self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
7638 self.gguf_writer.add_expert_weights_scale(1.0)
7639 self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
7640 self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
7641
7642 _experts: list[dict[str, Tensor]] | None = None
7643
7644 @staticmethod
7645 def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
7646 if n_head_kv is not None and n_head != n_head_kv:
7647 n_head = n_head_kv
7648 return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
7649 .swapaxes(1, 2)
7650 .reshape(weights.shape))
7651
7652 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7653 n_head = self.hparams["num_attention_heads"]
7654 n_kv_head = self.hparams.get("num_key_value_heads")
7655
7656 if name.endswith(("q_proj.weight", "q_proj.bias")):
7657 data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
7658 if name.endswith(("k_proj.weight", "k_proj.bias")):
7659 data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
7660
7661 # process the experts separately
7662 if name.find("mlp.experts") != -1:
7663 n_experts = self.hparams["n_routed_experts"]
7664 assert bid is not None
7665
7666 if self._experts is None:
7667 self._experts = [{} for _ in range(self.block_count)]
7668
7669 self._experts[bid][name] = data_torch
7670
7671 if len(self._experts[bid]) >= n_experts * 3:
7672 # merge the experts into a single 3d tensor
7673 for w_name in ["down_proj", "gate_proj", "up_proj"]:
7674 datas: list[Tensor] = []
7675
7676 for xid in range(n_experts):
7677 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
7678 datas.append(self._experts[bid][ename])
7679 del self._experts[bid][ename]
7680
7681 data_torch = torch.stack(datas, dim=0)
7682
7683 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
7684
7685 yield from super().modify_tensors(data_torch, merged_name, bid)
7686 return
7687 else:
7688 return
7689
7690 yield from super().modify_tensors(data_torch, name, bid)
7691
7692 def prepare_tensors(self):
7693 super().prepare_tensors()
7694
7695 if self._experts is not None:
7696 # flatten `list[dict[str, Tensor]]` into `list[str]`
7697 experts = [k for d in self._experts for k in d.keys()]
7698 if len(experts) > 0:
7699 raise ValueError(f"Unprocessed experts: {experts}")
7700
7701
7702@ModelBase.register(
7703 "DeepseekV2ForCausalLM",
7704 "DeepseekV3ForCausalLM",
7705 "KimiVLForConditionalGeneration",
7706 "KimiK25ForConditionalGeneration",
7707 "YoutuForCausalLM",
7708 "YoutuVLForConditionalGeneration",
7709)
7710class DeepseekV2Model(TextModel):
7711 model_arch = gguf.MODEL_ARCH.DEEPSEEK2
7712
7713 def set_vocab(self):
7714 try:
7715 self._set_vocab_gpt2()
7716 return
7717 except Exception:
7718 pass
7719
7720 from transformers import AutoTokenizer
7721 tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
7722 tokpre = self.get_vocab_base_pre(tokenizer)
7723
7724 if tokpre == "kimi-k2":
7725 # Build merges list using the approach similar to HunYuanMoE
7726 merges = []
7727 vocab = {}
7728 mergeable_ranks = tokenizer.model._mergeable_ranks
7729 for token, rank in mergeable_ranks.items():
7730 vocab[QwenModel.token_bytes_to_string(token)] = rank
7731 if len(token) == 1:
7732 continue
7733 merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
7734 if len(merged) == 2:
7735 merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
7736
7737 # Build token list
7738 vocab_size = self.hparams["vocab_size"]
7739 special_tokens = tokenizer.special_tokens
7740 reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
7741 tokens: list[str] = []
7742 toktypes: list[int] = []
7743
7744 for i in range(vocab_size):
7745 if i not in reverse_vocab:
7746 tokens.append(f"[PAD{i}]")
7747 toktypes.append(gguf.TokenType.UNUSED)
7748 else:
7749 token = reverse_vocab[i]
7750 tokens.append(token)
7751 if i in special_tokens.values():
7752 toktypes.append(gguf.TokenType.CONTROL)
7753 else:
7754 toktypes.append(gguf.TokenType.NORMAL)
7755
7756 self.gguf_writer.add_tokenizer_model("gpt2")
7757 self.gguf_writer.add_tokenizer_pre(tokpre)
7758 self.gguf_writer.add_token_list(tokens)
7759 self.gguf_writer.add_token_types(toktypes)
7760 self.gguf_writer.add_token_merges(merges)
7761
7762 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
7763 special_vocab.add_to_gguf(self.gguf_writer)
7764 else:
7765 raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
7766
7767 def set_gguf_parameters(self):
7768
7769 # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
7770 self.hparams["num_key_value_heads"] = 1
7771
7772 super().set_gguf_parameters()
7773 hparams = self.hparams
7774
7775 # first_k_dense_replace: number of leading layers using dense FFN instead of MoE
7776 # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
7777 # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
7778 has_moe = hparams.get("n_routed_experts") is not None
7779 first_k_dense_replace = hparams.get("first_k_dense_replace")
7780 if first_k_dense_replace is None:
7781 # Default: if no MoE, all layers are dense; if MoE, none are dense
7782 first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
7783 self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
7784 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
7785 if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
7786 self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
7787 self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
7788
7789 # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
7790 self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
7791 self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
7792 self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
7793 self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
7794
7795 # MoE parameters (required by C++ code for DEEPSEEK2 arch)
7796 # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
7797 moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False)
7798 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
7799
7800 if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
7801 self.gguf_writer.add_expert_count(n_routed_experts)
7802
7803 # expert_shared_count is required by C++ code, default to 0 for non-MoE models
7804 n_shared_experts = hparams.get("n_shared_experts", 0)
7805 self.gguf_writer.add_expert_shared_count(n_shared_experts)
7806
7807 # When not set, C++ code will use scale_w = false to skip the no-op scaling
7808 if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
7809 self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
7810
7811 if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
7812 self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
7813
7814 self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
7815
7816 if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
7817 # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
7818 # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
7819 # ref https://github.com/ggml-org/llama.cpp/pull/17945
7820 self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)
7821
7822 _experts: list[dict[str, Tensor]] | None = None
7823
7824 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7825 # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
7826 if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name:
7827 return
7828 if name.startswith("siglip2.") or name.startswith("merger."):
7829 return
7830 if name.startswith("language_model."):
7831 name = name.replace("language_model.", "")
7832
7833 # skip lm_head.weight if tie_word_embeddings is True
7834 if self.hparams.get("tie_word_embeddings", False):
7835 if name == "lm_head.weight" or name == "model.lm_head.weight":
7836 logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
7837 return
7838
7839 # rename e_score_correction_bias tensors
7840 if name.endswith("e_score_correction_bias"):
7841 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
7842
7843 # skip Multi-Token Prediction (MTP) layers
7844 block_count = self.hparams["num_hidden_layers"]
7845 match = re.match(r"model.layers.(\d+)", name)
7846 if match and int(match.group(1)) >= block_count:
7847 return
7848
7849 # process the experts separately
7850 if name.find("mlp.experts") != -1:
7851 n_experts = self.hparams["n_routed_experts"]
7852 assert bid is not None
7853
7854 if self._experts is None:
7855 self._experts = [{} for _ in range(self.block_count)]
7856
7857 self._experts[bid][name] = data_torch
7858
7859 if len(self._experts[bid]) >= n_experts * 3:
7860 # merge the experts into a single 3d tensor
7861 for w_name in ["down_proj", "gate_proj", "up_proj"]:
7862 datas: list[Tensor] = []
7863
7864 for xid in range(n_experts):
7865 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
7866 datas.append(self._experts[bid][ename])
7867 del self._experts[bid][ename]
7868
7869 data_torch = torch.stack(datas, dim=0)
7870
7871 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
7872
7873 yield from super().modify_tensors(data_torch, merged_name, bid)
7874 return
7875 else:
7876 return
7877
7878 # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
7879 if name.endswith("kv_b_proj.weight"):
7880 name_kb = name.replace("kv_b_proj", "k_b_proj")
7881 name_vb = name.replace("kv_b_proj", "v_b_proj")
7882
7883 n_head_kv = self.hparams["num_key_value_heads"]
7884 v_head_dim = self.hparams["v_head_dim"]
7885 qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
7886
7887 assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
7888
7889 kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
7890 k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
7891 k_b = k_b.transpose(1, 2)
7892
7893 yield from super().modify_tensors(k_b, name_kb, bid)
7894 yield from super().modify_tensors(v_b, name_vb, bid)
7895 return
7896
7897 yield from super().modify_tensors(data_torch, name, bid)
7898
7899 def prepare_tensors(self):
7900 super().prepare_tensors()
7901
7902 if self._experts is not None:
7903 # flatten `list[dict[str, Tensor]]` into `list[str]`
7904 experts = [k for d in self._experts for k in d.keys()]
7905 if len(experts) > 0:
7906 raise ValueError(f"Unprocessed experts: {experts}")
7907
7908
7909@ModelBase.register("MiniMaxM2ForCausalLM")
7910class MiniMaxM2Model(TextModel):
7911 model_arch = gguf.MODEL_ARCH.MINIMAXM2
7912 _experts_cache: dict[int, dict[str, Tensor]] = {}
7913
7914 def __init__(self, *args, **kwargs):
7915 super().__init__(*args, **kwargs)
7916 self.hparams["num_experts"] = self.hparams["num_local_experts"]
7917
7918 def set_gguf_parameters(self):
7919 super().set_gguf_parameters()
7920
7921 self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
7922 self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
7923
7924 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
7925 if name.endswith("e_score_correction_bias"):
7926 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
7927
7928 # merge expert weights
7929 if 'experts' in name:
7930 n_experts = self.hparams["num_experts"]
7931 assert bid is not None
7932
7933 expert_cache = self._experts_cache.setdefault(bid, {})
7934 expert_cache[name] = data_torch
7935 expert_weights = ["w1", "w2", "w3"]
7936
7937 # not enough expert weights to merge
7938 if len(expert_cache) < n_experts * len(expert_weights):
7939 return
7940
7941 for w_name in expert_weights:
7942 datas: list[Tensor] = []
7943
7944 for xid in range(n_experts):
7945 ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
7946 datas.append(expert_cache[ename])
7947 del expert_cache[ename]
7948
7949 data_torch = torch.stack(datas, dim=0)
7950 merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
7951 new_name = self.map_tensor_name(merged_name)
7952 yield from super().modify_tensors(data_torch, new_name, bid)
7953
7954 del self._experts_cache[bid]
7955 return
7956
7957 yield from super().modify_tensors(data_torch, name, bid)
7958
7959
7960@ModelBase.register("MiMoV2FlashForCausalLM")
7961class MimoV2Model(TextModel):
7962 model_arch = gguf.MODEL_ARCH.MIMO2
7963
7964 def set_gguf_parameters(self):
7965 super().set_gguf_parameters()
7966
7967 assert self.hparams["swa_head_dim"] == self.hparams["head_dim"]
7968 assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"]
7969 assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"]
7970 assert self.hparams["topk_method"] == "noaux_tc"
7971
7972 n_head_kv = self.hparams["num_key_value_heads"]
7973 n_head_kv_swa = self.hparams["swa_num_key_value_heads"]
7974 n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in self.hparams["hybrid_layer_pattern"]]
7975 self.gguf_writer.add_head_count_kv(n_head_kv_arr)
7976
7977 self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
7978 self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
7979 self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
7980 self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
7981 self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
7982
7983 rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
7984 self.gguf_writer.add_rope_dimension_count(rope_dim)
7985
7986 self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
7987
7988 _experts: list[dict[str, Tensor]] | None = None
7989
7990 def modify_tensors(self, data_torch, name, bid):
7991 if name.endswith("e_score_correction_bias"):
7992 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
7993
7994 if "attention_sink" in name and not name.endswith(".weight"):
7995 name += ".weight"
7996
7997 # TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE
7998 if "model.mtp." in name:
7999 return
8000
8001 # process the experts separately
8002 if name.find("mlp.experts") != -1:
8003 n_experts = self.hparams["n_routed_experts"]
8004 assert bid is not None
8005
8006 if self._experts is None:
8007 self._experts = [{} for _ in range(self.block_count)]
8008
8009 self._experts[bid][name] = data_torch
8010
8011 if len(self._experts[bid]) >= n_experts * 3:
8012 # merge the experts into a single 3d tensor
8013 for w_name in ["gate_proj", "up_proj", "down_proj"]:
8014 datas: list[Tensor] = []
8015
8016 for xid in range(n_experts):
8017 ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
8018 datas.append(self._experts[bid][ename_to_retrieve])
8019 del self._experts[bid][ename_to_retrieve]
8020
8021 data_torch = torch.stack(datas, dim=0)
8022 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
8023
8024 yield from super().modify_tensors(data_torch, merged_name, bid)
8025 return
8026 else:
8027 return
8028 yield from super().modify_tensors(data_torch, name, bid)
8029
8030 def prepare_tensors(self):
8031 super().prepare_tensors()
8032
8033 if self._experts is not None:
8034 # flatten `list[dict[str, Tensor]]` into `list[str]`
8035 experts = [k for d in self._experts for k in d.keys()]
8036 if len(experts) > 0:
8037 raise ValueError(f"Unprocessed experts: {experts}")
8038
8039
8040@ModelBase.register("Step3p5ForCausalLM")
8041class Step35Model(TextModel):
8042 model_arch = gguf.MODEL_ARCH.STEP35
8043
8044 def set_gguf_parameters(self):
8045 rope_theta = self.hparams.get("rope_theta")
8046 if isinstance(rope_theta, list):
8047 self.hparams["rope_theta"] = float(rope_theta[0])
8048 self.hparams["local_rope_theta"] = float(rope_theta[1])
8049 self.rope_parameters["rope_theta"] = self.hparams["rope_theta"]
8050 self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]}
8051
8052 super().set_gguf_parameters()
8053
8054 layer_types = self.hparams.get("layer_types") or []
8055 partial_rotary_factors = self.hparams.get("partial_rotary_factors") or []
8056 attn_other = self.hparams.get("attention_other_setting") or {}
8057
8058 n_head_base = self.hparams["num_attention_heads"]
8059 n_kv_base = self.hparams["num_attention_groups"]
8060
8061 n_head_swa = attn_other.get("num_attention_heads", n_head_base)
8062 n_kv_swa = attn_other.get("num_attention_groups", n_kv_base)
8063
8064 layer_types = layer_types[: self.block_count]
8065 partial_rotary_factors = partial_rotary_factors[: self.block_count]
8066 assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors
8067 head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types]
8068 kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
8069 swa_pat = [lt == "sliding_attention" for lt in layer_types]
8070
8071 self.gguf_writer.add_head_count(head_arr)
8072 self.gguf_writer.add_head_count_kv(kv_arr)
8073
8074 self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
8075 self.gguf_writer.add_sliding_window_pattern(swa_pat)
8076
8077 self.gguf_writer.add_value_length(self.hparams["head_dim"])
8078
8079 # MoE params
8080 self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
8081 self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
8082 self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
8083 self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"])
8084
8085 if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None:
8086 self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor)
8087 if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None:
8088 self.gguf_writer.add_expert_weights_norm(norm_expert_weight)
8089
8090 # leading dense blocks
8091 leading_dense = 0
8092 moe_layers_enum = self.hparams.get("moe_layers_enum")
8093 if isinstance(moe_layers_enum, str) and moe_layers_enum.strip():
8094 moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(","))
8095 if moe_layers:
8096 leading_dense = max(0, moe_layers[0])
8097 self.gguf_writer.add_leading_dense_block_count(leading_dense)
8098 self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1)))
8099
8100 self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
8101
8102 # Optional per-layer SwiGLU clamps.
8103 if (limits := self.hparams.get("swiglu_limits")) is not None:
8104 limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]]
8105 self.gguf_writer.add_swiglu_clamp_exp(limits_f)
8106 if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None:
8107 limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]]
8108 self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f)
8109
8110 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
8111 # remove mtp layers
8112 if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
8113 il = int(m.group(1))
8114 n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
8115 if il >= n_main:
8116 return
8117 if name.endswith("norm.weight"):
8118 data_torch += 1.0
8119 # Map router bias (expert selection bias) to a GGUF bias tensor
8120 if name.endswith(".moe.router_bias"):
8121 name += ".bias"
8122
8123 if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")):
8124 data_torch = data_torch.squeeze().contiguous()
8125
8126 yield from super().modify_tensors(data_torch, name, bid)
8127
8128 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
8129 # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3").
8130 # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS).
8131 rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
8132 rope_type = rope_params.get("rope_type") or ""
8133 if rope_type.lower() != "llama3":
8134 return
8135
8136 # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value.
8137 rope_theta = self.hparams.get("rope_theta", 10000.0)
8138 if isinstance(rope_theta, list):
8139 rope_theta = rope_theta[0]
8140 base = float(rope_theta)
8141 if (dim := self.hparams.get("head_dim")) is None:
8142 dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
8143 dim = int(dim)
8144
8145 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
8146
8147 factor = float(rope_params.get("factor", 8.0))
8148 low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
8149 high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
8150 old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
8151
8152 low_freq_wavelen = old_context_len / low_freq_factor
8153 high_freq_wavelen = old_context_len / high_freq_factor
8154
8155 rope_factors: list[float] = []
8156 for freq in freqs:
8157 wavelen = 2 * math.pi / float(freq)
8158 if wavelen < high_freq_wavelen:
8159 rope_factors.append(1.0)
8160 elif wavelen > low_freq_wavelen:
8161 rope_factors.append(factor)
8162 else:
8163 smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
8164 rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth))
8165
8166 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
8167
8168
8169@ModelBase.register("PanguEmbeddedForCausalLM")
8170class PanguEmbeddedModel(TextModel):
8171 model_arch = gguf.MODEL_ARCH.PANGU_EMBED
8172
8173 def set_vocab(self):
8174 self._set_vocab_sentencepiece()
8175
8176 tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
8177 if tokenizer_config_file.is_file():
8178 with open(tokenizer_config_file, "r", encoding="utf-8") as f:
8179 tokenizer_config_json = json.load(f)
8180 if "add_prefix_space" in tokenizer_config_json:
8181 self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
8182
8183 def set_gguf_parameters(self):
8184 super().set_gguf_parameters()
8185 hparams = self.hparams
8186 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
8187
8188 # PanguEmbedded's hparam loaded from config.json without head_dim
8189 if (rope_dim := hparams.get("head_dim")) is None:
8190 rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
8191 self.gguf_writer.add_rope_dimension_count(rope_dim)
8192
8193 if hparams.get("head_dim") is None:
8194 self.gguf_writer.add_key_length(rope_dim)
8195 self.gguf_writer.add_value_length(rope_dim)
8196
8197 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8198 if name == "lm_head.weight":
8199 if self.hparams.get("tie_word_embeddings", False):
8200 logger.info("Skipping tied output layer 'lm_head.weight'")
8201 return
8202 yield from super().modify_tensors(data_torch, name, bid)
8203
8204
8205@ModelBase.register("Dots1ForCausalLM")
8206class Dots1Model(Qwen2MoeModel):
8207 model_arch = gguf.MODEL_ARCH.DOTS1
8208
8209 def __init__(self, *args, **kwargs):
8210 super().__init__(*args, **kwargs)
8211 self.hparams["num_experts"] = self.hparams["n_routed_experts"]
8212
8213 def set_gguf_parameters(self):
8214 super().set_gguf_parameters()
8215 self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
8216 self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
8217 self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
8218 self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
8219
8220 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
8221 if name.endswith("e_score_correction_bias"):
8222 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
8223 if "shared_experts" in name:
8224 yield from ModelBase.modify_tensors(self, data_torch, name, bid)
8225 else:
8226 yield from super().modify_tensors(data_torch, name, bid)
8227
8228
8229@ModelBase.register("PLMForCausalLM")
8230class PLMModel(TextModel):
8231 model_arch = gguf.MODEL_ARCH.PLM
8232
8233 def set_vocab(self):
8234 self._set_vocab_gpt2()
8235
8236 def set_gguf_parameters(self):
8237 super().set_gguf_parameters()
8238 hparams = self.hparams
8239 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
8240 self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
8241 self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
8242 self.gguf_writer.add_value_length(hparams["v_head_dim"])
8243 self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
8244
8245 def prepare_tensors(self):
8246 super().prepare_tensors()
8247
8248
8249@ModelBase.register("T5WithLMHeadModel")
8250@ModelBase.register("T5ForConditionalGeneration")
8251@ModelBase.register("MT5ForConditionalGeneration")
8252@ModelBase.register("UMT5ForConditionalGeneration")
8253@ModelBase.register("UMT5Model")
8254class T5Model(TextModel):
8255 model_arch = gguf.MODEL_ARCH.T5
8256
8257 def __init__(self, *args, **kwargs):
8258 super().__init__(*args, **kwargs)
8259 self.shared_token_embeddings_found = False
8260
8261 def set_vocab(self):
8262 # to avoid TypeError: Descriptors cannot be created directly
8263 # exception when importing sentencepiece_model_pb2
8264 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
8265 from sentencepiece import SentencePieceProcessor
8266 from sentencepiece import sentencepiece_model_pb2 as model
8267
8268 tokenizer_path = self.dir_model / 'tokenizer.model'
8269
8270 # many older models use spiece.model tokenizer model filename
8271 if not tokenizer_path.is_file():
8272 tokenizer_path = self.dir_model / 'spiece.model'
8273
8274 if not tokenizer_path.is_file():
8275 raise FileNotFoundError(f"File not found: {tokenizer_path}")
8276
8277 sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
8278 sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
8279
8280 # some models like Pile-T5 family use BPE tokenizer instead of Unigram
8281 if sentencepiece_model.trainer_spec.model_type == 2: # BPE
8282 # assure the tokenizer model file name is correct
8283 assert tokenizer_path.name == 'tokenizer.model'
8284 return self._set_vocab_sentencepiece()
8285 else:
8286 assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
8287
8288 add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
8289 remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
8290 precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
8291
8292 tokenizer = SentencePieceProcessor()
8293 tokenizer.LoadFromFile(str(tokenizer_path))
8294
8295 vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
8296
8297 tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
8298 scores: list[float] = [-10000.0] * vocab_size
8299 toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
8300
8301 for token_id in range(tokenizer.vocab_size()):
8302 piece = tokenizer.IdToPiece(token_id)
8303 text = piece.encode("utf-8")
8304 score = tokenizer.GetScore(token_id)
8305
8306 toktype = SentencePieceTokenTypes.NORMAL
8307 if tokenizer.IsUnknown(token_id):
8308 toktype = SentencePieceTokenTypes.UNKNOWN
8309 elif tokenizer.IsControl(token_id):
8310 toktype = SentencePieceTokenTypes.CONTROL
8311 elif tokenizer.IsUnused(token_id):
8312 toktype = SentencePieceTokenTypes.UNUSED
8313 elif tokenizer.IsByte(token_id):
8314 toktype = SentencePieceTokenTypes.BYTE
8315
8316 tokens[token_id] = text
8317 scores[token_id] = score
8318 toktypes[token_id] = toktype
8319
8320 added_tokens_file = self.dir_model / 'added_tokens.json'
8321 if added_tokens_file.is_file():
8322 with open(added_tokens_file, "r", encoding="utf-8") as f:
8323 added_tokens_json = json.load(f)
8324 for key in added_tokens_json:
8325 token_id = added_tokens_json[key]
8326 if token_id >= vocab_size:
8327 logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
8328 continue
8329
8330 tokens[token_id] = key.encode("utf-8")
8331 scores[token_id] = -1000.0
8332 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
8333
8334 if vocab_size > len(tokens):
8335 pad_count = vocab_size - len(tokens)
8336 logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
8337 for i in range(1, pad_count + 1):
8338 tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
8339 scores.append(-1000.0)
8340 toktypes.append(SentencePieceTokenTypes.UNUSED)
8341
8342 self.gguf_writer.add_tokenizer_model("t5")
8343 self.gguf_writer.add_tokenizer_pre("default")
8344 self.gguf_writer.add_token_list(tokens)
8345 self.gguf_writer.add_token_scores(scores)
8346 self.gguf_writer.add_token_types(toktypes)
8347 self.gguf_writer.add_add_space_prefix(add_prefix)
8348 self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
8349 if precompiled_charsmap:
8350 self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
8351
8352 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
8353 special_vocab.add_to_gguf(self.gguf_writer)
8354
8355 def set_gguf_parameters(self):
8356 if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
8357 logger.warning("Couldn't find context length in config.json, assuming default value of 512")
8358 n_ctx = 512
8359 self.gguf_writer.add_context_length(n_ctx)
8360 self.gguf_writer.add_embedding_length(self.hparams["d_model"])
8361 self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
8362 self.gguf_writer.add_block_count(self.block_count)
8363 if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
8364 self.gguf_writer.add_decoder_block_count(dec_n_layer)
8365 self.gguf_writer.add_head_count(self.hparams["num_heads"])
8366 self.gguf_writer.add_key_length(self.hparams["d_kv"])
8367 self.gguf_writer.add_value_length(self.hparams["d_kv"])
8368 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
8369 self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
8370 self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
8371 self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
8372 self.gguf_writer.add_file_type(self.ftype)
8373
8374 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8375 # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
8376 # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
8377 # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
8378 # and decoder and ignore the remaining ones.
8379 if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
8380 if not self.shared_token_embeddings_found:
8381 name = "shared.weight"
8382 self.shared_token_embeddings_found = True
8383 else:
8384 logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
8385 return
8386
8387 yield from super().modify_tensors(data_torch, name, bid)
8388
8389
8390@ModelBase.register("T5EncoderModel")
8391class T5EncoderModel(TextModel):
8392 model_arch = gguf.MODEL_ARCH.T5ENCODER
8393
8394 def __init__(self, *args, **kwargs):
8395 super().__init__(*args, **kwargs)
8396 self.shared_token_embeddings_found = False
8397
8398 def set_vocab(self):
8399 # to avoid TypeError: Descriptors cannot be created directly
8400 # exception when importing sentencepiece_model_pb2
8401 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
8402 from sentencepiece import SentencePieceProcessor
8403 from sentencepiece import sentencepiece_model_pb2 as model
8404
8405 tokenizer_path = self.dir_model / 'tokenizer.model'
8406
8407 # many older models use spiece.model tokenizer model filename
8408 if not tokenizer_path.is_file():
8409 tokenizer_path = self.dir_model / 'spiece.model'
8410
8411 if not tokenizer_path.is_file():
8412 raise FileNotFoundError(f"File not found: {tokenizer_path}")
8413
8414 sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
8415 sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
8416
8417 # some models like Pile-T5 family use BPE tokenizer instead of Unigram
8418 if sentencepiece_model.trainer_spec.model_type == 2: # BPE
8419 # assure the tokenizer model file name is correct
8420 assert tokenizer_path.name == 'tokenizer.model'
8421 return self._set_vocab_sentencepiece()
8422 else:
8423 assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
8424
8425 add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
8426 remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
8427 precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
8428
8429 tokenizer = SentencePieceProcessor()
8430 tokenizer.LoadFromFile(str(tokenizer_path))
8431
8432 vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
8433
8434 tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
8435 scores: list[float] = [-10000.0] * vocab_size
8436 toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
8437
8438 for token_id in range(tokenizer.vocab_size()):
8439 piece = tokenizer.IdToPiece(token_id)
8440 text = piece.encode("utf-8")
8441 score = tokenizer.GetScore(token_id)
8442
8443 toktype = SentencePieceTokenTypes.NORMAL
8444 if tokenizer.IsUnknown(token_id):
8445 toktype = SentencePieceTokenTypes.UNKNOWN
8446 elif tokenizer.IsControl(token_id):
8447 toktype = SentencePieceTokenTypes.CONTROL
8448 elif tokenizer.IsUnused(token_id):
8449 toktype = SentencePieceTokenTypes.UNUSED
8450 elif tokenizer.IsByte(token_id):
8451 toktype = SentencePieceTokenTypes.BYTE
8452
8453 tokens[token_id] = text
8454 scores[token_id] = score
8455 toktypes[token_id] = toktype
8456
8457 added_tokens_file = self.dir_model / 'added_tokens.json'
8458 if added_tokens_file.is_file():
8459 with open(added_tokens_file, "r", encoding="utf-8") as f:
8460 added_tokens_json = json.load(f)
8461 for key in added_tokens_json:
8462 token_id = added_tokens_json[key]
8463 if token_id >= vocab_size:
8464 logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
8465 continue
8466
8467 tokens[token_id] = key.encode("utf-8")
8468 scores[token_id] = -1000.0
8469 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
8470
8471 if vocab_size > len(tokens):
8472 pad_count = vocab_size - len(tokens)
8473 logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
8474 for i in range(1, pad_count + 1):
8475 tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
8476 scores.append(-1000.0)
8477 toktypes.append(SentencePieceTokenTypes.UNUSED)
8478
8479 self.gguf_writer.add_tokenizer_model("t5")
8480 self.gguf_writer.add_tokenizer_pre("default")
8481 self.gguf_writer.add_token_list(tokens)
8482 self.gguf_writer.add_token_scores(scores)
8483 self.gguf_writer.add_token_types(toktypes)
8484 self.gguf_writer.add_add_space_prefix(add_prefix)
8485 self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
8486 if precompiled_charsmap:
8487 self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
8488
8489 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
8490 special_vocab.add_to_gguf(self.gguf_writer)
8491
8492 def set_gguf_parameters(self):
8493 if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
8494 logger.warning("Couldn't find context length in config.json, assuming default value of 512")
8495 n_ctx = 512
8496 self.gguf_writer.add_context_length(n_ctx)
8497 self.gguf_writer.add_embedding_length(self.hparams["d_model"])
8498 self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
8499 self.gguf_writer.add_block_count(self.block_count)
8500 self.gguf_writer.add_head_count(self.hparams["num_heads"])
8501 self.gguf_writer.add_key_length(self.hparams["d_kv"])
8502 self.gguf_writer.add_value_length(self.hparams["d_kv"])
8503 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
8504 self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
8505 self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
8506 self.gguf_writer.add_file_type(self.ftype)
8507
8508 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8509 # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
8510 # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
8511 # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
8512 # and decoder and ignore the remaining ones.
8513 if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
8514 if not self.shared_token_embeddings_found:
8515 name = "shared.weight"
8516 self.shared_token_embeddings_found = True
8517 else:
8518 logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
8519 return
8520
8521 yield from super().modify_tensors(data_torch, name, bid)
8522
8523
8524@ModelBase.register("JAISLMHeadModel")
8525class JaisModel(TextModel):
8526 model_arch = gguf.MODEL_ARCH.JAIS
8527
8528 def __init__(self, *args, **kwargs):
8529 super().__init__(*args, **kwargs)
8530
8531 # SwigLU activation
8532 assert self.hparams["activation_function"] == "swiglu"
8533 # ALiBi position embedding
8534 assert self.hparams["position_embedding_type"] == "alibi"
8535
8536 # Embeddings scale
8537 self.embeddings_scale = 1.0
8538 if 'mup_embeddings_scale' in self.hparams:
8539 self.embeddings_scale = self.hparams['mup_embeddings_scale']
8540 elif 'embeddings_scale' in self.hparams:
8541 self.embeddings_scale = self.hparams['embeddings_scale']
8542 else:
8543 assert False
8544
8545 self.width_scale = 1.0
8546 if 'mup_output_alpha' in self.hparams:
8547 assert 'mup_width_scale' in self.hparams
8548 self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
8549 elif 'width_scale' in self.hparams:
8550 self.width_scale = self.hparams['width_scale']
8551 else:
8552 assert False
8553
8554 self.max_alibi_bias = 8.0
8555
8556 def set_vocab(self):
8557 self._set_vocab_gpt2()
8558
8559 def set_gguf_parameters(self):
8560 self.gguf_writer.add_block_count(self.block_count)
8561 self.gguf_writer.add_context_length(self.hparams["n_positions"])
8562 self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
8563 self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
8564 self.gguf_writer.add_head_count(self.hparams["n_head"])
8565 self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
8566 self.gguf_writer.add_file_type(self.ftype)
8567
8568 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8569 # we don't need these
8570 if name.endswith((".attn.bias")):
8571 return
8572
8573 if name.endswith(("relative_pe.slopes")):
8574 # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
8575 # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
8576 # but Jais's PyTorch model simply precalculates the slope values and places them
8577 # in relative_pes.slopes
8578 n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
8579 first_val = float(data_torch[0].item())
8580 self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
8581
8582 return
8583
8584 if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
8585 data_torch = data_torch.transpose(1, 0)
8586
8587 new_name = self.map_tensor_name(name)
8588
8589 if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
8590 yield from super().modify_tensors(data_torch * self.embeddings_scale, new_name, bid)
8591 elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
8592 yield from super().modify_tensors(data_torch * self.width_scale, new_name, bid)
8593 else:
8594 yield from super().modify_tensors(data_torch, new_name, bid)
8595
8596 def prepare_tensors(self):
8597 super().prepare_tensors()
8598 self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
8599
8600
8601@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
8602class Glm4Model(TextModel):
8603 model_arch = gguf.MODEL_ARCH.GLM4
8604 use_mrope = False
8605 partial_rotary_factor = 0.5
8606
8607 def __init__(self, *args, **kwargs):
8608 super().__init__(*args, **kwargs)
8609 self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5)
8610 if "mrope_section" in self.rope_parameters:
8611 self.use_mrope = True
8612 logger.info("Q/K weight will need to be permuted for M-RoPE")
8613
8614 def set_vocab(self):
8615 from transformers import AutoTokenizer
8616 tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
8617 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
8618 tokens, toktypes, tokpre = self.get_vocab_base()
8619 self.gguf_writer.add_tokenizer_model("gpt2")
8620 self.gguf_writer.add_tokenizer_pre(tokpre)
8621 self.gguf_writer.add_token_list(tokens)
8622 self.gguf_writer.add_token_types(toktypes)
8623 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
8624 special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
8625 special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
8626 special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
8627 special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
8628 special_vocab.add_to_gguf(self.gguf_writer)
8629
8630 def set_gguf_parameters(self):
8631 super().set_gguf_parameters()
8632 if (rope_dim := self.hparams.get("head_dim")) is None:
8633 rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
8634 self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor))
8635
8636 @staticmethod
8637 def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor:
8638 orig_shape = weights.shape
8639 if len(orig_shape) == 1:
8640 weights = weights.unsqueeze(1) # [out_dim, 1]
8641 if len(weights.shape) != 2:
8642 raise ValueError("Only 1D and 2D tensors are supported.")
8643 n_effective_heads = weights.shape[0] // head_dim
8644 if n_head_kv is not None and n_effective_heads != n_head:
8645 if n_effective_heads != n_head_kv:
8646 raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}")
8647 rotary_dim = int(head_dim * partial_rotary_factor)
8648 if rotary_dim % 2 != 0:
8649 raise ValueError("rotary_dim must be even.")
8650 reshaped = weights.reshape(n_effective_heads, head_dim, -1)
8651 rot_part = reshaped[:, :rotary_dim, :]
8652 non_rot_part = reshaped[:, rotary_dim:, :]
8653 permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1)
8654 combined = torch.cat((permuted_rot, non_rot_part), dim=1)
8655 result = combined.reshape(weights.shape)
8656 return result if len(orig_shape) != 1 else result.squeeze(1)
8657
8658 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8659 if name.startswith("model.visual."): # ignore visual part of Glm4v
8660 return
8661 elif name.startswith("model.language_model."):
8662 name = name.replace("language_model.", "") # for Glm4v
8663 if self.use_mrope:
8664 n_head = self.hparams["num_attention_heads"]
8665 n_kv_head = self.hparams["num_key_value_heads"]
8666 n_embd = self.hparams["hidden_size"]
8667 head_dim = n_embd // n_head
8668 # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
8669 if name.endswith(("q_proj.weight", "q_proj.bias")):
8670 data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
8671 if name.endswith(("k_proj.weight", "k_proj.bias")):
8672 data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor)
8673 yield from super().modify_tensors(data_torch, name, bid)
8674
8675
8676@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
8677class Glm4MoeModel(TextModel):
8678 model_arch = gguf.MODEL_ARCH.GLM4_MOE
8679
8680 def __init__(self, *args, **kwargs):
8681 super().__init__(*args, **kwargs)
8682 # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
8683 self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
8684 self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
8685
8686 def set_vocab(self):
8687 from transformers import AutoTokenizer
8688
8689 tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
8690 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
8691 tokens, toktypes, tokpre = self.get_vocab_base()
8692 self.gguf_writer.add_tokenizer_model("gpt2")
8693 self.gguf_writer.add_tokenizer_pre(tokpre)
8694 self.gguf_writer.add_token_list(tokens)
8695 self.gguf_writer.add_token_types(toktypes)
8696
8697 # Special tokens
8698 # Note: Using <|endoftext|> (151329) for eot causes endless generation
8699 special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
8700 special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
8701 special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
8702 special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
8703
8704 special_vocab.add_to_gguf(self.gguf_writer)
8705
8706 def set_gguf_parameters(self):
8707 super().set_gguf_parameters()
8708 if (rope_dim := self.hparams.get("head_dim")) is None:
8709 rope_dim = (
8710 self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
8711 )
8712 self.gguf_writer.add_rope_dimension_count(
8713 int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
8714 )
8715
8716 # MoE parameters - Use only routed expert count (shared experts handled separately)
8717 if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None:
8718 self.gguf_writer.add_expert_count(n_routed_experts)
8719 if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
8720 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
8721 if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
8722 self.gguf_writer.add_expert_shared_count(n_shared_experts)
8723 if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None:
8724 self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
8725
8726 # Expert gating function (sigmoid for GLM4_MOE)
8727 self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
8728
8729 # Routed scaling factor
8730 if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None:
8731 self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
8732
8733 # Normalise topk probabilities
8734 if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None:
8735 self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
8736
8737 # NextN/MTP prediction layers
8738 if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
8739 self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
8740
8741 _experts: list[dict[str, Tensor]] | None = None
8742
8743 # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already
8744 def modify_tensors(
8745 self, data_torch: Tensor, name: str, bid: int | None
8746 ) -> Iterable[tuple[str, Tensor]]:
8747 if name.startswith("model.visual."): # ignore visual part
8748 return
8749 elif name.startswith("model.language_model."):
8750 name = name.replace("language_model.", "") # for multimodal variants
8751
8752 # Handle main token embedding (but not layer-specific NextN embeddings)
8753 if name == "model.embed_tokens.weight" and ".layers." not in name:
8754 yield from super().modify_tensors(data_torch, "token_embd.weight", bid)
8755 return
8756
8757 # Handle routed experts
8758 if name.find("mlp.experts") != -1:
8759 n_experts = self.hparams["n_routed_experts"]
8760 assert bid is not None
8761
8762 if self._experts is None:
8763 self._experts = [{} for _ in range(self.block_count)]
8764
8765 self._experts[bid][name] = data_torch
8766
8767 if len(self._experts[bid]) >= n_experts * 3:
8768 # merge the experts into a single 3d tensor
8769 for w_name in ["down_proj", "gate_proj", "up_proj"]:
8770 datas: list[Tensor] = []
8771
8772 for xid in range(n_experts):
8773 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
8774 datas.append(self._experts[bid][ename])
8775 del self._experts[bid][ename]
8776
8777 data_torch = torch.stack(datas, dim=0)
8778
8779 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
8780
8781 yield from super().modify_tensors(data_torch, merged_name, bid)
8782 return
8783 else:
8784 return
8785
8786 if name.endswith("e_score_correction_bias"):
8787 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
8788
8789 yield from super().modify_tensors(data_torch, name, bid)
8790
8791 def prepare_tensors(self):
8792 super().prepare_tensors()
8793 if self._experts is not None:
8794 # flatten `list[dict[str, Tensor]]` into `list[str]`
8795 experts = [k for d in self._experts for k in d.keys()]
8796 if len(experts) > 0:
8797 raise ValueError(f"Unprocessed experts: {experts}")
8798
8799
8800@ModelBase.register("Glm4MoeLiteForCausalLM")
8801class Glm4MoeLiteModel(DeepseekV2Model):
8802 model_arch = gguf.MODEL_ARCH.DEEPSEEK2
8803
8804 # copied from Glm4MoeModel
8805 def set_vocab(self):
8806 from transformers import AutoTokenizer
8807
8808 tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
8809 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
8810 tokens, toktypes, tokpre = self.get_vocab_base()
8811 self.gguf_writer.add_tokenizer_model("gpt2")
8812 self.gguf_writer.add_tokenizer_pre(tokpre)
8813 self.gguf_writer.add_token_list(tokens)
8814 self.gguf_writer.add_token_types(toktypes)
8815
8816 # Special tokens
8817 # Note: Using <|endoftext|> (151329) for eot causes endless generation
8818 special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
8819 special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
8820 special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
8821 special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
8822
8823 special_vocab.add_to_gguf(self.gguf_writer)
8824
8825
8826@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
8827class ChatGLMModel(TextModel):
8828 model_arch = gguf.MODEL_ARCH.CHATGLM
8829
8830 def set_vocab_chatglm3(self):
8831 dir_model = self.dir_model
8832 hparams = self.hparams
8833 tokens: list[bytes] = []
8834 toktypes: list[int] = []
8835 scores: list[float] = []
8836
8837 from transformers import AutoTokenizer
8838 tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
8839 vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
8840 assert max(tokenizer.get_vocab().values()) < vocab_size
8841 role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
8842 special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
8843 for token_id in range(vocab_size):
8844 piece = tokenizer._convert_id_to_token(token_id)
8845 if token_id == 0:
8846 piece = "<unk>"
8847 elif token_id == 1:
8848 piece = "<bos>"
8849 elif token_id == 2:
8850 piece = "<eos>"
8851
8852 text = piece.encode("utf-8")
8853 score = 0.0
8854 # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
8855 # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
8856 if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
8857 score = tokenizer.tokenizer.sp_model.get_score(token_id)
8858
8859 if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
8860 if piece in special_tokens:
8861 toktype = SentencePieceTokenTypes.CONTROL
8862 elif len(piece) == 0:
8863 text = f"[PAD{token_id}]".encode("utf-8")
8864 toktype = SentencePieceTokenTypes.UNUSED
8865 else:
8866 toktype = SentencePieceTokenTypes.USER_DEFINED
8867 tokens.append(text)
8868 scores.append(score)
8869 toktypes.append(toktype)
8870 continue
8871
8872 toktype = SentencePieceTokenTypes.NORMAL
8873 if tokenizer.tokenizer.sp_model.is_unknown(token_id):
8874 toktype = SentencePieceTokenTypes.UNKNOWN
8875 elif tokenizer.tokenizer.sp_model.is_control(token_id):
8876 toktype = SentencePieceTokenTypes.CONTROL
8877 elif tokenizer.tokenizer.sp_model.is_unused(token_id):
8878 toktype = SentencePieceTokenTypes.UNUSED
8879 elif tokenizer.tokenizer.sp_model.is_byte(token_id):
8880 toktype = SentencePieceTokenTypes.BYTE
8881
8882 tokens.append(text)
8883 scores.append(score)
8884 toktypes.append(toktype)
8885
8886 self.gguf_writer.add_tokenizer_model("llama")
8887 # glm3 needs prefix and suffix formatted as:
8888 # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
8889 self.gguf_writer.add_tokenizer_pre("chatglm-spm")
8890 self.gguf_writer.add_token_list(tokens)
8891 self.gguf_writer.add_token_scores(scores)
8892 self.gguf_writer.add_token_types(toktypes)
8893
8894 special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
8895 special_vocab.add_to_gguf(self.gguf_writer)
8896
8897 @staticmethod
8898 def token_bytes_to_string(b):
8899 from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
8900 byte_encoder = bytes_to_unicode()
8901 return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
8902
8903 @staticmethod
8904 def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
8905 parts = [bytes([b]) for b in token]
8906 while True:
8907 min_idx = None
8908 min_rank = None
8909 for i, pair in enumerate(zip(parts[:-1], parts[1:])):
8910 rank = mergeable_ranks.get(pair[0] + pair[1])
8911 if rank is not None and (min_rank is None or rank < min_rank):
8912 min_idx = i
8913 min_rank = rank
8914 if min_rank is None or (max_rank is not None and min_rank >= max_rank):
8915 break
8916 assert min_idx is not None
8917 parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
8918 return parts
8919
8920 def set_vocab(self):
8921 if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
8922 self.set_vocab_chatglm3()
8923 return
8924
8925 dir_model = self.dir_model
8926 hparams = self.hparams
8927 tokens: list[str] = []
8928 toktypes: list[int] = []
8929
8930 from transformers import AutoTokenizer
8931 tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
8932 vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
8933 assert max(tokenizer.get_vocab().values()) < vocab_size
8934
8935 tokens, toktypes, tokpre = self.get_vocab_base()
8936 self.gguf_writer.add_tokenizer_model("gpt2")
8937 self.gguf_writer.add_tokenizer_pre(tokpre)
8938 self.gguf_writer.add_token_list(tokens)
8939 self.gguf_writer.add_token_types(toktypes)
8940 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
8941 # only add special tokens when they were not already loaded from config.json
8942 special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
8943 special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
8944 # this one is usually not in config.json anyway
8945 special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
8946 special_vocab.add_to_gguf(self.gguf_writer)
8947
8948 def set_gguf_parameters(self):
8949 n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
8950 n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
8951 n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
8952 self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
8953 self.gguf_writer.add_embedding_length(n_embed)
8954 self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
8955 self.gguf_writer.add_block_count(self.block_count)
8956 self.gguf_writer.add_head_count(n_head)
8957 self.gguf_writer.add_head_count_kv(n_head_kv)
8958 self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
8959 self.gguf_writer.add_file_type(self.ftype)
8960 if "attention_dim" in self.hparams:
8961 rope_dim = self.hparams["attention_dim"]
8962 else:
8963 rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
8964 self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
8965 self.gguf_writer.add_add_bos_token(False)
8966 rope_freq = 10000
8967 if "rope_ratio" in self.hparams:
8968 rope_freq = rope_freq * self.hparams["rope_ratio"]
8969 self.gguf_writer.add_rope_freq_base(rope_freq)
8970
8971 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8972 if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
8973 return
8974
8975 name = name.removeprefix("transformer.")
8976 yield from super().modify_tensors(data_torch, name, bid)
8977
8978
8979@ModelBase.register("NemotronForCausalLM")
8980class NemotronModel(TextModel):
8981 model_arch = gguf.MODEL_ARCH.NEMOTRON
8982
8983 def set_vocab(self):
8984 self._set_vocab_sentencepiece()
8985 self.gguf_writer.add_pad_token_id(0)
8986 self.gguf_writer.add_unk_token_id(1)
8987
8988 def set_gguf_parameters(self):
8989 super().set_gguf_parameters()
8990 hparams = self.hparams
8991 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
8992
8993 f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
8994 self.gguf_writer.add_layer_norm_eps(f_norm_eps)
8995
8996 # * Partial RoPE
8997 rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
8998 n_embd = self.find_hparam(["hidden_size", "n_embd"])
8999 n_head = self.find_hparam(["num_attention_heads", "n_head"])
9000 self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
9001
9002 # * RopeScaling for Nemotron
9003 if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
9004 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
9005 else:
9006 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
9007 self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
9008
9009 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9010 # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
9011 # model.layers.{l}.input_layernorm.weight
9012 # model.layers.{l}.post_attention_layernorm.weight
9013 # model.norm.weight
9014 if name.endswith("norm.weight"):
9015 data_torch = data_torch + 1
9016
9017 yield from super().modify_tensors(data_torch, name, bid)
9018
9019
9020@ModelBase.register("ExaoneForCausalLM")
9021class ExaoneModel(TextModel):
9022 model_arch = gguf.MODEL_ARCH.EXAONE
9023
9024 def set_gguf_parameters(self):
9025 super().set_gguf_parameters()
9026 hparams = self.hparams
9027
9028 assert (hparams["activation_function"] == "silu")
9029
9030 rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
9031 rotary_factor = rotary_factor if rotary_factor is not None else 1.0
9032 self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
9033
9034 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
9035 if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
9036 if rope_params.get("rope_type", '').lower() == "llama3":
9037 base = self.rope_parameters.get("rope_theta", 10000.0)
9038 if (dim := self.hparams.get("head_dim")) is None:
9039 dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
9040 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
9041
9042 factor = rope_params.get("factor", 8.0)
9043 low_freq_factor = rope_params.get("low_freq_factor", 1.0)
9044 high_freq_factor = rope_params.get("high_freq_factor", 4.0)
9045 old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
9046
9047 low_freq_wavelen = old_context_len / low_freq_factor
9048 high_freq_wavelen = old_context_len / high_freq_factor
9049 assert low_freq_wavelen != high_freq_wavelen
9050
9051 rope_factors = []
9052 for freq in freqs:
9053 wavelen = 2 * math.pi / freq
9054 if wavelen < high_freq_wavelen:
9055 rope_factors.append(1)
9056 elif wavelen > low_freq_wavelen:
9057 rope_factors.append(factor)
9058 else:
9059 smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
9060 rope_factors.append(1 / ((1 - smooth) / factor + smooth))
9061
9062 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
9063
9064
9065@ModelBase.register("Exaone4ForCausalLM")
9066class Exaone4Model(TextModel):
9067 model_arch = gguf.MODEL_ARCH.EXAONE4
9068
9069 def set_vocab(self):
9070 tokens, toktypes, tokpre = self.get_vocab_base()
9071 self.gguf_writer.add_tokenizer_model("gpt2")
9072 self.gguf_writer.add_tokenizer_pre(tokpre)
9073 self.gguf_writer.add_token_list(tokens)
9074 self.gguf_writer.add_token_types(toktypes)
9075
9076 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
9077 special_vocab.add_to_gguf(self.gguf_writer)
9078
9079 def set_gguf_parameters(self):
9080 super().set_gguf_parameters()
9081 hparams = self.hparams
9082 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
9083
9084 if hparams.get("sliding_window") is not None:
9085 self.gguf_writer.add_sliding_window(hparams["sliding_window"])
9086 if "layer_types" in hparams:
9087 self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
9088 elif "sliding_window_pattern" in hparams:
9089 sliding_window_pattern = []
9090 if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG
9091 for i in range(hparams["num_hidden_layers"]):
9092 sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L")
9093 if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4
9094 for i in range(hparams["num_hidden_layers"]):
9095 sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0)
9096 if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
9097 self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
9098
9099 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
9100 if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
9101 if rope_params.get("rope_type", '').lower() == "llama3":
9102 base = rope_params.get("rope_theta", 10_000.0)
9103 if (dim := self.hparams.get("head_dim")) is None:
9104 dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
9105 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
9106
9107 factor = rope_params.get("factor", 16.0)
9108 low_freq_factor = rope_params.get("low_freq_factor", 1.0)
9109 high_freq_factor = rope_params.get("high_freq_factor", 4.0)
9110 old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
9111
9112 low_freq_wavelen = old_context_len / low_freq_factor
9113 high_freq_wavelen = old_context_len / high_freq_factor
9114
9115 rope_factors = []
9116 for freq in freqs:
9117 wavelen = 2 * math.pi / freq
9118 if wavelen < high_freq_wavelen:
9119 rope_factors.append(1)
9120 elif wavelen > low_freq_wavelen:
9121 rope_factors.append(factor)
9122 else:
9123 smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
9124 rope_factors.append(1 / ((1 - smooth) / factor + smooth))
9125
9126 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
9127
9128
9129@ModelBase.register("ExaoneMoEForCausalLM")
9130class ExaoneMoEModel(Exaone4Model):
9131 model_arch = gguf.MODEL_ARCH.EXAONE_MOE
9132
9133 def __init__(self, *args, **kwargs):
9134 super().__init__(*args, **kwargs)
9135 self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
9136 self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
9137
9138 def set_gguf_parameters(self):
9139 super().set_gguf_parameters()
9140 self.gguf_writer.add_expert_count(self.hparams["num_experts"])
9141 moe_intermediate_size = self.hparams["moe_intermediate_size"]
9142 num_shared_experts = self.hparams["num_shared_experts"]
9143 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
9144 self.gguf_writer.add_expert_shared_count(num_shared_experts)
9145 self.gguf_writer.add_expert_shared_feed_forward_length(moe_intermediate_size * num_shared_experts)
9146 self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
9147 self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
9148 n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
9149 self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
9150 self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))
9151
9152 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
9153
9154 _experts: list[dict[str, Tensor]] | None = None
9155
9156 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9157 if name.startswith("mtp."):
9158 if name.find("layers.") != -1:
9159 # `mtp.layers.0.[module_name]` format
9160 name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + self.hparams['num_hidden_layers']}")
9161 else:
9162 # mtp fc/norm weights
9163 remapper = {
9164 "mtp.fc": "model.layers.{bid}.eh_proj",
9165 "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
9166 "mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm",
9167 "mtp.norm": "model.layers.{bid}.shared_head.norm",
9168 }
9169 _n = Path(name)
9170 new_name = remapper[_n.stem] + _n.suffix
9171
9172 # set shared weights for all NextN/MTP layers
9173 for bid in range(self.hparams['num_hidden_layers'], self.block_count):
9174 yield from super().modify_tensors(data_torch, new_name.format(bid=bid), bid)
9175 return
9176
9177 if name.endswith("e_score_correction_bias"):
9178 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
9179
9180 if name.find("mlp.experts") != -1:
9181 n_experts = self.hparams["num_experts"]
9182 assert bid is not None
9183
9184 if self._experts is None:
9185 self._experts = [{} for _ in range(self.block_count)]
9186
9187 self._experts[bid][name] = data_torch
9188
9189 if len(self._experts[bid]) >= n_experts * 3:
9190 # merge the experts into a single 3d tensor
9191 for w_name in ["down_proj", "gate_proj", "up_proj"]:
9192 datas: list[Tensor] = []
9193
9194 for xid in range(n_experts):
9195 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
9196 datas.append(self._experts[bid][ename])
9197 del self._experts[bid][ename]
9198
9199 data_torch = torch.stack(datas, dim=0)
9200
9201 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
9202
9203 new_name = self.map_tensor_name(merged_name)
9204
9205 yield from super().modify_tensors(data_torch, new_name, bid)
9206 return
9207 else:
9208 return
9209
9210 yield from super().modify_tensors(data_torch, name, bid)
9211
9212 def prepare_tensors(self):
9213 super().prepare_tensors()
9214 if self._experts is not None:
9215 # flatten `list[dict[str, Tensor]]` into `list[str]`
9216 experts = [k for d in self._experts for k in d.keys()]
9217 if len(experts) > 0:
9218 raise ValueError(f"Unprocessed experts: {experts}")
9219
9220
9221@ModelBase.register("GraniteForCausalLM")
9222class GraniteModel(LlamaModel):
9223 """Conversion for IBM's GraniteForCausalLM"""
9224 model_arch = gguf.MODEL_ARCH.GRANITE
9225
9226 def set_gguf_parameters(self):
9227 """Granite uses standard llama parameters with the following differences:
9228
9229 - No head_dim support
9230 - New multiplier params:
9231 - attention_scale
9232 - embedding_scale
9233 - residual_scale
9234 - logits_scaling
9235 """
9236 if head_dim := self.hparams.pop("head_dim", None):
9237 logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
9238 super().set_gguf_parameters()
9239 # NOTE: Convert _multiplier params to _scale params for naming
9240 # consistency
9241 if attention_scale := self.hparams.get("attention_multiplier"):
9242 self.gguf_writer.add_attention_scale(attention_scale)
9243 logger.info("gguf: (granite) attention_scale = %s", attention_scale)
9244 if embedding_scale := self.hparams.get("embedding_multiplier"):
9245 self.gguf_writer.add_embedding_scale(embedding_scale)
9246 logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
9247 if residual_scale := self.hparams.get("residual_multiplier"):
9248 self.gguf_writer.add_residual_scale(residual_scale)
9249 logger.info("gguf: (granite) residual_scale = %s", residual_scale)
9250 if logits_scale := self.hparams.get("logits_scaling"):
9251 self.gguf_writer.add_logit_scale(logits_scale)
9252 logger.info("gguf: (granite) logits_scale = %s", logits_scale)
9253
9254
9255@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
9256class GraniteMoeModel(GraniteModel):
9257 """Conversion for IBM's GraniteMoeForCausalLM"""
9258 model_arch = gguf.MODEL_ARCH.GRANITE_MOE
9259
9260 def set_gguf_parameters(self):
9261 """GraniteMoeShared uses GraniteMoe parameters plus the following:
9262 - shared_intermediate_size
9263 """
9264 super().set_gguf_parameters()
9265 if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
9266 self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
9267 logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
9268
9269 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9270 """In modeling_granitemoe, the JetMoe implementation of parallel experts
9271 is used. This essentially merges w1 and w3 into a single tensor with 2x
9272 the hidden size that is then split during forward. To keep compatibility
9273 with existing mixtral support, we pull them apart here.
9274 """
9275
9276 if name.endswith("block_sparse_moe.input_linear.weight"):
9277 ffn_dim = self.hparams["intermediate_size"]
9278 assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
9279 gate, up = data_torch.split(ffn_dim, dim=-2)
9280 yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid)
9281 yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid)
9282 return
9283
9284 has_experts = bool(self.hparams.get('num_local_experts'))
9285
9286 if name.endswith("shared_mlp.input_linear.weight"):
9287 ffn_dim = self.hparams["shared_intermediate_size"]
9288 assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
9289 gate, up = data_torch.split(ffn_dim, dim=-2)
9290 if has_experts:
9291 yield from ModelBase.modify_tensors(self, gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid)
9292 yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid)
9293 return
9294 yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid)
9295 yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid)
9296 return
9297
9298 if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
9299 yield from ModelBase.modify_tensors(self, data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid)
9300 return
9301
9302 yield from super().modify_tensors(data_torch, name, bid)
9303
9304
9305@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
9306class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
9307 """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
9308 layers and optionally uses MoE w/ a shared expert"""
9309 model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
9310 undo_permute = True
9311
9312 def __init__(self, *args, **kwargs):
9313
9314 # Hybrid mamba models use a prefix for the mamba-specific params.
9315 # TODO: Extend this if the prefix(es) need to be configurable
9316 self.hparam_prefixes = ["mamba"]
9317
9318 super().__init__(*args, **kwargs)
9319
9320 # Lists of which layers use ssm vs attention
9321 self._attn_layers = self.get_attn_layers()
9322 self._ssm_layers = [
9323 i for i in range(self.block_count)
9324 if i not in self._attn_layers
9325 ]
9326
9327 # There are some models in this family that are non-hybrid, but keep the
9328 # same parent class by setting all layers to "attention." If this is the
9329 # case, the model architecture needs to be updated to a standard
9330 # "granite" or "granitemoe" model
9331 if not self._ssm_layers:
9332 has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
9333 new_arch = (
9334 gguf.MODEL_ARCH.GRANITE_MOE
9335 if has_experts else
9336 gguf.MODEL_ARCH.GRANITE
9337 )
9338 self.model_arch = new_arch
9339 self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
9340 self.gguf_writer.add_architecture()
9341
9342 # n_group and d_inner are used during reshape_tensors for mamba2
9343 # NOTE: Explicitly include hparam prefix prefix for d_model to
9344 # disambiguate with top-level head_dim
9345 # NOTE 2: If needed for future models, this can be isolated in a method
9346 # to separate the prefix setting and teh keys used
9347 self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
9348 self.n_group = self.find_hparam(["n_groups", "num_groups"])
9349 self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
9350
9351 def get_attn_layers(self):
9352 # Explicit list of layer type names
9353 if layer_types := self.hparams.get("layer_types"):
9354 return [
9355 i for i, typ in enumerate(layer_types)
9356 if typ == "attention"
9357 ]
9358
9359 # Layer types indicated by index or period
9360 attn_layers = self.hparams.get("attn_layer_indices", [])
9361 if not attn_layers:
9362 attn_period = self.hparams.get("attn_layer_period")
9363 assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
9364 attn_offset = self.hparams.get("attn_layer_offset")
9365 assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
9366 attn_layers = [
9367 i for i in range(self.block_count)
9368 if i % attn_period == attn_offset
9369 ]
9370 return attn_layers
9371
9372 def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
9373 prefixed = []
9374 for pfx in self.hparam_prefixes:
9375 prefixed.extend(
9376 "_".join([pfx, k])
9377 for k in keys
9378 )
9379 keys = list(keys) + prefixed
9380 return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
9381
9382 def modify_tensors(
9383 self, data_torch: Tensor, name: str, bid: int | None
9384 ) -> Iterable[tuple[str, Tensor]]:
9385 if (
9386 name.endswith("block_sparse_moe.input_linear.weight")
9387 or "shared_mlp" in name
9388 ):
9389 yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
9390 return
9391
9392 # Determine whether this is a mamba layer or an attention layer
9393 if bid in self._ssm_layers:
9394 yield from Mamba2Model.modify_tensors(self, data_torch, name, bid)
9395 return
9396 elif bid in self._attn_layers:
9397 yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
9398 return
9399 yield from ModelBase.modify_tensors(self, data_torch, name, bid)
9400
9401 def set_gguf_parameters(self):
9402 """This method merges params from both parents and some that are
9403 specific to this model. The result is some duplication of how the params
9404 get set. The following warnings are expected during conversion:
9405
9406 WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
9407 WARNING:Duplicated key name 'granitehybrid.context_length'
9408 """
9409 GraniteMoeModel.set_gguf_parameters(self)
9410
9411 ## Mamba mixer params ##
9412 self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
9413 self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state", "state_dim", "ssm_state_size"]))
9414 self.gguf_writer.add_ssm_group_count(self.n_group)
9415 self.gguf_writer.add_ssm_inner_size(self.d_inner)
9416 # NOTE: The mamba_dt_rank is _not_ the right field for how this is used
9417 # in llama.cpp
9418 self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads", "num_heads"]))
9419
9420 ## Attention params ##
9421 head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
9422 head_count_kv_vec = [
9423 head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
9424 ]
9425 if rope_dim := self.hparams.get("attn_rotary_emb"):
9426 self.gguf_writer.add_rope_dimension_count(rope_dim)
9427 self.gguf_writer.add_head_count_kv(head_count_kv_vec)
9428
9429 ## If Bamba or non-hybrid, use rope, otherwise don't
9430 use_rope = (
9431 "BambaForCausalLM" in self.hparams["architectures"]
9432 or not self._ssm_layers
9433 )
9434 self.gguf_writer.add_rope_scaling_finetuned(use_rope)
9435 if not use_rope:
9436 self.gguf_writer.add_context_length(2**20)
9437
9438 ## Validation ##
9439 d_head = self.find_hparam(["d_head"], optional=True) or 64
9440 assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
9441 assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
9442
9443 def set_vocab(self):
9444 self.hparams["pad_vocab_size_multiple"] = 8
9445 Mamba2Model.set_vocab(self)
9446
9447
9448@ModelBase.register("NemotronHForCausalLM")
9449class NemotronHModel(GraniteHybridModel):
9450 """Hybrid mamba2/attention model from NVIDIA"""
9451 model_arch = gguf.MODEL_ARCH.NEMOTRON_H
9452 is_moe: bool = False
9453
9454 def __init__(self, *args, **kwargs):
9455 # We have to determine the correct model architecture (MoE vs non-MoE) before
9456 # calling the parent __init__. This is because the parent constructor
9457 # uses self.model_arch to build the tensor name map, and all MoE-specific
9458 # mappings would be missed if it were called with the default non-MoE arch.
9459 hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
9460 if "num_experts_per_tok" in hparams:
9461 self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
9462 self.is_moe = True
9463
9464 super().__init__(*args, **kwargs)
9465
9466 # Save the top-level head_dim for later
9467 self.head_dim = self.hparams.get("head_dim", self.hparams.get("attention_head_dim"))
9468 assert self.head_dim is not None, "Could not find the attention head dim in config"
9469
9470 # Don't use expand to calculate d_inner
9471 self.d_inner = self.find_hparam(["num_heads"]) * self.d_model
9472
9473 # Update the ssm / attn / mlp layers
9474 # M: Mamba2, *: Attention, -: MLP
9475 # MoE:
9476 # M: Mamba2, *: Attention, E: Expert
9477 hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
9478 self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
9479 self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
9480
9481 def get_attn_layers(self):
9482 hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
9483 assert len(hybrid_override_pattern) == self.block_count, "Mismatch between hybrid override and num_hidden_layers!"
9484 return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"]
9485
9486 def set_gguf_parameters(self):
9487 super().set_gguf_parameters()
9488
9489 self.gguf_writer.add_key_length(self.head_dim)
9490 self.gguf_writer.add_value_length(self.head_dim)
9491
9492 # Set feed_forward_length
9493 # NOTE: This will trigger an override warning. This is preferrable to
9494 # duplicating all the parent logic
9495 if not self.is_moe:
9496 n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
9497 self.gguf_writer.add_feed_forward_length([
9498 n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
9499 ])
9500 else:
9501 moe_intermediate_size = self.hparams["moe_intermediate_size"]
9502 self.gguf_writer.add_feed_forward_length([
9503 moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count)
9504 ])
9505 self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
9506 self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
9507 self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"])
9508 self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
9509 self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
9510 self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
9511 self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
9512 self.gguf_writer.add_expert_group_count(self.hparams["n_group"])
9513
9514 # number of experts used per token (top-k)
9515 if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
9516 self.gguf_writer.add_expert_used_count(n_experts_used)
9517
9518 def set_vocab(self):
9519 super().set_vocab()
9520
9521 # The tokenizer _does_ add a BOS token (via post_processor type
9522 # TemplateProcessing) but does not set add_bos_token to true in the
9523 # config, so we need to explicitly override it here.
9524 if not self.is_moe:
9525 self.gguf_writer.add_add_bos_token(True)
9526
9527 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9528 if self.is_moe and bid is not None:
9529 if name.endswith("mixer.gate.e_score_correction_bias"):
9530 new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
9531 yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
9532 return
9533
9534 if name.endswith("mixer.dt_bias"):
9535 new_name = name.replace("dt_bias", "dt.bias")
9536 yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
9537 return
9538
9539 if name.endswith("mixer.conv1d.weight"):
9540 squeezed_data = data_torch.squeeze()
9541 yield from ModelBase.modify_tensors(self, squeezed_data, name, bid)
9542 return
9543
9544 if name.endswith("mixer.A_log"):
9545 transformed_data = -torch.exp(data_torch)
9546 reshaped_data = transformed_data.squeeze().reshape(-1, 1)
9547 yield from ModelBase.modify_tensors(self, reshaped_data, name, bid)
9548 return
9549
9550 if name.endswith("mixer.D"):
9551 reshaped_data = data_torch.squeeze().reshape(-1, 1)
9552 yield from ModelBase.modify_tensors(self, reshaped_data, name, bid)
9553 return
9554
9555 if name.endswith("mixer.norm.weight"):
9556 reshaped_data = data_torch.reshape(self.n_group, -1)
9557 yield from ModelBase.modify_tensors(self, reshaped_data, name, bid)
9558 return
9559
9560 if name.find("mixer.experts") != -1:
9561 n_experts = self.hparams["n_routed_experts"]
9562 assert bid is not None
9563
9564 if self._experts is None:
9565 self._experts = [{} for _ in range(self.block_count)]
9566
9567 self._experts[bid][name] = data_torch
9568
9569 if len(self._experts[bid]) >= n_experts * 2:
9570 # merge the experts into a single tensor
9571 for w_name in ["down_proj", "up_proj"]:
9572 datas: list[Tensor] = []
9573
9574 for xid in range(n_experts):
9575 ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight"
9576 datas.append(self._experts[bid][ename])
9577 del self._experts[bid][ename]
9578
9579 data_torch = torch.stack(datas, dim=0)
9580 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
9581
9582 yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid)
9583 return
9584 else:
9585 return
9586
9587 yield from super().modify_tensors(data_torch, name, bid)
9588
9589 def prepare_tensors(self):
9590 super().prepare_tensors()
9591
9592 if self._experts is not None:
9593 # flatten `list[dict[str, Tensor]]` into `list[str]`
9594 experts = [k for d in self._experts for k in d.keys()]
9595 if len(experts) > 0:
9596 raise ValueError(f"Unprocessed experts: {experts}")
9597
9598
9599@ModelBase.register("LlamaBidirectionalModel")
9600class LlamaEmbedNemotronModel(LlamaModel):
9601 model_arch = gguf.MODEL_ARCH.LLAMA_EMBED
9602
9603
9604@ModelBase.register("BailingMoeForCausalLM")
9605class BailingMoeModel(TextModel):
9606 model_arch = gguf.MODEL_ARCH.BAILINGMOE
9607
9608 def set_vocab(self):
9609 self._set_vocab_gpt2()
9610
9611 def set_gguf_parameters(self):
9612 super().set_gguf_parameters()
9613 hparams = self.hparams
9614 if (rope_dim := hparams.get("head_dim")) is None:
9615 rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
9616
9617 self.gguf_writer.add_rope_dimension_count(rope_dim)
9618 self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
9619 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
9620 self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
9621 self.gguf_writer.add_expert_weights_scale(1.0)
9622 self.gguf_writer.add_expert_count(hparams["num_experts"])
9623 self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
9624 self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
9625
9626 _experts: list[dict[str, Tensor]] | None = None
9627
9628 @staticmethod
9629 def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
9630 if n_head_kv is not None and n_head != n_head_kv:
9631 n_head = n_head_kv
9632 return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
9633 .swapaxes(1, 2)
9634 .reshape(weights.shape))
9635
9636 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9637 n_head = self.hparams["num_attention_heads"]
9638 n_kv_head = self.hparams.get("num_key_value_heads")
9639 n_embd = self.hparams["hidden_size"]
9640 if (head_dim := self.hparams.get("head_dim")) is None:
9641 head_dim = n_embd // n_head
9642
9643 output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
9644
9645 if name.endswith("attention.dense.weight"):
9646 yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid)
9647 return
9648 elif name.endswith("query_key_value.weight"):
9649 q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
9650
9651 yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid)
9652 yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid)
9653 yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
9654 return
9655 elif name.find("mlp.experts") != -1:
9656 n_experts = self.hparams["num_experts"]
9657 assert bid is not None
9658
9659 if self._experts is None:
9660 self._experts = [{} for _ in range(self.block_count)]
9661
9662 self._experts[bid][name] = data_torch
9663
9664 if len(self._experts[bid]) >= n_experts * 3:
9665 # merge the experts into a single 3d tensor
9666 for w_name in ["down_proj", "gate_proj", "up_proj"]:
9667 datas: list[Tensor] = []
9668
9669 for xid in range(n_experts):
9670 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
9671 datas.append(self._experts[bid][ename])
9672 del self._experts[bid][ename]
9673
9674 data_torch = torch.stack(datas, dim=0)
9675
9676 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
9677
9678 new_name = self.map_tensor_name(merged_name)
9679
9680 yield from super().modify_tensors(data_torch, new_name, bid)
9681
9682 return
9683
9684 new_name = self.map_tensor_name(name)
9685
9686 if new_name == output_name and self.hparams.get("norm_head"):
9687 data_torch = data_torch.float()
9688 data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
9689
9690 yield from super().modify_tensors(data_torch, new_name, bid)
9691
9692 def prepare_tensors(self):
9693 super().prepare_tensors()
9694
9695 if self._experts is not None:
9696 # flatten `list[dict[str, Tensor]]` into `list[str]`
9697 experts = [k for d in self._experts for k in d.keys()]
9698 if len(experts) > 0:
9699 raise ValueError(f"Unprocessed experts: {experts}")
9700
9701
9702@ModelBase.register("BailingMoeV2ForCausalLM")
9703class BailingMoeV2Model(TextModel):
9704 model_arch = gguf.MODEL_ARCH.BAILINGMOE2
9705
9706 def __init__(self, *args, **kwargs):
9707 super().__init__(*args, **kwargs)
9708 if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0):
9709 self.block_count = self.hparams["num_hidden_layers"] + nextn_layers
9710 self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
9711
9712 def set_vocab(self):
9713 self._set_vocab_gpt2()
9714
9715 def set_gguf_parameters(self):
9716 super().set_gguf_parameters()
9717 hparams = self.hparams
9718 if (rope_dim := hparams.get("head_dim")) is None:
9719 rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
9720
9721 self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
9722 self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
9723 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
9724 self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
9725 self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"]))
9726 self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
9727 self.gguf_writer.add_expert_count(hparams["num_experts"])
9728 self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
9729 self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
9730
9731 if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
9732 self.gguf_writer.add_nextn_predict_layers(nextn_layers)
9733
9734 _experts: list[dict[str, Tensor]] | None = None
9735
9736 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9737 if "mlp.experts" in name:
9738 n_experts = self.hparams["num_experts"]
9739 assert bid is not None
9740
9741 if self._experts is None:
9742 self._experts = [{} for _ in range(self.block_count)]
9743
9744 self._experts[bid][name] = data_torch
9745
9746 if len(self._experts[bid]) >= n_experts * 3:
9747 # merge the experts into a single 3d tensor
9748 for w_name in ["down_proj", "gate_proj", "up_proj"]:
9749 datas: list[Tensor] = []
9750
9751 for xid in range(n_experts):
9752 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
9753 datas.append(self._experts[bid][ename])
9754 del self._experts[bid][ename]
9755
9756 data_torch = torch.stack(datas, dim=0)
9757
9758 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
9759
9760 yield from super().modify_tensors(data_torch, merged_name, bid)
9761 return
9762
9763 if name.endswith(".expert_bias"):
9764 name = name.replace(".expert_bias", ".expert_bias.bias")
9765
9766 yield from super().modify_tensors(data_torch, name, bid)
9767
9768 def prepare_tensors(self):
9769 super().prepare_tensors()
9770
9771 if self._experts is not None:
9772 # flatten `list[dict[str, Tensor]]` into `list[str]`
9773 experts = [k for d in self._experts for k in d.keys()]
9774 if len(experts) > 0:
9775 raise ValueError(f"Unprocessed experts: {experts}")
9776
9777
9778@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
9779class GroveMoeModel(TextModel):
9780 model_arch = gguf.MODEL_ARCH.GROVEMOE
9781
9782 def set_gguf_parameters(self):
9783 super().set_gguf_parameters()
9784 if (n_experts := self.hparams.get("num_experts")) is not None:
9785 self.gguf_writer.add_expert_count(n_experts)
9786 if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
9787 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
9788 logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
9789 # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
9790 self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
9791 # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
9792 self.gguf_writer.add_experts_per_group(2)
9793 # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
9794 self.gguf_writer.add_expert_group_scale(0.05)
9795
9796 _experts: list[dict[str, Tensor]] | None = None
9797 _chunk_experts: list[dict[str, Tensor]] | None = None
9798
9799 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9800 if name.endswith(".expert_bias"):
9801 # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
9802 return
9803
9804 # process the experts separately
9805 if name.find("chunk_experts") != -1:
9806 n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
9807 assert bid is not None
9808
9809 if self._chunk_experts is None:
9810 self._chunk_experts = [{} for _ in range(self.block_count)]
9811
9812 self._chunk_experts[bid][name] = data_torch
9813
9814 if len(self._chunk_experts[bid]) >= n_experts * 3:
9815 # merge the experts into a single 3d tensor
9816 for w_name in ["down_proj", "gate_proj", "up_proj"]:
9817 datas: list[Tensor] = []
9818
9819 for xid in range(n_experts):
9820 ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
9821 datas.append(self._chunk_experts[bid][ename])
9822 del self._chunk_experts[bid][ename]
9823
9824 data_torch = torch.stack(datas, dim=0)
9825
9826 merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
9827
9828 yield from super().modify_tensors(data_torch, merged_name, bid)
9829 return
9830 else:
9831 return
9832 elif name.find("experts") != -1:
9833 n_experts = self.hparams["num_experts"]
9834 assert bid is not None
9835
9836 if self._experts is None:
9837 self._experts = [{} for _ in range(self.block_count)]
9838
9839 self._experts[bid][name] = data_torch
9840
9841 if len(self._experts[bid]) >= n_experts * 3:
9842 # merge the experts into a single 3d tensor
9843 for w_name in ["down_proj", "gate_proj", "up_proj"]:
9844 datas: list[Tensor] = []
9845
9846 for xid in range(n_experts):
9847 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
9848 datas.append(self._experts[bid][ename])
9849 del self._experts[bid][ename]
9850
9851 data_torch = torch.stack(datas, dim=0)
9852
9853 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
9854
9855 yield from super().modify_tensors(data_torch, merged_name, bid)
9856 return
9857 else:
9858 return
9859
9860 yield from super().modify_tensors(data_torch, name, bid)
9861
9862 def prepare_tensors(self):
9863 super().prepare_tensors()
9864
9865 if self._chunk_experts is not None:
9866 # flatten `list[dict[str, Tensor]]` into `list[str]`
9867 chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
9868 if len(chunk_experts) > 0:
9869 raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
9870
9871 if self._experts is not None:
9872 # flatten `list[dict[str, Tensor]]` into `list[str]`
9873 experts = [k for d in self._experts for k in d.keys()]
9874 if len(experts) > 0:
9875 raise ValueError(f"Unprocessed experts: {experts}")
9876
9877
9878@ModelBase.register("ChameleonForConditionalGeneration")
9879@ModelBase.register("ChameleonForCausalLM") # obsolete
9880class ChameleonModel(TextModel):
9881 model_arch = gguf.MODEL_ARCH.CHAMELEON
9882
9883 def set_gguf_parameters(self):
9884 super().set_gguf_parameters()
9885 self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
9886
9887 def set_vocab(self):
9888 self._set_vocab_gpt2()
9889
9890 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9891 # ignore image tokenizer for now
9892 # TODO: remove this once image support is implemented for Chameleon
9893 if name.startswith("model.vqmodel"):
9894 return
9895
9896 n_head = self.hparams["num_attention_heads"]
9897 n_kv_head = self.hparams.get("num_key_value_heads")
9898 hidden_dim = self.hparams.get("hidden_size")
9899
9900 if name.endswith(("q_proj.weight", "q_proj.bias")):
9901 data_torch = LlamaModel.permute(data_torch, n_head, n_head)
9902 if name.endswith(("k_proj.weight", "k_proj.bias")):
9903 data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
9904 if name.endswith(("q_norm.weight", "q_norm.bias")):
9905 data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
9906 if name.endswith(("k_norm.weight", "k_norm.bias")):
9907 data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
9908
9909 yield from super().modify_tensors(data_torch, name, bid)
9910
9911 # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
9912 @staticmethod
9913 def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
9914 head_dim = hidden_dim // n_heads
9915 data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
9916 data_torch = data_torch.repeat_interleave(n_heads, 0)
9917 return data_torch
9918
9919
9920@ModelBase.register("UltravoxModel")
9921class UltravoxModel(TextModel):
9922 model_arch = gguf.MODEL_ARCH.LLAMA # dummy
9923
9924 def __init__(self, *args, **kwargs):
9925 super().__init__(*args, **kwargs)
9926 raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
9927
9928
9929@ModelBase.register("GlmasrModel")
9930class GlmASRWhisperEncoderModel(MmprojModel):
9931 has_vision_encoder = False
9932 has_audio_encoder = True
9933
9934 def __init__(self, *args, **kwargs):
9935 super().__init__(*args, **kwargs)
9936 if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
9937 self.hparams["hidden_size"] = self.hparams["d_model"]
9938 self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
9939 self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
9940
9941 def set_gguf_parameters(self):
9942 super().set_gguf_parameters()
9943 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
9944 self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
9945 self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
9946 self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
9947
9948 def tensor_force_quant(self, name, new_name, bid, n_dims):
9949 if ".conv" in name and ".weight" in name:
9950 return gguf.GGMLQuantizationType.F16
9951 return super().tensor_force_quant(name, new_name, bid, n_dims)
9952
9953 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9954 if name.startswith("model.") or name.startswith("lm_head."):
9955 # skip language model tensors
9956 return
9957
9958 if name.startswith("audio_encoder.whisper."):
9959 name = name.replace("audio_encoder.whisper.","audio_tower.")
9960 if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
9961 name = name.replace("audio_encoder.", "audio_encoder.adapting.")
9962
9963 if name.startswith("audio_encoder.audio_bos_eos_token."):
9964 yield from super().modify_tensors(data_torch[0], "model.vision.boi", bid)
9965 yield from super().modify_tensors(data_torch[1], "model.vision.eoi", bid)
9966 return
9967
9968 if name.startswith("audio_encoder.adapting."):
9969 name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
9970 if ".layer_norm." in name:
9971 name = name.replace(".layer_norm.", ".ln_pre.")
9972 if ".0." in name:
9973 name = name.replace(".0.", ".linear_1.")
9974 if ".2." in name:
9975 name = name.replace(".2.", ".linear_2.")
9976 if ".proj." in name:
9977 return
9978
9979 if "conv1.bias" in name or "conv2.bias" in name:
9980 # transpose conv1 and conv2 bias
9981 data_torch = data_torch.unsqueeze(-1)
9982
9983 yield from super().modify_tensors(data_torch, name, bid)
9984
9985
9986@ModelBase.register("Qwen2AudioForConditionalGeneration")
9987class WhisperEncoderModel(MmprojModel):
9988 has_vision_encoder = False # no vision encoder
9989 has_audio_encoder = True
9990
9991 def __init__(self, *args, **kwargs):
9992 super().__init__(*args, **kwargs)
9993 if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
9994 self.hparams["hidden_size"] = self.hparams["d_model"]
9995 self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
9996 self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
9997
9998 def set_gguf_parameters(self):
9999 super().set_gguf_parameters()
10000 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
10001 self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
10002 self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
10003
10004 def tensor_force_quant(self, name, new_name, bid, n_dims):
10005 if ".conv" in name and ".weight" in name:
10006 return gguf.GGMLQuantizationType.F16
10007 return super().tensor_force_quant(name, new_name, bid, n_dims)
10008
10009 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10010 if name.startswith("language_model."):
10011 # skip language model tensors
10012 return
10013
10014 # prevent clash naming with vision tensors
10015 if name.startswith("multi_modal_projector"):
10016 name = "audio." + name
10017
10018 if "conv1.bias" in name or "conv2.bias" in name:
10019 # transpose conv1 and conv2 bias
10020 data_torch = data_torch.unsqueeze(-1)
10021
10022 yield from super().modify_tensors(data_torch, name, bid)
10023
10024
10025@ModelBase.register("UltravoxModel")
10026class UltravoxWhisperEncoderModel(WhisperEncoderModel):
10027 has_vision_encoder = False # no vision encoder
10028 has_audio_encoder = True
10029
10030 def set_gguf_parameters(self):
10031 super().set_gguf_parameters()
10032 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
10033 self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
10034
10035
10036@ModelBase.register("VoxtralForConditionalGeneration")
10037class VoxtralWhisperEncoderModel(WhisperEncoderModel):
10038 has_vision_encoder = False # no vision encoder
10039 has_audio_encoder = True
10040
10041 def set_gguf_parameters(self):
10042 super().set_gguf_parameters()
10043 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
10044 self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
10045
10046
10047@ModelBase.register("AudioFlamingo3ForConditionalGeneration")
10048class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel):
10049 def set_gguf_parameters(self):
10050 super().set_gguf_parameters()
10051 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO)
10052
10053 def tensor_force_quant(self, name, new_name, bid, n_dims):
10054 if ".conv" in name and ".weight" in name:
10055 # Was trained in BF16, being safe, avoiding quantizing to FP16
10056 return gguf.GGMLQuantizationType.F32
10057 return super().tensor_force_quant(name, new_name, bid, n_dims)
10058
10059
10060@ModelBase.register("FalconH1ForCausalLM")
10061class FalconH1Model(Mamba2Model):
10062 model_arch = gguf.MODEL_ARCH.FALCON_H1
10063
10064 def __init__(self, *args, **kwargs):
10065 # Set the hparam prefixes for Falcon Mamba2
10066 self.hparam_prefixes = ["mamba"]
10067
10068 # Initialize the base Mamba2Model
10069 super().__init__(*args, **kwargs)
10070
10071 # Use Llama conversion for attention
10072 self._transformer_model_class = LlamaModel
10073
10074 # n_group and d_inner are used during reshape_tensors for mamba2
10075 self.n_group = self.find_hparam(["n_groups"])
10076 self.d_inner = self.find_hparam(["mamba_d_ssm"])
10077 self.d_head = self.find_hparam(["d_head"])
10078
10079 # Initialize any Falcon Mamba2 specific attributes
10080 self.has_attention = True # Falcon Mamba2 has attention components
10081
10082 # Load Falcon-H1 multipliers from hyperparameters
10083 self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
10084 self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
10085 self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
10086 self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
10087 self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
10088 self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
10089 self.intermediate_size = self.find_hparam(["intermediate_size"])
10090 self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
10091
10092 def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
10093 prefixed = []
10094 for pfx in self.hparam_prefixes:
10095 prefixed.extend(
10096 "_".join([pfx, k])
10097 for k in keys
10098 )
10099 keys = list(keys) + prefixed
10100 return super().find_hparam(keys, *args, **kwargs)
10101
10102 def set_vocab(self):
10103 self._set_vocab_gpt2()
10104
10105 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10106 tensors = list(super().modify_tensors(data_torch, name, bid))
10107 tensor = tensors[0][1]
10108
10109 if "down_proj" in name:
10110 tensor = tensor * self.mlp_multipliers[1]
10111 elif "gate_proj" in name:
10112 tensor = tensor * self.mlp_multipliers[0]
10113 elif "k_proj" in name:
10114 tensor = tensor * self.key_multiplier * self.attention_in_multiplier
10115 elif "q_proj" in name:
10116 tensor = tensor * self.attention_in_multiplier
10117 elif "v_proj" in name:
10118 tensor = tensor * self.attention_in_multiplier
10119 elif "o_proj" in name:
10120 tensor = tensor * self.attention_out_multiplier
10121 elif "out_proj" in name:
10122 tensor = tensor * self.ssm_out_multiplier
10123 elif "in_proj" in name:
10124 tensor = tensor * self.ssm_in_multiplier
10125 zxbcdt_multipliers = self.hparams["ssm_multipliers"]
10126 intermediate_size = self.hparams["mamba_d_ssm"]
10127 groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
10128 tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
10129 tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
10130 tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
10131 tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
10132 tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
10133 elif "lm_head" in name:
10134 tensor = tensor * self.hparams["lm_head_multiplier"]
10135 elif "embed_tokens" in name:
10136 tensor = tensor * self.hparams["embedding_multiplier"]
10137 elif "mamba.norm" in name:
10138 tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
10139
10140 tensors = [(tensors[0][0], tensor)]
10141 return tensors
10142
10143 def set_gguf_parameters(self):
10144 super().set_gguf_parameters()
10145
10146 ## General Params ##
10147 self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10148 # Override some Mamba2 defaults
10149 self.gguf_writer.add_block_count(self.block_count)
10150 self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
10151 self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
10152
10153 ## Attention params ##
10154 self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
10155 self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
10156 self.gguf_writer.add_key_length(self.hparams["head_dim"])
10157 self.gguf_writer.add_value_length(self.hparams["head_dim"])
10158
10159 ## Validation ##
10160 assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
10161 assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
10162
10163 # Add any other Falcon Mamba2 specific configuration
10164 self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"])
10165
10166
10167@ModelBase.register("HunYuanMoEV1ForCausalLM")
10168class HunYuanMoEModel(TextModel):
10169 model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
10170
10171 def set_vocab(self):
10172 from transformers import AutoTokenizer
10173 tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
10174
10175 # 1. Get the pre-tokenizer identifier hash
10176 tokpre = self.get_vocab_base_pre(tokenizer)
10177
10178 # 2. Reverse-engineer the merges list from mergeable_ranks
10179 merges = []
10180 vocab = {}
10181 mergeable_ranks = tokenizer.mergeable_ranks
10182 for token, rank in mergeable_ranks.items():
10183 vocab[QwenModel.token_bytes_to_string(token)] = rank
10184 if len(token) == 1:
10185 continue
10186 merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
10187 if len(merged) == 2: # todo this is an assert in Qwen, why?
10188 merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
10189
10190 # 3. Generate the tokens and toktypes lists
10191 vocab_size = self.hparams["vocab_size"]
10192 assert tokenizer.vocab_size == vocab_size
10193 special_tokens = tokenizer.special_tokens
10194 reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
10195 tokens: list[str] = []
10196 toktypes: list[int] = []
10197 for i in range(vocab_size):
10198 if i not in reverse_vocab:
10199 tokens.append(f"[PAD{i}]")
10200 toktypes.append(gguf.TokenType.UNUSED)
10201 else:
10202 token = reverse_vocab[i]
10203 tokens.append(token)
10204 if i in special_tokens.values():
10205 toktypes.append(gguf.TokenType.CONTROL)
10206 else:
10207 toktypes.append(gguf.TokenType.NORMAL)
10208
10209 # 4. Write all vocab-related fields to the GGUF writer
10210 self.gguf_writer.add_tokenizer_model("gpt2")
10211 self.gguf_writer.add_tokenizer_pre(tokpre)
10212 self.gguf_writer.add_token_list(tokens)
10213 self.gguf_writer.add_token_types(toktypes)
10214 self.gguf_writer.add_token_merges(merges)
10215
10216 # 5. Add special tokens and chat templates
10217 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
10218 special_vocab.add_to_gguf(self.gguf_writer)
10219 # FIX for BOS token: Overwrite incorrect id read from config.json
10220 self.gguf_writer.add_bos_token_id(127959) # <|bos|>
10221
10222 def set_gguf_parameters(self):
10223 super().set_gguf_parameters()
10224 hparams = self.hparams
10225
10226 self.gguf_writer.add_expert_count(hparams["num_experts"])
10227 self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
10228
10229 moe_intermediate_size = hparams["moe_intermediate_size"]
10230 assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
10231 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
10232
10233 moe_topk = hparams["moe_topk"]
10234 assert all(topk == moe_topk[0] for topk in moe_topk)
10235 self.gguf_writer.add_expert_used_count(moe_topk[0])
10236
10237 moe_shared_expert = hparams["num_shared_expert"]
10238 assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
10239 self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
10240
10241 # Rope
10242 if self.rope_parameters.get("rope_type") == "dynamic":
10243 # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
10244 # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
10245 alpha = self.rope_parameters.get("alpha", 1000)
10246 base = self.rope_parameters.get("rope_theta", 10000.0)
10247 dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
10248 scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
10249 self.gguf_writer.add_rope_freq_base(scaled_base)
10250 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
10251 self.gguf_writer.add_rope_scaling_factor(1)
10252 # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
10253 self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
10254 self.gguf_writer.add_context_length(256 * 1024) # 256k context length
10255
10256 # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
10257 assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
10258 "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
10259
10260 _experts: list[dict[str, Tensor]] | None = None
10261
10262 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10263 if name == "lm_head.weight":
10264 if self.hparams.get("tie_word_embeddings", False):
10265 logger.info("Skipping tied output layer 'lm_head.weight'")
10266 return
10267
10268 if name.find("mlp.experts") != -1:
10269 n_experts = self.hparams["num_experts"]
10270 assert bid is not None
10271
10272 if self._experts is None:
10273 self._experts = [{} for _ in range(self.block_count)]
10274
10275 self._experts[bid][name] = data_torch
10276
10277 if len(self._experts[bid]) >= n_experts * 3:
10278 # merge the experts into a single 3d tensor
10279 for w_name in ["down_proj", "gate_proj", "up_proj"]:
10280 datas: list[Tensor] = []
10281
10282 for xid in range(n_experts):
10283 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
10284 datas.append(self._experts[bid][ename])
10285 del self._experts[bid][ename]
10286
10287 data_torch = torch.stack(datas, dim=0)
10288 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
10289
10290 yield from super().modify_tensors(data_torch, merged_name, bid)
10291 return
10292 else:
10293 return
10294
10295 yield from super().modify_tensors(data_torch, name, bid)
10296
10297 def prepare_tensors(self):
10298 super().prepare_tensors()
10299 if self._experts is not None:
10300 experts = [k for d in self._experts for k in d.keys()]
10301 if len(experts) > 0:
10302 raise ValueError(f"Unprocessed experts: {experts}")
10303
10304
10305@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
10306class LLaDAMoEModel(TextModel):
10307 model_arch = gguf.MODEL_ARCH.LLADA_MOE
10308
10309 def set_gguf_parameters(self):
10310 super().set_gguf_parameters()
10311 if (n_experts := self.hparams.get("num_experts")) is not None:
10312 self.gguf_writer.add_expert_count(n_experts)
10313
10314 if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
10315 self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
10316
10317 # number of experts used per token (top-k)
10318 if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
10319 self.gguf_writer.add_expert_used_count(n_experts_used)
10320
10321 self.gguf_writer.add_mask_token_id(156895)
10322 self.gguf_writer.add_causal_attention(False)
10323 self.gguf_writer.add_diffusion_shift_logits(False)
10324
10325 _experts: list[dict[str, Tensor]] | None = None
10326
10327 # Copied from: Qwen2MoeModel
10328 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10329 # process the experts separately
10330 if name.find("experts") != -1:
10331 n_experts = self.hparams["num_experts"]
10332 assert bid is not None
10333
10334 if self._experts is None:
10335 self._experts = [{} for _ in range(self.block_count)]
10336
10337 self._experts[bid][name] = data_torch
10338
10339 if len(self._experts[bid]) >= n_experts * 3:
10340 # merge the experts into a single 3d tensor
10341 for w_name in ["down_proj", "gate_proj", "up_proj"]:
10342 datas: list[Tensor] = []
10343
10344 for xid in range(n_experts):
10345 ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
10346 datas.append(self._experts[bid][ename])
10347 del self._experts[bid][ename]
10348
10349 data_torch = torch.stack(datas, dim=0)
10350
10351 merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
10352
10353 yield from super().modify_tensors(data_torch, merged_name, bid)
10354 return
10355 else:
10356 return
10357
10358 yield from super().modify_tensors(data_torch, name, bid)
10359
10360 # Copied from: Qwen2MoeModel
10361 def prepare_tensors(self):
10362 super().prepare_tensors()
10363
10364 if self._experts is not None:
10365 # flatten `list[dict[str, Tensor]]` into `list[str]`
10366 experts = [k for d in self._experts for k in d.keys()]
10367 if len(experts) > 0:
10368 raise ValueError(f"Unprocessed experts: {experts}")
10369
10370
10371@ModelBase.register("HunYuanDenseV1ForCausalLM")
10372class HunYuanModel(TextModel):
10373 model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
10374
10375 def set_vocab(self):
10376 if (self.dir_model / "tokenizer.json").is_file():
10377 self._set_vocab_gpt2()
10378 else:
10379 from transformers import AutoTokenizer
10380 tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
10381
10382 # 1. Get the pre-tokenizer identifier hash
10383 tokpre = self.get_vocab_base_pre(tokenizer)
10384
10385 # 2. Reverse-engineer the merges list from mergeable_ranks
10386 merges = []
10387 vocab = {}
10388 mergeable_ranks = tokenizer.mergeable_ranks
10389 for token, rank in mergeable_ranks.items():
10390 vocab[QwenModel.token_bytes_to_string(token)] = rank
10391 if len(token) == 1:
10392 continue
10393 merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
10394 if len(merged) == 2:
10395 merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
10396
10397 # 3. Generate the tokens and toktypes lists
10398 vocab_size = self.hparams["vocab_size"]
10399 assert tokenizer.vocab_size == vocab_size
10400 special_tokens = tokenizer.special_tokens
10401 reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
10402 tokens: list[str] = []
10403 toktypes: list[int] = []
10404 for i in range(vocab_size):
10405 if i not in reverse_vocab:
10406 tokens.append(f"[PAD{i}]")
10407 toktypes.append(gguf.TokenType.UNUSED)
10408 else:
10409 token = reverse_vocab[i]
10410 tokens.append(token)
10411 if i in special_tokens.values():
10412 toktypes.append(gguf.TokenType.CONTROL)
10413 else:
10414 toktypes.append(gguf.TokenType.NORMAL)
10415
10416 # 4. Write all vocab-related fields to the GGUF writer
10417 self.gguf_writer.add_tokenizer_model("gpt2")
10418 self.gguf_writer.add_tokenizer_pre(tokpre)
10419 self.gguf_writer.add_token_list(tokens)
10420 self.gguf_writer.add_token_types(toktypes)
10421 self.gguf_writer.add_token_merges(merges)
10422
10423 # 5. Add special tokens and chat templates
10424 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
10425 special_vocab.add_to_gguf(self.gguf_writer)
10426 # FIX for BOS token: Overwrite incorrect id read from config.json
10427 if self.hparams['hidden_size'] == 4096:
10428 self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
10429
10430 def set_gguf_parameters(self):
10431 super().set_gguf_parameters()
10432 hparams = self.hparams
10433
10434 # Rope
10435 if self.rope_parameters.get("rope_type") == "dynamic":
10436 # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
10437 # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
10438 alpha = self.rope_parameters.get("alpha", 50)
10439 base = self.rope_parameters.get("rope_theta", 10000.0)
10440 dim = hparams["head_dim"]
10441 scaled_base = base * (alpha ** (dim / (dim - 2)))
10442 self.gguf_writer.add_rope_freq_base(scaled_base)
10443 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
10444 self.gguf_writer.add_rope_scaling_factor(1)
10445 # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
10446 self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
10447 self.gguf_writer.add_context_length(256 * 1024) # 256k context length
10448
10449 # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
10450 assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
10451 "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
10452
10453 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10454 if name == "lm_head.weight":
10455 if self.hparams.get("tie_word_embeddings", False):
10456 logger.info("Skipping tied output layer 'lm_head.weight'")
10457 return
10458
10459 yield from super().modify_tensors(data_torch, name, bid)
10460
10461
10462@ModelBase.register("SmolLM3ForCausalLM")
10463class SmolLM3Model(LlamaModel):
10464 model_arch = gguf.MODEL_ARCH.SMOLLM3
10465
10466
10467@ModelBase.register("GptOssForCausalLM")
10468class GptOssModel(TextModel):
10469 model_arch = gguf.MODEL_ARCH.GPT_OSS
10470
10471 # TODO: remove once MXFP4 is supported more generally
10472 def dequant_model(self):
10473 quant_config = self.hparams.get("quantization_config")
10474 if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
10475 return
10476 return super().dequant_model()
10477
10478 def transform_nibble_layout(self, tensor):
10479 assert tensor.dtype == torch.uint8
10480 assert tensor.shape[-1] == 16
10481 # swap nibbles
10482 t_lo = tensor & 0x0F
10483 t_hi = tensor & 0xF0
10484 t_swapped = (t_lo << 4) | (t_hi >> 4)
10485 tensor = t_swapped
10486 # transform aaaa...bbbb... to abababab...
10487 blk_a, blk_b = tensor.chunk(2, dim=-1)
10488 # get a_
10489 blk_a0 = (blk_a & 0xF0).view(-1, 1)
10490 blk_a1 = (blk_a << 4).view(-1, 1)
10491 blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape)
10492 # get _b
10493 blk_b0 = (blk_b >> 4).view(-1, 1)
10494 blk_b1 = (blk_b & 0x0F).view(-1, 1)
10495 blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape)
10496 # swap once more
10497 out = blk_a | blk_b
10498 out_h = out & 0xF0
10499 out_l = out & 0x0F
10500 out = (out_h >> 4) | (out_l << 4)
10501 return out
10502
10503 def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
10504 assert blocks.dtype == torch.uint8
10505 assert scales.dtype == torch.uint8
10506 scales = scales.unsqueeze(-1)
10507 assert len(blocks.shape) == 4
10508 assert len(scales.shape) == 4
10509 blocks = self.transform_nibble_layout(blocks)
10510 new_data = torch.concat((scales, blocks), dim=-1)
10511 new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32]
10512 logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4")
10513 # flatten last dim
10514 new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3])
10515 new_data = new_data.numpy()
10516 self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
10517
10518 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
10519 blocks0: Tensor = torch.zeros(1)
10520 blocks1: Tensor = torch.zeros(1)
10521 # we assume that tensors are loaded in the correct order
10522 for name, data_torch in self.get_tensors():
10523 if "mlp.experts.down_proj_blocks" in name:
10524 blocks0 = data_torch
10525 elif "mlp.experts.down_proj_scales" in name:
10526 new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
10527 self.repack_mxfp4(new_name, blocks0, data_torch)
10528 elif "mlp.experts.gate_up_proj_blocks" in name:
10529 blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
10530 elif "mlp.experts.gate_up_proj_scales" in name:
10531 scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :]
10532 new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight"))
10533 new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
10534 self.repack_mxfp4(new_name_gate, blocks0, scales0)
10535 self.repack_mxfp4(new_name_up, blocks1, scales1)
10536 return []
10537
10538 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10539 if "sinks" in name:
10540 name += ".weight"
10541
10542 # correct naming for down_proj
10543 if "down_proj" in name:
10544 if name.endswith("_bias"):
10545 name = name.replace("down_proj_bias", "down_proj.bias")
10546 elif "_blocks" not in name and "_scales" not in name:
10547 logger.warning(f"{name} is not in MXFP4, performance may be degraded")
10548 name = name.replace("down_proj", "down_proj.weight")
10549 data_torch = data_torch.transpose(-1, -2)
10550 else:
10551 # otherwise, it should already be repacked to ggml MXFP4 format
10552 return
10553
10554 # split the gate_up into gate and up
10555 if "gate_up_proj" in name:
10556 if name.endswith("_bias"):
10557 name_up = name.replace("gate_up_proj_bias", "up_proj.bias")
10558 name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias")
10559 gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2]
10560 yield from super().modify_tensors(gate_proj_bias, name_gate, bid)
10561 yield from super().modify_tensors(up_proj_bias, name_up, bid)
10562 elif "_blocks" not in name and "_scales" not in name:
10563 logger.warning(f"{name} is not in MXFP4, performance may be degraded")
10564 name_up = name.replace("gate_up_proj", "up_proj.weight")
10565 name_gate = name.replace("gate_up_proj", "gate_proj.weight")
10566 data_torch = data_torch.transpose(-1, -2)
10567 gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
10568 yield from super().modify_tensors(gate_proj_weight, name_gate, bid)
10569 yield from super().modify_tensors(up_proj_weight, name_up, bid)
10570 else:
10571 yield from super().modify_tensors(data_torch, name, bid)
10572
10573 def set_vocab(self):
10574 self._set_vocab_gpt2()
10575
10576 def set_gguf_parameters(self):
10577 super().set_gguf_parameters()
10578 self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
10579 self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
10580
10581
10582@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
10583class LFM2Model(TextModel):
10584 model_arch = gguf.MODEL_ARCH.LFM2
10585
10586 def _add_feed_forward_length(self):
10587 ff_dim = self.hparams["block_ff_dim"]
10588
10589 auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
10590 ff_dim = self.hparams["block_ff_dim"]
10591 ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
10592 multiple_of = self.hparams["block_multiple_of"]
10593
10594 if auto_adjust_ff_dim:
10595 ff_dim = int(2 * ff_dim / 3)
10596 # custom dim factor multiplier
10597 if ffn_dim_multiplier is not None:
10598 ff_dim = int(ffn_dim_multiplier * ff_dim)
10599 ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
10600
10601 self.gguf_writer.add_feed_forward_length(ff_dim)
10602
10603 def set_gguf_parameters(self):
10604 # set num_key_value_heads only for attention layers
10605 self.hparams["num_key_value_heads"] = [
10606 self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
10607 for layer_type in self.hparams["layer_types"]
10608 ]
10609
10610 super().set_gguf_parameters()
10611 self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10612 self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
10613 self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
10614 self._add_feed_forward_length()
10615
10616 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10617 if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
10618 # skip multimodal tensors
10619 return
10620
10621 name = name.replace("language_model.", "") # vision
10622 name = name.replace("lfm.", "model.") # audio
10623
10624 # conv op requires 2d tensor
10625 if 'conv.conv' in name:
10626 data_torch = data_torch.squeeze(1)
10627
10628 yield from super().modify_tensors(data_torch, name, bid)
10629
10630 def _is_vision_tensor(self, name: str) -> bool:
10631 return "vision_tower" in name or "multi_modal_projector" in name
10632
10633
10634@ModelBase.register("Lfm2Model")
10635class LFM2ColBertModel(LFM2Model):
10636 model_arch = gguf.MODEL_ARCH.LFM2
10637 dense_tensor_name = "dense_2"
10638
10639 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10640 if not name.startswith(self.dense_tensor_name):
10641 name = "model." + name
10642
10643 yield from super().modify_tensors(data_torch, name, bid)
10644
10645 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
10646 # dense tensor is stored in a separate safetensors file
10647 from safetensors.torch import load_file
10648 tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
10649 assert tensors_file.is_file()
10650 tensor = load_file(tensors_file)["linear.weight"]
10651 self.gguf_writer.add_embedding_length_out(tensor.shape[0])
10652 yield f"{self.dense_tensor_name}.weight", tensor.clone()
10653
10654
10655@ModelBase.register("Lfm2MoeForCausalLM")
10656class LFM2MoeModel(TextModel):
10657 model_arch = gguf.MODEL_ARCH.LFM2MOE
10658
10659 def set_gguf_parameters(self):
10660 # set num_key_value_heads only for attention layers
10661 self.hparams["num_key_value_heads"] = [
10662 self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
10663 for layer_type in self.hparams["layer_types"]
10664 ]
10665
10666 super().set_gguf_parameters()
10667
10668 self.gguf_writer.add_expert_count(self.hparams["num_experts"])
10669 self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
10670 self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
10671 self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
10672
10673 self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10674 self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
10675
10676 # cache for experts weights for merging
10677 _experts_cache: dict[int, dict[str, Tensor]] = {}
10678
10679 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10680 # conv op requires 2d tensor
10681 if 'conv.conv' in name:
10682 data_torch = data_torch.squeeze(1)
10683
10684 if name.endswith(".expert_bias"):
10685 name = name.replace(".expert_bias", ".expert_bias.bias")
10686
10687 # merge expert weights
10688 if 'experts' in name:
10689 n_experts = self.hparams["num_experts"]
10690 assert bid is not None
10691
10692 expert_cache = self._experts_cache.setdefault(bid, {})
10693 expert_cache[name] = data_torch
10694 expert_weights = ["w1", "w2", "w3"]
10695
10696 # not enough expert weights to merge
10697 if len(expert_cache) < n_experts * len(expert_weights):
10698 return
10699
10700 for w_name in expert_weights:
10701 datas: list[Tensor] = []
10702
10703 for xid in range(n_experts):
10704 ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight"
10705 datas.append(expert_cache[ename])
10706 del expert_cache[ename]
10707
10708 data_torch = torch.stack(datas, dim=0)
10709 merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight"
10710
10711 yield from super().modify_tensors(data_torch, merged_name, bid)
10712
10713 del self._experts_cache[bid]
10714 return
10715
10716 yield from super().modify_tensors(data_torch, name, bid)
10717
10718 def prepare_tensors(self):
10719 super().prepare_tensors()
10720 assert not self._experts_cache
10721
10722
10723@ModelBase.register("Lfm2VlForConditionalGeneration")
10724class LFM2VLModel(MmprojModel):
10725 def __init__(self, *args, **kwargs):
10726 super().__init__(*args, **kwargs)
10727 assert self.hparams_vision is not None
10728 # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility
10729 self.hparams_vision["image_size"] = 256
10730
10731 def set_gguf_parameters(self):
10732 super().set_gguf_parameters()
10733 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2)
10734 self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"]))
10735 self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2))
10736 self.gguf_writer.add_vision_use_gelu(True)
10737 # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0
10738 vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1)
10739 self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop)
10740
10741 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10742 is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
10743
10744 if is_vision_tensor:
10745 # remove "model." prefix
10746 name = name.replace("model.vision_tower.", "vision_tower.")
10747 name = name.replace("model.multi_modal_projector.", "multi_modal_projector.")
10748
10749 if "patch_embedding.weight" in name:
10750 data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2)
10751
10752 yield from super().modify_tensors(data_torch, name, bid)
10753 return
10754
10755 return # skip other tensors
10756
10757
10758@ModelBase.register("Lfm2AudioForConditionalGeneration")
10759class LFM2AudioModel(ConformerAudioModel):
10760 has_vision_encoder = False
10761 has_audio_encoder = True
10762 model_name = "Lfm2AudioEncoder"
10763
10764 def get_audio_config(self) -> dict[str, Any] | None:
10765 return self.global_config.get("encoder")
10766
10767 def set_gguf_parameters(self):
10768 assert self.hparams_audio is not None
10769 self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
10770 self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"]
10771 self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"]
10772 super().set_gguf_parameters()
10773 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A)
10774 self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
10775 self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
10776
10777 def modify_tensors(self, data_torch, name, bid):
10778 # skip language model tensors
10779 if name.startswith("lfm."):
10780 return
10781
10782 # for training only
10783 if any(p in name for p in ["audio_loss_weight"]):
10784 return
10785
10786 # for audio output
10787 if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
10788 return
10789
10790 yield from super().modify_tensors(data_torch, name, bid)
10791
10792
10793@ModelBase.register("SmallThinkerForCausalLM")
10794class SmallThinkerModel(TextModel):
10795 model_arch = gguf.MODEL_ARCH.SMALLTHINKER
10796
10797 def set_gguf_parameters(self):
10798 super().set_gguf_parameters()
10799 if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
10800 self.gguf_writer.add_expert_count(n_experts)
10801 if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
10802 self.gguf_writer.add_expert_used_count(n_experts_used)
10803 if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
10804 self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
10805 self.gguf_writer.add_feed_forward_length(moe_intermediate_size)
10806 logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
10807 if (self.hparams.get('moe_primary_router_apply_softmax')):
10808 self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
10809 else:
10810 self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
10811
10812 sliding_window_layout = self.hparams.get("sliding_window_layout")
10813 if sliding_window_layout:
10814 for i in sliding_window_layout:
10815 if i != 0:
10816 sliding_window = self.hparams.get("sliding_window_size")
10817 if sliding_window:
10818 self.gguf_writer.add_sliding_window(sliding_window)
10819 break
10820
10821 _experts: list[dict[str, Tensor]] | None = None
10822
10823 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10824 # process the experts separately
10825 if name.find("experts") != -1:
10826 n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
10827 assert bid is not None
10828
10829 if self._experts is None:
10830 self._experts = [{} for _ in range(self.block_count)]
10831
10832 self._experts[bid][name] = data_torch
10833
10834 if len(self._experts[bid]) >= n_experts * 3:
10835 # merge the experts into a single 3d tensor
10836 for w_name in ["down", "gate", "up"]:
10837 datas: list[Tensor] = []
10838
10839 for xid in range(n_experts):
10840 ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
10841 datas.append(self._experts[bid][ename])
10842 del self._experts[bid][ename]
10843
10844 data_torch = torch.stack(datas, dim=0)
10845
10846 merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
10847
10848 yield from super().modify_tensors(data_torch, merged_name, bid)
10849 return
10850 else:
10851 return
10852
10853 yield from super().modify_tensors(data_torch, name, bid)
10854
10855 def prepare_tensors(self):
10856 super().prepare_tensors()
10857
10858 if self._experts is not None:
10859 # flatten `list[dict[str, Tensor]]` into `list[str]`
10860 experts = [k for d in self._experts for k in d.keys()]
10861 if len(experts) > 0:
10862 raise ValueError(f"Unprocessed experts: {experts}")
10863
10864
10865@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
10866class ModernBertModel(BertModel):
10867 model_arch = gguf.MODEL_ARCH.MODERN_BERT
10868
10869 def set_vocab(self):
10870 self.gguf_writer.add_add_bos_token(True)
10871 self.gguf_writer.add_add_eos_token(True)
10872 self.gguf_writer.add_add_sep_token(True)
10873 self._set_vocab_gpt2()
10874
10875 def set_gguf_parameters(self):
10876 super().set_gguf_parameters()
10877 self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
10878 if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
10879 self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
10880 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
10881 self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10882
10883 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10884 # these layers act as MLM head, so we don't need them
10885 if name.startswith("decoder."):
10886 return
10887
10888 if name.startswith("model."):
10889 name = name[6:]
10890
10891 yield from super().modify_tensors(data_torch, name, bid)
10892
10893
10894@ModelBase.register("ApertusForCausalLM")
10895class ApertusModel(LlamaModel):
10896 model_arch = gguf.MODEL_ARCH.APERTUS
10897 undo_permute = False
10898
10899 _alpha_n = {}
10900 _alpha_p = {}
10901 _beta = {}
10902 _eps = {}
10903
10904 def modify_tensors(self, data_torch, name, bid):
10905 # Handle xIELU activation parameters
10906 n_layers = self.hparams["num_hidden_layers"]
10907 if name.endswith(".act_fn.alpha_n"):
10908 self._alpha_n[bid] = data_torch.to("cpu").float().item()
10909 if (len(self._alpha_n) == n_layers):
10910 self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)])
10911 return
10912 if name.endswith(".act_fn.alpha_p"):
10913 self._alpha_p[bid] = data_torch.to("cpu").float().item()
10914 if (len(self._alpha_p) == n_layers):
10915 self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)])
10916 return
10917 if name.endswith(".act_fn.beta"):
10918 self._beta[bid] = data_torch.to("cpu").float().item()
10919 if (len(self._beta) == n_layers):
10920 self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)])
10921 return
10922 if name.endswith(".act_fn.eps"):
10923 self._eps[bid] = data_torch.to("cpu").float().item()
10924 if (len(self._eps) == n_layers):
10925 self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)])
10926 return
10927
10928 yield from super().modify_tensors(data_torch, name, bid)
10929
10930
10931class MistralModel(LlamaModel):
10932 model_arch = gguf.MODEL_ARCH.MISTRAL3
10933 model_name = "Mistral"
10934 hf_arch = ""
10935 is_mistral_format = True
10936 undo_permute = False
10937
10938 def __init__(self, *args, **kwargs):
10939 super().__init__(*args, **kwargs)
10940 # for compatibility, we use LLAMA arch for older models
10941 # TODO: remove this once everyone migrates to newer version of llama.cpp
10942 if "llama_4_scaling" not in self.hparams:
10943 self.model_arch = gguf.MODEL_ARCH.LLAMA
10944 self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
10945 self.gguf_writer.add_architecture()
10946 self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
10947
10948 def dequant_model(self):
10949 # transform quantization config into HF format
10950 quant_config = self.hparams.get("quantization")
10951 if quant_config is not None:
10952 assert quant_config["qformat_weight"] == "fp8_e4m3"
10953 self.hparams["quantization_config"] = {
10954 "activation_scheme": "static",
10955 "quant_method": "fp8",
10956 "weight_block_size": None,
10957 }
10958 return super().dequant_model()
10959
10960 @staticmethod
10961 def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
10962 assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
10963 assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
10964 f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
10965 )
10966
10967 if vocab.tokenizer.version == TokenizerVersion.v1:
10968 return "mistral-v1"
10969 elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm:
10970 return "mistral-v3"
10971 elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken:
10972 return "mistral-v3-tekken"
10973 elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm:
10974 return "mistral-v7"
10975 elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken:
10976 return "mistral-v7-tekken"
10977 elif vocab.tokenizer.version == TokenizerVersion.v11:
10978 template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja"
10979 elif vocab.tokenizer.version == TokenizerVersion.v13:
10980 template_file = "unsloth-mistral-Devstral-Small-2507.jinja"
10981 else:
10982 err_message = f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}"
10983 if is_mistral_format:
10984 err_message += (
10985 " . Please pass --disable-mistral-community-chat-template argument to the CLI "
10986 "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library."
10987 )
10988 raise ValueError(err_message)
10989
10990 template_path = templates_dir / template_file
10991 if not template_path.exists():
10992 raise FileNotFoundError(f"Template file not found: {template_path}")
10993
10994 with open(template_path, "r", encoding="utf-8") as f:
10995 template = f.read()
10996
10997 return template
10998
10999 def set_gguf_parameters(self):
11000 super().set_gguf_parameters()
11001 MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
11002
11003 @staticmethod
11004 def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
11005 if "yarn" in hparams:
11006 yarn_params = hparams["yarn"]
11007 gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
11008 gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
11009 gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
11010 gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
11011 gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
11012 gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
11013
11014 if "llama_4_scaling" in hparams:
11015 gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
11016
11017
11018class MistralMoeModel(DeepseekV2Model):
11019 model_arch = gguf.MODEL_ARCH.DEEPSEEK2
11020 model_name = "Mistral"
11021 hf_arch = ""
11022 is_mistral_format = True
11023
11024 def __init__(self, *args, **kwargs):
11025 super().__init__(*args, **kwargs)
11026 logger.info("Using MistralMoeModel")
11027 # remap hparams from Mistral MoE format to DeepseekV2 format
11028 # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic
11029 # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py
11030 config = self.hparams
11031 # Mistral key -> HF key
11032 config_mapping = {
11033 "dim": "hidden_size",
11034 "norm_eps": "rms_norm_eps",
11035 "n_kv_heads": "num_key_value_heads",
11036 "n_layers": "num_hidden_layers",
11037 "n_heads": "num_attention_heads",
11038 "hidden_dim": "intermediate_size",
11039 }
11040 # HF key -> (Mistral key, default value)
11041 top_level_mapping_with_default = {
11042 "model_type": ("model_type", "transformer"),
11043 "hidden_act": ("activation", "silu"),
11044 "tie_word_embeddings": ("tied_embeddings", False),
11045 "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
11046 "max_position_embeddings": ("max_position_embeddings", 128_000),
11047 }
11048 # mapping top-level keys
11049 for key, new_key in config_mapping.items():
11050 if key in config:
11051 config[new_key] = config[key]
11052 for new_key, (key, default_value) in top_level_mapping_with_default.items():
11053 config[new_key] = config.get(key, default_value)
11054 # mapping MoE-specific keys
11055 moe_config_map = {
11056 "route_every_n": "moe_layer_freq",
11057 "first_k_dense_replace": "first_k_dense_replace",
11058 "num_experts_per_tok": "num_experts_per_tok",
11059 "num_experts": "n_routed_experts",
11060 "expert_hidden_dim": "moe_intermediate_size",
11061 "routed_scale": "routed_scaling_factor",
11062 "num_shared_experts": "n_shared_experts",
11063 "num_expert_groups": "n_group",
11064 "num_expert_groups_per_tok": "topk_group",
11065 }
11066 moe = config["moe"]
11067 for key, new_key in moe_config_map.items():
11068 if key in moe:
11069 config[new_key] = moe[key]
11070 # provide missing values
11071 config["topk_method"] = None
11072 config["norm_topk_prob"] = True
11073 config["scoring_func"] = "softmax"
11074
11075 def set_vocab(self):
11076 self._set_vocab_mistral()
11077
11078 def set_gguf_parameters(self):
11079 super().set_gguf_parameters()
11080 MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
11081 yarn_params = self.hparams["yarn"]
11082 self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
11083
11084 # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
11085 # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
11086 # ref https://github.com/ggml-org/llama.cpp/pull/17945
11087 self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
11088
11089 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
11090 if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name:
11091 return
11092
11093 # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic
11094 if name.endswith(".qscale_act"):
11095 name = name.replace(".qscale_act", ".input_scale")
11096 if name.endswith(".qscale_weight"):
11097 name = name.replace(".qscale_weight", ".weight_scale")
11098 if ".wkv_b." in name:
11099 name = name.replace(".wkv_b.", ".kv_b_proj.")
11100 if ".experts." in name:
11101 name = name.replace(".experts.", ".mlp.experts.")
11102 name = name.replace(".w1.", ".gate_proj.")
11103 name = name.replace(".w2.", ".down_proj.")
11104 name = name.replace(".w3.", ".up_proj.")
11105 name = "model." + name
11106
11107 yield from super().modify_tensors(data_torch, name, bid)
11108
11109
11110class PixtralModel(LlavaVisionModel):
11111 model_name = "Pixtral"
11112 hf_arch = ""
11113 is_mistral_format = True
11114
11115 def set_gguf_parameters(self):
11116 super().set_gguf_parameters()
11117 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
11118
11119 self.gguf_writer.add_vision_attention_layernorm_eps(
11120 self.find_hparam(["norm_eps"])
11121 )
11122 self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"]))
11123
11124 self.gguf_writer.add_vision_use_silu(True)
11125
11126 # spatial_merge_size
11127 if self.find_vparam(["mm_projector_id"]) == "patch_merge":
11128 self.gguf_writer.add_vision_spatial_merge_size(
11129 self.find_vparam(["spatial_merge_size"])
11130 )
11131
11132 def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
11133 if name == "vision_language_adapter.w_in.weight":
11134 return "mm.1.weight"
11135 elif name == "vision_language_adapter.w_out.weight":
11136 return "mm.2.weight"
11137 return super().map_tensor_name(name, try_suffixes)
11138
11139
11140@ModelBase.register("LightOnOCRForConditionalGeneration")
11141class LightOnOCRVisionModel(LlavaVisionModel):
11142 is_mistral_format = False
11143 use_break_tok = False
11144
11145 def set_gguf_parameters(self):
11146 super().set_gguf_parameters()
11147 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR)
11148
11149 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
11150 name = name.replace("model.vision_encoder.", "vision_tower.")
11151 name = name.replace("model.vision_projection.", "multi_modal_projector.")
11152 yield from super().modify_tensors(data_torch, name, bid)
11153
11154
11155@ModelBase.register("KimiVLForConditionalGeneration")
11156class KimiVLModel(MmprojModel):
11157 def __init__(self, *args, **kwargs):
11158 super().__init__(*args, **kwargs)
11159 assert self.hparams_vision is not None
11160 self.hparams_vision["image_size"] = 64 * 14 # for compatibility
11161
11162 def set_gguf_parameters(self):
11163 super().set_gguf_parameters()
11164 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL)
11165 self.gguf_writer.add_vision_use_gelu(True)
11166 self.gguf_writer.add_vision_projector_scale_factor(2)
11167 # eps is the same as pytorch's default value
11168 assert self.hparams_vision is not None
11169 self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5))
11170
11171 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11172 is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
11173
11174 if is_vision_tensor:
11175 if "pos_emb.weight" in name:
11176 data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2])
11177
11178 if "wqkv" in name:
11179 split_dim = 0 if "weight" in name else -1
11180 wq, wk, wv = data_torch.chunk(3, dim=split_dim)
11181 yield from super().modify_tensors(wq, name.replace("wqkv", "wq"), bid)
11182 yield from super().modify_tensors(wk, name.replace("wqkv", "wk"), bid)
11183 yield from super().modify_tensors(wv, name.replace("wqkv", "wv"), bid)
11184 else:
11185 yield from super().modify_tensors(data_torch, name, bid)
11186
11187
11188@ModelBase.register("KimiK25ForConditionalGeneration")
11189class KimiK25Model(MmprojModel):
11190 """Kimi-K2.5 with MoonViT3d vision encoder"""
11191
11192 def __init__(self, *args, **kwargs):
11193 super().__init__(*args, **kwargs)
11194
11195 assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config"
11196
11197 self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2]))
11198 self.patch_size = self.hparams_vision.get("patch_size", 14)
11199
11200 # Set image_size for compatibility with base class
11201 # Use position embedding dimensions as image_size reference
11202 pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64)
11203 self.hparams_vision["image_size"] = pos_emb_h * self.patch_size
11204
11205 def set_gguf_parameters(self):
11206 # Base class MmprojModel.set_gguf_parameters() already writes:
11207 # - vision_block_count, vision_head_count, vision_embedding_length
11208 # - vision_feed_forward_length, vision_patch_size, image_mean, image_std
11209 # via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config
11210 super().set_gguf_parameters()
11211 assert self.hparams_vision is not None
11212
11213 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25)
11214
11215 # Position embedding parameters (for interpolation)
11216 self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64))
11217 self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64))
11218 self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4))
11219
11220 # Projector parameters
11221 self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu")
11222 self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5))
11223 self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0])
11224
11225 # Image size limits
11226 # Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet)
11227 in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384)
11228 min_patches = 8 # reasonable minimum
11229 pixels_per_patch = self.patch_size ** 2
11230 self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch)
11231 self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch)
11232
11233 @staticmethod
11234 def permute(weights: Tensor, n_head: int) -> Tensor:
11235 out_dim, in_dim = weights.shape
11236 head_dim = out_dim // n_head
11237 w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim)
11238 w = w.permute(0, 2, 1, 3, 4)
11239 return w.reshape(out_dim, in_dim)
11240
11241 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11242 # Only process vision and projector tensors
11243 is_vision = any(x in name for x in ["vision_tower", "mm_projector"])
11244
11245 if not is_vision:
11246 return
11247
11248 assert self.hparams_vision is not None
11249 n_head = self.hparams_vision.get("num_attention_heads", 16)
11250
11251 # Permute Q/K weights/biases from interleaved to split RoPE format
11252 # This allows using build_rope_2d at runtime without post-permutation.
11253 if "wqkv" in name:
11254 out_dim = data_torch.shape[0]
11255 qkv_dim = out_dim // 3
11256 head_dim = qkv_dim // n_head
11257
11258 if "weight" in name:
11259 wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2 * qkv_dim, :], data_torch[2 * qkv_dim:, :]
11260 wq = self.permute(wq, n_head)
11261 wk = self.permute(wk, n_head)
11262 data_torch = torch.cat([wq, wk, wv], dim=0)
11263 elif "bias" in name:
11264 bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2 * qkv_dim], data_torch[2 * qkv_dim:]
11265 bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
11266 bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
11267 data_torch = torch.cat([bq, bk, bv], dim=0)
11268
11269 # Temporal embeddings: (T, 1, C) โ (T, C)
11270 if "pos_emb.time_weight" in name:
11271 T, _, C = data_torch.shape
11272 data_torch = data_torch.reshape(T, C)
11273
11274 # PatchMergerMLP tensor name mapping
11275 # proj.0.weight โ proj.linear_1.weight
11276 # proj.2.weight โ proj.linear_2.weight
11277 if "mm_projector.proj.0." in name:
11278 name = name.replace(".proj.0.", ".proj.linear_1.")
11279 elif "mm_projector.proj.2." in name:
11280 name = name.replace(".proj.2.", ".proj.linear_2.")
11281
11282 yield from super().modify_tensors(data_torch, name, bid)
11283
11284
11285@ModelBase.register("CogVLMForCausalLM")
11286class CogVLMVisionModel(MmprojModel):
11287
11288 def set_gguf_parameters(self):
11289 super().set_gguf_parameters()
11290 self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
11291 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
11292
11293 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11294 if not name.startswith("model.vision."):
11295 return
11296
11297 yield from super().modify_tensors(data_torch, name, bid)
11298
11299
11300@ModelBase.register("CogVLMForCausalLM")
11301class CogVLMModel(LlamaModel):
11302 model_arch = gguf.MODEL_ARCH.COGVLM
11303
11304 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11305 # block vision tensors
11306 if name.startswith("model.vision."):
11307 return
11308
11309 yield from ModelBase.modify_tensors(self, data_torch, name, bid)
11310
11311
11312@ModelBase.register("JanusForConditionalGeneration")
11313class JanusProModel(LlamaModel):
11314 model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
11315
11316 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11317 # Skip vision, aligner, and generation tensors
11318 skip_prefixes = (
11319 'model.vision_model.',
11320 'model.aligner.',
11321 'model.vqmodel.',
11322 'model.generation_embeddings.',
11323 'model.generation_aligner.',
11324 'model.generation_head.',
11325 )
11326 if name.startswith(skip_prefixes):
11327 return
11328
11329 if name.startswith('model.language_model.'):
11330 name = name.replace('model.language_model.', 'model.')
11331 elif name.startswith('language_model.'):
11332 name = name.replace('language_model.', '')
11333
11334 yield from super().modify_tensors(data_torch, name, bid)
11335
11336
11337@ModelBase.register("JanusForConditionalGeneration")
11338class JanusProVisionModel(MmprojModel):
11339 def __init__(self, *args, **kwargs):
11340 super().__init__(*args, **kwargs)
11341 assert self.hparams_vision is not None
11342 if "intermediate_size" not in self.hparams_vision:
11343 mlp_ratio = self.hparams_vision.get("mlp_ratio")
11344 hidden_size = self.hparams_vision.get("hidden_size")
11345 if mlp_ratio is not None and hidden_size is not None:
11346 self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
11347
11348 def set_gguf_parameters(self):
11349 super().set_gguf_parameters()
11350 assert self.hparams_vision is not None
11351
11352 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
11353
11354 self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
11355
11356 hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
11357 if hidden_act == "gelu":
11358 self.gguf_writer.add_vision_use_gelu(True)
11359 elif hidden_act == "silu":
11360 self.gguf_writer.add_vision_use_silu(True)
11361
11362 def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
11363 """Map aligner tensors to projector format"""
11364 suffix = ".bias" if name.endswith(".bias") else ".weight"
11365
11366 if name.startswith("model.aligner."):
11367 local_name = name[len("model.aligner."):]
11368 elif name.startswith("aligner."):
11369 local_name = name[len("aligner."):]
11370 else:
11371 raise ValueError(f"Unsupported Janus aligner prefix: {name}")
11372
11373 if local_name.startswith("fc1."):
11374 mm_index = 0
11375 elif local_name.startswith("hidden_layers."):
11376 parts = local_name.split(".", 2)
11377 if len(parts) < 3:
11378 raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
11379 mm_index = int(parts[1]) + 1
11380 else:
11381 raise ValueError(f"Unsupported Janus aligner tensor: {name}")
11382
11383 tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
11384 return [(tensor_name, data_torch)]
11385
11386 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11387 # Skip language model tensors as they will be handled by `JanusProModel`
11388 if name.startswith(('model.language_model.', 'language_model.')):
11389 return
11390
11391 # Skip generation-related components
11392 skip_generation_prefixes = (
11393 'model.vqmodel.',
11394 'vqmodel.',
11395 'model.generation_embeddings.',
11396 'generation_embeddings.',
11397 'model.generation_aligner.',
11398 'generation_aligner.',
11399 'model.generation_head.',
11400 'generation_head.',
11401 )
11402 if name.startswith(skip_generation_prefixes):
11403 return
11404
11405 # Handle aligner tensors
11406 if name.startswith(('model.aligner.', 'aligner.')):
11407 yield from self._map_aligner_tensor(data_torch, name)
11408 return
11409
11410 # Handle vision tensors
11411 if name.startswith(('model.vision_model.', 'vision_model.')):
11412 yield from super().modify_tensors(data_torch, name, bid)
11413 return
11414
11415 return
11416
11417
11418@ModelBase.register("YoutuVLForConditionalGeneration")
11419class YoutuVLVisionModel(MmprojModel):
11420 def __init__(self, *args, **kwargs):
11421 super().__init__(*args, **kwargs)
11422 assert self.hparams_vision is not None
11423 self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
11424
11425 def set_gguf_parameters(self):
11426 super().set_gguf_parameters()
11427
11428 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
11429 self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
11430
11431 # Handle activation function
11432 hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
11433 if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
11434 self.gguf_writer.add_vision_use_gelu(True)
11435 elif hidden_act == "silu":
11436 self.gguf_writer.add_vision_use_silu(True)
11437 else:
11438 raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
11439
11440 self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
11441
11442 window_size = self.hparams.get("window_size")
11443 if window_size is not None:
11444 self.gguf_writer.add_vision_window_size(window_size)
11445 # fullatt_block_indexes contains explicit layer indices that use full attention
11446 # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
11447 # All other layers use window attention
11448 fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
11449 assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
11450 # Store the explicit layer indices for YoutuVL (irregular pattern approach)
11451 self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
11452
11453 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11454 # Skip language model tensors
11455 skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
11456 if name.startswith(skip_prefixes):
11457 return
11458
11459 # Try to map the tensor using TensorNameMap (handles vision encoder and projector)
11460 try:
11461 yield from super().modify_tensors(data_torch, name, bid)
11462 except ValueError:
11463 # If mapping fails, log warning and skip
11464 logger.warning(f"Cannot map tensor: {name}")
11465 return
11466
11467
11468@ModelBase.register("SolarOpenForCausalLM")
11469class SolarOpenModel(Glm4MoeModel):
11470 model_arch = gguf.MODEL_ARCH.GLM4_MOE
11471
11472 def set_vocab(self):
11473 from transformers import AutoTokenizer
11474 tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
11475 special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
11476 tokens, toktypes, tokpre = self.get_vocab_base()
11477 self.gguf_writer.add_tokenizer_model("gpt2")
11478 self.gguf_writer.add_tokenizer_pre(tokpre)
11479 self.gguf_writer.add_token_list(tokens)
11480 self.gguf_writer.add_token_types(toktypes)
11481 special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
11482 special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
11483 special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"])
11484 special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
11485 special_vocab.add_to_gguf(self.gguf_writer)
11486
11487
11488###### CONVERSION LOGIC ######
11489
11490
11491# tree of lazy tensors
11492class LazyTorchTensor(gguf.LazyBase):
11493 _tensor_type = torch.Tensor
11494 # to keep the type-checker happy
11495 dtype: torch.dtype
11496 shape: torch.Size
11497
11498 # only used when converting a torch.Tensor to a np.ndarray
11499 _dtype_map: dict[torch.dtype, type] = {
11500 torch.float16: np.float16,
11501 torch.float32: np.float32,
11502 torch.uint8: np.uint8,
11503 }
11504
11505 # only used when byteswapping data. Only correct size is needed
11506 _dtype_byteswap_map: dict[torch.dtype, type] = {
11507 torch.float64: np.float64,
11508 torch.float32: np.float32,
11509 torch.bfloat16: np.float16,
11510 torch.float16: np.float16,
11511 torch.int64: np.int64,
11512 torch.uint64: np.uint64,
11513 torch.int32: np.int32,
11514 torch.uint32: np.uint32,
11515 torch.int16: np.int16,
11516 torch.uint16: np.uint16,
11517 torch.int8: np.int8,
11518 torch.uint8: np.uint8,
11519 torch.bool: np.uint8,
11520 torch.float8_e4m3fn: np.uint8,
11521 torch.float8_e5m2: np.uint8,
11522 }
11523
11524 # used for safetensors slices
11525 # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
11526 # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
11527 _dtype_str_map: dict[str, torch.dtype] = {
11528 "F64": torch.float64,
11529 "F32": torch.float32,
11530 "BF16": torch.bfloat16,
11531 "F16": torch.float16,
11532 # "U64": torch.uint64,
11533 "I64": torch.int64,
11534 # "U32": torch.uint32,
11535 "I32": torch.int32,
11536 # "U16": torch.uint16,
11537 "I16": torch.int16,
11538 "U8": torch.uint8,
11539 "I8": torch.int8,
11540 "BOOL": torch.bool,
11541 "F8_E4M3": torch.float8_e4m3fn,
11542 "F8_E5M2": torch.float8_e5m2,
11543 }
11544
11545 def numpy(self) -> gguf.LazyNumpyTensor:
11546 dtype = self._dtype_map[self.dtype]
11547 return gguf.LazyNumpyTensor(
11548 meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
11549 args=(self,),
11550 func=(lambda s: s.numpy())
11551 )
11552
11553 @classmethod
11554 def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
11555 return torch.empty(size=shape, dtype=dtype, device="meta")
11556
11557 @classmethod
11558 def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
11559 dtype = cls._dtype_str_map[st_slice.get_dtype()]
11560 shape: tuple[int, ...] = tuple(st_slice.get_shape())
11561 lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
11562 return cast(torch.Tensor, lazy)
11563
11564 @classmethod
11565 def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
11566 def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
11567 def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
11568 if sys.byteorder == 'big':
11569 # switch data back to big endian
11570 tensor = tensor.view(dtype).byteswap(inplace=False)
11571 return tensor
11572 dtype = cls._dtype_str_map[tensor.dtype]
11573 numpy_dtype = cls._dtype_byteswap_map[dtype]
11574 return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape)
11575 dtype = cls._dtype_str_map[t.dtype]
11576 shape = t.shape
11577 lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
11578 return cast(torch.Tensor, lazy)
11579
11580 @classmethod
11581 def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
11582 def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray:
11583 if sys.byteorder == 'big':
11584 # switch data back to big endian
11585 tensor = tensor.view(dtype).byteswap(inplace=False)
11586 return tensor
11587 dtype = cls._dtype_str_map[remote_tensor.dtype]
11588 numpy_dtype = cls._dtype_byteswap_map[dtype]
11589 shape = remote_tensor.shape
11590 meta = cls.meta_with_dtype_and_shape(dtype, shape)
11591 lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape))
11592 return cast(torch.Tensor, lazy)
11593
11594 @classmethod
11595 def __torch_function__(cls, func, types, args=(), kwargs=None):
11596 del types # unused
11597
11598 if kwargs is None:
11599 kwargs = {}
11600
11601 if func is torch.Tensor.numpy:
11602 return args[0].numpy()
11603
11604 return cls._wrap_fn(func)(*args, **kwargs)
11605
11606
11607def parse_args() -> argparse.Namespace:
11608 parser = argparse.ArgumentParser(
11609 description="Convert a huggingface model to a GGML compatible file")
11610 parser.add_argument(
11611 "--vocab-only", action="store_true",
11612 help="extract only the vocab",
11613 )
11614 parser.add_argument(
11615 "--outfile", type=Path,
11616 help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
11617 )
11618 parser.add_argument(
11619 "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
11620 help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
11621 )
11622 parser.add_argument(
11623 "--bigendian", action="store_true",
11624 help="model is executed on big endian machine",
11625 )
11626 parser.add_argument(
11627 "model", type=str,
11628 help="directory containing model file or huggingface repository ID (if --remote)",
11629 nargs="?",
11630 )
11631 parser.add_argument(
11632 "--use-temp-file", action="store_true",
11633 help="use the tempfile library while processing (helpful when running out of memory, process killed)",
11634 )
11635 parser.add_argument(
11636 "--no-lazy", action="store_true",
11637 help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
11638 )
11639 parser.add_argument(
11640 "--model-name", type=str, default=None,
11641 help="name of the model",
11642 )
11643 parser.add_argument(
11644 "--verbose", action="store_true",
11645 help="increase output verbosity",
11646 )
11647 parser.add_argument(
11648 "--split-max-tensors", type=int, default=0,
11649 help="max tensors in each split",
11650 )
11651 parser.add_argument(
11652 "--split-max-size", type=str, default="0",
11653 help="max size per split N(M|G)",
11654 )
11655 parser.add_argument(
11656 "--dry-run", action="store_true",
11657 help="only print out a split plan and exit, without writing any new files",
11658 )
11659 parser.add_argument(
11660 "--no-tensor-first-split", action="store_true",
11661 help="do not add tensors to the first split (disabled by default)"
11662 )
11663 parser.add_argument(
11664 "--metadata", type=Path,
11665 help="Specify the path for an authorship metadata override file"
11666 )
11667 parser.add_argument(
11668 "--print-supported-models", action="store_true",
11669 help="Print the supported models"
11670 )
11671 parser.add_argument(
11672 "--remote", action="store_true",
11673 help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.",
11674 )
11675 parser.add_argument(
11676 "--mmproj", action="store_true",
11677 help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
11678 )
11679 parser.add_argument(
11680 "--mistral-format", action="store_true",
11681 help="Whether the model is stored following the Mistral format.",
11682 )
11683 parser.add_argument(
11684 "--disable-mistral-community-chat-template", action="store_true",
11685 help=(
11686 "Whether to disable usage of Mistral community chat templates. If set, use the Mistral official `mistral-common` library for tokenization and detokenization of Mistral models. "
11687 "Using `mistral-common` ensure correctness and zero-day support of tokenization for models converted from the Mistral format but requires to manually setup the tokenization server."
11688 )
11689 )
11690
11691 parser.add_argument(
11692 "--sentence-transformers-dense-modules", action="store_true",
11693 help=("Whether to include sentence-transformers dense modules. "
11694 "It can be used for sentence-transformers models, like google/embeddinggemma-300m. "
11695 "Default these modules are not included.")
11696 )
11697
11698 args = parser.parse_args()
11699 if not args.print_supported_models and args.model is None:
11700 parser.error("the following arguments are required: model")
11701 return args
11702
11703
11704def split_str_to_n_bytes(split_str: str) -> int:
11705 if split_str.endswith("K"):
11706 n = int(split_str[:-1]) * 1000
11707 elif split_str.endswith("M"):
11708 n = int(split_str[:-1]) * 1000 * 1000
11709 elif split_str.endswith("G"):
11710 n = int(split_str[:-1]) * 1000 * 1000 * 1000
11711 elif split_str.isnumeric():
11712 n = int(split_str)
11713 else:
11714 raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
11715
11716 if n < 0:
11717 raise ValueError(f"Invalid split size: {split_str}, must be positive")
11718
11719 return n
11720
11721
11722def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
11723 # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
11724 # maybe we should fallback to text model's arch in that case, since not many models have both
11725 text_config = hparams.get("text_config", {})
11726 vision_config = hparams.get("vision_config", {})
11727 arch = None
11728 if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
11729 arch = arches[0]
11730 elif "ssm_cfg" in hparams:
11731 # For non-hf Mamba and Mamba2 models
11732 arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
11733
11734 # if "architectures" is found in the sub-config, use that instead
11735 if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
11736 arch = text_config["architectures"][0]
11737 elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
11738 arch = vision_config["architectures"][0]
11739 if arch is None:
11740 raise ValueError("Failed to detect model architecture")
11741 return arch
11742
11743
11744def main() -> None:
11745 args = parse_args()
11746
11747 if args.print_supported_models:
11748 logger.error("Supported models:")
11749 ModelBase.print_registered_models()
11750 sys.exit(0)
11751
11752 if args.verbose:
11753 logging.basicConfig(level=logging.DEBUG)
11754 else:
11755 logging.basicConfig(level=logging.INFO)
11756
11757 if args.remote:
11758 hf_repo_id = args.model
11759 from huggingface_hub import snapshot_download
11760 allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]
11761 if args.sentence_transformers_dense_modules:
11762 # include sentence-transformers dense modules safetensors files
11763 allowed_patterns.append("*.safetensors")
11764 local_dir = snapshot_download(
11765 repo_id=hf_repo_id,
11766 allow_patterns=allowed_patterns)
11767 dir_model = Path(local_dir)
11768 logger.info(f"Downloaded config and tokenizer to {local_dir}")
11769 else:
11770 hf_repo_id = None
11771 dir_model = Path(args.model)
11772
11773 if not dir_model.is_dir():
11774 logger.error(f'Error: {dir_model} is not a directory')
11775 sys.exit(1)
11776
11777 ftype_map: dict[str, gguf.LlamaFileType] = {
11778 "f32": gguf.LlamaFileType.ALL_F32,
11779 "f16": gguf.LlamaFileType.MOSTLY_F16,
11780 "bf16": gguf.LlamaFileType.MOSTLY_BF16,
11781 "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
11782 "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
11783 "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
11784 "auto": gguf.LlamaFileType.GUESSED,
11785 }
11786
11787 is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
11788 if args.use_temp_file and is_split:
11789 logger.error("Error: Cannot use temp file when splitting")
11790 sys.exit(1)
11791
11792 if args.outfile is not None:
11793 fname_out = args.outfile
11794 elif hf_repo_id:
11795 # if remote, use the model ID as the output file name
11796 fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
11797 else:
11798 fname_out = dir_model
11799
11800 logger.info(f"Loading model: {dir_model.name}")
11801
11802 is_mistral_format = args.mistral_format
11803 if is_mistral_format and not _mistral_common_installed:
11804 raise ImportError(_mistral_import_error_msg)
11805 disable_mistral_community_chat_template = args.disable_mistral_community_chat_template
11806
11807 with torch.inference_mode():
11808 output_type = ftype_map[args.outtype]
11809 model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
11810 hparams = ModelBase.load_hparams(dir_model, is_mistral_format)
11811 if not is_mistral_format:
11812 model_architecture = get_model_architecture(hparams, model_type)
11813 logger.info(f"Model architecture: {model_architecture}")
11814 try:
11815 model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
11816 except NotImplementedError:
11817 logger.error(f"Model {model_architecture} is not supported")
11818 sys.exit(1)
11819 elif args.mmproj:
11820 assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
11821 model_class = PixtralModel
11822 elif "moe" in hparams:
11823 model_class = MistralMoeModel
11824 else:
11825 model_class = MistralModel
11826
11827 model_instance = model_class(dir_model, output_type, fname_out,
11828 is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
11829 eager=args.no_lazy,
11830 metadata_override=args.metadata, model_name=args.model_name,
11831 split_max_tensors=args.split_max_tensors,
11832 split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
11833 small_first_shard=args.no_tensor_first_split,
11834 remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
11835 sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
11836 )
11837
11838 if args.vocab_only:
11839 logger.info("Exporting model vocab...")
11840 model_instance.write_vocab()
11841 logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
11842 else:
11843 logger.info("Exporting model...")
11844 model_instance.write()
11845 out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
11846 logger.info(f"Model successfully exported to {out_path}")
11847
11848
11849if __name__ == '__main__':
11850 main()