archive llama.cpp-b8008.tar.gz
corpus lotr.txt map1_bromm.txt map1_dagna.txt map1_keldor.txt map1_skara.txt map1_thrain.txt
llama.cpp
.devops
nix apps.nix devshells.nix docker.nix jetson-support.nix nixpkgs-instances.nix package-gguf-py.nix package.nix python-scripts.nix scope.nix sif.nix
cann.Dockerfile cpu.Dockerfile cuda-new.Dockerfile cuda.Dockerfile intel.Dockerfile llama-cli-cann.Dockerfile llama-cpp-cuda.srpm.spec llama-cpp.srpm.spec musa.Dockerfile rocm.Dockerfile s390x.Dockerfile tools.sh vulkan.Dockerfile
.gemini settings.json
.github
ISSUE_TEMPLATE 010-bug-compilation.yml 011-bug-results.yml 019-bug-misc.yml 020-enhancement.yml 030-research.yml 040-refactor.yml config.yml
actions
get-tag-name action.yml
install-exe action.yml
linux-setup-spacemit action.yml
linux-setup-vulkan action.yml
unarchive-tar action.yml
windows-setup-cuda action.yml
windows-setup-rocm action.yml
workflows bench.yml.disabled build-cache.yml build-cmake-pkg.yml build-linux-cross.yml build.yml check-vendor.yml close-issue.yml copilot-setup-steps.yml docker.yml editorconfig.yml gguf-publish.yml labeler.yml pre-tokenizer-hashes.yml python-check-requirements.yml python-lint.yml python-type-check.yml release.yml server-metal.yml server-webui.yml server.yml update-ops-docs.yml winget.yml
labeler.yml pull_request_template.md
benches
dgx-spark aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json dgx-spark.md
mac-m2-ultra mac-m2-ultra.md
ci README-MUSA.md README.md run.sh
cmake arm64-apple-clang.cmake arm64-windows-llvm.cmake build-info.cmake common.cmake download-models.cmake git-vars.cmake license.cmake llama-config.cmake.in llama.pc.in riscv64-spacemit-linux-gnu-gcc.cmake x64-windows-llvm.cmake
common
jinja README.md caps.cpp caps.h lexer.cpp lexer.h parser.cpp parser.h runtime.cpp runtime.h string.cpp string.h utils.h value.cpp value.h
CMakeLists.txt arg.cpp arg.h base64.hpp build-info.cpp.in chat-parser-xml-toolcall.cpp chat-parser-xml-toolcall.h chat-parser.cpp chat-parser.h chat-peg-parser.cpp chat-peg-parser.h chat.cpp chat.h common.cpp common.h console.cpp console.h debug.cpp debug.h download.cpp download.h http.h json-partial.cpp json-partial.h json-schema-to-grammar.cpp json-schema-to-grammar.h llguidance.cpp log.cpp log.h ngram-cache.cpp ngram-cache.h ngram-map.cpp ngram-map.h ngram-mod.cpp ngram-mod.h peg-parser.cpp peg-parser.h preset.cpp preset.h regex-partial.cpp regex-partial.h sampling.cpp sampling.h speculative.cpp speculative.h unicode.cpp unicode.h
docs
android imported-into-android-studio.jpg
backend
VirtGPU configuration.md development.md
snapdragon CMakeUserPresets.json README.md developer.md windows.md
BLIS.md CANN.md CUDA-FEDORA.md OPENCL.md SYCL.md VirtGPU.md ZenDNN.md zDNN.md
development
llama-star idea-arch.key idea-arch.pdf
HOWTO-add-model.md debugging-tests.md parsing.md token_generation_performance_tips.md
multimodal MobileVLM.md gemma3.md glmedge.md granitevision.md llava.md minicpmo2.6.md minicpmo4.0.md minicpmv2.5.md minicpmv2.6.md minicpmv4.0.md minicpmv4.5.md
ops BLAS.csv CANN.csv CPU.csv CUDA.csv Metal.csv OpenCL.csv SYCL.csv Vulkan.csv WebGPU.csv ZenDNN.csv zDNN.csv
android.md build-riscv64-spacemit.md build-s390x.md build.md docker.md function-calling.md install.md llguidance.md multimodal.md ops.md preset.md speculative.md
examples
batched CMakeLists.txt README.md batched.cpp
batched.swift
Sources main.swift
.gitignore Makefile Package.swift README.md
convert-llama2c-to-ggml CMakeLists.txt README.md convert-llama2c-to-ggml.cpp
debug CMakeLists.txt README.md debug.cpp
deprecation-warning README.md deprecation-warning.cpp
diffusion CMakeLists.txt README.md diffusion-cli.cpp
embedding CMakeLists.txt README.md embedding.cpp
eval-callback CMakeLists.txt README.md eval-callback.cpp
gen-docs CMakeLists.txt gen-docs.cpp
gguf CMakeLists.txt gguf.cpp
gguf-hash
deps
rotate-bits package.json rotate-bits.h
sha1 package.json sha1.c sha1.h
sha256 package.json sha256.c sha256.h
xxhash clib.json xxhash.c xxhash.h
CMakeLists.txt README.md gguf-hash.cpp
idle CMakeLists.txt README.md idle.cpp
llama.android
app
src
main
java
com
example
llama MainActivity.kt MessageAdapter.kt
res
drawable bg_assistant_message.xml bg_user_message.xml ic_launcher_background.xml ic_launcher_foreground.xml outline_folder_open_24.xml outline_send_24.xml
layout activity_main.xml item_message_assistant.xml item_message_user.xml
mipmap-anydpi ic_launcher.xml ic_launcher_round.xml
mipmap-hdpi ic_launcher.webp ic_launcher_round.webp
mipmap-mdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xhdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xxhdpi ic_launcher.webp ic_launcher_round.webp
mipmap-xxxhdpi ic_launcher.webp ic_launcher_round.webp
values colors.xml strings.xml themes.xml
xml backup_rules.xml data_extraction_rules.xml
AndroidManifest.xml
.gitignore build.gradle.kts proguard-rules.pro
gradle
wrapper gradle-wrapper.jar gradle-wrapper.properties
libs.versions.toml
lib
src
androidTest
java
android
llama
cpp ExampleInstrumentedTest.kt
main
cpp CMakeLists.txt ai_chat.cpp logging.h
java
com
arm
aichat
gguf FileType.kt GgufMetadata.kt GgufMetadataReader.kt
internal
gguf GgufMetadataReaderImpl.kt
InferenceEngineImpl.kt
AiChat.kt InferenceEngine.kt
AndroidManifest.xml
test
java
android
llama
cpp ExampleUnitTest.kt
.gitignore build.gradle.kts consumer-rules.pro proguard-rules.pro
.gitignore build.gradle.kts gradle.properties gradlew settings.gradle.kts
llama.swiftui
llama.cpp.swift LibLlama.swift
llama.swiftui
Assets.xcassets
AppIcon.appiconset Contents.json
Contents.json
Models LlamaState.swift
Resources
models .gitignore
UI ContentView.swift DownloadButton.swift InputButton.swift LoadCustomButton.swift
llama_swiftuiApp.swift
llama.swiftui.xcodeproj
project.xcworkspace contents.xcworkspacedata
project.pbxproj
.gitignore README.md
lookahead CMakeLists.txt README.md lookahead.cpp
lookup CMakeLists.txt README.md lookup-create.cpp lookup-merge.cpp lookup-stats.cpp lookup.cpp
model-conversion
scripts
causal compare-embeddings-logits.sh compare-logits.py convert-model.sh modelcard.template run-casual-gen-embeddings-org.py run-converted-model-embeddings-logits.sh run-converted-model.sh run-org-model.py
embedding compare-embeddings-logits.sh convert-model.sh modelcard.template run-converted-model.sh run-original-model.py
utils __init__.py check-nmse.py common.py compare_tokens.py create-collection-add-model.sh curl-embedding-server.sh hf-add-model-to-collection.py hf-create-collection.py hf-create-model.py hf-upload-gguf-model.py inspect-converted-model.sh inspect-org-model.py perplexity-gen.sh perplexity-run-simple.sh perplexity-run.sh quantize.sh run-embedding-server.sh semantic_check.py tensor-info.py
.gitignore Makefile README.md requirements.txt
parallel CMakeLists.txt README.md parallel.cpp
passkey CMakeLists.txt README.md passkey.cpp
retrieval CMakeLists.txt README.md retrieval.cpp
save-load-state CMakeLists.txt save-load-state.cpp
simple CMakeLists.txt README.md simple.cpp
simple-chat CMakeLists.txt README.md simple-chat.cpp
simple-cmake-pkg .gitignore CMakeLists.txt README.md
speculative CMakeLists.txt README.md speculative.cpp
speculative-simple CMakeLists.txt README.md speculative-simple.cpp
sycl CMakeLists.txt README.md build.sh ls-sycl-device.cpp run-llama2.sh test.sh win-build-sycl.bat win-run-llama2.bat win-test.bat
training CMakeLists.txt README.md finetune.cpp
CMakeLists.txt convert_legacy_llama.py json_schema_pydantic_example.py json_schema_to_grammar.py llama.vim pydantic_models_to_grammar.py pydantic_models_to_grammar_examples.py reason-act.sh regex_to_grammar.py server-llama2-13B.sh server_embd.py ts-type-to-grammar.sh
ggml
cmake GitVars.cmake common.cmake ggml-config.cmake.in
include ggml-alloc.h ggml-backend.h ggml-blas.h ggml-cann.h ggml-cpp.h ggml-cpu.h ggml-cuda.h ggml-hexagon.h ggml-metal.h ggml-opencl.h ggml-opt.h ggml-rpc.h ggml-sycl.h ggml-virtgpu.h ggml-vulkan.h ggml-webgpu.h ggml-zdnn.h ggml-zendnn.h ggml.h gguf.h
src
ggml-blas CMakeLists.txt ggml-blas.cpp
ggml-cann CMakeLists.txt acl_tensor.cpp acl_tensor.h aclnn_ops.cpp aclnn_ops.h common.h ggml-cann.cpp
ggml-cpu
amx amx.cpp amx.h common.h mmq.cpp mmq.h
arch
arm cpu-feats.cpp quants.c repack.cpp
loongarch quants.c
powerpc cpu-feats.cpp quants.c
riscv cpu-feats.cpp quants.c repack.cpp
s390 cpu-feats.cpp quants.c
wasm quants.c
x86 cpu-feats.cpp quants.c repack.cpp
cmake FindSIMD.cmake
kleidiai kernels.cpp kernels.h kleidiai.cpp kleidiai.h
llamafile sgemm-ppc.h sgemm.cpp sgemm.h
spacemit ime.cpp ime.h ime1_kernels.cpp ime_kernels.h
CMakeLists.txt arch-fallback.h binary-ops.cpp binary-ops.h common.h ggml-cpu-impl.h ggml-cpu.c ggml-cpu.cpp hbm.cpp hbm.h ops.cpp ops.h quants.c quants.h repack.cpp repack.h simd-mappings.h traits.cpp traits.h unary-ops.cpp unary-ops.h vec.cpp vec.h
ggml-cuda
template-instances fattn-mma-f16-instance-ncols1_1-ncols2_16.cu fattn-mma-f16-instance-ncols1_1-ncols2_32.cu fattn-mma-f16-instance-ncols1_1-ncols2_8.cu fattn-mma-f16-instance-ncols1_16-ncols2_1.cu fattn-mma-f16-instance-ncols1_16-ncols2_2.cu fattn-mma-f16-instance-ncols1_16-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_16.cu fattn-mma-f16-instance-ncols1_2-ncols2_32.cu fattn-mma-f16-instance-ncols1_2-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_8.cu fattn-mma-f16-instance-ncols1_32-ncols2_1.cu fattn-mma-f16-instance-ncols1_32-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_16.cu fattn-mma-f16-instance-ncols1_4-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_4.cu fattn-mma-f16-instance-ncols1_4-ncols2_8.cu fattn-mma-f16-instance-ncols1_64-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_2.cu fattn-mma-f16-instance-ncols1_8-ncols2_4.cu fattn-mma-f16-instance-ncols1_8-ncols2_8.cu fattn-tile-instance-dkq112-dv112.cu fattn-tile-instance-dkq128-dv128.cu fattn-tile-instance-dkq256-dv256.cu fattn-tile-instance-dkq40-dv40.cu fattn-tile-instance-dkq576-dv512.cu fattn-tile-instance-dkq64-dv64.cu fattn-tile-instance-dkq72-dv72.cu fattn-tile-instance-dkq80-dv80.cu fattn-tile-instance-dkq96-dv96.cu fattn-vec-instance-f16-f16.cu fattn-vec-instance-f16-q4_0.cu fattn-vec-instance-f16-q4_1.cu fattn-vec-instance-f16-q5_0.cu fattn-vec-instance-f16-q5_1.cu fattn-vec-instance-f16-q8_0.cu fattn-vec-instance-q4_0-f16.cu fattn-vec-instance-q4_0-q4_0.cu fattn-vec-instance-q4_0-q4_1.cu fattn-vec-instance-q4_0-q5_0.cu fattn-vec-instance-q4_0-q5_1.cu fattn-vec-instance-q4_0-q8_0.cu fattn-vec-instance-q4_1-f16.cu fattn-vec-instance-q4_1-q4_0.cu fattn-vec-instance-q4_1-q4_1.cu fattn-vec-instance-q4_1-q5_0.cu fattn-vec-instance-q4_1-q5_1.cu fattn-vec-instance-q4_1-q8_0.cu fattn-vec-instance-q5_0-f16.cu fattn-vec-instance-q5_0-q4_0.cu fattn-vec-instance-q5_0-q4_1.cu fattn-vec-instance-q5_0-q5_0.cu fattn-vec-instance-q5_0-q5_1.cu fattn-vec-instance-q5_0-q8_0.cu fattn-vec-instance-q5_1-f16.cu fattn-vec-instance-q5_1-q4_0.cu fattn-vec-instance-q5_1-q4_1.cu fattn-vec-instance-q5_1-q5_0.cu fattn-vec-instance-q5_1-q5_1.cu fattn-vec-instance-q5_1-q8_0.cu fattn-vec-instance-q8_0-f16.cu fattn-vec-instance-q8_0-q4_0.cu fattn-vec-instance-q8_0-q4_1.cu fattn-vec-instance-q8_0-q5_0.cu fattn-vec-instance-q8_0-q5_1.cu fattn-vec-instance-q8_0-q8_0.cu generate_cu_files.py mmf-instance-ncols_1.cu mmf-instance-ncols_10.cu mmf-instance-ncols_11.cu mmf-instance-ncols_12.cu mmf-instance-ncols_13.cu mmf-instance-ncols_14.cu mmf-instance-ncols_15.cu mmf-instance-ncols_16.cu mmf-instance-ncols_2.cu mmf-instance-ncols_3.cu mmf-instance-ncols_4.cu mmf-instance-ncols_5.cu mmf-instance-ncols_6.cu mmf-instance-ncols_7.cu mmf-instance-ncols_8.cu mmf-instance-ncols_9.cu mmq-instance-iq1_s.cu mmq-instance-iq2_s.cu mmq-instance-iq2_xs.cu mmq-instance-iq2_xxs.cu mmq-instance-iq3_s.cu mmq-instance-iq3_xxs.cu mmq-instance-iq4_nl.cu mmq-instance-iq4_xs.cu mmq-instance-mxfp4.cu mmq-instance-q2_k.cu mmq-instance-q3_k.cu mmq-instance-q4_0.cu mmq-instance-q4_1.cu mmq-instance-q4_k.cu mmq-instance-q5_0.cu mmq-instance-q5_1.cu mmq-instance-q5_k.cu mmq-instance-q6_k.cu mmq-instance-q8_0.cu
vendors cuda.h hip.h musa.h
CMakeLists.txt acc.cu acc.cuh add-id.cu add-id.cuh arange.cu arange.cuh argmax.cu argmax.cuh argsort.cu argsort.cuh binbcast.cu binbcast.cuh clamp.cu clamp.cuh common.cuh concat.cu concat.cuh conv-transpose-1d.cu conv-transpose-1d.cuh conv2d-dw.cu conv2d-dw.cuh conv2d-transpose.cu conv2d-transpose.cuh conv2d.cu conv2d.cuh convert.cu convert.cuh count-equal.cu count-equal.cuh cp-async.cuh cpy-utils.cuh cpy.cu cpy.cuh cross-entropy-loss.cu cross-entropy-loss.cuh cumsum.cu cumsum.cuh dequantize.cuh diag.cu diag.cuh diagmask.cu diagmask.cuh fattn-common.cuh fattn-mma-f16.cuh fattn-tile.cu fattn-tile.cuh fattn-vec.cuh fattn-wmma-f16.cu fattn-wmma-f16.cuh fattn.cu fattn.cuh fill.cu fill.cuh getrows.cu getrows.cuh ggml-cuda.cu gla.cu gla.cuh im2col.cu im2col.cuh mean.cu mean.cuh mma.cuh mmf.cu mmf.cuh mmid.cu mmid.cuh mmq.cu mmq.cuh mmvf.cu mmvf.cuh mmvq.cu mmvq.cuh norm.cu norm.cuh opt-step-adamw.cu opt-step-adamw.cuh opt-step-sgd.cu opt-step-sgd.cuh out-prod.cu out-prod.cuh pad.cu pad.cuh pad_reflect_1d.cu pad_reflect_1d.cuh pool2d.cu pool2d.cuh quantize.cu quantize.cuh reduce_rows.cuh roll.cu roll.cuh rope.cu rope.cuh scale.cu scale.cuh set-rows.cu set-rows.cuh set.cu set.cuh softcap.cu softcap.cuh softmax.cu softmax.cuh solve_tri.cu solve_tri.cuh ssm-conv.cu ssm-conv.cuh ssm-scan.cu ssm-scan.cuh sum.cu sum.cuh sumrows.cu sumrows.cuh top-k.cu top-k.cuh topk-moe.cu topk-moe.cuh tri.cu tri.cuh tsembd.cu tsembd.cuh unary.cu unary.cuh upscale.cu upscale.cuh vecdotq.cuh wkv.cu wkv.cuh
ggml-hexagon
htp CMakeLists.txt act-ops.c argsort-ops.c binary-ops.c cmake-toolchain.cmake cpy-ops.c flash-attn-ops.c get-rows-ops.c hex-dma.c hex-dma.h hex-dump.h hex-fastdiv.h hex-utils.h htp-ctx.h htp-msg.h htp-ops.h htp_iface.idl hvx-arith.h hvx-base.h hvx-copy.h hvx-div.h hvx-dump.h hvx-exp.h hvx-floor.h hvx-inverse.h hvx-reduce.h hvx-scale.h hvx-sigmoid.h hvx-sqrt.h hvx-types.h hvx-utils.h main.c matmul-ops.c rope-ops.c set-rows-ops.c softmax-ops.c sum-rows-ops.c unary-ops.c worker-pool.c worker-pool.h
CMakeLists.txt ggml-hexagon.cpp htp-drv.cpp htp-drv.h libdl.h libggml-htp.inf op-desc.h
ggml-hip CMakeLists.txt
ggml-metal CMakeLists.txt ggml-metal-common.cpp ggml-metal-common.h ggml-metal-context.h ggml-metal-context.m ggml-metal-device.cpp ggml-metal-device.h ggml-metal-device.m ggml-metal-impl.h ggml-metal-ops.cpp ggml-metal-ops.h ggml-metal.cpp ggml-metal.metal
ggml-musa CMakeLists.txt mudnn.cu mudnn.cuh
ggml-opencl
kernels add.cl add_id.cl argsort.cl clamp.cl concat.cl conv2d.cl conv2d_f16_f32.cl cpy.cl cvt.cl diag_mask_inf.cl div.cl embed_kernel.py expm1.cl fill.cl flash_attn_f16.cl flash_attn_f32.cl flash_attn_f32_f16.cl gelu.cl gemm_moe_mxfp4_f32.cl gemv_moe_mxfp4_f32.cl gemv_noshuffle.cl gemv_noshuffle_general.cl gemv_noshuffle_general_q8_0_f32.cl get_rows.cl glu.cl group_norm.cl im2col_f16.cl im2col_f32.cl mean.cl mul.cl mul_mat_Ab_Bi_8x4.cl mul_mat_f16_f32.cl mul_mm_f16_f32_kq_kqv.cl mul_mm_f16_f32_l4_lm.cl mul_mm_f32_f32_l4_lm.cl mul_mm_q6_k_f32_l4_lm.cl mul_mm_q8_0_f32_8x4.cl mul_mm_q8_0_f32_l4_lm.cl mul_mv_f16_f16.cl mul_mv_f16_f32.cl mul_mv_f16_f32_1row.cl mul_mv_f16_f32_l4.cl mul_mv_f32_f32.cl mul_mv_id_mxfp4_f32.cl mul_mv_id_mxfp4_f32_flat.cl mul_mv_id_q4_0_f32_8x_flat.cl mul_mv_id_q8_0_f32.cl mul_mv_id_q8_0_f32_flat.cl mul_mv_mxfp4_f32.cl mul_mv_mxfp4_f32_flat.cl mul_mv_q4_0_f32.cl mul_mv_q4_0_f32_1d_16x_flat.cl mul_mv_q4_0_f32_1d_8x_flat.cl mul_mv_q4_0_f32_8x_flat.cl mul_mv_q4_0_f32_v.cl mul_mv_q4_k_f32.cl mul_mv_q6_k_f32.cl mul_mv_q6_k_f32_flat.cl mul_mv_q8_0_f32.cl mul_mv_q8_0_f32_flat.cl norm.cl pad.cl relu.cl repeat.cl rms_norm.cl rope.cl scale.cl set_rows.cl sigmoid.cl silu.cl softmax_4_f16.cl softmax_4_f32.cl softmax_f16.cl softmax_f32.cl softplus.cl solve_tri.cl sqr.cl sqrt.cl ssm_conv.cl sub.cl sum_rows.cl tanh.cl transpose.cl tri.cl tsembd.cl upscale.cl
CMakeLists.txt ggml-opencl.cpp
ggml-rpc CMakeLists.txt ggml-rpc.cpp
ggml-sycl
dpct helper.hpp
CMakeLists.txt add-id.cpp add-id.hpp backend.hpp binbcast.cpp binbcast.hpp common.cpp common.hpp concat.cpp concat.hpp conv.cpp conv.hpp convert.cpp convert.hpp count-equal.cpp count-equal.hpp cpy.cpp cpy.hpp dequantize.hpp dmmv.cpp dmmv.hpp element_wise.cpp element_wise.hpp gemm.hpp getrows.cpp getrows.hpp ggml-sycl.cpp gla.cpp gla.hpp im2col.cpp im2col.hpp mmq.cpp mmq.hpp mmvq.cpp mmvq.hpp norm.cpp norm.hpp outprod.cpp outprod.hpp pad.cpp pad.hpp pad_reflect_1d.cpp pad_reflect_1d.hpp presets.hpp quantize.hpp quants.hpp repeat_back.cpp repeat_back.hpp roll.cpp roll.hpp rope.cpp rope.hpp set.cpp set.hpp set_rows.cpp set_rows.hpp softmax.cpp softmax.hpp ssm_conv.cpp ssm_conv.hpp sycl_hw.cpp sycl_hw.hpp tsembd.cpp tsembd.hpp vecdotq.hpp wkv.cpp wkv.hpp
ggml-virtgpu
backend
shared api_remoting.h apir_backend.gen.h apir_backend.h apir_cs.h apir_cs_ggml.h apir_cs_rpc.h
CMakeLists.txt apir_cs_ggml-rpc-back.cpp backend-convert.h backend-dispatched-backend.cpp backend-dispatched-buffer-type.cpp backend-dispatched-buffer.cpp backend-dispatched-device.cpp backend-dispatched.cpp backend-dispatched.gen.h backend-dispatched.h backend-virgl-apir.h backend.cpp
include apir_hw.h
CMakeLists.txt apir_cs_ggml-rpc-front.cpp ggml-backend-buffer-type.cpp ggml-backend-buffer.cpp ggml-backend-device.cpp ggml-backend-reg.cpp ggml-backend.cpp ggml-remoting.h ggmlremoting_functions.yaml regenerate_remoting.py virtgpu-apir.h virtgpu-forward-backend.cpp virtgpu-forward-buffer-type.cpp virtgpu-forward-buffer.cpp virtgpu-forward-device.cpp virtgpu-forward-impl.h virtgpu-forward.gen.h virtgpu-shm.cpp virtgpu-shm.h virtgpu-utils.cpp virtgpu-utils.h virtgpu.cpp virtgpu.h
ggml-vulkan
cmake host-toolchain.cmake.in
vulkan-shaders
feature-tests bfloat16.comp coopmat.comp coopmat2.comp integer_dot.comp
CMakeLists.txt abs.comp acc.comp add.comp add1.comp add_id.comp arange.comp argmax.comp argsort.comp argsort_large.comp ceil.comp clamp.comp concat.comp contig_copy.comp conv2d_dw.comp conv2d_mm.comp conv_transpose_1d.comp copy.comp copy_from_quant.comp copy_to_quant.comp copy_transpose.comp cos.comp count_equal.comp count_experts.comp cumsum.comp cumsum_multipass1.comp cumsum_multipass2.comp dequant_f32.comp dequant_funcs.glsl dequant_funcs_cm2.glsl dequant_head.glsl dequant_iq1_m.comp dequant_iq1_s.comp dequant_iq2_s.comp dequant_iq2_xs.comp dequant_iq2_xxs.comp dequant_iq3_s.comp dequant_iq3_xxs.comp dequant_iq4_nl.comp dequant_iq4_xs.comp dequant_mxfp4.comp dequant_q2_k.comp dequant_q3_k.comp dequant_q4_0.comp dequant_q4_1.comp dequant_q4_k.comp dequant_q5_0.comp dequant_q5_1.comp dequant_q5_k.comp dequant_q6_k.comp dequant_q8_0.comp diag.comp diag_mask_inf.comp div.comp exp.comp fill.comp flash_attn.comp flash_attn_base.glsl flash_attn_cm1.comp flash_attn_cm2.comp flash_attn_mask_opt.comp flash_attn_split_k_reduce.comp floor.comp geglu.comp geglu_erf.comp geglu_quick.comp gelu.comp gelu_erf.comp gelu_quick.comp generic_binary_head.glsl generic_head.glsl generic_unary_head.glsl get_rows.comp get_rows_quant.comp glu_head.glsl glu_main.glsl group_norm.comp hardsigmoid.comp hardswish.comp im2col.comp im2col_3d.comp l2_norm.comp leaky_relu.comp log.comp mul.comp mul_mat_split_k_reduce.comp mul_mat_vec.comp mul_mat_vec_base.glsl mul_mat_vec_iface.glsl mul_mat_vec_iq1_m.comp mul_mat_vec_iq1_s.comp mul_mat_vec_iq2_s.comp mul_mat_vec_iq2_xs.comp mul_mat_vec_iq2_xxs.comp mul_mat_vec_iq3_s.comp mul_mat_vec_iq3_xxs.comp mul_mat_vec_nc.comp mul_mat_vec_p021.comp mul_mat_vec_q2_k.comp mul_mat_vec_q3_k.comp mul_mat_vec_q4_k.comp mul_mat_vec_q5_k.comp mul_mat_vec_q6_k.comp mul_mat_vecq.comp mul_mat_vecq_funcs.glsl mul_mm.comp mul_mm_cm2.comp mul_mm_funcs.glsl mul_mm_id_funcs.glsl mul_mmq.comp mul_mmq_funcs.glsl mul_mmq_shmem_types.glsl multi_add.comp neg.comp norm.comp opt_step_adamw.comp opt_step_sgd.comp pad.comp pool2d.comp quantize_q8_1.comp reglu.comp relu.comp repeat.comp repeat_back.comp rms_norm.comp rms_norm_back.comp rms_norm_partials.comp roll.comp rope_funcs.glsl rope_head.glsl rope_multi.comp rope_neox.comp rope_norm.comp rope_params.glsl rope_vision.comp round.comp rte.glsl scale.comp sigmoid.comp silu.comp silu_back.comp sin.comp soft_max.comp soft_max_back.comp soft_max_large1.comp soft_max_large2.comp soft_max_large3.comp soft_max_large_common.glsl softplus.comp solve_tri.comp sqrt.comp square.comp ssm_conv.comp ssm_scan.comp step.comp sub.comp sum_rows.comp sum_rows.glsl swiglu.comp swiglu_oai.comp tanh.comp timestep_embedding.comp topk_argsort.comp topk_moe.comp topk_nary_search.comp tri.comp trunc.comp types.glsl upscale.comp utils.glsl vulkan-shaders-gen.cpp wkv6.comp wkv7.comp xielu.comp
CMakeLists.txt ggml-vulkan.cpp
ggml-webgpu
wgsl-shaders argmax.wgsl argsort.wgsl argsort_merge.wgsl binary.wgsl common_decls.tmpl cpy.tmpl.wgsl cumsum.wgsl embed_wgsl.py flash_attn.wgsl get_rows.tmpl.wgsl glu.tmpl.wgsl memset.wgsl mul_mat.tmpl.wgsl mul_mat_decls.tmpl mul_mat_reg_tile.tmpl.wgsl mul_mat_subgroup_matrix.tmpl.wgsl mul_mat_vec.tmpl.wgsl pad.wgsl rms_norm.wgsl rope.tmpl.wgsl scale.tmpl.wgsl set_rows.wgsl soft_max.tmpl.wgsl sum_rows.wgsl unary.wgsl
CMakeLists.txt ggml-webgpu-shader-lib.hpp ggml-webgpu.cpp pre_wgsl.hpp
ggml-zdnn .gitignore CMakeLists.txt common.hpp ggml-zdnn.cpp mmf.cpp mmf.hpp utils.cpp utils.hpp
ggml-zendnn CMakeLists.txt ggml-zendnn.cpp
CMakeLists.txt ggml-alloc.c ggml-backend-dl.cpp ggml-backend-dl.h ggml-backend-impl.h ggml-backend-reg.cpp ggml-backend.cpp ggml-common.h ggml-impl.h ggml-opt.cpp ggml-quants.c ggml-quants.h ggml-threading.cpp ggml-threading.h ggml.c ggml.cpp gguf.cpp
.gitignore CMakeLists.txt
gguf-py
examples reader.py writer.py
gguf
scripts gguf_convert_endian.py gguf_dump.py gguf_editor_gui.py gguf_hash.py gguf_new_metadata.py gguf_set_metadata.py
__init__.py constants.py gguf.py gguf_reader.py gguf_writer.py lazy.py metadata.py py.typed quants.py tensor_mapping.py utility.py vocab.py
tests __init__.py test_metadata.py test_quants.py
LICENSE README.md pyproject.toml
grammars README.md arithmetic.gbnf c.gbnf chess.gbnf english.gbnf japanese.gbnf json.gbnf json_arr.gbnf list.gbnf
include llama-cpp.h llama.h
licenses LICENSE-jsonhpp
media llama0-banner.png llama0-logo.png llama1-banner.png llama1-icon-transparent.png llama1-icon-transparent.svg llama1-icon.png llama1-icon.svg llama1-logo.png llama1-logo.svg matmul.png matmul.svg
models
templates Apertus-8B-Instruct.jinja ByteDance-Seed-OSS.jinja CohereForAI-c4ai-command-r-plus-tool_use.jinja CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja GLM-4.6.jinja Kimi-K2-Instruct.jinja Kimi-K2-Thinking.jinja MiMo-VL.jinja MiniMax-M2.jinja Mistral-Small-3.2-24B-Instruct-2506.jinja NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja NVIDIA-Nemotron-Nano-v2.jinja NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja Qwen-QwQ-32B.jinja Qwen-Qwen2.5-7B-Instruct.jinja Qwen-Qwen3-0.6B.jinja Qwen3-Coder.jinja README.md deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja deepseek-ai-DeepSeek-V3.1.jinja fireworks-ai-llama-3-firefunction-v2.jinja google-gemma-2-2b-it.jinja ibm-granite-granite-3.3-2B-Instruct.jinja llama-cpp-deepseek-r1.jinja llama-cpp-lfm2.jinja llama-cpp-rwkv-world.jinja meetkai-functionary-medium-v3.1.jinja meetkai-functionary-medium-v3.2.jinja meta-llama-Llama-3.1-8B-Instruct.jinja meta-llama-Llama-3.2-3B-Instruct.jinja meta-llama-Llama-3.3-70B-Instruct.jinja microsoft-Phi-3.5-mini-instruct.jinja mistralai-Ministral-3-14B-Reasoning-2512.jinja mistralai-Mistral-Nemo-Instruct-2407.jinja moonshotai-Kimi-K2.jinja openai-gpt-oss-120b.jinja unsloth-Apriel-1.5.jinja unsloth-mistral-Devstral-Small-2507.jinja upstage-Solar-Open-100B.jinja
.editorconfig ggml-vocab-aquila.gguf ggml-vocab-baichuan.gguf ggml-vocab-bert-bge.gguf ggml-vocab-bert-bge.gguf.inp ggml-vocab-bert-bge.gguf.out ggml-vocab-command-r.gguf ggml-vocab-command-r.gguf.inp ggml-vocab-command-r.gguf.out ggml-vocab-deepseek-coder.gguf ggml-vocab-deepseek-coder.gguf.inp ggml-vocab-deepseek-coder.gguf.out ggml-vocab-deepseek-llm.gguf ggml-vocab-deepseek-llm.gguf.inp ggml-vocab-deepseek-llm.gguf.out ggml-vocab-falcon.gguf ggml-vocab-falcon.gguf.inp ggml-vocab-falcon.gguf.out ggml-vocab-gpt-2.gguf ggml-vocab-gpt-2.gguf.inp ggml-vocab-gpt-2.gguf.out ggml-vocab-gpt-neox.gguf ggml-vocab-llama-bpe.gguf ggml-vocab-llama-bpe.gguf.inp ggml-vocab-llama-bpe.gguf.out ggml-vocab-llama-spm.gguf ggml-vocab-llama-spm.gguf.inp ggml-vocab-llama-spm.gguf.out ggml-vocab-mpt.gguf ggml-vocab-mpt.gguf.inp ggml-vocab-mpt.gguf.out ggml-vocab-nomic-bert-moe.gguf ggml-vocab-phi-3.gguf ggml-vocab-phi-3.gguf.inp ggml-vocab-phi-3.gguf.out ggml-vocab-qwen2.gguf ggml-vocab-qwen2.gguf.inp ggml-vocab-qwen2.gguf.out ggml-vocab-refact.gguf ggml-vocab-refact.gguf.inp ggml-vocab-refact.gguf.out ggml-vocab-starcoder.gguf ggml-vocab-starcoder.gguf.inp ggml-vocab-starcoder.gguf.out
pocs
vdot CMakeLists.txt q8dot.cpp vdot.cpp
CMakeLists.txt
requirements requirements-all.txt requirements-compare-llama-bench.txt requirements-convert_hf_to_gguf.txt requirements-convert_hf_to_gguf_update.txt requirements-convert_legacy_llama.txt requirements-convert_llama_ggml_to_gguf.txt requirements-convert_lora_to_gguf.txt requirements-gguf_editor_gui.txt requirements-pydantic.txt requirements-server-bench.txt requirements-test-tokenizer-random.txt requirements-tool_bench.txt
scripts
apple validate-apps.sh validate-ios.sh validate-macos.sh validate-tvos.sh validate-visionos.sh
jinja jinja-tester.py requirements.txt
snapdragon
adb llama-cli.farf run-bench.sh run-cli.sh run-completion.sh run-mtmd.sh run-tool.sh
qdc
tests test_bench.py
readme.md requirements.txt
windows run-bench.ps1 run-cli.ps1 run-tool.ps1 setup-build.ps1
bench-models.sh build-info.sh check-requirements.sh compare-commits.sh compare-llama-bench.py compare-logprobs.py create_ops_docs.py debug-test.sh fetch_server_test_models.py gen-authors.sh gen-unicode-data.py get-flags.mk get-hellaswag.sh get-pg.sh get-wikitext-103.sh get-wikitext-2.sh get-winogrande.sh get_chat_template.py hf.sh install-oneapi.bat pr2wt.sh serve-static.js server-bench.py sync-ggml-am.sh sync-ggml.last sync-ggml.sh sync_vendor.py tool_bench.py tool_bench.sh verify-checksum-models.py xxd.cmake
src
models afmoe.cpp apertus.cpp arcee.cpp arctic.cpp arwkv7.cpp baichuan.cpp bailingmoe.cpp bailingmoe2.cpp bert.cpp bitnet.cpp bloom.cpp chameleon.cpp chatglm.cpp codeshell.cpp cogvlm.cpp cohere2-iswa.cpp command-r.cpp dbrx.cpp deci.cpp deepseek.cpp deepseek2.cpp dots1.cpp dream.cpp ernie4-5-moe.cpp ernie4-5.cpp exaone-moe.cpp exaone.cpp exaone4.cpp falcon-h1.cpp falcon.cpp gemma-embedding.cpp gemma.cpp gemma2-iswa.cpp gemma3.cpp gemma3n-iswa.cpp glm4-moe.cpp glm4.cpp gpt2.cpp gptneox.cpp granite-hybrid.cpp granite.cpp graph-context-mamba.cpp grok.cpp grovemoe.cpp hunyuan-dense.cpp hunyuan-moe.cpp internlm2.cpp jais.cpp jamba.cpp kimi-linear.cpp lfm2.cpp llada-moe.cpp llada.cpp llama-iswa.cpp llama.cpp maincoder.cpp mamba.cpp mimo2-iswa.cpp minicpm3.cpp minimax-m2.cpp mistral3.cpp models.h modern-bert.cpp mpt.cpp nemotron-h.cpp nemotron.cpp neo-bert.cpp olmo.cpp olmo2.cpp olmoe.cpp openai-moe-iswa.cpp openelm.cpp orion.cpp pangu-embedded.cpp phi2.cpp phi3.cpp plamo.cpp plamo2.cpp plamo3.cpp plm.cpp qwen.cpp qwen2.cpp qwen2moe.cpp qwen2vl.cpp qwen3.cpp qwen35.cpp qwen35moe.cpp qwen3moe.cpp qwen3next.cpp qwen3vl-moe.cpp qwen3vl.cpp refact.cpp rnd1.cpp rwkv6-base.cpp rwkv6.cpp rwkv6qwen2.cpp rwkv7-base.cpp rwkv7.cpp seed-oss.cpp smallthinker.cpp smollm3.cpp stablelm.cpp starcoder.cpp starcoder2.cpp step35-iswa.cpp t5-dec.cpp t5-enc.cpp wavtokenizer-dec.cpp xverse.cpp
CMakeLists.txt llama-adapter.cpp llama-adapter.h llama-arch.cpp llama-arch.h llama-batch.cpp llama-batch.h llama-chat.cpp llama-chat.h llama-context.cpp llama-context.h llama-cparams.cpp llama-cparams.h llama-grammar.cpp llama-grammar.h llama-graph.cpp llama-graph.h llama-hparams.cpp llama-hparams.h llama-impl.cpp llama-impl.h llama-io.cpp llama-io.h llama-kv-cache-iswa.cpp llama-kv-cache-iswa.h llama-kv-cache.cpp llama-kv-cache.h llama-kv-cells.h llama-memory-hybrid-iswa.cpp llama-memory-hybrid-iswa.h llama-memory-hybrid.cpp llama-memory-hybrid.h llama-memory-recurrent.cpp llama-memory-recurrent.h llama-memory.cpp llama-memory.h llama-mmap.cpp llama-mmap.h llama-model-loader.cpp llama-model-loader.h llama-model-saver.cpp llama-model-saver.h llama-model.cpp llama-model.h llama-quant.cpp llama-quant.h llama-sampler.cpp llama-sampler.h llama-vocab.cpp llama-vocab.h llama.cpp unicode-data.cpp unicode-data.h unicode.cpp unicode.h
tests
peg-parser simple-tokenize.cpp simple-tokenize.h test-basic.cpp test-gbnf-generation.cpp test-json-parser.cpp test-json-serialization.cpp test-unicode.cpp tests.h
.gitignore CMakeLists.txt get-model.cpp get-model.h run-json-schema-to-grammar.mjs test-alloc.cpp test-arg-parser.cpp test-autorelease.cpp test-backend-ops.cpp test-backend-sampler.cpp test-barrier.cpp test-c.c test-chat-parser.cpp test-chat-peg-parser.cpp test-chat-template.cpp test-chat.cpp test-double-float.cpp test-gbnf-validator.cpp test-gguf.cpp test-grammar-integration.cpp test-grammar-llguidance.cpp test-grammar-parser.cpp test-jinja.cpp test-json-partial.cpp test-json-schema-to-grammar.cpp test-llama-grammar.cpp test-log.cpp test-lora-conversion-inference.sh test-model-load-cancel.cpp test-mtmd-c-api.c test-opt.cpp test-peg-parser.cpp test-quantize-fns.cpp test-quantize-perf.cpp test-quantize-stats.cpp test-regex-partial.cpp test-rope.cpp test-sampling.cpp test-state-restore-fragmented.cpp test-thread-safety.cpp test-tokenizer-0.cpp test-tokenizer-0.py test-tokenizer-0.sh test-tokenizer-1-bpe.cpp test-tokenizer-1-spm.cpp test-tokenizer-random.py test-tokenizers-repo.sh testing.h
tools
batched-bench CMakeLists.txt README.md batched-bench.cpp
cli CMakeLists.txt README.md cli.cpp
completion CMakeLists.txt README.md completion.cpp
cvector-generator CMakeLists.txt README.md completions.txt cvector-generator.cpp mean.hpp negative.txt pca.hpp positive.txt
export-lora CMakeLists.txt README.md export-lora.cpp
fit-params CMakeLists.txt README.md fit-params.cpp
gguf-split CMakeLists.txt README.md gguf-split.cpp tests.sh
imatrix CMakeLists.txt README.md imatrix.cpp
llama-bench CMakeLists.txt README.md llama-bench.cpp
mtmd
legacy-models convert_image_encoder_to_gguf.py glmedge-convert-image-encoder-to-gguf.py glmedge-surgery.py llava_surgery.py llava_surgery_v2.py minicpmv-convert-image-encoder-to-gguf.py minicpmv-surgery.py
models cogvlm.cpp conformer.cpp glm4v.cpp internvl.cpp kimik25.cpp kimivl.cpp llama4.cpp llava.cpp minicpmv.cpp mobilenetv5.cpp models.h pixtral.cpp qwen2vl.cpp qwen3vl.cpp siglip.cpp whisper-enc.cpp youtuvl.cpp
CMakeLists.txt README.md clip-graph.h clip-impl.h clip-model.h clip.cpp clip.h deprecation-warning.cpp mtmd-audio.cpp mtmd-audio.h mtmd-cli.cpp mtmd-helper.cpp mtmd-helper.h mtmd.cpp mtmd.h requirements.txt test-1.jpeg test-2.mp3 tests.sh
perplexity CMakeLists.txt README.md perplexity.cpp
quantize CMakeLists.txt README.md quantize.cpp tests.sh
rpc CMakeLists.txt README.md rpc-server.cpp
server
bench README.md bench.py prometheus.yml requirements.txt script.js
public index.html.gz loading.html
public_legacy colorthemes.css completion.js favicon.ico index-new.html index.html index.js json-schema-to-grammar.mjs loading.html prompt-formats.js style.css system-prompts.js theme-beeninorder.css theme-ketivah.css theme-mangotango.css theme-playground.css theme-polarnight.css theme-snowstorm.css
public_simplechat datautils.mjs index.html readme.md simplechat.css simplechat.js simplechat_screens.webp ui.mjs
tests
unit test_basic.py test_chat_completion.py test_compat_anthropic.py test_compat_oai_responses.py test_completion.py test_ctx_shift.py test_embedding.py test_infill.py test_lora.py test_rerank.py test_router.py test_security.py test_sleep.py test_slot_save.py test_speculative.py test_template.py test_tokenize.py test_tool_call.py test_vision_api.py
.gitignore README.md conftest.py pytest.ini requirements.txt tests.sh utils.py
themes
buttons-top README.md buttons_top.png favicon.ico index.html
wild README.md favicon.ico index.html llama_cpp.png llamapattern.png wild.png
README.md
webui
.storybook ModeWatcherDecorator.svelte TooltipProviderDecorator.svelte main.ts preview.ts vitest.setup.ts
docs
architecture high-level-architecture-simplified.md high-level-architecture.md
flows chat-flow.md conversations-flow.md data-flow-simplified-model-mode.md data-flow-simplified-router-mode.md database-flow.md models-flow.md server-flow.md settings-flow.md
scripts dev.sh install-git-hooks.sh post-build.sh
src
lib
components
app
chat
ChatAttachments ChatAttachmentPreview.svelte ChatAttachmentThumbnailFile.svelte ChatAttachmentThumbnailImage.svelte ChatAttachmentsList.svelte ChatAttachmentsViewAll.svelte
ChatForm
ChatFormActions ChatFormActionFileAttachments.svelte ChatFormActionRecord.svelte ChatFormActionSubmit.svelte ChatFormActions.svelte
ChatForm.svelte ChatFormFileInputInvisible.svelte ChatFormHelperText.svelte ChatFormTextarea.svelte
ChatMessages ChatMessage.svelte ChatMessageActions.svelte ChatMessageAssistant.svelte ChatMessageBranchingControls.svelte ChatMessageEditForm.svelte ChatMessageStatistics.svelte ChatMessageSystem.svelte ChatMessageThinkingBlock.svelte ChatMessageUser.svelte ChatMessages.svelte
ChatScreen ChatScreen.svelte ChatScreenDragOverlay.svelte ChatScreenHeader.svelte ChatScreenProcessingInfo.svelte
ChatSettings ChatSettings.svelte ChatSettingsFields.svelte ChatSettingsFooter.svelte ChatSettingsImportExportTab.svelte ChatSettingsParameterSourceIndicator.svelte
ChatSidebar ChatSidebar.svelte ChatSidebarActions.svelte ChatSidebarConversationItem.svelte ChatSidebarSearch.svelte handle-mobile-sidebar-item-click.ts
dialogs DialogChatAttachmentPreview.svelte DialogChatAttachmentsViewAll.svelte DialogChatError.svelte DialogChatSettings.svelte DialogConfirmation.svelte DialogConversationSelection.svelte DialogConversationTitleUpdate.svelte DialogEmptyFileAlert.svelte DialogModelInformation.svelte DialogModelNotAvailable.svelte
misc ActionButton.svelte ActionDropdown.svelte BadgeChatStatistic.svelte BadgeInfo.svelte BadgeModality.svelte CodePreviewDialog.svelte ConversationSelection.svelte CopyToClipboardIcon.svelte KeyboardShortcutInfo.svelte MarkdownContent.svelte RemoveButton.svelte SearchInput.svelte SyntaxHighlightedCode.svelte
models ModelBadge.svelte ModelsSelector.svelte
server ServerErrorSplash.svelte ServerLoadingSplash.svelte ServerStatus.svelte
index.ts
ui
alert alert-description.svelte alert-title.svelte alert.svelte index.ts
alert-dialog alert-dialog-action.svelte alert-dialog-cancel.svelte alert-dialog-content.svelte alert-dialog-description.svelte alert-dialog-footer.svelte alert-dialog-header.svelte alert-dialog-overlay.svelte alert-dialog-title.svelte alert-dialog-trigger.svelte index.ts
badge badge.svelte index.ts
button button.svelte index.ts
card card-action.svelte card-content.svelte card-description.svelte card-footer.svelte card-header.svelte card-title.svelte card.svelte index.ts
checkbox checkbox.svelte index.ts
collapsible collapsible-content.svelte collapsible-trigger.svelte collapsible.svelte index.ts
dialog dialog-close.svelte dialog-content.svelte dialog-description.svelte dialog-footer.svelte dialog-header.svelte dialog-overlay.svelte dialog-title.svelte dialog-trigger.svelte index.ts
dropdown-menu dropdown-menu-checkbox-item.svelte dropdown-menu-content.svelte dropdown-menu-group-heading.svelte dropdown-menu-group.svelte dropdown-menu-item.svelte dropdown-menu-label.svelte dropdown-menu-radio-group.svelte dropdown-menu-radio-item.svelte dropdown-menu-separator.svelte dropdown-menu-shortcut.svelte dropdown-menu-sub-content.svelte dropdown-menu-sub-trigger.svelte dropdown-menu-trigger.svelte index.ts
input index.ts input.svelte
label index.ts label.svelte
popover index.ts popover-close.svelte popover-content.svelte popover-portal.svelte popover-trigger.svelte popover.svelte
scroll-area index.ts scroll-area-scrollbar.svelte scroll-area.svelte
select index.ts select-content.svelte select-group-heading.svelte select-group.svelte select-item.svelte select-label.svelte select-scroll-down-button.svelte select-scroll-up-button.svelte select-separator.svelte select-trigger.svelte
separator index.ts separator.svelte
sheet index.ts sheet-close.svelte sheet-content.svelte sheet-description.svelte sheet-footer.svelte sheet-header.svelte sheet-overlay.svelte sheet-title.svelte sheet-trigger.svelte
sidebar constants.ts context.svelte.ts index.ts sidebar-content.svelte sidebar-footer.svelte sidebar-group-action.svelte sidebar-group-content.svelte sidebar-group-label.svelte sidebar-group.svelte sidebar-header.svelte sidebar-input.svelte sidebar-inset.svelte sidebar-menu-action.svelte sidebar-menu-badge.svelte sidebar-menu-button.svelte sidebar-menu-item.svelte sidebar-menu-skeleton.svelte sidebar-menu-sub-button.svelte sidebar-menu-sub-item.svelte sidebar-menu-sub.svelte sidebar-menu.svelte sidebar-provider.svelte sidebar-rail.svelte sidebar-separator.svelte sidebar-trigger.svelte sidebar.svelte
skeleton index.ts skeleton.svelte
switch index.ts switch.svelte
table index.ts table-body.svelte table-caption.svelte table-cell.svelte table-footer.svelte table-head.svelte table-header.svelte table-row.svelte table.svelte
textarea index.ts textarea.svelte
tooltip index.ts tooltip-content.svelte tooltip-trigger.svelte
utils.ts
constants auto-scroll.ts binary-detection.ts default-context.ts floating-ui-constraints.ts icons.ts input-classes.ts latex-protection.ts literal-html.ts localstorage-keys.ts max-bundle-size.ts precision.ts processing-info.ts settings-config.ts supported-file-types.ts table-html-restorer.ts tooltip-config.ts viewport.ts
enums attachment.ts chat.ts files.ts index.ts model.ts server.ts
hooks is-mobile.svelte.ts use-model-change-validation.svelte.ts use-processing-state.svelte.ts
markdown enhance-code-blocks.ts enhance-links.ts literal-html.ts table-html-restorer.ts
services chat.ts database.ts index.ts models.ts parameter-sync.spec.ts parameter-sync.ts props.ts
stores chat.svelte.ts conversations.svelte.ts models.svelte.ts persisted.svelte.ts server.svelte.ts settings.svelte.ts
types api.d.ts chat.d.ts database.d.ts index.ts models.d.ts settings.d.ts
utils api-headers.ts api-key-validation.ts attachment-display.ts attachment-type.ts audio-recording.ts autoresize-textarea.ts branching.ts browser-only.ts clipboard.ts config-helpers.ts conversation-utils.ts convert-files-to-extra.ts file-preview.ts file-type.ts formatters.ts index.ts is-ime-composing.ts latex-protection.ts modality-file-validation.ts model-names.ts pdf-processing.ts portal-to-body.ts precision.ts process-uploaded-files.ts svg-to-png.ts syntax-highlight-language.ts text-files.ts text.ts webp-to-png.ts
routes
chat
[id] +page.svelte +page.ts
+error.svelte +layout.svelte +page.svelte +page.ts
styles katex-custom.scss
app.css app.d.ts app.html
static favicon.svg loading.html
tests
client
components TestWrapper.svelte
page.svelte.test.ts
e2e demo.test.ts
stories
fixtures
assets 1.jpg beautiful-flowers-lotus.webp example.pdf hf-logo.svg
ai-tutorial.ts api-docs.ts blog-post.ts data-analysis.ts empty.ts math-formulas.ts readme.ts storybook-mocks.ts
ChatForm.stories.svelte ChatMessage.stories.svelte ChatSettings.stories.svelte ChatSidebar.stories.svelte Introduction.mdx MarkdownContent.stories.svelte
unit clipboard.test.ts latex-protection.test.ts model-names.test.ts
.gitignore .npmrc .prettierignore .prettierrc README.md components.json eslint.config.js package-lock.json package.json playwright.config.ts svelte.config.js tsconfig.json vite.config.ts vitest-setup-client.ts
CMakeLists.txt README-dev.md README.md chat-llama2.sh chat.mjs chat.sh server-common.cpp server-common.h server-context.cpp server-context.h server-http.cpp server-http.h server-models.cpp server-models.h server-queue.cpp server-queue.h server-task.cpp server-task.h server.cpp
tokenize CMakeLists.txt tokenize.cpp
tts CMakeLists.txt README.md convert_pt_to_hf.py tts-outetts.py tts.cpp
CMakeLists.txt
vendor
cpp-httplib CMakeLists.txt LICENSE httplib.cpp httplib.h
miniaudio miniaudio.h
nlohmann json.hpp json_fwd.hpp
sheredom subprocess.h
stb stb_image.h
.clang-format .clang-tidy .dockerignore .ecrc .editorconfig .flake8 .gitignore .gitmodules .pre-commit-config.yaml AGENTS.md AUTHORS CLAUDE.md CMakeLists.txt CMakePresets.json CODEOWNERS CONTRIBUTING.md LICENSE Makefile README.md SECURITY.md convert_hf_to_gguf.py convert_hf_to_gguf_update.py convert_llama_ggml_to_gguf.py convert_lora_to_gguf.py flake.lock flake.nix mypy.ini poetry.lock pyproject.toml pyrightconfig.json requirements.txt
maps map1.h map1.txt
papers 2310.11703v2.pdf 2405.14159v2.pdf
prompts lotr.h lotr.txt
.gitignore Dockerfile Makefile README.md compile_flags.txt context.c game.c makext.mk mapeditor.html maps.h minunit.h models.h models.txt nonstd.h npc.c termbox2.h vectordb.c vectordb.h
llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c raw
  1#pragma clang diagnostic ignored "-Wunused-variable"
  2#pragma clang diagnostic ignored "-Wunused-function"
  3#pragma clang diagnostic ignored "-Wunused-but-set-variable"
  4
  5#include <HAP_farf.h>
  6#include <HAP_perf.h>
  7
  8#include <math.h>
  9#include <string.h>
 10
 11#include "hex-dma.h"
 12#include "hvx-utils.h"
 13
 14#define GGML_COMMON_DECL_C
 15#include "ggml-common.h"
 16#include "htp-ctx.h"
 17#include "htp-msg.h"
 18#include "htp-ops.h"
 19
 20#define htp_act_preamble3              \
 21    const uint32_t ne00 = src0->ne[0]; \
 22    const uint32_t ne01 = src0->ne[1]; \
 23    const uint32_t ne02 = src0->ne[2]; \
 24    const uint32_t ne03 = src0->ne[3]; \
 25                                       \
 26    const uint32_t ne10 = src1->ne[0]; \
 27    const uint32_t ne11 = src1->ne[1]; \
 28    const uint32_t ne12 = src1->ne[2]; \
 29    const uint32_t ne13 = src1->ne[3]; \
 30                                       \
 31    const uint32_t ne0 = dst->ne[0];   \
 32    const uint32_t ne1 = dst->ne[1];   \
 33    const uint32_t ne2 = dst->ne[2];   \
 34    const uint32_t ne3 = dst->ne[3];   \
 35                                       \
 36    const uint32_t nb00 = src0->nb[0]; \
 37    const uint32_t nb01 = src0->nb[1]; \
 38    const uint32_t nb02 = src0->nb[2]; \
 39    const uint32_t nb03 = src0->nb[3]; \
 40                                       \
 41    const uint32_t nb10 = src1->nb[0]; \
 42    const uint32_t nb11 = src1->nb[1]; \
 43    const uint32_t nb12 = src1->nb[2]; \
 44    const uint32_t nb13 = src1->nb[3]; \
 45                                       \
 46    const uint32_t nb0 = dst->nb[0];   \
 47    const uint32_t nb1 = dst->nb[1];   \
 48    const uint32_t nb2 = dst->nb[2];   \
 49    const uint32_t nb3 = dst->nb[3];
 50
 51#define htp_act_preamble2              \
 52    const uint32_t ne00 = src0->ne[0]; \
 53    const uint32_t ne01 = src0->ne[1]; \
 54    const uint32_t ne02 = src0->ne[2]; \
 55    const uint32_t ne03 = src0->ne[3]; \
 56                                       \
 57    const uint32_t ne0 = dst->ne[0];   \
 58    const uint32_t ne1 = dst->ne[1];   \
 59    const uint32_t ne2 = dst->ne[2];   \
 60    const uint32_t ne3 = dst->ne[3];   \
 61                                       \
 62    const uint32_t nb00 = src0->nb[0]; \
 63    const uint32_t nb01 = src0->nb[1]; \
 64    const uint32_t nb02 = src0->nb[2]; \
 65    const uint32_t nb03 = src0->nb[3]; \
 66                                       \
 67    const uint32_t nb0 = dst->nb[0];   \
 68    const uint32_t nb1 = dst->nb[1];   \
 69    const uint32_t nb2 = dst->nb[2];   \
 70    const uint32_t nb3 = dst->nb[3];
 71
 72static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
 73                                       const struct htp_tensor * src1,
 74                                       struct htp_tensor *       dst,
 75                                       const int32_t *           op_params,
 76                                       struct htp_spad *         src0_spad,
 77                                       struct htp_spad *         src1_spad,
 78                                       struct htp_spad *         dst_spad,
 79                                       uint32_t                  nth,
 80                                       uint32_t                  ith,
 81                                       uint32_t                  src0_nrows_per_thread,
 82                                       dma_queue *               dma_queue) {
 83    htp_act_preamble3;
 84
 85    size_t src0_row_size = nb01;
 86    size_t src1_row_size = nb11;
 87    size_t dst_row_size  = nb1;
 88
 89
 90
 91    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
 92
 93    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
 94    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
 95
 96    // no work for this thread
 97    if (src0_start_row >= src0_end_row) {
 98        return;
 99    }
100
101    uint64_t t1, t2;
102    t1 = HAP_perf_get_qtimer_count();
103
104    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
105    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
106    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
107
108    const bool src1_valid = src1->ne[0];
109    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
110    if (!src1_valid) {
111        const int32_t swapped = op_params[1];
112        data_src1             = data_src0;
113        src1_row_size         = src0_row_size;
114
115        const size_t nc_in_bytes = nc * SIZEOF_FP32;
116        data_src0 += swapped ? nc_in_bytes : 0;
117        data_src1 += swapped ? 0 : nc_in_bytes;
118    }
119
120    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
121    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
122    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
123
124    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
125    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
126    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
127
128    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
129    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
130    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
131    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
132
133    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
134    if (BLOCK == 0) {
135        FARF(ERROR,
136             "swiglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
137             src0_spad->size_per_thread, src0_row_size_aligned);
138        return;
139    }
140
141    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
142    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
143        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
144
145        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
146        dma_queue_push_vtcm_to_ddr(dma_queue,
147            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
148            dst_row_size, dst_row_size_aligned, 0);
149
150        dma_queue_push_ddr_to_vtcm(dma_queue,
151            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
152            src0_row_size_aligned, src0_row_size, block_size);
153        dma_queue_push_ddr_to_vtcm(dma_queue,
154            dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
155            src1_row_size_aligned, src1_row_size, block_size);
156    }
157
158    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
159        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
160
161        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
162        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
163        float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
164
165        for (uint32_t ib = 0; ib < block_size; ib++) {
166            const float * src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
167            const float * src1_spad_ptr = src1_spad + ib * (src1_row_size_aligned / sizeof(float));
168            float *       dst_spad_ptr  = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
169
170            //swiglu(x) = x1 * sigmoid(x0)
171            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, nc);
172            hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
173                                (const uint8_t *) src1_spad_ptr, nc);
174        }
175
176        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
177                                   dst_row_size_aligned, block_size);
178
179        // prefetch N+2 loop iteration if any
180        const uint32_t pref_block = (ir + BLOCK * 2);
181        if (pref_block < src0_end_row) {
182            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
183            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
184                                       src0_row_size_aligned, src0_row_size, pref_block_size);
185            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
186                                       src1_row_size_aligned, src1_row_size, pref_block_size);
187        }
188    }
189
190    dma_queue_flush(dma_queue);
191
192    t2 = HAP_perf_get_qtimer_count();
193
194    FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
195         ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
196         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
197}
198
199static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
200                                           const struct htp_tensor * src1,
201                                           struct htp_tensor *       dst,
202                                           const int32_t *           op_params,
203                                           struct htp_spad *         src0_spad,
204                                           struct htp_spad *         src1_spad,
205                                           struct htp_spad *         dst_spad,
206                                           uint32_t                  nth,
207                                           uint32_t                  ith,
208                                           uint32_t                  src0_nrows_per_thread,
209                                           dma_queue *               dma_queue) {
210    htp_act_preamble3;
211
212    uint64_t t1, t2;
213    t1 = HAP_perf_get_qtimer_count();
214
215    size_t src0_row_size = nb01;
216    size_t src1_row_size = nb11;
217    size_t dst_row_size  = nb1;
218
219    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
220
221    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
222    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
223
224    // no work for this thread
225    if (src0_start_row >= src0_end_row) {
226        return;
227    }
228
229    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
230    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
231    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
232
233    const bool src1_valid = src1->ne[0];
234    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
235    if (!src1_valid) {
236        const int32_t swapped = op_params[1];
237        data_src1             = data_src0;
238        src1_row_size         = src0_row_size;
239
240        const size_t nc_in_bytes = nc * SIZEOF_FP32;
241        data_src0 += swapped ? nc_in_bytes : 0;
242        data_src1 += swapped ? 0 : nc_in_bytes;
243    }
244
245    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
246    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
247    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
248
249    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
250    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
251    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
252
253    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
254    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
255    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
256    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
257
258    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
259    if (BLOCK == 0) {
260        FARF(ERROR,
261             "swiglu-oai-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least "
262             "%zu\n",
263             src0_spad->size_per_thread, src0_row_size_aligned);
264        return;
265    }
266    const float alpha = ((const float *) (op_params))[2];
267    const float limit = ((const float *) (op_params))[3];
268
269    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
270    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
271        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
272
273        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
274        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
275                                   dst_row_size, dst_row_size_aligned, 0);
276
277        dma_queue_push_ddr_to_vtcm(
278            dma_queue,
279            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
280            src0_row_size_aligned, src0_row_size, block_size);
281        dma_queue_push_ddr_to_vtcm(
282            dma_queue,
283            dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
284            src1_row_size_aligned, src1_row_size, block_size);
285    }
286
287    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
288        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
289
290        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
291        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
292        float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
293
294        for (uint32_t ib = 0; ib < block_size; ib++) {
295            const float * src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
296            const float * src1_spad_ptr = src1_spad + ib * (src1_row_size_aligned / sizeof(float));
297            float *       dst_spad_ptr  = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
298
299            // x (src0_spad_data) = std::min(src0_p[k], limit);
300            hvx_min_scalar_f32((uint8_t *) src0_spad_ptr, (const uint8_t *) src0_spad_ptr, limit, nc);
301            // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
302            hvx_clamp_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, -limit, limit, nc);
303            // y (src1_spad_data)  = y1 + 1.f
304            hvx_add_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, 1.0, nc);
305            // x1 (dst_spad_data) = alpha * (x)
306            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, alpha, nc);
307            // x2 (dst_spad_data) = sigmoid(x1) = 1/(1+exp(-x1))
308            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, nc);
309            // out = x * sigmoid(alpha * x) * (y + 1.f)
310            hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
311                                (const uint8_t *) src1_spad_ptr, nc);
312        }
313
314        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
315                                   dst_row_size_aligned, block_size);
316
317        // prefetch N+2 loop iteration if any
318        const uint32_t pref_block = (ir + BLOCK * 2);
319        if (pref_block < src0_end_row) {
320            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
321            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
322                                       src0_row_size_aligned, src0_row_size, pref_block_size);
323            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
324                                       src1_row_size_aligned, src1_row_size, pref_block_size);
325        }
326    }
327
328    dma_queue_flush(dma_queue);
329
330    t2 = HAP_perf_get_qtimer_count();
331
332    FARF(HIGH, "swiglu-oai-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0],
333         src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], src1->ne[2],
334         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
335}
336
337
338static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
339                                       struct htp_tensor *       dst,
340                                       const int32_t *           op_params,
341                                       struct htp_spad *         src0_spad,
342                                       struct htp_spad *         dst_spad,
343                                       uint32_t                  nth,
344                                       uint32_t                  ith,
345                                       uint32_t                  src0_nrows_per_thread,
346                                       dma_queue *               dma_queue) {
347    htp_act_preamble2;
348
349    uint64_t t1, t2;
350    t1 = HAP_perf_get_qtimer_count();
351
352    const size_t src0_row_size = nb01;
353    const size_t dst_row_size  = nb1;
354    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
355    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
356
357    const uint32_t src0_nrows = ne01 * ne02 * ne03;
358
359    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
360    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
361
362    // no work for this thread
363    if (src0_start_row >= src0_end_row) {
364        return;
365    }
366
367    const uint8_t * data_src0 = (const uint8_t *) src0->data;
368    uint8_t * data_dst        = (uint8_t *) dst->data;
369
370    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
371    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
372
373    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
374    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
375    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
376
377    // In gelu = x*sigmoid(x*1.702)
378    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
379
380    if (BLOCK == 0) {
381        FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
382                src0_spad->size_per_thread, src0_row_size_aligned);
383        return;
384    }
385
386    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
387    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
388        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
389
390        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
391        dma_queue_push_vtcm_to_ddr(dma_queue,
392            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
393            dst_row_size, dst_row_size_aligned, 0);
394
395        dma_queue_push_ddr_to_vtcm(dma_queue,
396            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
397            src0_row_size_aligned, src0_row_size, block_size);
398    }
399
400    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
401        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
402
403        float* dst_spad  = (float *) dma_queue_pop(dma_queue).src;
404        float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
405
406        for (uint32_t ib = 0; ib < block_size; ib++) {
407            const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
408            float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
409
410            // gelu = x * sigmoid(1.702 * x) // current implementation
411            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0);
412            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
413            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
414        }
415
416        dma_queue_push_vtcm_to_ddr(dma_queue,
417            dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
418            dst_row_size, dst_row_size_aligned, block_size);
419
420        // prefetch N+2 loop iteration if any
421        const uint32_t pref_block = (ir + BLOCK * 2);
422        if (pref_block < src0_end_row) {
423            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
424            dma_queue_push_ddr_to_vtcm(dma_queue,
425                dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
426                src0_row_size_aligned, src0_row_size, pref_block_size);
427        }
428    }
429
430    dma_queue_flush(dma_queue);
431
432    t2 = HAP_perf_get_qtimer_count();
433
434    FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
435         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
436}
437
438static void unary_gelu_f32(unsigned int n, unsigned int i, void * data) {
439    struct htp_ops_context * octx = (struct htp_ops_context *) data;
440    unary_gelu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
441                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
442}
443
444
445
446static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
447                                       struct htp_tensor *       dst,
448                                       const int32_t *           op_params,
449                                       struct htp_spad *         src0_spad,
450                                       struct htp_spad *         dst_spad,
451                                       uint32_t                  nth,
452                                       uint32_t                  ith,
453                                       uint32_t                  src0_nrows_per_thread,
454                                       dma_queue *               dma_queue) {
455    htp_act_preamble2;
456
457    uint64_t t1, t2;
458    t1 = HAP_perf_get_qtimer_count();
459
460    const size_t src0_row_size = nb01;
461    const size_t dst_row_size  = nb1;
462    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
463    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
464
465    const uint32_t src0_nrows = ne01 * ne02 * ne03;
466
467    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
468    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
469
470    // no work for this thread
471    if (src0_start_row >= src0_end_row) {
472        return;
473    }
474
475    const uint8_t * data_src0 = (const uint8_t *) src0->data;
476    uint8_t * data_dst        = (uint8_t *) dst->data;
477
478    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
479    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
480
481    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
482    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
483    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
484
485    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
486
487    if (BLOCK == 0) {
488        FARF(ERROR, "silu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
489                src0_spad->size_per_thread, src0_row_size_aligned);
490        return;
491    }
492
493    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
494    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
495        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
496
497        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
498        dma_queue_push_vtcm_to_ddr(dma_queue,
499            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
500            dst_row_size, dst_row_size_aligned, 0);
501
502        dma_queue_push_ddr_to_vtcm(dma_queue,
503            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
504            src0_row_size_aligned, src0_row_size, block_size);
505    }
506
507    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
508        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
509
510        float* dst_spad  = (float *) dma_queue_pop(dma_queue).src;
511        float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
512
513        for (uint32_t ib = 0; ib < block_size; ib++) {
514            const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
515            float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
516
517            // silu = x * sigmoid(x)
518            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0);
519            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
520        }
521
522        dma_queue_push_vtcm_to_ddr(dma_queue,
523            dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
524            dst_row_size, dst_row_size_aligned, block_size);
525
526        // prefetch N+2 loop iteration if any
527        const uint32_t pref_block = (ir + BLOCK * 2);
528        if (pref_block < src0_end_row) {
529            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
530            dma_queue_push_ddr_to_vtcm(dma_queue,
531                dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
532                src0_row_size_aligned, src0_row_size, pref_block_size);
533        }
534    }
535
536    dma_queue_flush(dma_queue);
537
538    t2 = HAP_perf_get_qtimer_count();
539
540    FARF(HIGH, "silu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
541         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
542}
543
544static const float GELU_COEF_A     = 0.044715f;
545static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
546
547static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
548                                       const struct htp_tensor * src1,
549                                       struct htp_tensor *       dst,
550                                       const int32_t *           op_params,
551                                       struct htp_spad *         src0_spad,
552                                       struct htp_spad *         src1_spad,
553                                       struct htp_spad *         dst_spad,
554                                       uint32_t                  nth,
555                                       uint32_t                  ith,
556                                       uint32_t                  src0_nrows_per_thread,
557                                       dma_queue *               dma_queue) {
558    htp_act_preamble3;
559
560    size_t src0_row_size = nb01;
561    size_t src1_row_size = nb11;
562    size_t dst_row_size  = nb1;
563
564    uint64_t t1, t2;
565    t1 = HAP_perf_get_qtimer_count();
566
567    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
568
569    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
570    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
571
572    // no work for this thread
573    if (src0_start_row >= src0_end_row) {
574        return;
575    }
576
577    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
578    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
579    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
580
581    const bool src1_valid = src1->ne[0];
582    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
583    if (!src1_valid) {
584        const int32_t swapped = op_params[1];
585        data_src1             = data_src0;
586        src1_row_size         = src0_row_size;
587
588        const size_t nc_in_bytes = nc * SIZEOF_FP32;
589        data_src0 += swapped ? nc_in_bytes : 0;
590        data_src1 += swapped ? 0 : nc_in_bytes;
591    }
592
593    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
594    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
595    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
596
597    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
598    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
599    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
600
601    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
602    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
603    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
604    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
605
606    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
607    if (BLOCK == 0) {
608        FARF(ERROR,
609             "geglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
610             src0_spad->size_per_thread, src0_row_size_aligned);
611        return;
612    }
613
614    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
615    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
616        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
617
618        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
619        dma_queue_push_vtcm_to_ddr(dma_queue,
620            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
621            dst_row_size, dst_row_size_aligned, 0);
622
623        dma_queue_push_ddr_to_vtcm(dma_queue,
624            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
625            src0_row_size_aligned, src0_row_size, block_size);
626        dma_queue_push_ddr_to_vtcm(dma_queue,
627            dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
628            src1_row_size_aligned, src1_row_size, block_size);
629    }
630
631    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
632        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
633
634        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
635        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
636        float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
637
638        for (uint32_t ib = 0; ib < block_size; ib++) {
639            const uint8_t * src0_spad_ptr = (const uint8_t *)(src0_spad + ib * (src0_row_size_aligned / sizeof(float)));
640            const uint8_t * src1_spad_ptr = (const uint8_t *)(src1_spad + ib * (src1_row_size_aligned / sizeof(float)));
641            uint8_t *       dst_spad_ptr  = (uint8_t *)(dst_spad + ib * (dst_row_size_aligned / sizeof(float)));
642
643            // geglu tanh implementation
644            // geglu(x, g) = gelu(x) * g
645            // gelu(x) = 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)))
646            hvx_mul_f32_aaa(dst_spad_ptr, src0_spad_ptr, src0_spad_ptr, nc);                       // res = x*x
647            hvx_mul_scalar_f32_aa(dst_spad_ptr, (const uint8_t *)dst_spad_ptr, GELU_COEF_A, nc);   // res = res * GELU_COEF_A
648            hvx_add_scalar_f32_aa(dst_spad_ptr, (const uint8_t *)dst_spad_ptr, 1.0f, nc);          // res = res + 1.0f
649            hvx_mul_f32_aaa(dst_spad_ptr, src0_spad_ptr, (const uint8_t *)dst_spad_ptr, nc);       // res = res * x
650            hvx_mul_scalar_f32_aa(dst_spad_ptr, (const uint8_t*)dst_spad_ptr, SQRT_2_OVER_PI, nc); // res = result * SQRT_2_OVER_PI
651            hvx_tanh_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, nc);         // res = tanh(res)
652            hvx_add_scalar_f32_aa(dst_spad_ptr, (const uint8_t*)dst_spad_ptr, 1.0f, nc);           // res = res + 1.0f
653            hvx_mul_f32_aaa(dst_spad_ptr, src0_spad_ptr, (const uint8_t *)dst_spad_ptr, nc);       // res = res * x
654            hvx_mul_scalar_f32_aa(dst_spad_ptr, (const uint8_t *)dst_spad_ptr, 0.5f, nc);          // res = res + 0.5f
655            hvx_mul_f32_aaa(dst_spad_ptr, (const uint8_t *)dst_spad_ptr, src1_spad_ptr, nc);       // res = res * g
656        }
657
658        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
659                                   dst_row_size_aligned, block_size);
660
661        // prefetch N+2 loop iteration if any
662        const uint32_t pref_block = (ir + BLOCK * 2);
663        if (pref_block < src0_end_row) {
664            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
665            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
666                                       src0_row_size_aligned, src0_row_size, pref_block_size);
667            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
668                                       src1_row_size_aligned, src1_row_size, pref_block_size);
669        }
670    }
671
672    dma_queue_flush(dma_queue);
673
674    t2 = HAP_perf_get_qtimer_count();
675
676    FARF(HIGH, "geglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
677         ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
678         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
679}
680
681static void unary_silu_f32(unsigned int n, unsigned int i, void * data) {
682    struct htp_ops_context * octx = (struct htp_ops_context *) data;
683    unary_silu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
684                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
685}
686
687static void glu_swiglu_f32(unsigned int n, unsigned int i, void * data) {
688    struct htp_ops_context * octx = (struct htp_ops_context *) data;
689    glu_swiglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
690                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
691}
692
693static void glu_swiglu_oai_f32(unsigned int n, unsigned int i, void * data) {
694    struct htp_ops_context * octx = (struct htp_ops_context *) data;
695    glu_swiglu_oai_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
696                                   &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
697}
698
699static void glu_geglu_f32(unsigned int n, unsigned int i, void * data) {
700    struct htp_ops_context * octx = (struct htp_ops_context *) data;
701    glu_geglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
702                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
703}
704
705static int execute_op_activations_f32(struct htp_ops_context * octx) {
706    int err = HTP_STATUS_OK;
707
708    const struct htp_tensor * src0 = &octx->src0;
709    const struct htp_tensor * src1 = &octx->src1;
710    struct htp_tensor *       dst  = &octx->dst;
711
712    if (((src0->ne[0] * SIZEOF_FP32) != src0->nb[1]) || ((dst->ne[0] * SIZEOF_FP32) != dst->nb[1])) {
713        FARF(ERROR, "Non-contiguous tensors are not supported at this time \n");
714        return HTP_STATUS_NO_SUPPORT;
715    }
716
717    worker_callback_t act_op_func;
718    const char *      op_type = NULL;
719
720    switch (octx->op) {
721        case HTP_OP_UNARY_SILU:
722            act_op_func = unary_silu_f32;
723            op_type     = "silu-f32";
724            break;
725
726        case HTP_OP_GLU_SWIGLU:
727            act_op_func = glu_swiglu_f32;
728            op_type     = "swiglu-f32";
729            break;
730
731        case HTP_OP_GLU_SWIGLU_OAI:
732            act_op_func = glu_swiglu_oai_f32;
733            op_type     = "swiglu-oai-f32";
734            break;
735        case HTP_OP_UNARY_GELU:
736            act_op_func = unary_gelu_f32;
737            op_type     = "gelu-f32";
738            break;
739
740        case HTP_OP_GLU_GEGLU:
741            act_op_func = glu_geglu_f32;
742            op_type     = "geglu-f32";
743            break;
744        default:
745            FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
746            return HTP_STATUS_NO_SUPPORT;
747    }
748
749    const uint32_t n_threads  = octx->n_threads;
750    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
751
752    size_t src0_row_size = src0->nb[1];
753    size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
754    size_t dst_row_size  = dst->nb[1];
755
756    const bool src1_valid = src1->ne[0];
757    if (!src1_valid) {
758        src1_row_size = src0_row_size;
759    }
760
761    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
762    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
763    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
764    // VTCM scratchpads for all tensors
765    // N rows per thread, padded to HVX vector size
766
767    size_t spad_size_per_row   = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
768    size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
769
770    // Make sure the reserved vtcm size is sufficient
771    if(vtcm_row_per_thread ==0){
772        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
773             spad_size_per_row * n_threads);
774        return HTP_STATUS_VTCM_TOO_SMALL;
775    }
776
777    octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
778    octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
779    octx->dst_spad.size_per_thread  = dst_row_size_aligned * vtcm_row_per_thread;
780
781    octx->dst_spad.size  = n_threads* octx->dst_spad.size_per_thread;
782    octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
783    octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
784
785    octx->src0_spad.data = octx->ctx->vtcm_base;
786    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
787    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
788
789    if (src1->ne[0]) {
790        FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
791             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
792             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
793             octx->dst_spad.size);
794    } else {
795        FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
796             src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
797             octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
798    }
799
800    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
801        uint32_t n_jobs = MIN(n_threads, src0_nrows);
802        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
803        worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
804    }
805
806    return err;
807}
808
809int op_activations(struct htp_ops_context * octx) {
810    int err = HTP_STATUS_OK;
811
812    switch (octx->src0.type) {
813        case HTP_TYPE_F32:
814            err = execute_op_activations_f32(octx);
815            break;
816
817        default:
818            err = HTP_STATUS_NO_SUPPORT;
819            break;
820    }
821
822    return err;
823}