llama.cpp
.devops
nix
apps.nix devshells.nix docker.nix jetson-support.nix nixpkgs-instances.nix package-gguf-py.nix package.nix python-scripts.nix scope.nix sif.nix.github
ISSUE_TEMPLATE
010-bug-compilation.yml 011-bug-results.yml 019-bug-misc.yml 020-enhancement.yml 030-research.yml 040-refactor.yml config.ymlworkflows
bench.yml.disabled build-cache.yml build-cmake-pkg.yml build-linux-cross.yml build.yml check-vendor.yml close-issue.yml copilot-setup-steps.yml docker.yml editorconfig.yml gguf-publish.yml labeler.yml pre-tokenizer-hashes.yml python-check-requirements.yml python-lint.yml python-type-check.yml release.yml server-metal.yml server-webui.yml server.yml update-ops-docs.yml winget.ymlbenches
cmake
arm64-apple-clang.cmake arm64-windows-llvm.cmake build-info.cmake common.cmake download-models.cmake git-vars.cmake license.cmake llama-config.cmake.in llama.pc.in riscv64-spacemit-linux-gnu-gcc.cmake x64-windows-llvm.cmakecommon
jinja
README.md caps.cpp caps.h lexer.cpp lexer.h parser.cpp parser.h runtime.cpp runtime.h string.cpp string.h utils.h value.cpp value.hdocs
multimodal
MobileVLM.md gemma3.md glmedge.md granitevision.md llava.md minicpmo2.6.md minicpmo4.0.md minicpmv2.5.md minicpmv2.6.md minicpmv4.0.md minicpmv4.5.mdops
BLAS.csv CANN.csv CPU.csv CUDA.csv Metal.csv OpenCL.csv SYCL.csv Vulkan.csv WebGPU.csv ZenDNN.csv zDNN.csvexamples
llama.android
app
src
lib
.gitignore build.gradle.kts consumer-rules.pro proguard-rules.promodel-conversion
scripts
causal
compare-embeddings-logits.sh compare-logits.py convert-model.sh modelcard.template run-casual-gen-embeddings-org.py run-converted-model-embeddings-logits.sh run-converted-model.sh run-org-model.pyembedding
compare-embeddings-logits.sh convert-model.sh modelcard.template run-converted-model.sh run-original-model.pyutils
__init__.py check-nmse.py common.py compare_tokens.py create-collection-add-model.sh curl-embedding-server.sh hf-add-model-to-collection.py hf-create-collection.py hf-create-model.py hf-upload-gguf-model.py inspect-converted-model.sh inspect-org-model.py perplexity-gen.sh perplexity-run-simple.sh perplexity-run.sh quantize.sh run-embedding-server.sh semantic_check.py tensor-info.pysycl
CMakeLists.txt README.md build.sh ls-sycl-device.cpp run-llama2.sh test.sh win-build-sycl.bat win-run-llama2.bat win-test.batggml
include
ggml-alloc.h ggml-backend.h ggml-blas.h ggml-cann.h ggml-cpp.h ggml-cpu.h ggml-cuda.h ggml-hexagon.h ggml-metal.h ggml-opencl.h ggml-opt.h ggml-rpc.h ggml-sycl.h ggml-virtgpu.h ggml-vulkan.h ggml-webgpu.h ggml-zdnn.h ggml-zendnn.h ggml.h gguf.hsrc
ggml-cann
CMakeLists.txt acl_tensor.cpp acl_tensor.h aclnn_ops.cpp aclnn_ops.h common.h ggml-cann.cppggml-cpu
CMakeLists.txt arch-fallback.h binary-ops.cpp binary-ops.h common.h ggml-cpu-impl.h ggml-cpu.c ggml-cpu.cpp hbm.cpp hbm.h ops.cpp ops.h quants.c quants.h repack.cpp repack.h simd-mappings.h traits.cpp traits.h unary-ops.cpp unary-ops.h vec.cpp vec.hggml-cuda
template-instances
fattn-mma-f16-instance-ncols1_1-ncols2_16.cu fattn-mma-f16-instance-ncols1_1-ncols2_32.cu fattn-mma-f16-instance-ncols1_1-ncols2_8.cu fattn-mma-f16-instance-ncols1_16-ncols2_1.cu fattn-mma-f16-instance-ncols1_16-ncols2_2.cu fattn-mma-f16-instance-ncols1_16-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_16.cu fattn-mma-f16-instance-ncols1_2-ncols2_32.cu fattn-mma-f16-instance-ncols1_2-ncols2_4.cu fattn-mma-f16-instance-ncols1_2-ncols2_8.cu fattn-mma-f16-instance-ncols1_32-ncols2_1.cu fattn-mma-f16-instance-ncols1_32-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_16.cu fattn-mma-f16-instance-ncols1_4-ncols2_2.cu fattn-mma-f16-instance-ncols1_4-ncols2_4.cu fattn-mma-f16-instance-ncols1_4-ncols2_8.cu fattn-mma-f16-instance-ncols1_64-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_1.cu fattn-mma-f16-instance-ncols1_8-ncols2_2.cu fattn-mma-f16-instance-ncols1_8-ncols2_4.cu fattn-mma-f16-instance-ncols1_8-ncols2_8.cu fattn-tile-instance-dkq112-dv112.cu fattn-tile-instance-dkq128-dv128.cu fattn-tile-instance-dkq256-dv256.cu fattn-tile-instance-dkq40-dv40.cu fattn-tile-instance-dkq576-dv512.cu fattn-tile-instance-dkq64-dv64.cu fattn-tile-instance-dkq72-dv72.cu fattn-tile-instance-dkq80-dv80.cu fattn-tile-instance-dkq96-dv96.cu fattn-vec-instance-f16-f16.cu fattn-vec-instance-f16-q4_0.cu fattn-vec-instance-f16-q4_1.cu fattn-vec-instance-f16-q5_0.cu fattn-vec-instance-f16-q5_1.cu fattn-vec-instance-f16-q8_0.cu fattn-vec-instance-q4_0-f16.cu fattn-vec-instance-q4_0-q4_0.cu fattn-vec-instance-q4_0-q4_1.cu fattn-vec-instance-q4_0-q5_0.cu fattn-vec-instance-q4_0-q5_1.cu fattn-vec-instance-q4_0-q8_0.cu fattn-vec-instance-q4_1-f16.cu fattn-vec-instance-q4_1-q4_0.cu fattn-vec-instance-q4_1-q4_1.cu fattn-vec-instance-q4_1-q5_0.cu fattn-vec-instance-q4_1-q5_1.cu fattn-vec-instance-q4_1-q8_0.cu fattn-vec-instance-q5_0-f16.cu fattn-vec-instance-q5_0-q4_0.cu fattn-vec-instance-q5_0-q4_1.cu fattn-vec-instance-q5_0-q5_0.cu fattn-vec-instance-q5_0-q5_1.cu fattn-vec-instance-q5_0-q8_0.cu fattn-vec-instance-q5_1-f16.cu fattn-vec-instance-q5_1-q4_0.cu fattn-vec-instance-q5_1-q4_1.cu fattn-vec-instance-q5_1-q5_0.cu fattn-vec-instance-q5_1-q5_1.cu fattn-vec-instance-q5_1-q8_0.cu fattn-vec-instance-q8_0-f16.cu fattn-vec-instance-q8_0-q4_0.cu fattn-vec-instance-q8_0-q4_1.cu fattn-vec-instance-q8_0-q5_0.cu fattn-vec-instance-q8_0-q5_1.cu fattn-vec-instance-q8_0-q8_0.cu generate_cu_files.py mmf-instance-ncols_1.cu mmf-instance-ncols_10.cu mmf-instance-ncols_11.cu mmf-instance-ncols_12.cu mmf-instance-ncols_13.cu mmf-instance-ncols_14.cu mmf-instance-ncols_15.cu mmf-instance-ncols_16.cu mmf-instance-ncols_2.cu mmf-instance-ncols_3.cu mmf-instance-ncols_4.cu mmf-instance-ncols_5.cu mmf-instance-ncols_6.cu mmf-instance-ncols_7.cu mmf-instance-ncols_8.cu mmf-instance-ncols_9.cu mmq-instance-iq1_s.cu mmq-instance-iq2_s.cu mmq-instance-iq2_xs.cu mmq-instance-iq2_xxs.cu mmq-instance-iq3_s.cu mmq-instance-iq3_xxs.cu mmq-instance-iq4_nl.cu mmq-instance-iq4_xs.cu mmq-instance-mxfp4.cu mmq-instance-q2_k.cu mmq-instance-q3_k.cu mmq-instance-q4_0.cu mmq-instance-q4_1.cu mmq-instance-q4_k.cu mmq-instance-q5_0.cu mmq-instance-q5_1.cu mmq-instance-q5_k.cu mmq-instance-q6_k.cu mmq-instance-q8_0.cuggml-hexagon
htp
CMakeLists.txt act-ops.c argsort-ops.c binary-ops.c cmake-toolchain.cmake cpy-ops.c flash-attn-ops.c get-rows-ops.c hex-dma.c hex-dma.h hex-dump.h hex-fastdiv.h hex-utils.h htp-ctx.h htp-msg.h htp-ops.h htp_iface.idl hvx-arith.h hvx-base.h hvx-copy.h hvx-div.h hvx-dump.h hvx-exp.h hvx-floor.h hvx-inverse.h hvx-reduce.h hvx-scale.h hvx-sigmoid.h hvx-sqrt.h hvx-types.h hvx-utils.h main.c matmul-ops.c rope-ops.c set-rows-ops.c softmax-ops.c sum-rows-ops.c unary-ops.c worker-pool.c worker-pool.hggml-metal
CMakeLists.txt ggml-metal-common.cpp ggml-metal-common.h ggml-metal-context.h ggml-metal-context.m ggml-metal-device.cpp ggml-metal-device.h ggml-metal-device.m ggml-metal-impl.h ggml-metal-ops.cpp ggml-metal-ops.h ggml-metal.cpp ggml-metal.metalggml-opencl
kernels
add.cl add_id.cl argsort.cl clamp.cl concat.cl conv2d.cl conv2d_f16_f32.cl cpy.cl cvt.cl diag_mask_inf.cl div.cl embed_kernel.py expm1.cl fill.cl flash_attn_f16.cl flash_attn_f32.cl flash_attn_f32_f16.cl gelu.cl gemm_moe_mxfp4_f32.cl gemv_moe_mxfp4_f32.cl gemv_noshuffle.cl gemv_noshuffle_general.cl gemv_noshuffle_general_q8_0_f32.cl get_rows.cl glu.cl group_norm.cl im2col_f16.cl im2col_f32.cl mean.cl mul.cl mul_mat_Ab_Bi_8x4.cl mul_mat_f16_f32.cl mul_mm_f16_f32_kq_kqv.cl mul_mm_f16_f32_l4_lm.cl mul_mm_f32_f32_l4_lm.cl mul_mm_q6_k_f32_l4_lm.cl mul_mm_q8_0_f32_8x4.cl mul_mm_q8_0_f32_l4_lm.cl mul_mv_f16_f16.cl mul_mv_f16_f32.cl mul_mv_f16_f32_1row.cl mul_mv_f16_f32_l4.cl mul_mv_f32_f32.cl mul_mv_id_mxfp4_f32.cl mul_mv_id_mxfp4_f32_flat.cl mul_mv_id_q4_0_f32_8x_flat.cl mul_mv_id_q8_0_f32.cl mul_mv_id_q8_0_f32_flat.cl mul_mv_mxfp4_f32.cl mul_mv_mxfp4_f32_flat.cl mul_mv_q4_0_f32.cl mul_mv_q4_0_f32_1d_16x_flat.cl mul_mv_q4_0_f32_1d_8x_flat.cl mul_mv_q4_0_f32_8x_flat.cl mul_mv_q4_0_f32_v.cl mul_mv_q4_k_f32.cl mul_mv_q6_k_f32.cl mul_mv_q6_k_f32_flat.cl mul_mv_q8_0_f32.cl mul_mv_q8_0_f32_flat.cl norm.cl pad.cl relu.cl repeat.cl rms_norm.cl rope.cl scale.cl set_rows.cl sigmoid.cl silu.cl softmax_4_f16.cl softmax_4_f32.cl softmax_f16.cl softmax_f32.cl softplus.cl solve_tri.cl sqr.cl sqrt.cl ssm_conv.cl sub.cl sum_rows.cl tanh.cl transpose.cl tri.cl tsembd.cl upscale.clggml-sycl
CMakeLists.txt add-id.cpp add-id.hpp backend.hpp binbcast.cpp binbcast.hpp common.cpp common.hpp concat.cpp concat.hpp conv.cpp conv.hpp convert.cpp convert.hpp count-equal.cpp count-equal.hpp cpy.cpp cpy.hpp dequantize.hpp dmmv.cpp dmmv.hpp element_wise.cpp element_wise.hpp gemm.hpp getrows.cpp getrows.hpp ggml-sycl.cpp gla.cpp gla.hpp im2col.cpp im2col.hpp mmq.cpp mmq.hpp mmvq.cpp mmvq.hpp norm.cpp norm.hpp outprod.cpp outprod.hpp pad.cpp pad.hpp pad_reflect_1d.cpp pad_reflect_1d.hpp presets.hpp quantize.hpp quants.hpp repeat_back.cpp repeat_back.hpp roll.cpp roll.hpp rope.cpp rope.hpp set.cpp set.hpp set_rows.cpp set_rows.hpp softmax.cpp softmax.hpp ssm_conv.cpp ssm_conv.hpp sycl_hw.cpp sycl_hw.hpp tsembd.cpp tsembd.hpp vecdotq.hpp wkv.cpp wkv.hppggml-virtgpu
backend
CMakeLists.txt apir_cs_ggml-rpc-back.cpp backend-convert.h backend-dispatched-backend.cpp backend-dispatched-buffer-type.cpp backend-dispatched-buffer.cpp backend-dispatched-device.cpp backend-dispatched.cpp backend-dispatched.gen.h backend-dispatched.h backend-virgl-apir.h backend.cppggml-vulkan
vulkan-shaders
CMakeLists.txt abs.comp acc.comp add.comp add1.comp add_id.comp arange.comp argmax.comp argsort.comp argsort_large.comp ceil.comp clamp.comp concat.comp contig_copy.comp conv2d_dw.comp conv2d_mm.comp conv_transpose_1d.comp copy.comp copy_from_quant.comp copy_to_quant.comp copy_transpose.comp cos.comp count_equal.comp count_experts.comp cumsum.comp cumsum_multipass1.comp cumsum_multipass2.comp dequant_f32.comp dequant_funcs.glsl dequant_funcs_cm2.glsl dequant_head.glsl dequant_iq1_m.comp dequant_iq1_s.comp dequant_iq2_s.comp dequant_iq2_xs.comp dequant_iq2_xxs.comp dequant_iq3_s.comp dequant_iq3_xxs.comp dequant_iq4_nl.comp dequant_iq4_xs.comp dequant_mxfp4.comp dequant_q2_k.comp dequant_q3_k.comp dequant_q4_0.comp dequant_q4_1.comp dequant_q4_k.comp dequant_q5_0.comp dequant_q5_1.comp dequant_q5_k.comp dequant_q6_k.comp dequant_q8_0.comp diag.comp diag_mask_inf.comp div.comp exp.comp fill.comp flash_attn.comp flash_attn_base.glsl flash_attn_cm1.comp flash_attn_cm2.comp flash_attn_mask_opt.comp flash_attn_split_k_reduce.comp floor.comp geglu.comp geglu_erf.comp geglu_quick.comp gelu.comp gelu_erf.comp gelu_quick.comp generic_binary_head.glsl generic_head.glsl generic_unary_head.glsl get_rows.comp get_rows_quant.comp glu_head.glsl glu_main.glsl group_norm.comp hardsigmoid.comp hardswish.comp im2col.comp im2col_3d.comp l2_norm.comp leaky_relu.comp log.comp mul.comp mul_mat_split_k_reduce.comp mul_mat_vec.comp mul_mat_vec_base.glsl mul_mat_vec_iface.glsl mul_mat_vec_iq1_m.comp mul_mat_vec_iq1_s.comp mul_mat_vec_iq2_s.comp mul_mat_vec_iq2_xs.comp mul_mat_vec_iq2_xxs.comp mul_mat_vec_iq3_s.comp mul_mat_vec_iq3_xxs.comp mul_mat_vec_nc.comp mul_mat_vec_p021.comp mul_mat_vec_q2_k.comp mul_mat_vec_q3_k.comp mul_mat_vec_q4_k.comp mul_mat_vec_q5_k.comp mul_mat_vec_q6_k.comp mul_mat_vecq.comp mul_mat_vecq_funcs.glsl mul_mm.comp mul_mm_cm2.comp mul_mm_funcs.glsl mul_mm_id_funcs.glsl mul_mmq.comp mul_mmq_funcs.glsl mul_mmq_shmem_types.glsl multi_add.comp neg.comp norm.comp opt_step_adamw.comp opt_step_sgd.comp pad.comp pool2d.comp quantize_q8_1.comp reglu.comp relu.comp repeat.comp repeat_back.comp rms_norm.comp rms_norm_back.comp rms_norm_partials.comp roll.comp rope_funcs.glsl rope_head.glsl rope_multi.comp rope_neox.comp rope_norm.comp rope_params.glsl rope_vision.comp round.comp rte.glsl scale.comp sigmoid.comp silu.comp silu_back.comp sin.comp soft_max.comp soft_max_back.comp soft_max_large1.comp soft_max_large2.comp soft_max_large3.comp soft_max_large_common.glsl softplus.comp solve_tri.comp sqrt.comp square.comp ssm_conv.comp ssm_scan.comp step.comp sub.comp sum_rows.comp sum_rows.glsl swiglu.comp swiglu_oai.comp tanh.comp timestep_embedding.comp topk_argsort.comp topk_moe.comp topk_nary_search.comp tri.comp trunc.comp types.glsl upscale.comp utils.glsl vulkan-shaders-gen.cpp wkv6.comp wkv7.comp xielu.compggml-webgpu
wgsl-shaders
argmax.wgsl argsort.wgsl argsort_merge.wgsl binary.wgsl common_decls.tmpl cpy.tmpl.wgsl cumsum.wgsl embed_wgsl.py flash_attn.wgsl get_rows.tmpl.wgsl glu.tmpl.wgsl memset.wgsl mul_mat.tmpl.wgsl mul_mat_decls.tmpl mul_mat_reg_tile.tmpl.wgsl mul_mat_subgroup_matrix.tmpl.wgsl mul_mat_vec.tmpl.wgsl pad.wgsl rms_norm.wgsl rope.tmpl.wgsl scale.tmpl.wgsl set_rows.wgsl soft_max.tmpl.wgsl sum_rows.wgsl unary.wgslgguf-py
gguf
scripts
gguf_convert_endian.py gguf_dump.py gguf_editor_gui.py gguf_hash.py gguf_new_metadata.py gguf_set_metadata.pygrammars
README.md arithmetic.gbnf c.gbnf chess.gbnf english.gbnf japanese.gbnf json.gbnf json_arr.gbnf list.gbnfmedia
llama0-banner.png llama0-logo.png llama1-banner.png llama1-icon-transparent.png llama1-icon-transparent.svg llama1-icon.png llama1-icon.svg llama1-logo.png llama1-logo.svg matmul.png matmul.svgmodels
templates
Apertus-8B-Instruct.jinja ByteDance-Seed-OSS.jinja CohereForAI-c4ai-command-r-plus-tool_use.jinja CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja GLM-4.6.jinja Kimi-K2-Instruct.jinja Kimi-K2-Thinking.jinja MiMo-VL.jinja MiniMax-M2.jinja Mistral-Small-3.2-24B-Instruct-2506.jinja NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja NVIDIA-Nemotron-Nano-v2.jinja NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja Qwen-QwQ-32B.jinja Qwen-Qwen2.5-7B-Instruct.jinja Qwen-Qwen3-0.6B.jinja Qwen3-Coder.jinja README.md deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja deepseek-ai-DeepSeek-V3.1.jinja fireworks-ai-llama-3-firefunction-v2.jinja google-gemma-2-2b-it.jinja ibm-granite-granite-3.3-2B-Instruct.jinja llama-cpp-deepseek-r1.jinja llama-cpp-lfm2.jinja llama-cpp-rwkv-world.jinja meetkai-functionary-medium-v3.1.jinja meetkai-functionary-medium-v3.2.jinja meta-llama-Llama-3.1-8B-Instruct.jinja meta-llama-Llama-3.2-3B-Instruct.jinja meta-llama-Llama-3.3-70B-Instruct.jinja microsoft-Phi-3.5-mini-instruct.jinja mistralai-Ministral-3-14B-Reasoning-2512.jinja mistralai-Mistral-Nemo-Instruct-2407.jinja moonshotai-Kimi-K2.jinja openai-gpt-oss-120b.jinja unsloth-Apriel-1.5.jinja unsloth-mistral-Devstral-Small-2507.jinja upstage-Solar-Open-100B.jinjarequirements
requirements-all.txt requirements-compare-llama-bench.txt requirements-convert_hf_to_gguf.txt requirements-convert_hf_to_gguf_update.txt requirements-convert_legacy_llama.txt requirements-convert_llama_ggml_to_gguf.txt requirements-convert_lora_to_gguf.txt requirements-gguf_editor_gui.txt requirements-pydantic.txt requirements-server-bench.txt requirements-test-tokenizer-random.txt requirements-tool_bench.txtscripts
bench-models.sh build-info.sh check-requirements.sh compare-commits.sh compare-llama-bench.py compare-logprobs.py create_ops_docs.py debug-test.sh fetch_server_test_models.py gen-authors.sh gen-unicode-data.py get-flags.mk get-hellaswag.sh get-pg.sh get-wikitext-103.sh get-wikitext-2.sh get-winogrande.sh get_chat_template.py hf.sh install-oneapi.bat pr2wt.sh serve-static.js server-bench.py sync-ggml-am.sh sync-ggml.last sync-ggml.sh sync_vendor.py tool_bench.py tool_bench.sh verify-checksum-models.py xxd.cmakesrc
models
afmoe.cpp apertus.cpp arcee.cpp arctic.cpp arwkv7.cpp baichuan.cpp bailingmoe.cpp bailingmoe2.cpp bert.cpp bitnet.cpp bloom.cpp chameleon.cpp chatglm.cpp codeshell.cpp cogvlm.cpp cohere2-iswa.cpp command-r.cpp dbrx.cpp deci.cpp deepseek.cpp deepseek2.cpp dots1.cpp dream.cpp ernie4-5-moe.cpp ernie4-5.cpp exaone-moe.cpp exaone.cpp exaone4.cpp falcon-h1.cpp falcon.cpp gemma-embedding.cpp gemma.cpp gemma2-iswa.cpp gemma3.cpp gemma3n-iswa.cpp glm4-moe.cpp glm4.cpp gpt2.cpp gptneox.cpp granite-hybrid.cpp granite.cpp graph-context-mamba.cpp grok.cpp grovemoe.cpp hunyuan-dense.cpp hunyuan-moe.cpp internlm2.cpp jais.cpp jamba.cpp kimi-linear.cpp lfm2.cpp llada-moe.cpp llada.cpp llama-iswa.cpp llama.cpp maincoder.cpp mamba.cpp mimo2-iswa.cpp minicpm3.cpp minimax-m2.cpp mistral3.cpp models.h modern-bert.cpp mpt.cpp nemotron-h.cpp nemotron.cpp neo-bert.cpp olmo.cpp olmo2.cpp olmoe.cpp openai-moe-iswa.cpp openelm.cpp orion.cpp pangu-embedded.cpp phi2.cpp phi3.cpp plamo.cpp plamo2.cpp plamo3.cpp plm.cpp qwen.cpp qwen2.cpp qwen2moe.cpp qwen2vl.cpp qwen3.cpp qwen35.cpp qwen35moe.cpp qwen3moe.cpp qwen3next.cpp qwen3vl-moe.cpp qwen3vl.cpp refact.cpp rnd1.cpp rwkv6-base.cpp rwkv6.cpp rwkv6qwen2.cpp rwkv7-base.cpp rwkv7.cpp seed-oss.cpp smallthinker.cpp smollm3.cpp stablelm.cpp starcoder.cpp starcoder2.cpp step35-iswa.cpp t5-dec.cpp t5-enc.cpp wavtokenizer-dec.cpp xverse.cpptests
peg-parser
simple-tokenize.cpp simple-tokenize.h test-basic.cpp test-gbnf-generation.cpp test-json-parser.cpp test-json-serialization.cpp test-unicode.cpp tests.htools
cvector-generator
CMakeLists.txt README.md completions.txt cvector-generator.cpp mean.hpp negative.txt pca.hpp positive.txtmtmd
legacy-models
convert_image_encoder_to_gguf.py glmedge-convert-image-encoder-to-gguf.py glmedge-surgery.py llava_surgery.py llava_surgery_v2.py minicpmv-convert-image-encoder-to-gguf.py minicpmv-surgery.pymodels
cogvlm.cpp conformer.cpp glm4v.cpp internvl.cpp kimik25.cpp kimivl.cpp llama4.cpp llava.cpp minicpmv.cpp mobilenetv5.cpp models.h pixtral.cpp qwen2vl.cpp qwen3vl.cpp siglip.cpp whisper-enc.cpp youtuvl.cppserver
public_legacy
colorthemes.css completion.js favicon.ico index-new.html index.html index.js json-schema-to-grammar.mjs loading.html prompt-formats.js style.css system-prompts.js theme-beeninorder.css theme-ketivah.css theme-mangotango.css theme-playground.css theme-polarnight.css theme-snowstorm.csspublic_simplechat
datautils.mjs index.html readme.md simplechat.css simplechat.js simplechat_screens.webp ui.mjstests
unit
test_basic.py test_chat_completion.py test_compat_anthropic.py test_compat_oai_responses.py test_completion.py test_ctx_shift.py test_embedding.py test_infill.py test_lora.py test_rerank.py test_router.py test_security.py test_sleep.py test_slot_save.py test_speculative.py test_template.py test_tokenize.py test_tool_call.py test_vision_api.pywebui
.storybook
ModeWatcherDecorator.svelte TooltipProviderDecorator.svelte main.ts preview.ts vitest.setup.tssrc
lib
components
app
chat
ChatAttachments
ChatAttachmentPreview.svelte ChatAttachmentThumbnailFile.svelte ChatAttachmentThumbnailImage.svelte ChatAttachmentsList.svelte ChatAttachmentsViewAll.svelteChatForm
ChatFormActions
ChatFormActionFileAttachments.svelte ChatFormActionRecord.svelte ChatFormActionSubmit.svelte ChatFormActions.svelteChatMessages
ChatMessage.svelte ChatMessageActions.svelte ChatMessageAssistant.svelte ChatMessageBranchingControls.svelte ChatMessageEditForm.svelte ChatMessageStatistics.svelte ChatMessageSystem.svelte ChatMessageThinkingBlock.svelte ChatMessageUser.svelte ChatMessages.svelteChatScreen
ChatScreen.svelte ChatScreenDragOverlay.svelte ChatScreenHeader.svelte ChatScreenProcessingInfo.sveltedialogs
DialogChatAttachmentPreview.svelte DialogChatAttachmentsViewAll.svelte DialogChatError.svelte DialogChatSettings.svelte DialogConfirmation.svelte DialogConversationSelection.svelte DialogConversationTitleUpdate.svelte DialogEmptyFileAlert.svelte DialogModelInformation.svelte DialogModelNotAvailable.sveltemisc
ActionButton.svelte ActionDropdown.svelte BadgeChatStatistic.svelte BadgeInfo.svelte BadgeModality.svelte CodePreviewDialog.svelte ConversationSelection.svelte CopyToClipboardIcon.svelte KeyboardShortcutInfo.svelte MarkdownContent.svelte RemoveButton.svelte SearchInput.svelte SyntaxHighlightedCode.svelteui
alert-dialog
alert-dialog-action.svelte alert-dialog-cancel.svelte alert-dialog-content.svelte alert-dialog-description.svelte alert-dialog-footer.svelte alert-dialog-header.svelte alert-dialog-overlay.svelte alert-dialog-title.svelte alert-dialog-trigger.svelte index.tscard
card-action.svelte card-content.svelte card-description.svelte card-footer.svelte card-header.svelte card-title.svelte card.svelte index.tsdialog
dialog-close.svelte dialog-content.svelte dialog-description.svelte dialog-footer.svelte dialog-header.svelte dialog-overlay.svelte dialog-title.svelte dialog-trigger.svelte index.tsdropdown-menu
dropdown-menu-checkbox-item.svelte dropdown-menu-content.svelte dropdown-menu-group-heading.svelte dropdown-menu-group.svelte dropdown-menu-item.svelte dropdown-menu-label.svelte dropdown-menu-radio-group.svelte dropdown-menu-radio-item.svelte dropdown-menu-separator.svelte dropdown-menu-shortcut.svelte dropdown-menu-sub-content.svelte dropdown-menu-sub-trigger.svelte dropdown-menu-trigger.svelte index.tspopover
index.ts popover-close.svelte popover-content.svelte popover-portal.svelte popover-trigger.svelte popover.svelteselect
index.ts select-content.svelte select-group-heading.svelte select-group.svelte select-item.svelte select-label.svelte select-scroll-down-button.svelte select-scroll-up-button.svelte select-separator.svelte select-trigger.sveltesheet
index.ts sheet-close.svelte sheet-content.svelte sheet-description.svelte sheet-footer.svelte sheet-header.svelte sheet-overlay.svelte sheet-title.svelte sheet-trigger.sveltesidebar
constants.ts context.svelte.ts index.ts sidebar-content.svelte sidebar-footer.svelte sidebar-group-action.svelte sidebar-group-content.svelte sidebar-group-label.svelte sidebar-group.svelte sidebar-header.svelte sidebar-input.svelte sidebar-inset.svelte sidebar-menu-action.svelte sidebar-menu-badge.svelte sidebar-menu-button.svelte sidebar-menu-item.svelte sidebar-menu-skeleton.svelte sidebar-menu-sub-button.svelte sidebar-menu-sub-item.svelte sidebar-menu-sub.svelte sidebar-menu.svelte sidebar-provider.svelte sidebar-rail.svelte sidebar-separator.svelte sidebar-trigger.svelte sidebar.sveltetable
index.ts table-body.svelte table-caption.svelte table-cell.svelte table-footer.svelte table-head.svelte table-header.svelte table-row.svelte table.svelteconstants
auto-scroll.ts binary-detection.ts default-context.ts floating-ui-constraints.ts icons.ts input-classes.ts latex-protection.ts literal-html.ts localstorage-keys.ts max-bundle-size.ts precision.ts processing-info.ts settings-config.ts supported-file-types.ts table-html-restorer.ts tooltip-config.ts viewport.tsstores
chat.svelte.ts conversations.svelte.ts models.svelte.ts persisted.svelte.ts server.svelte.ts settings.svelte.tsutils
api-headers.ts api-key-validation.ts attachment-display.ts attachment-type.ts audio-recording.ts autoresize-textarea.ts branching.ts browser-only.ts clipboard.ts config-helpers.ts conversation-utils.ts convert-files-to-extra.ts file-preview.ts file-type.ts formatters.ts index.ts is-ime-composing.ts latex-protection.ts modality-file-validation.ts model-names.ts pdf-processing.ts portal-to-body.ts precision.ts process-uploaded-files.ts svg-to-png.ts syntax-highlight-language.ts text-files.ts text.ts webp-to-png.tstests
llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c
raw
1#pragma clang diagnostic ignored "-Wunused-variable"
2#pragma clang diagnostic ignored "-Wunused-function"
3#pragma clang diagnostic ignored "-Wunused-but-set-variable"
4
5#include <HAP_farf.h>
6#include <HAP_perf.h>
7
8#include <math.h>
9#include <string.h>
10
11#include "hex-dma.h"
12#include "hvx-utils.h"
13
14#define GGML_COMMON_DECL_C
15#include "ggml-common.h"
16#include "htp-ctx.h"
17#include "htp-msg.h"
18#include "htp-ops.h"
19
20#define htp_act_preamble3 \
21 const uint32_t ne00 = src0->ne[0]; \
22 const uint32_t ne01 = src0->ne[1]; \
23 const uint32_t ne02 = src0->ne[2]; \
24 const uint32_t ne03 = src0->ne[3]; \
25 \
26 const uint32_t ne10 = src1->ne[0]; \
27 const uint32_t ne11 = src1->ne[1]; \
28 const uint32_t ne12 = src1->ne[2]; \
29 const uint32_t ne13 = src1->ne[3]; \
30 \
31 const uint32_t ne0 = dst->ne[0]; \
32 const uint32_t ne1 = dst->ne[1]; \
33 const uint32_t ne2 = dst->ne[2]; \
34 const uint32_t ne3 = dst->ne[3]; \
35 \
36 const uint32_t nb00 = src0->nb[0]; \
37 const uint32_t nb01 = src0->nb[1]; \
38 const uint32_t nb02 = src0->nb[2]; \
39 const uint32_t nb03 = src0->nb[3]; \
40 \
41 const uint32_t nb10 = src1->nb[0]; \
42 const uint32_t nb11 = src1->nb[1]; \
43 const uint32_t nb12 = src1->nb[2]; \
44 const uint32_t nb13 = src1->nb[3]; \
45 \
46 const uint32_t nb0 = dst->nb[0]; \
47 const uint32_t nb1 = dst->nb[1]; \
48 const uint32_t nb2 = dst->nb[2]; \
49 const uint32_t nb3 = dst->nb[3];
50
51#define htp_act_preamble2 \
52 const uint32_t ne00 = src0->ne[0]; \
53 const uint32_t ne01 = src0->ne[1]; \
54 const uint32_t ne02 = src0->ne[2]; \
55 const uint32_t ne03 = src0->ne[3]; \
56 \
57 const uint32_t ne0 = dst->ne[0]; \
58 const uint32_t ne1 = dst->ne[1]; \
59 const uint32_t ne2 = dst->ne[2]; \
60 const uint32_t ne3 = dst->ne[3]; \
61 \
62 const uint32_t nb00 = src0->nb[0]; \
63 const uint32_t nb01 = src0->nb[1]; \
64 const uint32_t nb02 = src0->nb[2]; \
65 const uint32_t nb03 = src0->nb[3]; \
66 \
67 const uint32_t nb0 = dst->nb[0]; \
68 const uint32_t nb1 = dst->nb[1]; \
69 const uint32_t nb2 = dst->nb[2]; \
70 const uint32_t nb3 = dst->nb[3];
71
72static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
73 const struct htp_tensor * src1,
74 struct htp_tensor * dst,
75 const int32_t * op_params,
76 struct htp_spad * src0_spad,
77 struct htp_spad * src1_spad,
78 struct htp_spad * dst_spad,
79 uint32_t nth,
80 uint32_t ith,
81 uint32_t src0_nrows_per_thread,
82 dma_queue * dma_queue) {
83 htp_act_preamble3;
84
85 size_t src0_row_size = nb01;
86 size_t src1_row_size = nb11;
87 size_t dst_row_size = nb1;
88
89
90
91 const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
92
93 const uint32_t src0_start_row = src0_nrows_per_thread * ith;
94 const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
95
96 // no work for this thread
97 if (src0_start_row >= src0_end_row) {
98 return;
99 }
100
101 uint64_t t1, t2;
102 t1 = HAP_perf_get_qtimer_count();
103
104 const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
105 const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
106 uint8_t * restrict data_dst = (uint8_t *) dst->data;
107
108 const bool src1_valid = src1->ne[0];
109 const int nc = (src1_valid) ? ne00 : ne00 / 2;
110 if (!src1_valid) {
111 const int32_t swapped = op_params[1];
112 data_src1 = data_src0;
113 src1_row_size = src0_row_size;
114
115 const size_t nc_in_bytes = nc * SIZEOF_FP32;
116 data_src0 += swapped ? nc_in_bytes : 0;
117 data_src1 += swapped ? 0 : nc_in_bytes;
118 }
119
120 const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
121 const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
122 const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
123
124 uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
125 uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
126 uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
127
128 // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
129 size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
130 size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
131 size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
132
133 const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
134 if (BLOCK == 0) {
135 FARF(ERROR,
136 "swiglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
137 src0_spad->size_per_thread, src0_row_size_aligned);
138 return;
139 }
140
141 // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
142 for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
143 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
144
145 // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
146 dma_queue_push_vtcm_to_ddr(dma_queue,
147 dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
148 dst_row_size, dst_row_size_aligned, 0);
149
150 dma_queue_push_ddr_to_vtcm(dma_queue,
151 dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
152 src0_row_size_aligned, src0_row_size, block_size);
153 dma_queue_push_ddr_to_vtcm(dma_queue,
154 dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
155 src1_row_size_aligned, src1_row_size, block_size);
156 }
157
158 for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
159 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
160
161 float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
162 float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
163 float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
164
165 for (uint32_t ib = 0; ib < block_size; ib++) {
166 const float * src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
167 const float * src1_spad_ptr = src1_spad + ib * (src1_row_size_aligned / sizeof(float));
168 float * dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
169
170 //swiglu(x) = x1 * sigmoid(x0)
171 hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, nc);
172 hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
173 (const uint8_t *) src1_spad_ptr, nc);
174 }
175
176 dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
177 dst_row_size_aligned, block_size);
178
179 // prefetch N+2 loop iteration if any
180 const uint32_t pref_block = (ir + BLOCK * 2);
181 if (pref_block < src0_end_row) {
182 const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
183 dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
184 src0_row_size_aligned, src0_row_size, pref_block_size);
185 dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
186 src1_row_size_aligned, src1_row_size, pref_block_size);
187 }
188 }
189
190 dma_queue_flush(dma_queue);
191
192 t2 = HAP_perf_get_qtimer_count();
193
194 FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
195 ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
196 (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
197}
198
199static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
200 const struct htp_tensor * src1,
201 struct htp_tensor * dst,
202 const int32_t * op_params,
203 struct htp_spad * src0_spad,
204 struct htp_spad * src1_spad,
205 struct htp_spad * dst_spad,
206 uint32_t nth,
207 uint32_t ith,
208 uint32_t src0_nrows_per_thread,
209 dma_queue * dma_queue) {
210 htp_act_preamble3;
211
212 uint64_t t1, t2;
213 t1 = HAP_perf_get_qtimer_count();
214
215 size_t src0_row_size = nb01;
216 size_t src1_row_size = nb11;
217 size_t dst_row_size = nb1;
218
219 const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
220
221 const uint32_t src0_start_row = src0_nrows_per_thread * ith;
222 const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
223
224 // no work for this thread
225 if (src0_start_row >= src0_end_row) {
226 return;
227 }
228
229 const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
230 const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
231 uint8_t * restrict data_dst = (uint8_t *) dst->data;
232
233 const bool src1_valid = src1->ne[0];
234 const int nc = (src1_valid) ? ne00 : ne00 / 2;
235 if (!src1_valid) {
236 const int32_t swapped = op_params[1];
237 data_src1 = data_src0;
238 src1_row_size = src0_row_size;
239
240 const size_t nc_in_bytes = nc * SIZEOF_FP32;
241 data_src0 += swapped ? nc_in_bytes : 0;
242 data_src1 += swapped ? 0 : nc_in_bytes;
243 }
244
245 const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
246 const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
247 const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
248
249 uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
250 uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
251 uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
252
253 // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
254 size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
255 size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
256 size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
257
258 const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
259 if (BLOCK == 0) {
260 FARF(ERROR,
261 "swiglu-oai-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least "
262 "%zu\n",
263 src0_spad->size_per_thread, src0_row_size_aligned);
264 return;
265 }
266 const float alpha = ((const float *) (op_params))[2];
267 const float limit = ((const float *) (op_params))[3];
268
269 // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
270 for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
271 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
272
273 // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
274 dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
275 dst_row_size, dst_row_size_aligned, 0);
276
277 dma_queue_push_ddr_to_vtcm(
278 dma_queue,
279 dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
280 src0_row_size_aligned, src0_row_size, block_size);
281 dma_queue_push_ddr_to_vtcm(
282 dma_queue,
283 dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
284 src1_row_size_aligned, src1_row_size, block_size);
285 }
286
287 for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
288 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
289
290 float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
291 float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
292 float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
293
294 for (uint32_t ib = 0; ib < block_size; ib++) {
295 const float * src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
296 const float * src1_spad_ptr = src1_spad + ib * (src1_row_size_aligned / sizeof(float));
297 float * dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
298
299 // x (src0_spad_data) = std::min(src0_p[k], limit);
300 hvx_min_scalar_f32((uint8_t *) src0_spad_ptr, (const uint8_t *) src0_spad_ptr, limit, nc);
301 // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
302 hvx_clamp_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, -limit, limit, nc);
303 // y (src1_spad_data) = y1 + 1.f
304 hvx_add_scalar_f32((uint8_t *) src1_spad_ptr, (const uint8_t *) src1_spad_ptr, 1.0, nc);
305 // x1 (dst_spad_data) = alpha * (x)
306 hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, alpha, nc);
307 // x2 (dst_spad_data) = sigmoid(x1) = 1/(1+exp(-x1))
308 hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, nc);
309 // out = x * sigmoid(alpha * x) * (y + 1.f)
310 hvx_mul_mul_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr,
311 (const uint8_t *) src1_spad_ptr, nc);
312 }
313
314 dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
315 dst_row_size_aligned, block_size);
316
317 // prefetch N+2 loop iteration if any
318 const uint32_t pref_block = (ir + BLOCK * 2);
319 if (pref_block < src0_end_row) {
320 const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
321 dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
322 src0_row_size_aligned, src0_row_size, pref_block_size);
323 dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
324 src1_row_size_aligned, src1_row_size, pref_block_size);
325 }
326 }
327
328 dma_queue_flush(dma_queue);
329
330 t2 = HAP_perf_get_qtimer_count();
331
332 FARF(HIGH, "swiglu-oai-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0],
333 src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], src1->ne[2],
334 src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
335}
336
337
338static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
339 struct htp_tensor * dst,
340 const int32_t * op_params,
341 struct htp_spad * src0_spad,
342 struct htp_spad * dst_spad,
343 uint32_t nth,
344 uint32_t ith,
345 uint32_t src0_nrows_per_thread,
346 dma_queue * dma_queue) {
347 htp_act_preamble2;
348
349 uint64_t t1, t2;
350 t1 = HAP_perf_get_qtimer_count();
351
352 const size_t src0_row_size = nb01;
353 const size_t dst_row_size = nb1;
354 const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
355 const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
356
357 const uint32_t src0_nrows = ne01 * ne02 * ne03;
358
359 const uint32_t src0_start_row = src0_nrows_per_thread * ith;
360 const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
361
362 // no work for this thread
363 if (src0_start_row >= src0_end_row) {
364 return;
365 }
366
367 const uint8_t * data_src0 = (const uint8_t *) src0->data;
368 uint8_t * data_dst = (uint8_t *) dst->data;
369
370 uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
371 uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
372
373 // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
374 size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
375 size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
376
377 // In gelu = x*sigmoid(x*1.702)
378 const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
379
380 if (BLOCK == 0) {
381 FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
382 src0_spad->size_per_thread, src0_row_size_aligned);
383 return;
384 }
385
386 // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
387 for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
388 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
389
390 // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
391 dma_queue_push_vtcm_to_ddr(dma_queue,
392 dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
393 dst_row_size, dst_row_size_aligned, 0);
394
395 dma_queue_push_ddr_to_vtcm(dma_queue,
396 dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
397 src0_row_size_aligned, src0_row_size, block_size);
398 }
399
400 for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
401 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
402
403 float* dst_spad = (float *) dma_queue_pop(dma_queue).src;
404 float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
405
406 for (uint32_t ib = 0; ib < block_size; ib++) {
407 const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
408 float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
409
410 // gelu = x * sigmoid(1.702 * x) // current implementation
411 hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0);
412 hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
413 hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
414 }
415
416 dma_queue_push_vtcm_to_ddr(dma_queue,
417 dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
418 dst_row_size, dst_row_size_aligned, block_size);
419
420 // prefetch N+2 loop iteration if any
421 const uint32_t pref_block = (ir + BLOCK * 2);
422 if (pref_block < src0_end_row) {
423 const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
424 dma_queue_push_ddr_to_vtcm(dma_queue,
425 dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
426 src0_row_size_aligned, src0_row_size, pref_block_size);
427 }
428 }
429
430 dma_queue_flush(dma_queue);
431
432 t2 = HAP_perf_get_qtimer_count();
433
434 FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
435 ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
436}
437
438static void unary_gelu_f32(unsigned int n, unsigned int i, void * data) {
439 struct htp_ops_context * octx = (struct htp_ops_context *) data;
440 unary_gelu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
441 octx->src0_nrows_per_thread, octx->ctx->dma[i]);
442}
443
444
445
446static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
447 struct htp_tensor * dst,
448 const int32_t * op_params,
449 struct htp_spad * src0_spad,
450 struct htp_spad * dst_spad,
451 uint32_t nth,
452 uint32_t ith,
453 uint32_t src0_nrows_per_thread,
454 dma_queue * dma_queue) {
455 htp_act_preamble2;
456
457 uint64_t t1, t2;
458 t1 = HAP_perf_get_qtimer_count();
459
460 const size_t src0_row_size = nb01;
461 const size_t dst_row_size = nb1;
462 const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
463 const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
464
465 const uint32_t src0_nrows = ne01 * ne02 * ne03;
466
467 const uint32_t src0_start_row = src0_nrows_per_thread * ith;
468 const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
469
470 // no work for this thread
471 if (src0_start_row >= src0_end_row) {
472 return;
473 }
474
475 const uint8_t * data_src0 = (const uint8_t *) src0->data;
476 uint8_t * data_dst = (uint8_t *) dst->data;
477
478 uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
479 uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
480
481 // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
482 size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
483 size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
484
485 const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
486
487 if (BLOCK == 0) {
488 FARF(ERROR, "silu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
489 src0_spad->size_per_thread, src0_row_size_aligned);
490 return;
491 }
492
493 // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
494 for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
495 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
496
497 // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
498 dma_queue_push_vtcm_to_ddr(dma_queue,
499 dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
500 dst_row_size, dst_row_size_aligned, 0);
501
502 dma_queue_push_ddr_to_vtcm(dma_queue,
503 dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
504 src0_row_size_aligned, src0_row_size, block_size);
505 }
506
507 for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
508 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
509
510 float* dst_spad = (float *) dma_queue_pop(dma_queue).src;
511 float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
512
513 for (uint32_t ib = 0; ib < block_size; ib++) {
514 const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
515 float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
516
517 // silu = x * sigmoid(x)
518 hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0);
519 hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
520 }
521
522 dma_queue_push_vtcm_to_ddr(dma_queue,
523 dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
524 dst_row_size, dst_row_size_aligned, block_size);
525
526 // prefetch N+2 loop iteration if any
527 const uint32_t pref_block = (ir + BLOCK * 2);
528 if (pref_block < src0_end_row) {
529 const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
530 dma_queue_push_ddr_to_vtcm(dma_queue,
531 dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
532 src0_row_size_aligned, src0_row_size, pref_block_size);
533 }
534 }
535
536 dma_queue_flush(dma_queue);
537
538 t2 = HAP_perf_get_qtimer_count();
539
540 FARF(HIGH, "silu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
541 ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
542}
543
544static const float GELU_COEF_A = 0.044715f;
545static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
546
547static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
548 const struct htp_tensor * src1,
549 struct htp_tensor * dst,
550 const int32_t * op_params,
551 struct htp_spad * src0_spad,
552 struct htp_spad * src1_spad,
553 struct htp_spad * dst_spad,
554 uint32_t nth,
555 uint32_t ith,
556 uint32_t src0_nrows_per_thread,
557 dma_queue * dma_queue) {
558 htp_act_preamble3;
559
560 size_t src0_row_size = nb01;
561 size_t src1_row_size = nb11;
562 size_t dst_row_size = nb1;
563
564 uint64_t t1, t2;
565 t1 = HAP_perf_get_qtimer_count();
566
567 const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
568
569 const uint32_t src0_start_row = src0_nrows_per_thread * ith;
570 const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
571
572 // no work for this thread
573 if (src0_start_row >= src0_end_row) {
574 return;
575 }
576
577 const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
578 const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
579 uint8_t * restrict data_dst = (uint8_t *) dst->data;
580
581 const bool src1_valid = src1->ne[0];
582 const int nc = (src1_valid) ? ne00 : ne00 / 2;
583 if (!src1_valid) {
584 const int32_t swapped = op_params[1];
585 data_src1 = data_src0;
586 src1_row_size = src0_row_size;
587
588 const size_t nc_in_bytes = nc * SIZEOF_FP32;
589 data_src0 += swapped ? nc_in_bytes : 0;
590 data_src1 += swapped ? 0 : nc_in_bytes;
591 }
592
593 const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
594 const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
595 const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
596
597 uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
598 uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
599 uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
600
601 // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
602 size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
603 size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
604 size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
605
606 const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
607 if (BLOCK == 0) {
608 FARF(ERROR,
609 "geglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
610 src0_spad->size_per_thread, src0_row_size_aligned);
611 return;
612 }
613
614 // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
615 for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
616 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
617
618 // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
619 dma_queue_push_vtcm_to_ddr(dma_queue,
620 dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
621 dst_row_size, dst_row_size_aligned, 0);
622
623 dma_queue_push_ddr_to_vtcm(dma_queue,
624 dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
625 src0_row_size_aligned, src0_row_size, block_size);
626 dma_queue_push_ddr_to_vtcm(dma_queue,
627 dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + (ir * src1_row_size)),
628 src1_row_size_aligned, src1_row_size, block_size);
629 }
630
631 for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
632 const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
633
634 float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
635 float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
636 float * src1_spad = (float *) dma_queue_pop(dma_queue).dst;
637
638 for (uint32_t ib = 0; ib < block_size; ib++) {
639 const uint8_t * src0_spad_ptr = (const uint8_t *)(src0_spad + ib * (src0_row_size_aligned / sizeof(float)));
640 const uint8_t * src1_spad_ptr = (const uint8_t *)(src1_spad + ib * (src1_row_size_aligned / sizeof(float)));
641 uint8_t * dst_spad_ptr = (uint8_t *)(dst_spad + ib * (dst_row_size_aligned / sizeof(float)));
642
643 // geglu tanh implementation
644 // geglu(x, g) = gelu(x) * g
645 // gelu(x) = 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)))
646 hvx_mul_f32_aaa(dst_spad_ptr, src0_spad_ptr, src0_spad_ptr, nc); // res = x*x
647 hvx_mul_scalar_f32_aa(dst_spad_ptr, (const uint8_t *)dst_spad_ptr, GELU_COEF_A, nc); // res = res * GELU_COEF_A
648 hvx_add_scalar_f32_aa(dst_spad_ptr, (const uint8_t *)dst_spad_ptr, 1.0f, nc); // res = res + 1.0f
649 hvx_mul_f32_aaa(dst_spad_ptr, src0_spad_ptr, (const uint8_t *)dst_spad_ptr, nc); // res = res * x
650 hvx_mul_scalar_f32_aa(dst_spad_ptr, (const uint8_t*)dst_spad_ptr, SQRT_2_OVER_PI, nc); // res = result * SQRT_2_OVER_PI
651 hvx_tanh_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, nc); // res = tanh(res)
652 hvx_add_scalar_f32_aa(dst_spad_ptr, (const uint8_t*)dst_spad_ptr, 1.0f, nc); // res = res + 1.0f
653 hvx_mul_f32_aaa(dst_spad_ptr, src0_spad_ptr, (const uint8_t *)dst_spad_ptr, nc); // res = res * x
654 hvx_mul_scalar_f32_aa(dst_spad_ptr, (const uint8_t *)dst_spad_ptr, 0.5f, nc); // res = res + 0.5f
655 hvx_mul_f32_aaa(dst_spad_ptr, (const uint8_t *)dst_spad_ptr, src1_spad_ptr, nc); // res = res * g
656 }
657
658 dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), dst_row_size,
659 dst_row_size_aligned, block_size);
660
661 // prefetch N+2 loop iteration if any
662 const uint32_t pref_block = (ir + BLOCK * 2);
663 if (pref_block < src0_end_row) {
664 const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
665 dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
666 src0_row_size_aligned, src0_row_size, pref_block_size);
667 dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src1_spad, data_src1 + (pref_block * src1_row_size)),
668 src1_row_size_aligned, src1_row_size, pref_block_size);
669 }
670 }
671
672 dma_queue_flush(dma_queue);
673
674 t2 = HAP_perf_get_qtimer_count();
675
676 FARF(HIGH, "geglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
677 ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
678 (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
679}
680
681static void unary_silu_f32(unsigned int n, unsigned int i, void * data) {
682 struct htp_ops_context * octx = (struct htp_ops_context *) data;
683 unary_silu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
684 octx->src0_nrows_per_thread, octx->ctx->dma[i]);
685}
686
687static void glu_swiglu_f32(unsigned int n, unsigned int i, void * data) {
688 struct htp_ops_context * octx = (struct htp_ops_context *) data;
689 glu_swiglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
690 &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
691}
692
693static void glu_swiglu_oai_f32(unsigned int n, unsigned int i, void * data) {
694 struct htp_ops_context * octx = (struct htp_ops_context *) data;
695 glu_swiglu_oai_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
696 &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
697}
698
699static void glu_geglu_f32(unsigned int n, unsigned int i, void * data) {
700 struct htp_ops_context * octx = (struct htp_ops_context *) data;
701 glu_geglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
702 &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
703}
704
705static int execute_op_activations_f32(struct htp_ops_context * octx) {
706 int err = HTP_STATUS_OK;
707
708 const struct htp_tensor * src0 = &octx->src0;
709 const struct htp_tensor * src1 = &octx->src1;
710 struct htp_tensor * dst = &octx->dst;
711
712 if (((src0->ne[0] * SIZEOF_FP32) != src0->nb[1]) || ((dst->ne[0] * SIZEOF_FP32) != dst->nb[1])) {
713 FARF(ERROR, "Non-contiguous tensors are not supported at this time \n");
714 return HTP_STATUS_NO_SUPPORT;
715 }
716
717 worker_callback_t act_op_func;
718 const char * op_type = NULL;
719
720 switch (octx->op) {
721 case HTP_OP_UNARY_SILU:
722 act_op_func = unary_silu_f32;
723 op_type = "silu-f32";
724 break;
725
726 case HTP_OP_GLU_SWIGLU:
727 act_op_func = glu_swiglu_f32;
728 op_type = "swiglu-f32";
729 break;
730
731 case HTP_OP_GLU_SWIGLU_OAI:
732 act_op_func = glu_swiglu_oai_f32;
733 op_type = "swiglu-oai-f32";
734 break;
735 case HTP_OP_UNARY_GELU:
736 act_op_func = unary_gelu_f32;
737 op_type = "gelu-f32";
738 break;
739
740 case HTP_OP_GLU_GEGLU:
741 act_op_func = glu_geglu_f32;
742 op_type = "geglu-f32";
743 break;
744 default:
745 FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
746 return HTP_STATUS_NO_SUPPORT;
747 }
748
749 const uint32_t n_threads = octx->n_threads;
750 const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
751
752 size_t src0_row_size = src0->nb[1];
753 size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
754 size_t dst_row_size = dst->nb[1];
755
756 const bool src1_valid = src1->ne[0];
757 if (!src1_valid) {
758 src1_row_size = src0_row_size;
759 }
760
761 const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
762 const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
763 const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
764 // VTCM scratchpads for all tensors
765 // N rows per thread, padded to HVX vector size
766
767 size_t spad_size_per_row = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
768 size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
769
770 // Make sure the reserved vtcm size is sufficient
771 if(vtcm_row_per_thread ==0){
772 FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
773 spad_size_per_row * n_threads);
774 return HTP_STATUS_VTCM_TOO_SMALL;
775 }
776
777 octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
778 octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
779 octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread;
780
781 octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread;
782 octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
783 octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
784
785 octx->src0_spad.data = octx->ctx->vtcm_base;
786 octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
787 octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
788
789 if (src1->ne[0]) {
790 FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
791 op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
792 src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
793 octx->dst_spad.size);
794 } else {
795 FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
796 src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
797 octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
798 }
799
800 if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
801 uint32_t n_jobs = MIN(n_threads, src0_nrows);
802 octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
803 worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
804 }
805
806 return err;
807}
808
809int op_activations(struct htp_ops_context * octx) {
810 int err = HTP_STATUS_OK;
811
812 switch (octx->src0.type) {
813 case HTP_TYPE_F32:
814 err = execute_op_activations_f32(octx);
815 break;
816
817 default:
818 err = HTP_STATUS_NO_SUPPORT;
819 break;
820 }
821
822 return err;
823}