summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-cann
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cann')
-rwxr-xr-xllama.cpp/ggml/src/ggml-cann/CMakeLists.txt89
-rw-r--r--llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp195
-rw-r--r--llama.cpp/ggml/src/ggml-cann/acl_tensor.h349
-rw-r--r--llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp4021
-rw-r--r--llama.cpp/ggml/src/ggml-cann/aclnn_ops.h1119
-rw-r--r--llama.cpp/ggml/src/ggml-cann/common.h641
-rw-r--r--llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp2881
7 files changed, 9295 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt b/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt
new file mode 100755
index 0000000..aee5e7b
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt
@@ -0,0 +1,89 @@
+if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
+ set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
+ message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
+endif()
+
+# Auto-detech Soc type and Soc version, if detect failed, will abort build
+set(SOC_VERSION "")
+function(detect_ascend_soc_type SOC_VERSION)
+ execute_process(
+ COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
+ OUTPUT_VARIABLE npu_info
+ RESULT_VARIABLE npu_result
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ if("${npu_info}" STREQUAL "" OR ${npu_result})
+ message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
+ endif()
+ set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
+endfunction()
+
+if(NOT SOC_TYPE)
+ detect_ascend_soc_type(SOC_VERSION)
+ set(SOC_TYPE "${SOC_VERSION}")
+ message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
+endif()
+
+string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
+string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
+set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
+message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
+option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
+
+if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
+ message(FATAL_ERROR
+ "CANN Graph (ACL graph mode) is not supported on 310P devices. "
+ "Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
+endif()
+
+if (CANN_INSTALL_DIR)
+ # Only Support Linux.
+ if (NOT UNIX)
+ message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}")
+ endif()
+
+ # Supported platforms: x86-64, arm64
+ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+ elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
+ else()
+ message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
+ endif()
+
+ # Set header and libs
+ set(CANN_INCLUDE_DIRS
+ ${CANN_INSTALL_DIR}/include
+ ${CANN_INSTALL_DIR}/include/aclnn
+ ${CANN_INSTALL_DIR}/acllib/include
+ )
+
+ list(APPEND CANN_LIBRARIES
+ ascendcl
+ nnopbase
+ opapi
+ acl_op_compiler
+ )
+
+ file(GLOB GGML_SOURCES_CANN "*.cpp")
+
+ ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
+ target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
+ target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
+ target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
+
+ target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
+
+ if (USE_ACL_GRAPH)
+ target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
+ message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
+ else()
+ message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
+ endif()
+
+ message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
+ message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
+else()
+ message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?")
+endif()
diff --git a/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp b/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp
new file mode 100644
index 0000000..e95d3c4
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2023-2026 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "acl_tensor.h"
+
+#include <algorithm>
+#include <cstring>
+
+aclDataType ggml_cann_type_mapping(ggml_type type) {
+ switch (type) {
+ case GGML_TYPE_F32:
+ return ACL_FLOAT;
+ case GGML_TYPE_F16:
+ return ACL_FLOAT16;
+ case GGML_TYPE_BF16:
+ return ACL_BF16;
+ case GGML_TYPE_I8:
+ return ACL_INT8;
+ case GGML_TYPE_I16:
+ return ACL_INT16;
+ case GGML_TYPE_I32:
+ return ACL_INT32;
+ case GGML_TYPE_Q4_0:
+ return ACL_INT4;
+ case GGML_TYPE_Q8_0:
+ return ACL_INT8;
+ case GGML_TYPE_I64:
+ return ACL_INT64;
+ default:
+ return ACL_DT_UNDEFINED;
+ }
+}
+
+acl_tensor_ptr ggml_cann_create_tensor(const ggml_tensor * tensor,
+ int64_t * ne,
+ size_t * nb,
+ int64_t dims,
+ aclFormat format,
+ size_t offset) {
+ // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
+ // added.
+ int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
+
+ if (ne == nullptr) {
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ acl_ne[i] = tensor->ne[i];
+ // The step size of acl is in elements.
+ acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
+ }
+ } else {
+ // With bcast
+ for (int i = 0; i < dims; i++) {
+ acl_ne[i] = ne[i];
+ acl_stride[i] = nb[i] / ggml_element_size(tensor);
+ }
+ }
+
+ int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
+ int64_t acl_storage_len = 1;
+ for (int i = 0; i < final_dims; i++) {
+ acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
+ }
+ size_t elem_offset = offset / ggml_element_size(tensor);
+ acl_storage_len += elem_offset;
+
+ // Reverse ne and stride.
+ std::reverse(acl_ne, acl_ne + final_dims);
+ std::reverse(acl_stride, acl_stride + final_dims);
+
+ aclTensor * raw = aclCreateTensor(acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride, elem_offset,
+ format, &acl_storage_len, 1, tensor->data);
+
+ return acl_tensor_ptr(raw);
+}
+
+acl_int_array_ptr ggml_cann_create_int_array(const int64_t * value, uint64_t size) {
+ aclIntArray * raw = aclCreateIntArray(value, size);
+ return acl_int_array_ptr(raw);
+}
+
+acl_scalar_ptr ggml_cann_create_scalar(void * value, aclDataType dataType) {
+ aclScalar * raw = aclCreateScalar(value, dataType);
+ return acl_scalar_ptr(raw);
+}
+
+bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1) {
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
+ return true;
+ }
+ }
+ return false;
+}
+
+int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0,
+ const ggml_tensor * src1,
+ int64_t * bcast_src0_ne,
+ int64_t * bcast_src1_ne,
+ size_t * bcast_src0_nb,
+ size_t * bcast_src1_nb) {
+ GGML_ASSERT(ggml_can_repeat(src1, src0));
+ int bcast_dim_cnt = 0;
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ int64_t nr = src0->ne[i] / src1->ne[i];
+ bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
+ bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
+ bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
+ bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
+ bcast_dim_cnt++;
+ if (nr != 1) {
+ // Need to add an extra dim.
+ bcast_src0_ne[bcast_dim_cnt] = nr;
+ bcast_src1_ne[bcast_dim_cnt] = 1;
+ bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] * bcast_src0_ne[bcast_dim_cnt - 1];
+ bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] * bcast_src1_ne[bcast_dim_cnt - 1];
+ bcast_dim_cnt++;
+ }
+ }
+ return bcast_dim_cnt;
+}
+
+int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne,
+ const int64_t * weight_ne,
+ const int64_t * dst_ne,
+ const size_t * input_nb,
+ const size_t * weight_nb,
+ const size_t * dst_nb,
+ int64_t * bcast_input_ne,
+ int64_t * bcast_weight_ne,
+ int64_t * bcast_dst_ne,
+ size_t * bcast_input_nb,
+ size_t * bcast_weight_nb,
+ size_t * bcast_dst_nb) {
+ // input and dst shoule in same shape, except first two dims.
+ GGML_ASSERT(input_ne[2] == dst_ne[2]);
+ GGML_ASSERT(input_ne[3] == dst_ne[3]);
+
+ int bcast_dim_cnt = 0;
+
+ // For mul_mat, a dimension needs to be added before the dimension that
+ // weight needs to be expanded to satisfy the bcast rule of matrix
+ // multiplication.
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ int64_t nr = input_ne[i] / weight_ne[i];
+ // Do not use bcast in the first two dimensions because we only support
+ // the bcast batch dimension. Just copy them.
+ if (i < 2 || nr == 1) {
+ bcast_input_ne[bcast_dim_cnt] = input_ne[i];
+ bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
+ bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
+
+ bcast_input_nb[bcast_dim_cnt] = input_nb[i];
+ bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
+ bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+ bcast_dim_cnt++;
+ } else {
+ // Need to add an extra dim.
+ bcast_input_ne[bcast_dim_cnt] = nr;
+ bcast_dst_ne[bcast_dim_cnt] = nr;
+ bcast_weight_ne[bcast_dim_cnt] = 1;
+ bcast_input_nb[bcast_dim_cnt] = input_nb[i];
+ bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+ bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
+ bcast_dim_cnt++;
+
+ bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
+ bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
+ bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
+ bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] * bcast_input_ne[bcast_dim_cnt - 1];
+ bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] * bcast_dst_ne[bcast_dim_cnt - 1];
+ bcast_weight_nb[bcast_dim_cnt] = bcast_weight_nb[bcast_dim_cnt - 1] * bcast_weight_ne[bcast_dim_cnt - 1];
+ bcast_dim_cnt++;
+ }
+ }
+ return bcast_dim_cnt;
+}
diff --git a/llama.cpp/ggml/src/ggml-cann/acl_tensor.h b/llama.cpp/ggml/src/ggml-cann/acl_tensor.h
new file mode 100644
index 0000000..4737773
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cann/acl_tensor.h
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2023-2026 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CANN_ACL_TENSOR_H
+#define CANN_ACL_TENSOR_H
+
+#include "common.h"
+
+#include <aclnn/aclnn_base.h>
+
+#include <algorithm>
+#include <cstring>
+
+/**
+ * @brief Maps a ggml_type to its corresponding aclDataType.
+ *
+ * @details This function takes a ggml_type as input and returns the corresponding
+ * aclDataType. It supports mapping for various ggml_types. If the input type
+ * does not match any of the predefined ggml_types, the function returns
+ * ACL_DT_UNDEFINED.
+ *
+ * @param type The ggml_type to be mapped.
+ * @return The corresponding aclDataType. If the input type is not recognized,
+ * ACL_DT_UNDEFINED is returned.
+ */
+aclDataType ggml_cann_type_mapping(ggml_type type);
+
+// Deleter for acl objects.
+template <typename T, aclError (*DestroyFunc)(const T *)> struct acl_deleter {
+ void operator()(T * ptr) const noexcept {
+ if (ptr) {
+ ACL_CHECK(DestroyFunc(ptr));
+ }
+ }
+};
+
+using acl_tensor_ptr = std::unique_ptr<aclTensor, acl_deleter<aclTensor, aclDestroyTensor>>;
+using acl_int_array_ptr = std::unique_ptr<aclIntArray, acl_deleter<aclIntArray, aclDestroyIntArray>>;
+using acl_scalar_ptr = std::unique_ptr<aclScalar, acl_deleter<aclScalar, aclDestroyScalar>>;
+using acl_tensor_list_ptr = std::unique_ptr<aclTensorList, acl_deleter<aclTensorList, aclDestroyTensorList>>;
+
+/**
+ * @brief Creates an ACL tensor from a ggml_tensor with optional shape.
+ *
+ * @details This function creates an ACL tensor based on the properties of the
+ * provided ggml_tensor. It supports customer shape by adjusting dimensions
+ * and strides accordingly. If customer shape is applied, additional
+ * dimensions and strides are calculated based on the provided parameters.
+ *
+ * @param tensor Pointer to the ggml_tensor to be converted to ACL tensor.
+ * @param ne Pointer to an array containing dimensions. Defaults to nullptr
+ * if no customer shape is applied.
+ * @param nb Pointer to an array containing strides. Defaults to nullptr
+ * if no customer shape is applied.
+ * @param dims Number of dimensions in the tensor. Defaults to 0 if no customer
+ * shape is applied.
+ * @param format ACL tensor format. Defaults to ACL_FORMAT_ND.
+ * @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
+ * @return Pointer to the created ACL tensor.
+ */
+acl_tensor_ptr ggml_cann_create_tensor(const ggml_tensor * tensor,
+ int64_t * ne = nullptr,
+ size_t * nb = nullptr,
+ int64_t dims = 0,
+ aclFormat format = ACL_FORMAT_ND,
+ size_t offset = 0);
+
+/**
+ * @brief Template for creating an ACL tensor from provided parameters. typename TYPE
+ * should be size_t or float.
+ *
+ * @details This function creates an ACL tensor using the provided data pointer,
+ * data type, dimensions, strides, format, offset, and additional parameters.
+ * It calculates necessary dimensions and strides based on the provided ne and nb
+ * arrays, adjusting them for the ACL tensor creation. The ACL storage length
+ * is also calculated based on the provided dimensions and strides.
+ *
+ * @param data_ptr Pointer to the data buffer for the ACL tensor.
+ * @param dtype ACL data type of the tensor.
+ * @param type_size Size of each element in the tensor data buffer.
+ * @param ne Pointer to an array containing tensor dimensions.
+ * @param nb Pointer to an array containing tensor strides.
+ * @param dims Number of dimensions of the tensor.
+ * @param format ACL tensor format. Defaults to ACL_FORMAT_ND.
+ * @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
+ * @return Pointer to the created ACL tensor.
+ */
+template <typename TYPE>
+acl_tensor_ptr ggml_cann_create_tensor(void * data_ptr,
+ aclDataType dtype,
+ TYPE type_size,
+ int64_t * ne,
+ TYPE * nb,
+ int64_t dims,
+ aclFormat format = ACL_FORMAT_ND,
+ size_t offset = 0) {
+ int64_t tmp_ne[GGML_MAX_DIMS * 2];
+ int64_t tmp_stride[GGML_MAX_DIMS * 2];
+
+ memcpy(tmp_ne, ne, dims * sizeof(int64_t));
+ for (int i = 0; i < dims; i++) {
+ tmp_stride[i] = nb[i] / type_size;
+ }
+
+ int64_t acl_storage_len = 1;
+ for (int i = 0; i < dims; i++) {
+ acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
+ }
+
+ std::reverse(tmp_ne, tmp_ne + dims);
+ std::reverse(tmp_stride, tmp_stride + dims);
+
+ aclTensor * raw =
+ aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, format, &acl_storage_len, 1, data_ptr);
+
+ return acl_tensor_ptr(raw);
+}
+
+/**
+ * @brief Create an ACL int array resource wrapped in a smart pointer.
+ *
+ * This function constructs an aclIntArray from the provided int64_t values
+ * and returns it as an acl_int_array_ptr (a std::unique_ptr with a custom
+ * deleter). The returned pointer owns the ACL resource and will automatically
+ * destroy it via aclDestroyIntArray().
+ *
+ * @param value Pointer to the int64_t elements.
+ * @param size Number of elements in value.
+ *
+ * @return A smart pointer managing the created ACL int array.
+ */
+acl_int_array_ptr ggml_cann_create_int_array(const int64_t * value, uint64_t size);
+
+/**
+ * @brief Create an ACL scalar resource wrapped in a smart pointer.
+ *
+ * This function constructs an aclScalar from the raw value pointer and ACL
+ * data type, then returns it as an acl_scalar_ptr (a std::unique_ptr with
+ * a custom deleter). The returned pointer owns the ACL scalar and will
+ * automatically destroy it via aclDestroyScalar().
+ *
+ * @param value Pointer to the raw scalar memory.
+ * @param dataType ACL data type of the scalar.
+ *
+ * @return A smart pointer managing the created ACL scalar.
+ */
+acl_scalar_ptr ggml_cann_create_scalar(void * value, aclDataType dataType);
+
+/**
+ * @brief Create an ACL tensor list from multiple tensor smart pointers.
+ *
+ * This function accepts a variadic list of acl_tensor_ptr (a unique_ptr with
+ * custom deleter) and produces an aclTensorList using aclCreateTensorList().
+ *
+ * The lifecycle management of the tensor objects changes as follows:
+ * - aclCreateTensorList() takes ownership of the tensors
+ * - Each input smart pointer releases ownership using release()
+ * - As a result, the tensors will NOT be destroyed by unique_ptr
+ * - Instead, they will be destroyed when aclDestroyTensorList() is called
+ *
+ * This ensures correct ownership transfer and prevents double-free situations.
+ *
+ * @param acl_tensor_ptr Variadic template parameter; each argument must be
+ * a unique_ptr-like type supporting get() and release().
+ *
+ * @param tensors Variadic list of acl_tensor_ptr objects. Ownership of
+ * each tensor is transferred away from these smart pointers.
+ *
+ * @return A smart pointer (acl_tensor_list_ptr) owning the created ACL tensor list.
+ *
+ * @note This implementation is C++11 compatible. The ownership-release process is
+ * executed using a pack expansion inside an initializer list.
+ */
+template <typename... acl_tensor_ptr> acl_tensor_list_ptr ggml_cann_create_tensor_list(acl_tensor_ptr &&... tensors) {
+ aclTensor * raw_tensors[] = { tensors.get()... };
+ aclTensorList * raw = aclCreateTensorList(raw_tensors, sizeof...(tensors));
+ // aclTensor will release by aclTensorList, so release ownership without
+ // destroying the tensor
+ int dummy[] = { (tensors.release(), 0)... };
+ GGML_UNUSED(dummy);
+ return acl_tensor_list_ptr(raw);
+}
+
+/**
+ * @brief Checks if tensors require broadcasting based on their shapes.
+ *
+ * @details This function determines if two ggml_tensors need to be broadcasted for
+ * element-wise operations. Broadcasting is necessary if the shapes of the
+ * tensors are not identical and no dimension in either tensor equals 1.
+ *
+ * @param t0 Pointer to the first ggml_tensor.
+ * @param t1 Pointer to the second ggml_tensor.
+ * @return True if broadcasting is needed, False otherwise.
+ *
+ * @remarks This function iterates over the dimensions of t0 and t1. It checks if each
+ * dimension in t1 differs from t0's corresponding dimension and is not equal
+ * to 1. If such a dimension is found, broadcasting is required to align t1
+ * with t0 for element-wise operations.
+ */
+bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1);
+
+/**
+ * @brief Computes broadcast shapes and strides for two ggml_tensors.
+ *
+ * @details This function calculates the broadcast shapes and strides for two ggml_tensors,
+ * following the broadcasting rules similar to numpy. It adjusts dimensions and
+ * strides to ensure compatibility for element-wise operations where one tensor
+ * can be broadcasted to match the shape of another tensor.
+ *
+ * @param src0 Pointer to the first ggml_tensor.
+ * @param src1 Pointer to the second ggml_tensor.
+ * @param bcast_ne_src0 Output array to store broadcasted dimensions for src0.
+ * @param bcast_ne_src1 Output array to store broadcasted dimensions for src1.
+ * @param bcast_nb_src0 Output array to store broadcasted strides for src0.
+ * @param bcast_nb_src1 Output array to store broadcasted strides for src1.
+ * @return Number of dimensions in the broadcasted shape.
+ *
+ * @pre ggml_can_repeat(src1, src0) must return true, indicating src1 can be broadcasted
+ * to match src0.
+ *
+ * @remarks This function iterates over the dimensions of src0 and src1, calculating the
+ * necessary broadcast dimensions and strides. If a dimension requires broadcasting
+ * (i.e., its size in src1 is smaller than in src0), an additional dimension is
+ * added with size calculated to match src0's dimension. This adjustment ensures
+ * that src1 can be element-wise broadcasted to src0's shape.
+ *
+ * How it works:
+ *
+ * if dim0 has padding.
+ * a -> (2, 2) padding = 2
+ * a: [[1, 2, *, *]
+ * [2, 3, *, *]]
+ * nb = (8, 4, 2)
+ *
+ * if a should bcast with b -> (2, 4)
+ * b' -> (2, 2, 2)
+ * b : [[1, 2, 3, 4, *, *]
+ * [5, 6, 7, 8, *, *]]
+ * nb = (12, 6, 1)
+ *
+ * after bcast:
+ * a' -> (2, 1, 2)
+ * a': [[[1, 2], *, *]
+ * [[2, 3], *, *]]
+ * nb = (8, 4, 2, 1)
+ *
+ * b' : [[[1, 2], [3, 4], *, *]
+ * [[5, 6], [7, 8], *, *]]
+ * nb = (12, 6, 2, 1)
+ * \endcode
+ *
+ * dim1 in a inserted dim, should add nb for dim1,
+ * and all other nb moves to next in order.
+ */
+int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0,
+ const ggml_tensor * src1,
+ int64_t * bcast_ne_src0,
+ int64_t * bcast_ne_src1,
+ size_t * bcast_nb_src0,
+ size_t * bcast_nb_src1);
+
+// Bcast macro to avoid duplicate code.
+#define BCAST_SHAPE(src0, src1) \
+ int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2]; \
+ int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2]; \
+ size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2]; \
+ size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2]; \
+ int64_t bcast_dims = ggml_cann_get_bcast_shape(src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, \
+ bcast_##src0##_nb, bcast_##src1##_nb);
+
+#define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
+
+/**
+ * @brief Calculates broadcast shapes for matrix multiplication.
+ *
+ * @details This function computes the broadcast shapes required for matrix multiplication
+ * based on the input, weight, and destination tensor shapes. It ensures that the
+ * dimensions of weight tensors are expanded appropriately to satisfy matrix
+ * multiplication broadcast rules.
+ *
+ * @param input_ne Array containing the dimensions of the input tensor.
+ * @param weight_ne Array containing the dimensions of the weight tensor.
+ * @param dst_ne Array containing the dimensions of the destination tensor.
+ * @param input_nb Array containing the strides of the input tensor.
+ * @param weight_nb Array containing the strides of the weight tensor.
+ * @param dst_nb Array containing the strides of the destination tensor.
+ * @param bcast_input_ne Output array for broadcasted input tensor dimensions.
+ * @param bcast_weight_ne Output array for broadcasted weight tensor dimensions.
+ * @param bcast_dst_ne Output array for broadcasted destination tensor dimensions.
+ * @param bcast_input_nb Output array for broadcasted input tensor strides.
+ * @param bcast_weight_nb Output array for broadcasted weight tensor strides.
+ * @param bcast_dst_nb Output array for broadcasted destination tensor strides.
+ * @return The number of dimensions in the broadcasted tensors.
+ *
+ * @remarks This function iterates over the tensor dimensions and calculates the broadcast
+ * shapes needed for matrix multiplication. It ensures that dimensions where
+ * weight tensor requires expansion are appropriately handled to conform with
+ * broadcasting rules.
+ * @note compare with ggml_cann_get_bcast_shape, mul_mat broadcast need add this new dim
+ * before cast dim.
+ * @sa ggml_cann_get_bcast_shape
+ */
+int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne,
+ const int64_t * weight_ne,
+ const int64_t * dst_ne,
+ const size_t * input_nb,
+ const size_t * weight_nb,
+ const size_t * dst_nb,
+ int64_t * bcast_input_ne,
+ int64_t * bcast_weight_ne,
+ int64_t * bcast_dst_ne,
+ size_t * bcast_input_nb,
+ size_t * bcast_weight_nb,
+ size_t * bcast_dst_nb);
+
+// Bcast macro to avoid duplicate code.
+#define BCAST_MUL_MAT_SHAPE(input, weight, dst) \
+ int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2]; \
+ int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2]; \
+ int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2]; \
+ size_t bcast_##input##_nb[GGML_MAX_DIMS * 2]; \
+ size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2]; \
+ size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2]; \
+ int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape( \
+ input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, bcast_##input##_ne, bcast_##weight##_ne, \
+ bcast_##dst##_ne, bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
+
+#define BCAST_MUL_MAT_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
+
+#endif // CANN_ACL_TENSOR_H
diff --git a/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp b/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp
new file mode 100644
index 0000000..fc7c3e3
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -0,0 +1,4021 @@
+/*
+ * Copyright (c) 2023-2026 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "aclnn_ops.h"
+
+#include "ggml-impl.h"
+#include "ggml.h"
+
+#include <aclnnop/aclnn_add.h>
+#include <aclnnop/aclnn_add_rms_norm.h>
+#include <aclnnop/aclnn_addcdiv.h>
+#include <aclnnop/aclnn_argmax.h>
+#include <aclnnop/aclnn_avgpool2d.h>
+#include <aclnnop/aclnn_batch_matmul.h>
+#include <aclnnop/aclnn_cast.h>
+#include <aclnnop/aclnn_clamp.h>
+#include <aclnnop/aclnn_constant_pad_nd.h>
+#include <aclnnop/aclnn_convolution.h>
+#include <aclnnop/aclnn_copy.h>
+#include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_elu.h>
+#include <aclnnop/aclnn_embedding.h>
+#include <aclnnop/aclnn_eq_tensor.h>
+#include <aclnnop/aclnn_exp.h>
+#include <aclnnop/aclnn_fill_scalar.h>
+#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
+#include <aclnnop/aclnn_ger.h>
+#include <aclnnop/aclnn_group_norm.h>
+#include <aclnnop/aclnn_grouped_matmul_v3.h>
+#include <aclnnop/aclnn_gt_scalar.h>
+#include <aclnnop/aclnn_im2col.h>
+#include <aclnnop/aclnn_index_copy.h>
+#include <aclnnop/aclnn_index_fill_tensor.h>
+#include <aclnnop/aclnn_index_select.h>
+#include <aclnnop/aclnn_layer_norm.h>
+#include <aclnnop/aclnn_log.h>
+#include <aclnnop/aclnn_matmul.h>
+#include <aclnnop/aclnn_max_pool.h>
+#include <aclnnop/aclnn_mean.h>
+#include <aclnnop/aclnn_mm.h>
+#include <aclnnop/aclnn_mul.h>
+#include <aclnnop/aclnn_mv.h>
+#include <aclnnop/aclnn_permute.h>
+#include <aclnnop/aclnn_pow.h>
+#include <aclnnop/aclnn_pow_tensor_tensor.h>
+#include <aclnnop/aclnn_reduce_sum.h>
+#include <aclnnop/aclnn_reflection_pad1d.h>
+#include <aclnnop/aclnn_repeat.h>
+#include <aclnnop/aclnn_repeat_interleave.h>
+#include <aclnnop/aclnn_rms_norm.h>
+#include <aclnnop/aclnn_roll.h>
+#include <aclnnop/aclnn_softmax.h>
+#include <aclnnop/aclnn_sub.h>
+#include <aclnnop/aclnn_sum.h>
+#include <aclnnop/aclnn_threshold.h>
+#include <aclnnop/aclnn_tril.h>
+#include <aclnnop/aclnn_triu.h>
+#include <aclnnop/aclnn_upsample_nearest_2d.h>
+#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
+#include <aclnnop/aclnn_zero.h>
+#include <float.h>
+
+#include <cmath>
+#include <cstring>
+#include <exception>
+#include <vector>
+
+#define GGML_COMMON_DECL_C
+
+#include "../ggml-common.h"
+
+void bcast_shape(ggml_tensor * src0,
+ ggml_tensor * src1,
+ ggml_tensor * dst,
+ acl_tensor_ptr & acl_src0,
+ acl_tensor_ptr & acl_src1,
+ acl_tensor_ptr & acl_dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
+ // Need bcast
+ if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
+ BCAST_SHAPE(src0, src1)
+ acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
+ acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
+ acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
+ } else {
+ acl_src0 = ggml_cann_create_tensor(src0);
+ acl_src1 = ggml_cann_create_tensor(src1);
+ acl_dst = ggml_cann_create_tensor(dst);
+ }
+}
+
+void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+ ggml_backend_cann_context & ctx,
+ ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ unary_op(ctx, acl_src.get(), acl_dst.get());
+}
+
+void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+ ggml_backend_cann_context & ctx,
+ ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+ acl_tensor_ptr acl_src0, acl_src1;
+ if (src1) {
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
+ GGML_ASSERT(src0->type == src1->type);
+
+ acl_src0 = ggml_cann_create_tensor(src0);
+ acl_src1 = ggml_cann_create_tensor(src1);
+ } else {
+ int64_t ne[] = { src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3] };
+ size_t nb[] = { src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3] };
+ acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
+ acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
+ if (swapped) {
+ std::swap(acl_src0, acl_src1);
+ }
+ }
+
+ unary_op(ctx, acl_src0.get(), acl_dst.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst.get(), acl_src1.get());
+}
+
+/**
+ * @brief Repeats elements of a tensor along each dimension according to the
+ * specified repeat array.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor to be repeated.
+ * @param acl_dst The destination tensor after repeating.
+ * @param repeat_array The array specifying the number of repetitions along each
+ * dimension.
+ */
+static void aclnn_repeat(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t * repeat_array) {
+ // repeat tensor along each dim with repeat_array
+ acl_int_array_ptr repeats = ggml_cann_create_int_array(repeat_array, GGML_MAX_DIMS);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats.get(), acl_dst);
+}
+
+/**
+ * @brief Casts the data type of a source tensor to a destination tensor.
+ *
+ * This function casts the data type of the source tensor `acl_src` to the
+ * specified data type `cast_data_type` and stores the result in the destination
+ * tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose data type will be casted.
+ * @param acl_dst The destination tensor where the casted result will be stored.
+ * @param cast_data_type The target data type to which the source tensor will be
+ * casted.
+ */
+static void aclnn_cast(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ aclDataType cast_data_type) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst);
+}
+
+void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ GGML_ASSERT(ggml_can_repeat(src, dst));
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ int64_t repeatsArray[] = { dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], dst->ne[1] / src->ne[1],
+ dst->ne[0] / src->ne[0] };
+
+ aclnn_repeat(ctx, acl_src.get(), acl_dst.get(), repeatsArray);
+}
+
+void aclnn_add(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
+ float alphaValue = 1.0f;
+ acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
+ if (acl_dst != nullptr) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha.get(), acl_dst);
+ } else {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha.get());
+ }
+}
+
+void aclnn_sub(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
+ float alphaValue = 1.0f;
+ acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
+ if (acl_dst != nullptr) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha.get(), acl_dst);
+ } else {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha.get());
+ }
+}
+
+void aclnn_mul(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
+ if (acl_dst != nullptr) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst);
+ } else {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other);
+ }
+}
+
+void aclnn_div(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
+ if (acl_dst != nullptr) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst);
+ } else {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other);
+ }
+}
+
+/**
+ * @brief Multiplies elements of a tensor by a scalar value, optionally
+ * in-place.
+ *
+ * This function multiplies each element of the source tensor `acl_src` by the
+ * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
+ * `inplace` is true, `acl_dst` will not be used and the operation is performed
+ * in-place on `acl_src`.
+ * The operation is defined as:
+ * \f[
+ * \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be multiplied.
+ * @param scale The scalar value by which each element of `acl_src` will be
+ * multiplied.
+ * @param acl_dst The destination tensor where the result will be stored if
+ * `inplace` is false.
+ * @param inplace Flag indicating whether to perform the operation in-place on
+ * `acl_src`.
+ */
+static void aclnn_muls(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ float scale,
+ aclTensor * acl_dst,
+ bool inplace) {
+ acl_scalar_ptr acl_scale = ggml_cann_create_scalar(&scale, aclDataType::ACL_FLOAT);
+ if (inplace) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale.get());
+ } else {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale.get(), acl_dst);
+ }
+}
+
+void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ float negative_slope;
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
+ acl_scalar_ptr acl_negative_slope = ggml_cann_create_scalar(&negative_slope, aclDataType::ACL_FLOAT);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src.get(), acl_negative_slope.get(), acl_dst.get());
+}
+
+/**
+ * @brief Concatenates a list of tensors along a specified dimension and stores
+ * the result in a destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param tensorList The list of tensors to be concatenated.
+ * @param acl_dst The destination tensor where the concatenated result will be
+ * stored.
+ * @param concat_dim The dimension along which the tensors will be concatenated.
+ */
+static void aclnn_concat(ggml_backend_cann_context & ctx,
+ aclTensorList * tensorList,
+ aclTensor * acl_dst,
+ int64_t concat_dim) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst);
+}
+
+void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+ acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
+
+ GGML_ASSERT(dim >= 0 && dim < 4);
+ int32_t acl_dim = 3 - dim;
+
+ acl_tensor_list_ptr tensor_list = ggml_cann_create_tensor_list(acl_src0, acl_src1);
+ aclnn_concat(ctx, tensor_list.get(), acl_dst.get(), acl_dim);
+}
+
+/**
+ * @brief Creates a tensor with values starting from `start`, incremented by
+ * `step`, and ending before `stop`.
+ *
+ * This function performs the operation:
+ * \f[
+ * \text {out }_{i+1}=\text {out }_i+\text {step}
+ * \f]
+ * the range is [start, stop).
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_dst The destination tensor where the values will be stored.
+ * @param start The starting value of the range.
+ * @param stop The ending value of the range (exclusive).
+ * @param step The step size between consecutive values.
+ * @param n_elements The number of elements in the destination tensor.
+ */
+static void aclnn_arange(ggml_backend_cann_context & ctx,
+ aclTensor * acl_dst,
+ float start,
+ float stop,
+ float step,
+ int64_t n_elements) {
+ int64_t steps = (int64_t) std::ceil((stop - start) / step);
+ GGML_ASSERT(n_elements == steps);
+
+ acl_scalar_ptr acl_start = ggml_cann_create_scalar(&start, aclDataType::ACL_FLOAT);
+ acl_scalar_ptr acl_end = ggml_cann_create_scalar(&stop, aclDataType::ACL_FLOAT);
+ acl_scalar_ptr acl_step = ggml_cann_create_scalar(&step, aclDataType::ACL_FLOAT);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start.get(), acl_end.get(), acl_step.get(), acl_dst);
+}
+
+void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ int64_t n_elements = ggml_nelements(dst);
+ float start;
+ float stop;
+ float step;
+ memcpy(&start, (float *) dst->op_params + 0, sizeof(float));
+ memcpy(&stop, (float *) dst->op_params + 1, sizeof(float));
+ memcpy(&step, (float *) dst->op_params + 2, sizeof(float));
+
+ aclnn_arange(ctx, acl_dst.get(), start, stop, step, n_elements);
+}
+
+void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ float min;
+ float max;
+ memcpy(&min, dst->op_params, sizeof(float));
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ acl_scalar_ptr acl_min = ggml_cann_create_scalar(&min, aclDataType::ACL_FLOAT);
+ acl_scalar_ptr acl_max = ggml_cann_create_scalar(&max, aclDataType::ACL_FLOAT);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src.get(), acl_min.get(), acl_max.get(), acl_dst.get());
+}
+
+void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ // scale factor
+ float v;
+ memcpy(&v, dst->op_params, sizeof(float));
+
+ acl_scalar_ptr scale = ggml_cann_create_scalar(&v, aclDataType::ACL_FLOAT);
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src.get(), scale.get(), acl_dst.get());
+}
+
+void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
+ void * buffer = temp_buffer_allocator.get();
+ acl_tensor_ptr tmp_tensor =
+ ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, dst->nb, GGML_MAX_DIMS);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src.get(), -1, (order == GGML_SORT_ORDER_DESC ? true : false),
+ tmp_tensor.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor.get(), ggml_cann_type_mapping(dst->type), acl_dst.get());
+}
+
+void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ float eps;
+ memcpy(&eps, dst->op_params, sizeof(float));
+
+ std::vector<int64_t> normData = { dst->ne[0] };
+ acl_int_array_ptr norm = ggml_cann_create_int_array(normData.data(), normData.size());
+ GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src.get(), norm.get(), nullptr, nullptr, eps, acl_dst.get(), nullptr,
+ nullptr);
+}
+
+void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ size_t type_size = ggml_type_size(src->type);
+ int64_t n_bytes = src->ne[3] * src->ne[2] * src->ne[1] * type_size;
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes);
+ void * buffer = temp_buffer_allocator.get();
+
+ int64_t div_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
+ size_t div_nb[GGML_MAX_DIMS];
+ div_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+ div_nb[i] = div_nb[i - 1] * div_ne[i - 1];
+ }
+ acl_tensor_ptr acl_div = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, div_ne, div_nb, GGML_MAX_DIMS);
+
+ std::vector<int64_t> norm_dims = { 3 };
+ acl_int_array_ptr dims_array = ggml_cann_create_int_array(norm_dims.data(), norm_dims.size());
+
+ float p_value = 2.0f;
+ acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
+}
+
+void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+
+ const int64_t nc = src0->ne[0];
+ const int64_t nr = ggml_nrows(src0);
+
+ int64_t logits_ne[] = { nc, nr };
+ size_t logits_nb[2];
+ logits_nb[0] = ggml_type_size(src0->type);
+ logits_nb[1] = logits_nb[0] * logits_ne[0];
+ acl_tensor_ptr acl_logits = ggml_cann_create_tensor(src0->data, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);
+
+ size_t log_softmax_type_size = sizeof(float);
+ int64_t log_softmax_n_bytes = nr * nc * log_softmax_type_size;
+ ggml_cann_pool_alloc log_softmax_allocator(ctx.pool(), log_softmax_n_bytes);
+ void * log_softmax_buffer = log_softmax_allocator.get();
+
+ int64_t log_softmax_ne[] = { nc, nr };
+ size_t log_softmax_nb[2];
+ log_softmax_nb[0] = log_softmax_type_size;
+ log_softmax_nb[1] = log_softmax_nb[0] * log_softmax_ne[0];
+ acl_tensor_ptr acl_log_softmax = ggml_cann_create_tensor(log_softmax_buffer, ACL_FLOAT, log_softmax_type_size,
+ log_softmax_ne, log_softmax_nb, 2);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, LogSoftmax, acl_logits.get(), 1, acl_log_softmax.get());
+
+ int64_t labels_ne[] = { nc, nr };
+ size_t labels_nb[2];
+ labels_nb[0] = ggml_type_size(src1->type);
+ labels_nb[1] = labels_nb[0] * labels_ne[0];
+ acl_tensor_ptr acl_labels = ggml_cann_create_tensor(src1->data, ACL_FLOAT, sizeof(float), labels_ne, labels_nb, 2);
+
+ size_t mul_type_size = sizeof(float);
+ int64_t mul_n_bytes = nr * nc * mul_type_size;
+ ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_n_bytes);
+ void * mul_buffer = mul_allocator.get();
+
+ int64_t mul_ne[] = { nc, nr };
+ size_t mul_nb[2];
+ mul_nb[0] = mul_type_size;
+ mul_nb[1] = mul_nb[0] * mul_ne[0];
+ acl_tensor_ptr acl_mul_result = ggml_cann_create_tensor(mul_buffer, ACL_FLOAT, mul_type_size, mul_ne, mul_nb, 2);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_log_softmax.get(), acl_labels.get(), acl_mul_result.get());
+
+ size_t sum_per_sample_type_size = sizeof(float);
+ int64_t sum_per_sample_n_bytes = nr * sum_per_sample_type_size;
+ ggml_cann_pool_alloc sum_per_sample_allocator(ctx.pool(), sum_per_sample_n_bytes);
+ void * sum_per_sample_buffer = sum_per_sample_allocator.get();
+
+ int64_t sum_per_sample_ne[] = { nr };
+ size_t sum_per_sample_nb[1];
+ sum_per_sample_nb[0] = sum_per_sample_type_size;
+ acl_tensor_ptr acl_sum_per_sample = ggml_cann_create_tensor(
+ sum_per_sample_buffer, ACL_FLOAT, sum_per_sample_type_size, sum_per_sample_ne, sum_per_sample_nb, 1);
+
+ std::vector<int64_t> sum_dims = { 1 };
+ acl_int_array_ptr dims_array = ggml_cann_create_int_array(sum_dims.data(), sum_dims.size());
+ bool keep_dims = false;
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_mul_result.get(), dims_array.get(), keep_dims, ACL_FLOAT,
+ acl_sum_per_sample.get());
+
+ size_t total_sum_type_size = sizeof(float);
+ int64_t total_sum_n_bytes = 1 * total_sum_type_size;
+ ggml_cann_pool_alloc total_sum_allocator(ctx.pool(), total_sum_n_bytes);
+ void * total_sum_buffer = total_sum_allocator.get();
+
+ int64_t total_sum_ne[] = { 1 };
+ size_t total_sum_nb[1];
+ total_sum_nb[0] = total_sum_type_size;
+
+ acl_tensor_ptr acl_total_sum =
+ ggml_cann_create_tensor(total_sum_buffer, ACL_FLOAT, total_sum_type_size, total_sum_ne, total_sum_nb, 1);
+
+ std::vector<int64_t> total_sum_dims = { 0 };
+ acl_int_array_ptr total_sum_dims_array = ggml_cann_create_int_array(total_sum_dims.data(), total_sum_dims.size());
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_sum_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
+ acl_total_sum.get());
+
+ float value = -1.0f / static_cast<float>(nr);
+ acl_scalar_ptr scale_factor = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
+ acl_tensor_ptr acl_dst =
+ ggml_cann_create_tensor(dst->data, ACL_FLOAT, sizeof(float), total_sum_ne, total_sum_nb, 1);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_total_sum.get(), scale_factor.get(), acl_dst.get());
+}
+
+void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ int n_groups = dst->op_params[0];
+
+ float eps;
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
+
+ int64_t N = src->ne[3];
+ int64_t C = src->ne[2];
+ int64_t HxW = src->ne[1] * src->ne[0];
+
+ size_t type_size = ggml_type_size(src->type);
+ int64_t ne[] = { n_groups, N };
+ size_t nb[] = { type_size, type_size * n_groups };
+ size_t n_bytes = N * n_groups;
+
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
+ void * buffer = temp_buffer_allocator.get();
+ acl_tensor_ptr acl_mean_out = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
+ acl_tensor_ptr acl_rstd_out =
+ ggml_cann_create_tensor((char *) buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src.get(), nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst.get(),
+ acl_mean_out.get(), acl_rstd_out.get());
+}
+
+void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
+ size_t offset = ((int32_t *) dst->op_params)[3];
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+ size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 };
+
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+ acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
+
+ acl_scalar_ptr alpha = nullptr;
+ float alphaValue = 1.0f;
+ alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+ if (!inplace) {
+ size_t cpy_size = ggml_nbytes(dst);
+ ACL_CHECK(
+ aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+ acl_tensor_ptr acl_src0 =
+ ggml_cann_create_tensor(src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0.get(), acl_src1.get(), alpha.get(), acl_dst.get());
+ } else {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), acl_src1.get(), alpha.get());
+ }
+}
+
+/**
+ * @brief Performs sum reduction on a given tensor along specified dimensions.
+ *
+ * This function reduces the input tensor by summing along the specified dimensions.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the reduced result will be stored.
+ * @param dim An array of dimension indices.
+ * @param dim_size The number of dimensions.
+ */
+static void aclnn_reduce_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t * dim, size_t dim_size) {
+ GGML_ASSERT(dst->ne[0] == 1);
+ ggml_tensor * src = dst->src[0];
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+ acl_int_array_ptr reduce_dims = ggml_cann_create_int_array(dim, dim_size);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src.get(), reduce_dims.get(), true, ggml_cann_type_mapping(dst->type),
+ acl_dst.get());
+}
+
+void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ int64_t reduce_dims[] = { 3 };
+ aclnn_reduce_sum(ctx, dst, reduce_dims, 1);
+}
+
+void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ int64_t reduce_dims[] = { 0, 1, 2, 3 };
+ aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
+}
+
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+ std::vector<int64_t> output_size{ dst->ne[1], dst->ne[0] };
+ acl_int_array_ptr output_size_array = ggml_cann_create_int_array(output_size.data(), 2);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src.get(), output_size_array.get(), acl_dst.get());
+}
+
+/**
+ * @brief Pads a tensor with a specified value along each dimension.
+ *
+ * This function performs padding of the source tensor `acl_src` and stores the
+ * result in the destination tensor `acl_dst`. The padding values for each
+ * dimension are specified in the `paddings` array.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor to be padded.
+ * @param acl_dst The destination tensor where the padded result will be stored.
+ * @param paddings An array specifying the padding values for each dimension.
+ * The size of the array should be twice the number of dimensions of the tensor.
+ * @param value The value to be used for padding. The default value is 0.0.
+ */
+static void aclnn_pad(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t * paddings,
+ float value = 0.0f) {
+ acl_int_array_ptr acl_pad = ggml_cann_create_int_array(paddings, GGML_MAX_DIMS * 2);
+ acl_scalar_ptr acl_value = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad.get(), acl_value.get(), acl_dst);
+}
+
+void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ // padding: value in the array means how much distance will be padding.
+ // the position of elements in the array means which dirction to padding,
+ // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
+ // dim2.front, dim2.behind, dim3.front, dim3.behind]
+ const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
+ const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
+ const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
+ const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
+ const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
+ const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
+ const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
+ const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
+
+ int64_t paddings[] = { lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3 };
+ aclnn_pad(ctx, acl_src.get(), acl_dst.get(), paddings);
+}
+
+/**
+ * @brief Performs 2D average pooling on the input tensor and stores the result
+ * in the destination tensor.
+ *
+ * This function performs average pooling on the source tensor and stores the
+ * result in the destination tensor. The pooling parameters (kernel size,
+ * strides, padding) are specified in the `op_params` of the destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result will be stored. The source
+ * tensor is referenced by `dst->src[0]`.
+ */
+static void ggml_cann_avg_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+ const int32_t * opts = (const int32_t *) dst->op_params;
+ const int k0 = opts[1];
+ const int k1 = opts[2];
+ const int s0 = opts[3];
+ const int s1 = opts[4];
+ const int p0 = opts[5];
+ const int p1 = opts[6];
+
+ std::vector<int64_t> kernel_dims = { k1, k0 };
+ std::vector<int64_t> stride_dims = { s1, s0 };
+ std::vector<int64_t> padding_avg_dims = { p1, p0 }; // (padH, padW)
+
+ acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
+ acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
+ acl_int_array_ptr paddings_avg = ggml_cann_create_int_array(padding_avg_dims.data(), 2);
+
+ bool ceil_mode = false;
+ bool count_include_pad = true;
+ int64_t divisor_override = 0;
+ int8_t cube_math_type = 0;
+#ifdef ASCEND_310P
+ cube_math_type = 1;
+#endif
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src.get(), kernel_size.get(), strides.get(), paddings_avg.get(),
+ ceil_mode, count_include_pad, divisor_override, cube_math_type, acl_dst.get());
+}
+
+/**
+ * @brief Performs 2D max pooling on the input tensor and stores the result in
+ * the destination tensor.
+ *
+ * This function performs max pooling on the source tensor and stores the result
+ * in the destination tensor. The pooling parameters (kernel size, strides,
+ * padding) are specified in the `op_params` of the destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result will be stored. The source
+ * tensor is referenced by `dst->src[0]`.
+ */
+static void ggml_cann_max_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+ const int32_t * opts = (const int32_t *) dst->op_params;
+ const int k0 = opts[1];
+ const int k1 = opts[2];
+ const int s0 = opts[3];
+ const int s1 = opts[4];
+ const int p0 = opts[5];
+ const int p1 = opts[6];
+
+ int64_t temp_ne[] = { src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], src->ne[3] };
+ size_t temp_nb[GGML_MAX_DIMS];
+
+ temp_nb[0] = ggml_element_size(src);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
+ }
+
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
+ void * buffer = temp_buffer_allocator.get();
+ acl_tensor_ptr tmp_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
+ GGML_MAX_DIMS, ACL_FORMAT_NCHW);
+
+ // pad: see padding in ggml_cann_pad()
+ int64_t paddings[] = { p0, p0, p1, p1, 0, 0, 0, 0 };
+ float value = -FLT_MAX;
+ aclnn_pad(ctx, acl_src.get(), tmp_tensor.get(), paddings, value);
+
+ // max_pool
+ std::vector<int64_t> kernel_dims = { k1, k0 };
+ std::vector<int64_t> stride_dims = { s1, s0 };
+ // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
+ std::vector<int64_t> padding_max_dims = { 0, 0, 0, 0 };
+ std::vector<int64_t> dilation_size = { 1, 1 };
+ acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
+ acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
+ acl_int_array_ptr paddings_max = ggml_cann_create_int_array(padding_max_dims.data(), 4);
+ acl_int_array_ptr dilations = ggml_cann_create_int_array(dilation_size.data(), 2);
+
+ bool ceil_mode = false;
+ int64_t auto_pads = 0;
+ GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor.get(), kernel_size.get(), strides.get(), auto_pads,
+ paddings_max.get(), dilations.get(), ceil_mode, acl_dst.get());
+}
+
+void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ const int32_t * opts = (const int32_t *) dst->op_params;
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
+ switch (op) {
+ case GGML_OP_POOL_AVG:
+ ggml_cann_avg_pool2d(ctx, dst);
+ break;
+ case GGML_OP_POOL_MAX:
+ ggml_cann_max_pool2d(ctx, dst);
+ break;
+ case GGML_OP_POOL_COUNT:
+ GGML_ABORT("fatal error");
+ break;
+ }
+}
+
+/**
+ * @brief Copies data from the source tensor to the destination tensor.
+ *
+ * This function copies data from the source tensor `acl_src` to the destination
+ * tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor from which data will be copied.
+ * @param acl_dst The destination tensor where the data will be copied to.
+ */
+static void cann_copy(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src);
+}
+
+void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+
+ if (ggml_are_same_shape(src0, dst)) {
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+ if (dst->type == src0->type) {
+ cann_copy(ctx, acl_src.get(), acl_dst.get());
+ } else {
+ aclnn_cast(ctx, acl_src.get(), acl_dst.get(), ggml_cann_type_mapping(dst->type));
+ }
+ } else {
+ void * src_trans_buffer = src0->data;
+ ggml_cann_pool_alloc src_buffer_allocator;
+ if (!ggml_is_contiguous(src0)) {
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
+ src_buffer_allocator.alloc(ctx.pool(), ggml_nelements(src0) * ggml_type_size(src0->type));
+ src_trans_buffer = src_buffer_allocator.get();
+ size_t src_trans_nb[GGML_MAX_DIMS];
+ src_trans_nb[0] = ggml_type_size(src0->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ }
+ acl_tensor_ptr src_trans_tensor =
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+ cann_copy(ctx, acl_src.get(), src_trans_tensor.get());
+ }
+
+ size_t src_reshape_nb[GGML_MAX_DIMS];
+ src_reshape_nb[0] = ggml_type_size(src0->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
+ }
+
+ acl_tensor_ptr trans_acl_src =
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ if (dst->type == src0->type) {
+ cann_copy(ctx, trans_acl_src.get(), acl_dst.get());
+ } else {
+ aclnn_cast(ctx, trans_acl_src.get(), acl_dst.get(), ggml_cann_type_mapping(dst->type));
+ }
+ }
+}
+
+/**
+ * @brief Creates an ACL tensor initialized with zeros using a provided buffer.
+ *
+ * This function initializes a tensor with zeros using the specified buffer and
+ * tensor parameters.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param buffer The buffer to be used for the tensor data.
+ * @param n_bytes The size of the buffer in bytes.
+ * @param ne An array specifying the extents (sizes) of each dimension of the
+ * tensor.
+ * @param dims The number of dimensions of the tensor.
+ * @param type The data type of the tensor.
+ * @param type_size The size of each element in the tensor data type.
+ * @return A tensor smart pointer initialized with zeros.
+ */
+static acl_tensor_ptr aclnn_zero(ggml_backend_cann_context & ctx,
+ void * buffer,
+ size_t n_bytes,
+ int64_t * ne,
+ int64_t dims,
+ aclDataType type,
+ size_t type_size) {
+ size_t nb[GGML_MAX_DIMS];
+ nb[0] = type_size;
+ for (int i = 1; i < dims; i++) {
+ nb[i] = nb[i - 1] * ne[i - 1];
+ }
+
+ acl_tensor_ptr zero = ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero.get());
+ return zero;
+ GGML_UNUSED(n_bytes);
+}
+
+/**
+ * @brief Creates an ACL tensor initialized with value using a provided buffer.
+ *
+ * This function initializes a tensor with value using the specified buffer and
+ * tensor parameters.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param buffer The buffer to be used for the tensor data.
+ * @param n_bytes The size of the buffer in bytes.
+ * @param ne An array specifying the extents (sizes) of each dimension of the
+ * tensor.
+ * @param dims The number of dimensions of the tensor.
+ * @param type The data type of the tensor.
+ * @param type_size The size of each element in the tensor data type.
+ * @param value The value to be used for initializing the tensor (default
+ * is 1.0).
+ * @return A tensor smart pointer initialized with value.
+ */
+static acl_tensor_ptr aclnn_values(ggml_backend_cann_context & ctx,
+ void * buffer,
+ size_t n_bytes,
+ int64_t * ne,
+ int64_t dims,
+ aclDataType type,
+ size_t type_size,
+ float value = 1.0f) {
+ acl_tensor_ptr acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
+ float alpha_host = 1.0f;
+ acl_scalar_ptr alpha = ggml_cann_create_scalar(&alpha_host, aclDataType::ACL_FLOAT);
+ acl_scalar_ptr other = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor.get(), other.get(), alpha.get());
+ return acl_tensor;
+}
+
+/**
+ * @brief Fills a tensor with a scalar value.
+ *
+ * This function fills the destination tensor `acl_dst` with the scalar value
+ * `scalar`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param scalar The scalar value used to fill the tensor.
+ * @param acl_dst The destination tensor to be filled with the scalar value.
+ */
+static void aclnn_fill_scalar(ggml_backend_cann_context & ctx, float scalar, aclTensor * acl_dst) {
+ acl_scalar_ptr acl_scalar = ggml_cann_create_scalar(&scalar, aclDataType::ACL_FLOAT);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar.get());
+}
+
+/**
+ * @brief Get or expand a cached tensor filled with a scalar value.
+ *
+ * This function manages cached device memory for tensors. If the current
+ * cache size is insufficient for the requested tensor shape, the old memory will
+ * be released and new memory will be allocated. The allocated buffer is
+ * initialized with the given scalar value using CANN operations.
+ * Finally, an aclTensor object is created from the cached memory and returned.
+ *
+ * @param ctx The CANN backend context that manages device memory.
+ * @param buffer A pointer to the cached device buffer (will be allocated
+ * or reallocated if necessary).
+ * @param cache_element The current number of cached elements. This will be
+ * updated when the cache is expanded.
+ * @param ne The tensor shape array (number of elements in each dimension).
+ * @param nb The stride size for each dimension.
+ * @param dtype Data type of cached tensor.
+ * @param dims The number of tensor dimensions.
+ * @param value The scalar value used to fill the tensor (supports zero
+ * initialization via memset or arbitrary values via fill_scalar).
+ * @return A tensor smart pointer created from the cached buffer.
+ */
+static acl_tensor_ptr get_cache_acl_tensor(ggml_backend_cann_context & ctx,
+ void ** buffer,
+ int64_t & cache_element,
+ int64_t * ne,
+ size_t * nb,
+ ggml_type dtype,
+ int64_t dims,
+ float value) {
+ // Calculate total number of elements
+ int64_t n_element = 1;
+ for (int i = 0; i < dims; i++) {
+ n_element *= ne[i];
+ }
+ size_t size = n_element * ggml_type_size(dtype);
+
+ // Allocate or expand cache if needed
+ if (cache_element < n_element) {
+ if (*buffer != nullptr) {
+ aclrtFree(*buffer);
+ *buffer = nullptr;
+ }
+
+ ACL_CHECK(aclrtMalloc(buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
+ cache_element = n_element;
+
+ // Initialize cache
+ int64_t pool_ne[1] = { n_element };
+ size_t pool_nb[1] = { ggml_type_size(dtype) };
+ acl_tensor_ptr acl_value =
+ ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), pool_ne, pool_nb, 1);
+ aclnn_fill_scalar(ctx, value, acl_value.get());
+ }
+
+ return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), ne, nb, dims);
+}
+
+void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ float eps;
+ memcpy(&eps, dst->op_params, sizeof(float));
+
+ // build gamma.
+ size_t acl_gamma_nb[GGML_MAX_DIMS];
+ // gamma's type is the same with dst.
+ acl_gamma_nb[0] = ggml_type_size(dst->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
+ }
+ acl_tensor_ptr acl_gamma = get_cache_acl_tensor(
+ ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, src->ne, acl_gamma_nb, dst->type,
+ 1, // dims
+ 1.0f // value
+ );
+
+ // build rstd.
+ int64_t acl_rstd_ne[] = { src->ne[1], src->ne[2], src->ne[3] };
+ size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
+ // rstd will always be F32.
+ acl_rstd_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+ acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
+ }
+ acl_tensor_ptr acl_rstd =
+ get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
+ acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS - 1,
+ 0.0f // value
+ );
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src.get(), acl_gamma.get(), eps, acl_dst.get(), acl_rstd.get());
+}
+
+// TODO: performace is low.
+void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value) {
+ ggml_tensor * src = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ const int n_past = ((int32_t *) dst->op_params)[0];
+
+ ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
+ void * buffer = one_tensor_allocator.get();
+
+ acl_tensor_ptr mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
+ ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
+
+ aclnn_fill_scalar(ctx, value, mask_tensor.get());
+
+ float alphaValue = 1.0f;
+ acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor.get(), n_past + 1);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), n_past + 1, acl_dst.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), mask_tensor.get(), alpha.get());
+}
+
+/**
+ * @brief Permutes the dimensions of a tensor according to a specified order.
+ *
+ * This function permutes the dimensions of the source tensor `acl_src`
+ * according to the order specified in the `new_dim` array and stores the result
+ * in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose dimensions will be permuted.
+ * @param acl_dst The destination tensor where the permuted result will be
+ * stored.
+ * @param new_dim An array specifying the new order of dimensions for the
+ * tensor.
+ * @param dims The number of dimensions in the tensor.
+ */
+static void aclnn_permute(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t * new_dim,
+ uint64_t dims) {
+ acl_int_array_ptr acl_dims = ggml_cann_create_int_array(new_dim, dims);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims.get(), acl_dst);
+}
+
+static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context & ctx,
+ ggml_tensor * dst,
+ ggml_tensor * src1,
+ aclTensor * tmp_cast_tensor,
+ aclTensor * tmp_im2col_tensor) {
+ // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
+ int64_t dst_ne[] = { dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3] };
+ size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[3] };
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
+
+ int64_t permute_dim[] = { 0, 2, 1 };
+ if (src1->type != dst->type) {
+ aclnn_permute(ctx, tmp_cast_tensor, acl_dst.get(), permute_dim, 3);
+ } else {
+ aclnn_permute(ctx, tmp_im2col_tensor, acl_dst.get(), permute_dim, 3);
+ }
+}
+
+static void ggml_cann_im2col_1d_post_process(ggml_backend_cann_context & ctx,
+ ggml_tensor * dst,
+ ggml_tensor * src1,
+ aclTensor * tmp_cast_tensor,
+ aclTensor * tmp_im2col_tensor,
+ const std::vector<int64_t> & im2col_op_params) {
+ // get params
+ const int64_t KH = im2col_op_params[0];
+ const int64_t KW = im2col_op_params[1];
+ const int64_t IW = im2col_op_params[2];
+ const int64_t IC = im2col_op_params[3];
+ const int64_t N = im2col_op_params[4];
+ const int64_t OH = im2col_op_params[5];
+ const int64_t OW = im2col_op_params[6];
+ const int64_t s0 = im2col_op_params[7];
+ const int64_t p0 = im2col_op_params[8];
+ const int64_t d0 = im2col_op_params[9];
+ const int64_t n_bytes_factor = im2col_op_params[10];
+
+ // Permute: [N, IC * KH * KW, OW * OH] ->
+ // [N, OW * OH * n_bytes_factor, IC * KH * KW]
+ ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
+ tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
+ void * tmp_permute_buffer = tmp_permute_allocator.get();
+
+ int64_t tmp_permute_ne[] = { IC * KH * KW, OW * OH * n_bytes_factor, N };
+ size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
+ tmp_permute_nb[0] = ggml_type_size(dst->type);
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+ tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
+ }
+
+ acl_tensor_ptr tmp_permute_tensor =
+ ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+
+ int64_t permute_dim[] = { 0, 2, 1 };
+ if (src1->type != dst->type) {
+ aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor.get(), permute_dim, 3);
+ } else {
+ aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor.get(), permute_dim, 3);
+ }
+
+ // number of times the kernel moves in W dimension
+ const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
+ size_t offset;
+ void * cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
+
+ // memory copy with offset to restore 1D im2col from 2d
+ if (IC > 1) {
+ offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
+ size_t cpy_size = KH * KW * ggml_type_size(dst->type);
+
+ for (int c = 0; c < IC; c++) {
+ cur_permute_buffer = (char *) tmp_permute_buffer + offset + KH * KW * c * ggml_type_size(dst->type);
+ cur_dst_buffer = (char *) dst->data + c * KH * KW * n_step_w * ggml_type_size(dst->type);
+
+ for (int i = 0; i < n_step_w; i++) {
+ ACL_CHECK(aclrtMemcpyAsync(cur_dst_buffer, cpy_size, cur_permute_buffer, cpy_size,
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+ cur_dst_buffer = (char *) cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
+ cur_permute_buffer = (char *) cur_permute_buffer + KH * KW * IC * ggml_type_size(dst->type);
+ }
+ }
+ } else {
+ offset = KH * KW * n_step_w * ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, (char *) tmp_permute_buffer + offset, offset,
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+ }
+}
+
+void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // kernel
+ ggml_tensor * src1 = dst->src[1]; // input
+
+ GGML_TENSOR_BINARY_OP_LOCALS;
+
+ // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
+ // im2col and do post-processing to restore it to 1D.
+ const bool is_2D = ((const int32_t *) (dst->op_params))[6] == 1;
+ const int32_t s0 = ((const int32_t *) (dst->op_params))[0];
+ const int32_t s1 = is_2D ? ((const int32_t *) (dst->op_params))[1] : 1;
+ const int32_t p0 = ((const int32_t *) (dst->op_params))[2];
+ const int32_t p1 = is_2D ? ((const int32_t *) (dst->op_params))[3] : 1;
+ const int32_t d0 = ((const int32_t *) (dst->op_params))[4];
+ const int32_t d1 = is_2D ? ((const int32_t *) (dst->op_params))[5] : 1;
+
+ const int64_t N = ne13;
+ const int64_t IC = ne12;
+ const int64_t KH = ne01;
+ const int64_t KW = ne00;
+ const int64_t IW = ne10;
+
+ const int64_t OH = is_2D ? ne2 : 1;
+ const int64_t OW = ne1;
+
+ // memory allocated increased to 3x when is_2D == false
+ const int64_t n_bytes_factor = is_2D ? 1 : 3;
+
+ // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
+ acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
+ int64_t tmp_im2col_ne[] = { OW * OH * n_bytes_factor, IC * KH * KW, N };
+ size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
+
+ tmp_im2col_nb[0] = ggml_type_size(src1->type);
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+ tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
+ }
+
+ // Calculate im2col.
+ // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
+ // dst.elemcount.
+ ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
+ void * tmp_im2col_buffer = im2col_allocator.get();
+
+ acl_tensor_ptr tmp_im2col_tensor =
+ ggml_cann_create_tensor(tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
+ tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+
+ std::vector<int64_t> kernel_dims = { KH, KW };
+ std::vector<int64_t> dilation_size = { d1, d0 };
+ std::vector<int64_t> padding_dims = { p1, p0 };
+ std::vector<int64_t> stride_dims = { s1, s0 };
+ acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
+ acl_int_array_ptr dilations = ggml_cann_create_int_array(dilation_size.data(), 2);
+ acl_int_array_ptr paddings = ggml_cann_create_int_array(padding_dims.data(), 2);
+ acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1.get(), kernel_size.get(), dilations.get(), paddings.get(),
+ strides.get(), tmp_im2col_tensor.get());
+
+ // Cast if dst is f16.
+ acl_tensor_ptr tmp_cast_tensor;
+ ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
+ void * tmp_cast_buffer = nullptr;
+ if (src1->type != dst->type) {
+ tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
+ tmp_cast_buffer = tmp_cast_allocator.get();
+ size_t temp_cast_nb[GGML_MAX_DIMS - 1];
+ temp_cast_nb[0] = ggml_type_size(dst->type);
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+ temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
+ }
+
+ tmp_cast_tensor =
+ ggml_cann_create_tensor(tmp_cast_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+ aclnn_cast(ctx, tmp_im2col_tensor.get(), tmp_cast_tensor.get(), ggml_cann_type_mapping(dst->type));
+ }
+
+ // post-processing
+ if (is_2D) {
+ ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor.get(), tmp_im2col_tensor.get());
+ } else {
+ std::vector<int64_t> im2col_op_params = { KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor };
+ ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor.get(), tmp_im2col_tensor.get(),
+ im2col_op_params);
+ }
+}
+
+/**
+ * @brief Applies element-wise exponential function to the elements of a tensor.
+ *
+ * This function computes the exponential of each element in the source tensor
+ * `acl_src` and stores the result back into the same tensor.
+ * The operation is defined as:
+ * \f[
+ * \text {acl_src }_i=e^{acl\_src_i}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The tensor on which the exponential function will be applied.
+ */
+static void aclnn_exp(ggml_backend_cann_context & ctx, aclTensor * acl_src) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src);
+}
+
+void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ if (acl_dst == nullptr) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCos, acl_src);
+ } else {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst);
+ }
+}
+
+void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ if (acl_dst == nullptr) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSin, acl_src);
+ } else {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst);
+ }
+}
+
+void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * src = dst->src[0];
+
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+ const int dim = dst->op_params[0];
+ const int max_period = dst->op_params[1];
+ int half = dim / 2;
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+
+ // arange: [0, ..., half)
+ float start = 0;
+ float stop = half;
+ float step = 1;
+ int64_t n_elements_arange = half;
+ int64_t tmp_arange_ne[] = { half };
+ size_t tmp_arange_nb[] = { sizeof(dst->type) };
+
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
+ void * tmp_arange_buffer = arange_allocator.get();
+ acl_tensor_ptr tmp_arange_tensor =
+ ggml_cann_create_tensor(tmp_arange_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+ aclnn_arange(ctx, tmp_arange_tensor.get(), start, stop, step, n_elements_arange);
+
+ // freq
+ float freq_param = -logf(max_period) / half;
+ bool inplace = true;
+ aclnn_muls(ctx, tmp_arange_tensor.get(), freq_param, nullptr, inplace);
+ aclnn_exp(ctx, tmp_arange_tensor.get());
+
+ // permute: src [0,1,2,3]->[0,1,3,2]
+ int64_t tmp_permute_ne[] = { src->ne[1], src->ne[0], src->ne[2], src->ne[3] };
+ size_t tmp_permute_nb[GGML_MAX_DIMS];
+ tmp_permute_nb[0] = ggml_type_size(src->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
+ }
+
+ ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
+ void * tmp_permute_buffer = permute_allocator.get();
+ acl_tensor_ptr tmp_permute_tensor =
+ ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
+ tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+ int64_t permute_dim[] = { 0, 1, 3, 2 };
+ int64_t num_dims = 4;
+ aclnn_permute(ctx, acl_src.get(), tmp_permute_tensor.get(), permute_dim, num_dims);
+
+ // timestep * freq
+ int64_t tmp_mul_ne[] = { src->ne[1] * half, src->ne[0], src->ne[2], src->ne[3] };
+ size_t tmp_mul_nb[GGML_MAX_DIMS];
+ tmp_mul_nb[0] = ggml_type_size(src->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
+ }
+
+ int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
+
+ ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+ void * tmp_mul_buffer = mul_allocator.get();
+ acl_tensor_ptr tmp_mul_tensor =
+ ggml_cann_create_tensor(tmp_mul_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+ aclnn_mul(ctx, tmp_permute_tensor.get(), tmp_arange_tensor.get(), tmp_mul_tensor.get());
+
+ // cos
+ ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+ void * tmp_cos_buffer = cos_allocator.get();
+ acl_tensor_ptr tmp_cos_tensor =
+ ggml_cann_create_tensor(tmp_cos_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+
+ aclnn_cos(ctx, tmp_mul_tensor.get(), tmp_cos_tensor.get());
+
+ // sin
+ ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+ void * tmp_sin_buffer = sin_allocator.get();
+ acl_tensor_ptr tmp_sin_tensor =
+ ggml_cann_create_tensor(tmp_sin_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+
+ aclnn_sin(ctx, tmp_mul_tensor.get(), tmp_sin_tensor.get());
+
+ // concat
+ int64_t concat_dim = 3;
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+ acl_tensor_list_ptr tensor_list = ggml_cann_create_tensor_list(tmp_cos_tensor, tmp_sin_tensor);
+ aclnn_concat(ctx, tensor_list.get(), acl_dst.get(), concat_dim);
+}
+
+/**
+ * @brief Raises each element of a tensor to the power of the corresponding
+ * element in another tensor.
+ *
+ * This function computes the element-wise power of the destination tensor
+ * `acl_dst` raised to the power of the exponent tensor `acl_exp`.
+ * The operation is defined as:
+ * \f[
+ * \text {acl_dst }_i=acl\_dst_i^{\text {acl_exp }_i}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_dst The destination tensor, which also serves as the base tensor.
+ * @param acl_exp The exponent tensor, each element of which is used to raise
+ * the corresponding element in the destination tensor.
+ */
+static void aclnn_pow_tensor_tensor(ggml_backend_cann_context & ctx, aclTensor * acl_dst, aclTensor * acl_exp) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp);
+}
+
+/**
+ * @brief Generate a range of values and apply a scalar base exponentiation.
+ *
+ * This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
+ * with step size `step`, stores it in a temporary buffer, and then computes:
+ *
+ * @f[
+ * slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
+ * @f]
+ *
+ * The results are written to the provided @p slope_buffer.
+ *
+ * @param ctx CANN backend context for memory allocation and operator execution.
+ * @param slope_buffer Pointer to the output buffer (float array) for the computed slope values.
+ * @param m Scalar base for the exponentiation.
+ * @param size Number of elements in the generated sequence.
+ * @param start Starting exponent offset.
+ * @param stop Stopping exponent offset (exclusive).
+ * @param step Step size for the exponent increment.
+ * @param dtype Data type for slope tensor.
+ */
+static void aclnn_get_slope_inner(ggml_backend_cann_context & ctx,
+ void * slope_buffer,
+ float m,
+ int64_t size,
+ float start,
+ float stop,
+ float step,
+ ggml_type dtype) {
+ aclDataType acl_type = ggml_cann_type_mapping(dtype);
+ size_t type_size = ggml_type_size(dtype);
+
+ int64_t ne[] = { size };
+ size_t nb[] = { type_size };
+
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
+ void * arange_buffer = arange_allocator.get();
+
+ acl_tensor_ptr arange_tensor = ggml_cann_create_tensor(arange_buffer, acl_type, type_size, ne, nb, 1);
+ aclnn_arange(ctx, arange_tensor.get(), start, stop, step, size);
+
+ acl_tensor_ptr slope_tensor = ggml_cann_create_tensor(slope_buffer, acl_type, type_size, ne, nb, 1);
+
+ acl_scalar_ptr sc = ggml_cann_create_scalar(&m, aclDataType::ACL_FLOAT);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc.get(), arange_tensor.get(), slope_tensor.get());
+}
+
+/**
+ * @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
+ *
+ * This function generates slope values for each attention head according to the ALiBi
+ * (Attention with Linear Biases) method. It splits the computation into two ranges depending
+ * on whether the head index is less than @p n_head_log2 or not, and uses different base values
+ * (`m0` and `m1`) for the exponentiation.
+ *
+ * @f[
+ * slope[h] =
+ * \begin{cases}
+ * m_0^{(h + 1)}, & h < n\_head\_log2 \\
+ * m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
+ * \end{cases}
+ * \quad , \quad \text{if } max\_bias > 0
+ * @f]
+ *
+ * If @p max_bias <= 0, all slope values are set to 1.0.
+ *
+ * @param ctx CANN backend context for memory allocation and operator execution.
+ * @param n_head Total number of attention heads.
+ * @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
+ * @param max_bias Maximum bias value for slope computation.
+ * @param dtype Data type for slope tensor.
+ *
+*/
+static void aclnn_get_slope(ggml_backend_cann_context & ctx,
+ int64_t n_head,
+ void * slope_buffer,
+ float max_bias,
+ ggml_type dtype) {
+ const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+ float m0 = powf(2.0f, -(max_bias) / n_head_log2);
+ float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+ // const float slope = (max_bias > 0.0f) ?
+ // h < n_head_log2 ?
+ // powf(m0, h + 1) :
+ // powf(m1, 2*(h - n_head_log2) + 1) :
+ // 1.0f;
+ // arange1
+ float start = 0 + 1;
+ float end = (n_head_log2 - 1) + 1;
+ float step = 1;
+ float count = n_head_log2;
+ // end needs to be +1 because aclnn uses a left-closed, right-open interval.
+ aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step, dtype);
+ if (n_head_log2 < n_head) {
+ // arange2
+ start = 2 * (n_head_log2 - n_head_log2) + 1;
+ end = 2 * ((n_head - 1) - n_head_log2) + 1;
+ step = 2;
+ count = n_head - n_head_log2;
+ aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
+ dtype);
+ }
+}
+
+/**
+ * @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
+ *
+ * This function computes the ALiBi slopes for each attention head (if max_bias > 0),
+ * multiplies them with the attention mask to produce bias tensors, and adds these biases
+ * to the destination tensor (@p dst).
+ *
+ * The function performs necessary broadcasting of the mask and slope tensors to match
+ * the shape of the destination tensor, then applies element-wise multiplication and addition
+ * using CANN operators.
+ *
+ * @param ctx CANN backend context for memory management and operator execution.
+ * @param mask Input attention mask tensor, assumed to be contiguous.
+ * @param dst Destination tensor to which ALiBi biases will be added.
+ * @param dst_ptr Pointer to the memory of the destination tensor.
+ * @param max_bias Maximum bias value controlling the slope scaling.
+ *
+ * @note
+ * - Write data into dst_ptr using only the shape information of the dst tensor.
+ * - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
+ */
+static void aclnn_add_alibi(ggml_backend_cann_context & ctx,
+ ggml_tensor * mask,
+ ggml_tensor * dst,
+ void * dst_ptr,
+ float max_bias) {
+ void * slope_buffer = nullptr;
+ void * bias_buffer = nullptr;
+
+ if (max_bias > 0.0f) {
+ int64_t n_heads = dst->ne[2];
+ ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
+ slope_buffer = slope_allocator.get();
+ ggml_cann_pool_alloc bias_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
+ bias_buffer = bias_allocator.get();
+ aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
+ }
+
+ // broadcast for mask, slop and dst;
+ int64_t nr2 = dst->ne[2] / mask->ne[2];
+ int64_t nr3 = dst->ne[3] / mask->ne[3];
+
+ // broadcast the mask across rows
+ int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
+ size_t mask_nb[] = { mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
+ mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3] };
+
+ int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
+ size_t dst_nb[] = { dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
+ dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3] };
+
+ // slope is a 1 dim tensor, slope.ne2 == dst.ne2
+ int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
+ size_t slope_nb[GGML_MAX_DIMS + 2];
+ slope_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
+ slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
+ }
+
+ acl_tensor_ptr acl_slope =
+ ggml_cann_create_tensor(slope_buffer, ACL_FLOAT, sizeof(float), slope_ne, slope_nb, GGML_MAX_DIMS + 2);
+ acl_tensor_ptr acl_mask = ggml_cann_create_tensor(mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
+
+ // write data into dst_ptr using only the shape information of the dst tensor.
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst_ptr, ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), dst_ne, dst_nb, GGML_MAX_DIMS + 2);
+
+ if (max_bias > 0.0f) {
+ int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
+ size_t bias_nb[GGML_MAX_DIMS + 2];
+ bias_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
+ bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
+ }
+ acl_tensor_ptr bias_tensor =
+ ggml_cann_create_tensor(bias_buffer, ACL_FLOAT, sizeof(float), bias_ne, bias_nb, GGML_MAX_DIMS + 2);
+
+ aclnn_mul(ctx, acl_slope.get(), acl_mask.get(), bias_tensor.get());
+ aclnn_add(ctx, acl_dst.get(), bias_tensor.get());
+ } else {
+ aclnn_add(ctx, acl_dst.get(), acl_mask.get());
+ }
+}
+
+void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_cann_dup(ctx, dst);
+}
+
+/**
+ * @brief Applies the softmax function to a tensor along a specified dimension.
+ *
+ * This function computes the softmax of the source tensor `acl_src` along the
+ * specified dimension `dim` and stores the result in the destination tensor
+ * `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the softmax function will be
+ * applied.
+ * @param dim The dimension along which the softmax function will be computed.
+ * @param acl_dst The destination tensor where the softmax results will be
+ * stored.
+ */
+static void aclnn_softmax(ggml_backend_cann_context & ctx, aclTensor * acl_src, int64_t dim, aclTensor * acl_dst) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
+}
+
+void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1]; // mask
+
+ acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ float scale = 1.0f;
+ float max_bias = 0.0f;
+
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
+
+ // input mul scale
+ acl_scalar_ptr acl_scale = ggml_cann_create_scalar(&scale, aclDataType::ACL_FLOAT);
+ ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
+ void * src_tensor_buffer = src_tensor_allocator.get();
+ acl_tensor_ptr softmax_tensor = ggml_cann_create_tensor(src_tensor_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_element_size(src0), src0->ne, src0->nb, GGML_MAX_DIMS);
+
+ aclnn_muls(ctx, acl_src0.get(), scale, softmax_tensor.get(), false);
+
+ // mask
+ if (src1) {
+ aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
+ }
+ // softmax
+ aclnn_softmax(ctx, softmax_tensor.get(), 3, acl_dst.get());
+}
+
+/**
+ * @brief Performs index select operation on a 4D tensor using the CANN backend.
+ *
+ * This function applies the `IndexSelect` operation along a specific dimension
+ * of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
+ * It iterates over the last two dimensions of the source tensor, creates the corresponding
+ * CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
+ * operation for each slice.
+ *
+ * @param ctx The context for CANN backend operations.
+ * @param src_buffer The source buffer containing the 4D input tensor data.
+ * @param src_ne The dimensions of the source tensor.
+ * @param src_nb The strides (byte offsets) of the source tensor.
+ * @param dst_buffer The destination buffer where the output tensor data will be written.
+ * @param dst_ne The dimensions of the destination tensor.
+ * @param dst_nb The strides (byte offsets) of the destination tensor.
+ * @param index The index tensor specifying the indices to select from the source tensor.
+ * @param type The data type of the source and destination tensors.
+ */
+static void aclnn_index_select_4d(ggml_backend_cann_context & ctx,
+ void * src_buffer,
+ int64_t * src_ne,
+ size_t * src_nb,
+ void * dst_buffer,
+ int64_t * dst_ne,
+ size_t * dst_nb,
+ ggml_tensor * index,
+ ggml_type type) {
+ for (int64_t i = 0; i < src_ne[3]; i++) {
+ for (int64_t j = 0; j < src_ne[2]; j++) {
+ // src
+ acl_tensor_ptr acl_src_tensor =
+ ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
+ ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
+
+ // index
+ acl_tensor_ptr acl_index = ggml_cann_create_tensor(
+ (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
+ ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
+
+ // out
+ acl_tensor_ptr acl_out =
+ ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
+ ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
+ GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor.get(), 0, acl_index.get(), acl_out.get());
+ }
+ }
+}
+
+/**
+ * @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
+ *
+ * This function applies the `IndexCopy` operation along a specific dimension of the
+ * destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
+ * to positions specified by the index tensor (`index`).
+ * It iterates over the last two dimensions of the tensors, creates the corresponding
+ * CANN tensors for source, index, and destination slices, and performs the index copy
+ * operation for each slice.
+ *
+ * @param ctx The context for CANN backend operations.
+ * @param src_buffer The source buffer containing the 4D input tensor data to be copied.
+ * @param src_ne The dimensions of the source tensor.
+ * @param src_nb The strides (byte offsets) of the source tensor.
+ * @param dst_buffer The destination buffer where values will be copied to.
+ * @param dst_ne The dimensions of the destination tensor.
+ * @param dst_nb The strides (byte offsets) of the destination tensor.
+ * @param index The index tensor specifying target positions in the destination tensor.
+ * @param type The data type of the source and destination tensors.
+ */
+static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx,
+ void * src_buffer,
+ int64_t * src_ne,
+ size_t * src_nb,
+ void * dst_buffer,
+ int64_t * dst_ne,
+ size_t * dst_nb,
+ ggml_tensor * index,
+ ggml_type type) {
+ for (int64_t i = 0; i < src_ne[3]; i++) {
+ for (int64_t j = 0; j < src_ne[2]; j++) {
+ // src
+ acl_tensor_ptr acl_src_tensor =
+ ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
+ ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
+
+ // index
+ acl_tensor_ptr acl_index = ggml_cann_create_tensor(
+ (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
+ ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
+
+ // out
+ acl_tensor_ptr acl_out =
+ ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
+ ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out.get(), 0, acl_index.get(), acl_src_tensor.get());
+ }
+ }
+}
+
+void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // src
+ ggml_tensor * src1 = dst->src[1]; // index
+
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
+ switch (src0->type) {
+ case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ if (src0->type == dst->type) {
+ aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1,
+ dst->type);
+ } else {
+ acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
+ ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
+ void * src_trans_buffer = src_buffer_allocator.get();
+ size_t src_trans_nb[GGML_MAX_DIMS];
+ src_trans_nb[0] = dst->nb[0];
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ }
+ acl_tensor_ptr src_trans_tensor =
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+ aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
+ aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
+ dst->type);
+ }
+ break;
+ case GGML_TYPE_Q8_0:
+ {
+ // add 1 dim for bcast mul.
+ size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1];
+ int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne;
+ int64_t scale_offset = 0;
+ // [3,4,5,64] -> [3,4,5,2,32]
+ weight_ne[0] = QK8_0;
+ weight_ne[1] = src0->ne[0] / QK8_0;
+ weight_nb[0] = sizeof(int8_t);
+ weight_nb[1] = weight_nb[0] * weight_ne[0];
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+ weight_ne[i] = src0->ne[i - 1];
+ weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
+ }
+ // [3,4,5,64] -> [3,4,5,2,1]
+ scale_ne[0] = 1;
+ scale_ne[1] = src0->ne[0] / QK8_0;
+ scale_nb[0] = sizeof(uint16_t);
+ scale_nb[1] = scale_nb[0] * scale_ne[0];
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+ scale_ne[i] = src0->ne[i - 1];
+ scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
+ }
+ // [3,4,5,64] -> [3,4,5,2,32]
+ dequant_ne = weight_ne;
+ dequant_nb[0] = ggml_type_size(dst->type);
+ for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
+ dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
+ }
+ scale_offset = ggml_nelements(src0) * sizeof(int8_t);
+ ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(),
+ ggml_nelements(src0) * ggml_type_size(dst->type));
+ acl_tensor_ptr acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
+ weight_ne, weight_nb, GGML_MAX_DIMS + 1);
+ acl_tensor_ptr acl_scale_tensor =
+ ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
+ GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
+ acl_tensor_ptr dequant_tensor =
+ ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
+ aclnn_mul(ctx, acl_weight_tensor.get(), acl_scale_tensor.get(), dequant_tensor.get());
+ dequant_nb[0] = ggml_type_size(dst->type);
+ dequant_ne = src0->ne;
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
+ }
+ aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne,
+ dst->nb, src1, dst->type);
+ break;
+ }
+ default:
+ GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
+ break;
+ }
+}
+
+void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // src
+ ggml_tensor * src1 = dst->src[1]; // index
+
+ switch (dst->type) {
+ case GGML_TYPE_F32:
+ {
+ aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type);
+ break;
+ }
+ case GGML_TYPE_F16:
+ {
+ acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
+ ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
+ void * src_trans_buffer = src_buffer_allocator.get();
+ size_t src_trans_nb[GGML_MAX_DIMS];
+ src_trans_nb[0] = sizeof(uint16_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ }
+ acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
+ src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+ aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
+ aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
+ dst->type);
+ break;
+ }
+ default:
+ GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
+ break;
+ }
+}
+
+/**
+ * @brief Repeats elements of a tensor along a specified dimension.
+ *
+ * This function repeats each element of the source tensor `acl_src` a specified
+ * number of times (`repeats`) along the specified dimension `dim` and stores
+ * the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be repeated.
+ * @param acl_dst The destination tensor where the repeated elements will be
+ * stored.
+ * @param dim The dimension along which the elements will be repeated.
+ * @param repeats The number of times each element will be repeated.
+ * @param output_size The size of the output tensor.
+ */
+static void aclnn_repeat_interleave(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t dim,
+ int64_t repeats,
+ int64_t output_size) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim, output_size, acl_dst);
+}
+
+/**
+ * @brief Performs matrix multiplication with floating-point precision on
+ * tensors using the CANN backend.
+ *
+ * This function performs matrix multiplication of the input tensor and the
+ * weight tensor, handling broadcasting and transposing as needed, and stores
+ * the result in the destination tensor `dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * weight = dst->src[0]; // weight
+ ggml_tensor * input = dst->src[1]; // input
+
+ // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
+ // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
+ BCAST_MUL_MAT_SHAPE(input, weight, dst);
+
+ int64_t n_dims = bcast_dims;
+ if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
+ if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
+ n_dims = 2;
+ } else if (bcast_input_ne[2] == 1) {
+ n_dims = 3;
+ }
+ }
+
+ acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
+ int64_t transpose_ne[] = { bcast_weight_ne[1], bcast_weight_ne[0], bcast_weight_ne[2],
+ bcast_weight_ne[3], bcast_weight_ne[4], bcast_weight_ne[5] };
+ size_t transpose_nb[] = { bcast_weight_nb[1], bcast_weight_nb[0], bcast_weight_nb[2],
+ bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5] };
+ acl_tensor_ptr acl_weight_tensor;
+
+ // Only check env once.
+ static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
+ if (weight_to_nz && is_matmul_weight(weight)) {
+ acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
+ } else {
+ acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
+ }
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
+
+ switch (n_dims) {
+ case 2:
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(), 2);
+ break;
+ case 3:
+ GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(),
+ 2);
+ break;
+ default:
+ // ALLOW_FP32_DOWN_PRECISION, when input is
+ // fp32, atlas a2 will transpose it to HFLOAT32.
+ GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(), 1);
+ break;
+ }
+}
+
+/**
+ * @brief Performs matrix multiplication with quantized weights and
+ * floating-point inputs using the CANN backend.
+ *
+ * This function performs matrix multiplication of the input tensor `src1` and
+ * the weight tensor `src0`, handling broadcasting, transposing, and
+ * quantization as needed, and stores the result in the destination tensor
+ * `dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void ggml_cann_mul_mat_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst, const enum ggml_type type) {
+ ggml_tensor * src0 = dst->src[0]; // weight
+ ggml_tensor * src1 = dst->src[1]; // input
+
+ // The shape of the weight is NCHW.
+ // Matrix multiplication uses HW dims.
+ // HC is regarded as batch.
+ // weight need transpose.
+ float weight_elem_size;
+ if (type == GGML_TYPE_Q4_0) {
+ weight_elem_size = float(sizeof(uint8_t)) / 2;
+ } else if (type == GGML_TYPE_Q8_0) {
+ weight_elem_size = float(sizeof(uint8_t));
+ } else {
+ GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
+ }
+ float weight_nb[] = { src0->ne[0] * weight_elem_size, weight_elem_size };
+ size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
+ size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
+
+ // scale stored at the end of weight. Also need transpose.
+ size_t scale_elem_size = sizeof(uint16_t);
+ size_t scale_nb[] = { src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size };
+ size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+ char * scale_offset = (char *) src0->data + weight_size;
+
+ // input
+ size_t input_elem_size = sizeof(uint16_t);
+ int64_t input_ne[] = { src1->ne[0], src1->ne[1] };
+ size_t input_nb[] = { input_elem_size, input_ne[0] * input_elem_size };
+ size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
+ ggml_cann_pool_alloc input_alloctor(ctx.pool());
+ void * input_buffer = src1->data;
+
+ // case in
+ if (src1->type != GGML_TYPE_F16) {
+ acl_tensor_ptr acl_src1_tensor = ggml_cann_create_tensor(src1);
+ input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
+
+ int64_t * input_cast_ne = src1->ne;
+ size_t input_cast_nb[GGML_MAX_DIMS];
+ input_cast_nb[0] = sizeof(uint16_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
+ }
+
+ acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, input_elem_size,
+ input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
+ aclnn_cast(ctx, acl_src1_tensor.get(), acl_input_tensor.get(), ACL_FLOAT16);
+ }
+
+ // output
+ size_t output_elem_size = sizeof(uint16_t);
+ size_t output_nb[] = { output_elem_size, dst->ne[0] * output_elem_size };
+ ggml_cann_pool_alloc output_allocator(ctx.pool());
+ void * output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
+ size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
+
+ // aclnn
+ int64_t max_elem_size = 65535;
+ int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool());
+ for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
+ for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
+ int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
+ int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
+
+ int64_t batch1 = (n1 * src1->ne[2]) + c1;
+ int64_t batch0 = (n0 * src0->ne[2]) + c0;
+
+ acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(
+ (char *) input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2);
+
+ // first split
+ int64_t weight_ne_offset = 0;
+ int64_t weight_ne[2] = { max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0] };
+ int64_t scale_ne_offset = 0;
+ int64_t scale_ne[2] = { weight_ne[0], weight_ne[1] / QK8_0 };
+ int64_t output_ne_offset = 0;
+ int64_t output_ne[2] = { weight_ne[0], dst->ne[1] };
+
+ acl_tensor_ptr acl_weight_tensor =
+ ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
+ weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
+ acl_tensor_ptr acl_scale_tensor =
+ ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne,
+ scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
+ acl_tensor_ptr acl_output_tensor =
+ ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size,
+ output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
+ int64_t antiquantGroupSize = 0;
+ if (src0->ne[0] > QK8_0) {
+ antiquantGroupSize = QK8_0;
+ }
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor.get(), acl_weight_tensor.get(),
+ acl_scale_tensor.get(), nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
+ acl_output_tensor.get());
+
+ // other splits
+ for (int64_t split = 1; split < split_size; split++) {
+ weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
+ weight_ne[0] =
+ max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
+ scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
+ scale_ne[0] = weight_ne[0];
+ output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
+ output_ne[0] = weight_ne[0];
+
+ acl_weight_tensor =
+ ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
+ weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
+ acl_scale_tensor =
+ ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size,
+ scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
+ acl_output_tensor =
+ ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16,
+ output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor.get(), acl_weight_tensor.get(),
+ acl_scale_tensor.get(), nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
+ acl_output_tensor.get());
+ }
+ }
+ }
+
+ // cast out
+ if (dst->type != GGML_TYPE_F16) {
+ int64_t * output_cast_ne = dst->ne;
+ size_t output_cast_nb[GGML_MAX_DIMS];
+ output_cast_nb[0] = sizeof(uint16_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
+ }
+
+ acl_tensor_ptr acl_output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
+ output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+ acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
+ aclnn_cast(ctx, acl_output_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
+ }
+}
+
+void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ const enum ggml_type type = dst->src[0]->type;
+ switch (type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ ggml_cann_mat_mul_fp(ctx, dst);
+ break;
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q8_0:
+ ggml_cann_mul_mat_quant(ctx, dst, type);
+ break;
+ default:
+ GGML_ABORT("Unsupported type for mul_mat");
+ break;
+ }
+}
+
+/**
+ * @brief Rolls the elements of a tensor along a specified dimension.
+ *
+ * This function rolls the elements of the source tensor `acl_src` by the
+ * specified shifts `shifts` along the specified dimensions `dims`, and stores
+ * the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be rolled.
+ * @param acl_dst The destination tensor where the rolled elements will be
+ * stored.
+ * @param shifts An array specifying the number of positions by which elements
+ * are shifted.
+ * @param dims An array specifying the dimensions along which elements are
+ * shifted.
+ */
+static void aclnn_roll(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t * shifts,
+ int64_t * dims) {
+ acl_int_array_ptr acl_shifts = ggml_cann_create_int_array(shifts, 1);
+ acl_int_array_ptr acl_dims = ggml_cann_create_int_array(dims, 1);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts.get(), acl_dims.get(), acl_dst);
+}
+
+/**
+ * @brief Fills specified positions of a tensor with a scalar value.
+ *
+ * This function fills the positions in the source tensor `acl_src` specified by
+ * `index` along the dimension `dim` with the scalar value `value`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor where the positions will be filled.
+ * @param dim The dimension along which the positions are specified.
+ * @param index An array specifying the positions to be filled.
+ * @param index_num The number of positions specified in the index array.
+ * @param value The scalar value used to fill the specified positions.
+ */
+static void aclnn_index_fill_tensor(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ int64_t dim,
+ int64_t * index,
+ int64_t index_num,
+ float value) {
+ acl_int_array_ptr acl_index = ggml_cann_create_int_array(index, index_num);
+ acl_scalar_ptr acl_value = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index.get(), acl_value.get());
+}
+
+/**
+ * @brief Initializes and caches all intermediate tensors required for RoPE
+ * (Rotary Position Embedding), including support for Yarn, mRoPE,
+ * i-mRoPE, Neox repeat strategy, independent sectors, frequency factors,
+ * and multi-section rotary groups.
+ *
+ * This function computes and caches the per-dimension θ coefficients used for
+ * Q/K rotary embedding. The cache is shared across layers, and recomputed only
+ * when any dependent parameter changes.
+ *
+ * The function now supports:
+ * - Yarn RoPE extrapolation (via @param corr_dims and @param ext_factor)
+ * - Per-dimension independent sector exponent rules (indep_sects + sections[])
+ * - Multi-section RoPE (mRoPE) index mapping (mrope_used + is_imrope)
+ * - Frequency factor division (src2)
+ * - Neox / normal repeat expansion modes
+ *
+ * @param ctx CANN backend context, containing memory pool,
+ * cached buffers, and runtime stream.
+ * @param dst Destination ggml_tensor whose computation
+ * depends on RoPE (typically Qcur or Kcur).
+ * @param corr_dims [low, high] Yarn correction range.
+ * @param ext_factor Yarn extrapolation strength. 0 = disabled.
+ * @param theta_scale Base multiplier for per-dimension θ exponent.
+ * @param freq_scale Global frequency scaling factor.
+ * @param attn_factor Optional scaling applied to sin/cos (if needed).
+ * @param is_neox Whether to use Neox-style dimension interleave.
+ * @param sections 4-way sector sizes for independent-section RoPE
+ * and multi-section mRoPE (t/h/w/e).
+ * @param mrope_used Whether to enable multi-section rotary embedding.
+ * @param is_imrope Whether to apply interleaved mRoPE rules.
+ * @param indep_sects Whether each dimension runs independent exponent
+ * resets based on @p sections.
+ */
+static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
+ ggml_tensor * dst,
+ float * corr_dims,
+ float ext_factor,
+ float theta_scale,
+ float freq_scale,
+ float attn_factor,
+ bool is_neox,
+ int sections[4],
+ bool mrope_used,
+ bool is_imrope,
+ bool indep_sects,
+ int64_t rope_dims) {
+ ggml_tensor * src1 = dst->src[1]; // position
+ ggml_tensor * src2 = dst->src[2]; // freq_factors
+
+ int64_t theta_scale_length = rope_dims / 2;
+ int64_t position_length = dst->ne[2];
+
+ // TODO: check theta_scale_length and position_length.
+ if (src2 == nullptr && ctx.rope_cache.cached &&
+ ctx.rope_cache.equal(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor,
+ is_neox, indep_sects, mrope_used, is_imrope, sections)) {
+ // use cache.
+ return;
+ }
+
+ // Step0: calculate tensor shape.
+ int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 };
+ size_t theta_scale_nb[] = { sizeof(float), theta_scale_length * sizeof(float), theta_scale_length * sizeof(float),
+ theta_scale_length * sizeof(float) };
+
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
+ int64_t position_ne[] = { 1, 1, position_length, 1 };
+ size_t position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };
+
+ int64_t cache_ne[] = { theta_scale_length, 1, position_length, 1 };
+ size_t cache_nb[GGML_MAX_DIMS];
+ cache_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ cache_nb[i] = cache_nb[i - 1] * cache_ne[i - 1];
+ }
+
+ // Step1: Compute the coefficient of theta. During the cache_init process, aside from
+ // (1) multiplying by the position,
+ // (2) dividing by freq_factors,
+ // (3) computing the sine and cosine,
+ // the other parameters used in the computation generally do not change in most scenarios.
+ // Therefore, we can first compute this part of the result and then cache it.
+
+ // Step1.1: prepare theta_scale exponent. if this exponent updated, should update theta_scale_tensor.
+ acl_tensor_ptr acl_theta_scale_tensor;
+ bool theta_scale_updated = false;
+ if (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.theta_scale != theta_scale ||
+ ctx.rope_cache.indep_sects != indep_sects) {
+ theta_scale_updated = true;
+ if (ctx.rope_cache.theta_scale_exp_host != nullptr) {
+ free(ctx.rope_cache.theta_scale_exp_host);
+ }
+ ctx.rope_cache.theta_scale_exp_host = (float *) malloc(theta_scale_length * sizeof(float));
+ GGML_ASSERT(ctx.rope_cache.theta_scale_exp_host != nullptr);
+ if (!indep_sects) {
+ ctx.rope_cache.theta_scale_exp_host[0] = 1;
+ for (int i = 1; i < theta_scale_length; i++) {
+ ctx.rope_cache.theta_scale_exp_host[i] = ctx.rope_cache.theta_scale_exp_host[i - 1] * theta_scale;
+ }
+ } else {
+ int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
+ int sec_w = sections[1] + sections[0];
+ int sec_e = sections[2] + sec_w;
+
+ ctx.rope_cache.theta_scale_exp_host[0] = 1;
+ for (int i = 1; i < theta_scale_length; i++) {
+ int sector = i % sect_dims;
+ if (sector == 0 || sector == sections[0] || sector == sec_w || sector == sec_e) {
+ ctx.rope_cache.theta_scale_exp_host[i] = 1;
+ continue;
+ }
+ ctx.rope_cache.theta_scale_exp_host[i] = ctx.rope_cache.theta_scale_exp_host[i - 1] * theta_scale;
+ }
+ }
+
+ if (ctx.rope_cache.theta_scale_cache != nullptr) {
+ ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
+ }
+ ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
+ ACL_MEM_MALLOC_HUGE_FIRST));
+
+ ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
+ ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
+ ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
+ }
+ acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
+ theta_scale_ne, theta_scale_nb, 1);
+
+ // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
+ // TODO: acl_yarn_ramp_tensor use rope cache.
+ bool yarn_ramp_tensor_updated = false;
+ acl_tensor_ptr acl_yarn_ramp_tensor;
+ if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
+ ctx.rope_cache.freq_scale != freq_scale)) {
+ yarn_ramp_tensor_updated = true;
+ if (ctx.rope_cache.yarn_ramp_cache != nullptr) {
+ ACL_CHECK(aclrtFree(ctx.rope_cache.yarn_ramp_cache));
+ }
+ ACL_CHECK(aclrtMalloc(&ctx.rope_cache.yarn_ramp_cache, theta_scale_length * sizeof(float),
+ ACL_MEM_MALLOC_HUGE_FIRST));
+ // -rope_yarn_ramp
+ // const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+ // return MIN(1, MAX(0, y)) - 1;
+ acl_yarn_ramp_tensor = ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float),
+ theta_scale_ne, theta_scale_nb, 1);
+ float zero_value = 0, one_value = 1;
+ float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
+ acl_scalar_ptr low = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
+ acl_scalar_ptr zero = ggml_cann_create_scalar(&zero_value, aclDataType::ACL_FLOAT);
+ acl_scalar_ptr one = ggml_cann_create_scalar(&one_value, aclDataType::ACL_FLOAT);
+ acl_scalar_ptr denom_safe = ggml_cann_create_scalar(&denom_safe_value, aclDataType::ACL_FLOAT);
+ acl_scalar_ptr ext_factor_sc = ggml_cann_create_scalar(&ext_factor, aclDataType::ACL_FLOAT);
+
+ aclnn_arange(ctx, acl_yarn_ramp_tensor.get(), 0, theta_scale_length, 1, theta_scale_length);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), low.get(), one.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor.get(), denom_safe.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor.get(), zero.get(), zero.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor.get(), one.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), one.get(), one.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), ext_factor_sc.get());
+
+ // theta_interp = freq_scale * theta_extrap;
+ // theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+ // theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
+ // theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
+ // theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
+ //
+ // we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
+ // cache freq_scale + (freq_scale - 1) * ramp_mix
+ float freq_scale_1 = freq_scale - 1;
+ acl_scalar_ptr freq_scale_sc = ggml_cann_create_scalar(&freq_scale, aclDataType::ACL_FLOAT);
+ acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
+ } else {
+ acl_yarn_ramp_tensor = ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float),
+ theta_scale_ne, theta_scale_nb, 1);
+ }
+ // Step 1.3: update theta_scale_tensor according to ext_factor or freq_scale.
+ if (ext_factor != 0) {
+ if (theta_scale_updated || yarn_ramp_tensor_updated) {
+ theta_scale_updated = true;
+ aclnn_mul(ctx, acl_theta_scale_tensor.get(), acl_yarn_ramp_tensor.get());
+ }
+ } else {
+ if (freq_scale != 1 && (ctx.rope_cache.freq_scale != freq_scale || theta_scale_updated)) {
+ theta_scale_updated = true;
+ aclnn_muls(ctx, acl_theta_scale_tensor.get(), freq_scale, nullptr, true);
+ }
+ }
+
+ // Nothing changed, use cache.
+ if (!theta_scale_updated) {
+ acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
+ }
+
+ // Step 1.4: prepare select index if mrope
+ acl_tensor_ptr position_select_index_tensor;
+ if (mrope_used) {
+ if (ctx.rope_cache.sections[0] != sections[0] || ctx.rope_cache.sections[1] != sections[1] ||
+ ctx.rope_cache.sections[2] != sections[2] || ctx.rope_cache.sections[3] != sections[3] ||
+ ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.is_imrope != is_imrope) {
+ if (ctx.rope_cache.position_select_index_host != nullptr) {
+ free(ctx.rope_cache.position_select_index_host);
+ }
+ ctx.rope_cache.position_select_index_host = (int *) malloc(theta_scale_length * sizeof(int));
+ GGML_ASSERT(ctx.rope_cache.position_select_index_host != nullptr);
+ int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
+ int sec_w = sections[1] + sections[0];
+ int sec_e = sections[2] + sec_w;
+ // t,h,w,e
+ for (int i = 0; i < theta_scale_length; i++) {
+ int sector = i % sect_dims;
+
+ if (is_imrope) { // qwen3vl apply interleaved mrope
+ if (sector % 3 == 1 && sector < 3 * sections[1]) {
+ ctx.rope_cache.position_select_index_host[i] = 1;
+ } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
+ ctx.rope_cache.position_select_index_host[i] = 2;
+ } else if (sector % 3 == 0 && sector < 3 * sections[0]) {
+ ctx.rope_cache.position_select_index_host[i] = 0;
+ } else {
+ ctx.rope_cache.position_select_index_host[i] = 3;
+ }
+ } else {
+ if (sector >= sections[0] && sector < sec_w) {
+ ctx.rope_cache.position_select_index_host[i] = 1;
+ } else if (sector >= sec_w && sector < sec_e) {
+ ctx.rope_cache.position_select_index_host[i] = 2;
+ } else if (sector >= sec_e) {
+ ctx.rope_cache.position_select_index_host[i] = 3;
+ } else {
+ ctx.rope_cache.position_select_index_host[i] = 0;
+ }
+ }
+ }
+
+ if (ctx.rope_cache.position_select_index != nullptr) {
+ ACL_CHECK(aclrtFree(ctx.rope_cache.position_select_index));
+ }
+ ACL_CHECK(aclrtMalloc(&ctx.rope_cache.position_select_index, theta_scale_length * sizeof(int),
+ ACL_MEM_MALLOC_HUGE_FIRST));
+
+ ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.position_select_index, theta_scale_length * sizeof(int),
+ ctx.rope_cache.position_select_index_host, theta_scale_length * sizeof(int),
+ ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
+ }
+
+ position_select_index_tensor = ggml_cann_create_tensor(ctx.rope_cache.position_select_index, ACL_INT32,
+ sizeof(int), theta_scale_ne, theta_scale_nb, 1);
+ }
+
+ // Step2: divide by freq_factors
+ ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
+ if (src2) {
+ freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
+ void * freq_fac_res_ptr = freq_fac_res_allocator.get();
+ acl_tensor_ptr acl_freq_factors_tensor =
+ ggml_cann_create_tensor(src2->data, ggml_cann_type_mapping(src2->type), ggml_type_size(src2->type),
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
+ acl_tensor_ptr acl_freq_fac_res_tensor = ggml_cann_create_tensor(freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
+ aclnn_div(ctx, acl_theta_scale_tensor.get(), acl_freq_factors_tensor.get(), acl_freq_fac_res_tensor.get());
+ std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
+ }
+
+ // Step3: prepare position_tensor
+ acl_tensor_ptr acl_position_tensor;
+ ggml_cann_pool_alloc mrope_position_acllocator(ctx.pool());
+ if (mrope_used) {
+ // Step3.1: select current position;
+ // position :
+ // pos1: [[0, 1 ,2 ,3 ],
+ // pos2: [4, 5 ,6 ,7 ],
+ // pos3: [8, 9 ,10,11],
+ // pos4: [12,13,14,15] ]
+ //
+ // select index = [0, 1, 2, 2, 1, 0]
+ //
+ // selected_tensor:
+ // [[0, 1 ,2 ,3 ],
+ // [4, 5 ,6 ,7 ],
+ // [8, 9 ,10,11],
+ // [8, 9 ,10,11],
+ // [4, 5 ,6 ,7 ],
+ // [0, 1 ,2 ,3 ]]
+ //
+ // transpose, from [seq_len:dims] to [dims:seq_len]
+ // [0, 4, 8 ,8 ,4, 0],
+ // [1, 5, 9, 9, 5, 1],
+ // [2, 6, 10,10,6 ,2],
+ // [3, 7, 11,11,7 3 ]]
+ //
+ // multipy by theta_scale_tensor
+ // [theta_scale^0, theta_scale^1, ..., theta_scale ^ n]
+
+ int64_t mrope_position_ne[] = { position_length, 4 };
+ size_t mrope_position_nb[] = { sizeof(int), position_length * sizeof(int) };
+ acl_tensor_ptr mrope_position =
+ ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
+ mrope_position_ne, mrope_position_nb, 2);
+
+ // selected position tensor's shape is a transpose of cache tensor.
+ int64_t selected_position_ne[] = { position_length, theta_scale_length };
+ size_t selected_position_nb[] = { sizeof(float), position_length * sizeof(float) };
+ mrope_position_acllocator.alloc(theta_scale_length * position_length * sizeof(float));
+ void * mrope_position_buffer = mrope_position_acllocator.get();
+ acl_position_tensor =
+ ggml_cann_create_tensor(mrope_position_buffer, ggml_cann_type_mapping(src1->type),
+ ggml_type_size(src1->type), selected_position_ne, selected_position_nb, 2);
+ GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, mrope_position.get(), 0, position_select_index_tensor.get(),
+ acl_position_tensor.get());
+
+ // transpose
+ int64_t transposed_ne[] = { position_length, 1, theta_scale_length, 1 };
+ size_t transposed_nb[GGML_MAX_DIMS];
+ transposed_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ transposed_nb[i] = transposed_nb[i - 1] * transposed_ne[i - 1];
+ }
+
+ std::swap(transposed_ne[0], transposed_ne[2]);
+ std::swap(transposed_nb[0], transposed_nb[2]);
+
+ acl_position_tensor =
+ ggml_cann_create_tensor(mrope_position_buffer, ggml_cann_type_mapping(src1->type),
+ ggml_type_size(src1->type), transposed_ne, transposed_nb, GGML_MAX_DIMS);
+
+ } else {
+ // auto bcast.
+ acl_position_tensor =
+ ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
+ position_ne, position_nb, GGML_MAX_DIMS);
+ }
+
+ // Step4: multiply by the position
+ int64_t theta_length = theta_scale_length * position_length;
+ ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
+ void * theta_buffer = theta_allocator.get();
+
+ acl_tensor_ptr acl_theta_tensor =
+ ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS);
+ aclnn_mul(ctx, acl_position_tensor.get(), acl_theta_scale_tensor.get(), acl_theta_tensor.get());
+
+ // Step5: calculate sin cos.
+ // init sin_repeat && cos_repeat, only to accelerate first layer on each device
+ if (position_length > ctx.rope_cache.position_length) {
+ ctx.rope_cache.position_length = position_length;
+ if (ctx.rope_cache.sin_cache != nullptr) {
+ ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache));
+ }
+ if (ctx.rope_cache.cos_cache != nullptr) {
+ ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
+ }
+ int64_t repeat_theta_length = theta_scale_length * position_length * 2;
+ ACL_CHECK(
+ aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
+ ACL_CHECK(
+ aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
+ }
+
+ // sin/cos
+ ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length * sizeof(float));
+ void * sin_buffer = sin_allocator.get();
+ acl_tensor_ptr acl_sin_tensor =
+ ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+ aclnn_sin(ctx, acl_theta_tensor.get(), acl_sin_tensor.get());
+
+ ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length * sizeof(float));
+ void * cos_buffer = cos_allocator.get();
+ acl_tensor_ptr acl_cos_tensor =
+ ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+ aclnn_cos(ctx, acl_theta_tensor.get(), acl_cos_tensor.get());
+
+ if (ext_factor != 0) {
+ attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+ }
+
+ // Step 5: multiply by attn_factor
+ if (attn_factor != 1) {
+ aclnn_muls(ctx, acl_sin_tensor.get(), attn_factor, nullptr, true);
+ aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
+ }
+
+ int64_t sin_reshape_ne[4] = { rope_dims, 1, dst->ne[2], 1 };
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
+ sin_reshape_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
+ }
+ acl_tensor_ptr acl_sin_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+ acl_tensor_ptr acl_cos_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+
+ // Step 6: repeat
+ if (is_neox) {
+ // [sinθ1, sinθ1, sinθ2, sinθ2, ..., sinθn, sinθn]
+ int64_t repeatsArray[] = { 1, 1, 1, 2 };
+ aclnn_repeat(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), repeatsArray);
+ aclnn_repeat(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), repeatsArray);
+ } else {
+ int64_t num_repeats = 2;
+ int64_t dim = 3;
+ int64_t output_size = theta_scale_length * num_repeats;
+ // [sinθ1, sinθ2, ..., sinθn, sinθ1, sinθ2, ..., sinθn]
+ aclnn_repeat_interleave(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), dim, num_repeats, output_size);
+ aclnn_repeat_interleave(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), dim, num_repeats, output_size);
+ }
+
+ // Update cached value.
+ ctx.rope_cache.cached = true;
+ ctx.rope_cache.set(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, is_neox,
+ indep_sects, mrope_used, is_imrope, sections);
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(const aclTensor * x,
+ const aclTensor * cos,
+ const aclTensor * sin,
+ int64_t mode,
+ const aclTensor * yOut,
+ uint64_t * workspaceSize,
+ aclOpExecutor ** executor);
+aclnnStatus aclnnRotaryPositionEmbedding(void * workspace,
+ uint64_t workspaceSize,
+ aclOpExecutor * executor,
+ aclrtStream stream);
+#ifdef __cplusplus
+}
+#endif
+
+void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // input
+
+ // param
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+ int sections[4];
+ // const int n_past = ((int32_t *) dst->op_params)[0];
+ const int n_dims = ((int32_t *) dst->op_params)[1];
+ const int mode = ((int32_t *) dst->op_params)[2];
+ // const int n_ctx = ((int32_t *) dst->op_params)[3];
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
+ memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int) * 4);
+
+ GGML_ASSERT(n_dims % 2 == 0);
+ GGML_ASSERT(n_dims <= ne00);
+
+ const float theta_scale = powf(freq_base, -2.0f / n_dims);
+
+ float corr_dims[2];
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+ bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+ const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
+ // mrope_used means the GGML_ROPE_TYPE_MROPE bit is set.
+ // Note: this bit is also set for imrope and some vision modes,
+ // so mrope_used does NOT exclusively indicate pure mrope.
+ const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+ if (mrope_used) {
+ GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+ }
+
+ if (is_vision) {
+ GGML_ASSERT(n_dims == ne0 / 2);
+ }
+
+ if (is_imrope || mrope_used) {
+ is_neox = true;
+ }
+
+ int64_t rope_dims = n_dims;
+
+ //Our current RotaryPositionEmbedding does not support the VISION mode,
+ //but essentially it only modifies theta_base in mrope,
+ //then repeats it at the end in the same way as is_neox.
+ //In fact, RoPE is still applied across all dimensions.
+ if (is_vision) {
+ rope_dims = src0->ne[0];
+ }
+ int64_t tail_dims = ne00 - rope_dims;
+ bool has_tail = tail_dims > 0;
+
+ // init ctx.rope_cos/rope_sin cache
+ aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
+ mrope_used, is_imrope, is_vision, rope_dims);
+
+ // Cache is generated with ne00 dimensions, so we use ne00 for reshape
+ int64_t sin_reshape_ne[4] = { rope_dims, 1, ne02, 1 };
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
+ sin_reshape_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
+ }
+ acl_tensor_ptr acl_sin_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+ acl_tensor_ptr acl_cos_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+#ifdef ASCEND_310P
+ // Special ROPE operation for 310P
+
+ // roll input
+ void * input_roll_buffer;
+ acl_tensor_ptr acl_minus_one_tensor;
+ void * minus_one_scale_buffer = nullptr;
+ ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
+ ggml_cann_pool_alloc minus_one_scale_allocator(ctx.pool(), sizeof(float) * src0->ne[0]);
+ if (!is_neox) {
+ // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
+ input_roll_buffer = roll_allocator.get();
+ int64_t input_roll_ne[4] = { 2, src0->ne[1] * (src0->ne[0] / 2), src0->ne[2], src0->ne[3] };
+ size_t input_roll_nb[GGML_MAX_DIMS];
+ input_roll_nb[0] = ggml_type_size(src0->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
+ }
+ acl_tensor_ptr acl_input_roll_tensor =
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
+ acl_tensor_ptr acl_input_tensor =
+ ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
+
+ int64_t shifts[] = { 1 };
+ int64_t dims[] = { 3 };
+ aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
+
+ // init [-1, 1, -1, 1, ...]
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
+
+ int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
+ size_t minus_one_nb[GGML_MAX_DIMS];
+ minus_one_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+ }
+ acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
+ GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
+ int64_t dim = 3;
+ int64_t * index = new int64_t[src0->ne[0]];
+ for (int i = 0; i < src0->ne[0]; i++) {
+ index[i] = i / 2 * 2;
+ }
+ int64_t index_num = src0->ne[0];
+ float value = -1;
+ aclnn_index_fill_tensor(ctx, acl_minus_one_tensor.get(), dim, index, index_num, value);
+ } else {
+ // roll input: [q0,q1,q2,...] ->
+ // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
+ input_roll_buffer = roll_allocator.get();
+ acl_tensor_ptr acl_input_roll_tensor =
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ src0->ne, src0->nb, GGML_MAX_DIMS);
+ acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(src0);
+
+ int64_t shifts[] = { src0->ne[0] / 2 };
+ int64_t dims[] = { 3 };
+ aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
+
+ // init [-1, -1, -1, 1, 1,1,...]
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
+ int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
+ size_t minus_one_nb[GGML_MAX_DIMS];
+ minus_one_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+ }
+ acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
+ GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
+ // -1 * first half
+ int64_t first_half_ne[4] = { src0->ne[0] / 2, 1, 1, 1 };
+ size_t first_half_nb[GGML_MAX_DIMS];
+ first_half_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
+ }
+ acl_tensor_ptr acl_first_half_tensor = ggml_cann_create_tensor(minus_one_scale_buffer, ACL_FLOAT, sizeof(float),
+ first_half_ne, first_half_nb, GGML_MAX_DIMS);
+ bool inplace = true;
+ float scale = -1;
+ aclnn_muls(ctx, acl_first_half_tensor.get(), scale, nullptr, inplace);
+ }
+
+ // TODO: n_dims < ne0
+ GGML_ASSERT(n_dims == src0->ne[0]);
+
+ // input * scale
+ ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0));
+ void * input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
+ size_t input_nb[GGML_MAX_DIMS];
+ input_nb[0] = ggml_type_size(src0->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
+ }
+ acl_tensor_ptr acl_input_roll_mul_scale_tensor =
+ ggml_cann_create_tensor(input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+ acl_tensor_ptr acl_input_roll_reshape_tensor =
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ src0->ne, input_nb, GGML_MAX_DIMS);
+
+ aclnn_mul(ctx, acl_input_roll_reshape_tensor.get(), acl_minus_one_tensor.get(),
+ acl_input_roll_mul_scale_tensor.get());
+
+ // output
+ void * output_fp32_buffer;
+ if (src0->type == GGML_TYPE_F32) {
+ aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get());
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get());
+ aclnn_add(ctx, acl_src.get(), acl_input_roll_mul_scale_tensor.get(), acl_dst.get());
+ // TODO: ne0 != n_dims in mode2
+ } else if (src0->type == GGML_TYPE_F16) {
+ size_t input_fp32_nb[GGML_MAX_DIMS];
+ input_fp32_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
+ }
+ ggml_cann_pool_alloc fp32_allocator1(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+ void * input_fp32_buffer1 = fp32_allocator1.get();
+ acl_tensor_ptr input_fp32_tensor1 = ggml_cann_create_tensor(input_fp32_buffer1, ACL_FLOAT, sizeof(float),
+ dst->ne, input_fp32_nb, GGML_MAX_DIMS);
+ ggml_cann_pool_alloc fp32_allocator2(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+ void * input_fp32_buffer2 = fp32_allocator2.get();
+ acl_tensor_ptr input_fp32_tensor2 = ggml_cann_create_tensor(input_fp32_buffer2, ACL_FLOAT, sizeof(float),
+ dst->ne, input_fp32_nb, GGML_MAX_DIMS);
+
+ ggml_cann_pool_alloc fp32_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+ output_fp32_buffer = fp32_allocator.get();
+ acl_tensor_ptr output_fp32_tensor = ggml_cann_create_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float),
+ dst->ne, input_fp32_nb, GGML_MAX_DIMS);
+ aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get(), input_fp32_tensor1.get());
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get(), input_fp32_tensor2.get());
+ aclnn_add(ctx, input_fp32_tensor1.get(), input_fp32_tensor2.get(), output_fp32_tensor.get());
+ aclnn_cast(ctx, output_fp32_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+ }
+ return;
+#endif
+ int64_t acl_mode = is_neox ? 0 : 1;
+
+ // Pre-define head and tail dimensions for reuse
+ int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
+ int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 };
+
+ // Step 1: Prepare trans tensors for F16 type conversion to F32 if needed
+ bool src_dst_need_trans = false;
+ ggml_cann_pool_alloc src_trans_allocator(ctx.pool());
+ ggml_cann_pool_alloc dst_trans_allocator(ctx.pool());
+ acl_tensor_ptr acl_src_trans_tensor;
+ acl_tensor_ptr acl_dst_trans_tensor;
+ void * src_trans_buffer = nullptr;
+ void * dst_trans_buffer = nullptr;
+ size_t src_dst_trans_nb[GGML_MAX_DIMS];
+ if (src0->type == GGML_TYPE_F16) {
+ src_dst_need_trans = true;
+ src_trans_buffer = src_trans_allocator.alloc(ggml_nelements(src0) * sizeof(float));
+ dst_trans_buffer = dst_trans_allocator.alloc(ggml_nelements(dst) * sizeof(float));
+
+ src_dst_trans_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ src_dst_trans_nb[i] = src_dst_trans_nb[i - 1] * src0->ne[i - 1];
+ }
+ acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne,
+ src_dst_trans_nb, GGML_MAX_DIMS);
+ acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne,
+ src_dst_trans_nb, GGML_MAX_DIMS);
+ aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
+ }
+
+ // Step 2: Prepare head tensors for tail splitting if needed
+ acl_tensor_ptr acl_src_head;
+ acl_tensor_ptr acl_dst_head;
+ if (has_tail) {
+ // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions)
+ // RotaryPositionEmbedding requires contiguous dst tensor, so we use a temporary buffer
+ if (src_dst_need_trans) {
+ // Use F32 trans tensor strides
+ acl_src_head = ggml_cann_create_tensor((char *) src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne,
+ src_dst_trans_nb, GGML_MAX_DIMS);
+ } else {
+ // Use original F32 tensor strides
+ acl_src_head = ggml_cann_create_tensor((char *) src0->data, ACL_FLOAT, sizeof(float), head_ne, src0->nb,
+ GGML_MAX_DIMS);
+ }
+
+ int64_t head_elements = rope_dims * ne01 * ne02 * ne03;
+ ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
+ void * dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
+
+ size_t head_contiguous_nb[GGML_MAX_DIMS];
+ head_contiguous_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
+ }
+ acl_dst_head = ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
+ head_contiguous_nb, GGML_MAX_DIMS);
+ }
+
+ // Step 3: Execute RotaryPositionEmbedding
+ if (has_tail) {
+ // Rotate only the head portion (first rope_dims dimensions)
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head.get(), acl_cos_reshape_tensor.get(),
+ acl_sin_reshape_tensor.get(), acl_mode, acl_dst_head.get());
+
+ // Copy head result from contiguous buffer back to destination tensor
+ if (src_dst_need_trans) {
+ acl_tensor_ptr acl_dst_head_target = ggml_cann_create_tensor(
+ (char *) dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, src_dst_trans_nb, GGML_MAX_DIMS);
+ cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
+ } else {
+ acl_tensor_ptr acl_dst_head_target =
+ ggml_cann_create_tensor((char *) dst->data, ACL_FLOAT, sizeof(float), head_ne, dst->nb, GGML_MAX_DIMS);
+ cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
+ }
+ } else if (src_dst_need_trans) {
+ // Rotate full tensor (no tail), using trans tensors
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
+ acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
+ } else {
+ // Rotate full tensor (no tail), using original tensors
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
+ acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
+ }
+
+ // Step 4: Copy unrotated tail portion from source to destination
+ if (has_tail) {
+ size_t src_tail_offset;
+ size_t dst_tail_offset;
+
+ auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size,
+ size_t * nb_src_arr, size_t * nb_dst_arr) {
+ acl_tensor_ptr acl_src_tail =
+ ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS);
+ acl_tensor_ptr acl_dst_tail =
+ ggml_cann_create_tensor(dst_ptr, dtype, elem_size, tail_ne, nb_dst_arr, GGML_MAX_DIMS);
+ cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get());
+ };
+
+ if (src_dst_need_trans) {
+ // Use F32 trans tensor strides and offsets
+ src_tail_offset = rope_dims * src_dst_trans_nb[0];
+ dst_tail_offset = rope_dims * src_dst_trans_nb[0];
+ copy_tail_device((char *) src_trans_buffer + src_tail_offset, (char *) dst_trans_buffer + dst_tail_offset,
+ ACL_FLOAT, sizeof(float), src_dst_trans_nb, src_dst_trans_nb);
+ } else {
+ // Use original tensor strides and offsets
+ src_tail_offset = rope_dims * nb00;
+ dst_tail_offset = rope_dims * nb0;
+ copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
+ ggml_cann_type_mapping(dst->type), ggml_element_size(dst), src0->nb, dst->nb);
+ }
+ }
+
+ // Step 5: Cast back to F16 if needed
+ if (src_dst_need_trans) {
+ aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+ }
+}
+
+void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
+}
+
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+
+ // stride
+ int64_t s0 = ((const int32_t *) (dst->op_params))[0];
+
+ acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
+ acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
+
+ // get base information of input and kernel
+ int64_t input_len = *(src1->ne);
+ int64_t dst_len = *(dst->ne);
+ int64_t kernel_size = *(src0->ne);
+
+ // set the max kernel size for each conv
+ int64_t max_kernel_size = 255;
+
+ // compute the partition of kernel
+ int64_t part_num = 1;
+ part_num = (kernel_size + max_kernel_size - 1) / max_kernel_size;
+
+ int64_t strideVal[1];
+ strideVal[0] = s0;
+ acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
+ int64_t paddingVal[] = { 0 };
+ acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
+ int64_t dilationVal[] = { 1 };
+ acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
+ bool transposed = true;
+ int64_t groups = 1;
+ int8_t cubeMathType = 0;
+
+#ifdef ASCEND_310P
+ cubeMathType = 1;
+#endif
+
+ auto weight_type = ggml_cann_type_mapping(src0->type);
+ auto dst_type = ggml_cann_type_mapping(dst->type);
+
+ // slice the kernel to make each conv available
+ int64_t slice_dim = -1;
+ int64_t slice_start = 0;
+ int64_t slice_end = max_kernel_size;
+ int64_t slice_step = 1;
+ int64_t interval = max_kernel_size;
+
+ int64_t left_pad_len = dilationVal[0] * (max_kernel_size - 1) + 1 - 2 * paddingVal[0];
+ int64_t right_pad_len = 0;
+
+ acl_scalar_ptr alpha = nullptr;
+ float alphaValue = 1.0;
+ alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+ // set zero to destination
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
+
+ for (int k = 0; k < part_num; k++) {
+ // create part kernel tensor and slice from big kernel
+ slice_start = max_kernel_size * k;
+ if (k == part_num - 1) {
+ slice_end = kernel_size;
+ interval = kernel_size - max_kernel_size * k;
+ } else {
+ slice_end = max_kernel_size * (k + 1);
+ }
+
+ int64_t part_ne[4];
+ for (int i = 0; i < 4; i++) {
+ part_ne[i] = *(src0->ne + i);
+ }
+ part_ne[0] = interval;
+
+ size_t part_nb[4];
+ part_nb[0] = sizeof(weight_type);
+ for (int i = 1; i < 4; i++) {
+ part_nb[i] = part_nb[i - 1] * part_ne[i - 1];
+ }
+
+ ggml_cann_pool_alloc part_kernel_allocator;
+ part_kernel_allocator.alloc(ctx.pool(), part_nb[3]);
+ void * part_kernel_buf = part_kernel_allocator.get();
+
+ acl_tensor_ptr part_kernel = ggml_cann_create_tensor(part_kernel_buf, weight_type, ggml_element_size(src0),
+ part_ne, part_nb, 3, ACL_FORMAT_NCL);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Slice, acl_weight.get(), slice_dim, slice_start, slice_end, slice_step,
+ part_kernel.get());
+
+ // create the part conv result tensor
+ int64_t part_dst_ne[4];
+ for (int i = 0; i < 4; i++) {
+ part_dst_ne[i] = *(dst->ne + i);
+ }
+ part_dst_ne[0] = (input_len - 1) * strideVal[0] - 2 * paddingVal[0] + dilationVal[0] * (part_ne[0] - 1) + 1;
+
+ size_t part_dst_nb[4];
+ part_dst_nb[0] = sizeof(weight_type);
+ for (int i = 1; i < 4; i++) {
+ part_dst_nb[i] = part_dst_nb[i - 1] * part_dst_ne[i - 1];
+ }
+ ggml_cann_pool_alloc part_dst_allocator;
+ part_dst_allocator.alloc(ctx.pool(), part_dst_nb[3]);
+ void * part_dst_buf = part_dst_allocator.get();
+
+ acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor(part_dst_buf, dst_type, ggml_element_size(dst),
+ part_dst_ne, part_dst_nb, 3, ACL_FORMAT_NCL);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_part_dst.get());
+
+ // compute part conv transpose 1d
+ GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), part_kernel.get(), nullptr, stride.get(),
+ padding.get(), dilation.get(), transposed, padding.get(), groups, acl_part_dst.get(),
+ cubeMathType);
+
+ // compute the position of part result in final result
+ int64_t global_start = slice_start;
+ int64_t global_end = std::min((input_len - 1) * strideVal[0] + slice_end, dst_len);
+
+ left_pad_len = global_start;
+ right_pad_len = dst_len - global_end;
+
+ std::vector<int64_t> padDataVal = { left_pad_len, right_pad_len };
+ acl_int_array_ptr padData = ggml_cann_create_int_array(padDataVal.data(), 2);
+
+ acl_scalar_ptr pad_value = nullptr;
+ float pad_valueVal = 0.0;
+ pad_value = ggml_cann_create_scalar(&pad_valueVal, aclDataType::ACL_FLOAT);
+
+ int64_t conv_result_ne[4];
+ for (int i = 0; i < 4; i++) {
+ conv_result_ne[i] = *(dst->ne + i);
+ }
+
+ size_t conv_result_nb[4];
+ conv_result_nb[0] = sizeof(weight_type);
+ for (int i = 1; i < 4; i++) {
+ conv_result_nb[i] = conv_result_nb[i - 1] * conv_result_ne[i - 1];
+ }
+
+ ggml_cann_pool_alloc conv_result_allocator;
+ conv_result_allocator.alloc(ctx.pool(), conv_result_nb[3]);
+ void * conv_result_buf = conv_result_allocator.get();
+
+ acl_tensor_ptr conv_result = ggml_cann_create_tensor(conv_result_buf, dst_type, ggml_element_size(dst),
+ conv_result_ne, conv_result_nb, 3, ACL_FORMAT_NCL);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, conv_result.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_part_dst.get(), padData.get(), pad_value.get(),
+ conv_result.get());
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), conv_result.get(), alpha.get());
+ }
+}
+
+void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+
+ acl_tensor_ptr acl_input = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ float alphaValue = 1.0f;
+ acl_scalar_ptr alpha = nullptr;
+ alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input.get(), alpha.get(), alpha.get(), alpha.get(), acl_dst.get());
+}
+
+void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ int64_t reduceDimValue[] = { 3 };
+ acl_int_array_ptr reduceDim = ggml_cann_create_int_array(reduceDimValue, 1);
+ bool keepDim = true;
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src.get(), reduceDim.get(), keepDim, ACL_FLOAT, acl_dst.get());
+}
+
+void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ int32_t * opts = (int32_t *) dst->op_params;
+ int64_t paddingsArray[2] = { opts[0], opts[1] };
+ acl_int_array_ptr paddings = ggml_cann_create_int_array(paddingsArray, 2);
+
+ for (int64_t i = 0; i < src0->ne[3]; i++) {
+ acl_tensor_ptr acl_src =
+ ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type),
+ ggml_element_size(src0), src0->ne, src0->nb, 3);
+
+ acl_tensor_ptr acl_dst =
+ ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type),
+ ggml_element_size(dst), dst->ne, dst->nb, 3);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
+ }
+}
+
+void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+
+ acl_tensor_ptr acl_self = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_other = ggml_cann_create_tensor(src1);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self.get(), acl_other.get());
+
+ ggml_cann_sum(ctx, dst);
+}
+
+void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ float alphaValue = 0.0f;
+ acl_scalar_ptr alpha = nullptr;
+ alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src.get(), alpha.get(), acl_dst.get());
+}
+
+/**
+ * @brief Performs expert-specific matrix multiplication (MoE) with
+ * floating-point precision using the CANN backend.
+ *
+ * This function executes a matrix multiplication operation tailored for
+ * Mixture of Experts (MoE) models, where the input tensor is multiplied
+ * with expert-specific weight matrices. It uses the CANN backend for
+ * efficient computation and stores the result in the destination tensor `dst`.
+ * The operation may leverage identity-based optimizations or routing masks
+ * as part of sparse expert selection.
+ *
+ * @param ctx The context for executing CANN backend operations.
+ * @param dst The destination tensor where the MoE multiplication result
+ * will be stored.
+ *
+ * @note This function assumes floating-point data types and is designed for
+ * MoE architectures, possibly involving sparse expert routing.
+ */
+static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ //dst [M, K, N, 1]
+ ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] -> [D, M, K, 1]
+ ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1 -> [D, 1, K, 1]
+ ggml_tensor * ids = dst->src[2]; //ids [K, N]
+
+ GGML_ASSERT(src0->ne[3] == 1);
+ GGML_ASSERT(src1->ne[3] == 1);
+ GGML_ASSERT(dst->ne[3] == 1);
+
+ int64_t batch = src1->ne[2];
+ GGML_ASSERT(batch == ids->ne[1]);
+
+ ggml_cann_pool_alloc export_allocator(ctx.pool(), src0->ne[0] * src0->ne[1] * ids->ne[0] * ggml_element_size(src0));
+ void * export_ptr = export_allocator.get();
+ for (int64_t i = 0; i < batch; i++) {
+ acl_tensor_ptr select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]);
+ acl_tensor_ptr export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3);
+
+ int64_t select_export_ne[] = { src0->ne[0], src0->ne[1], ids->ne[0] };
+ size_t select_export_nb[3];
+ select_export_nb[0] = src0->nb[0];
+ for (int k = 1; k < 3; k++) {
+ select_export_nb[k] = select_export_nb[k - 1] * select_export_ne[k - 1];
+ }
+
+ acl_tensor_ptr select_export =
+ ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
+ select_export_ne, select_export_nb, 3);
+ GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, export_weight.get(), 0, select_index.get(), select_export.get());
+
+ int64_t select_transpose_ne[] = { select_export_ne[1], select_export_ne[0], select_export_ne[2] };
+ size_t select_transpose_nb[] = { select_export_nb[1], select_export_nb[0], select_export_nb[2] };
+ acl_tensor_ptr select_export_transpose =
+ ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
+ select_transpose_ne, select_transpose_nb, 3);
+
+ int64_t active_tensor_ne[] = { src1->ne[0], 1, src1->ne[1] };
+ size_t active_tensor_nb[] = { src1->nb[0], src1->nb[1], src1->nb[1] };
+ acl_tensor_ptr active_tensor =
+ ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]);
+
+ int64_t dst_ne[] = { dst->ne[0], 1, dst->ne[1] };
+ size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[1] };
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, active_tensor.get(), select_export_transpose.get(), acl_dst.get(), 2);
+ }
+}
+
+/**
+ * @brief Performs quantized matrix multiplication for Mixture of Experts (MoE)
+ * models using the CANN backend.
+ *
+ * This function implements MUL_MAT_ID operation for quantized weight matrices
+ * (Q4_0 and Q8_0 formats). It selects expert-specific weight matrices based on
+ * the provided expert indices, and computes matrix multiplication using CANN's
+ * WeightQuantBatchMatmulV2 operator.
+ *
+ * The function performs the following steps:
+ * 1. Converts input/output tensors to F16 format if necessary
+ * 2. Uses IndexSelect to extract expert-specific weights and scales based on indices
+ * 3. Performs quantized matrix multiplication for each expert using WeightQuantBatchMatmulV2
+ * 4. Converts output back to the target type if needed
+ *
+ * Tensor shapes:
+ * - dst: [M, K, N, 1] - output tensor
+ * - src0: [D, M, A, 1] - quantized weight matrices (Q4_0 or Q8_0)
+ * - src1: [D, B, N, 1] - input activations (B = K for per-expert input, or B = 1 for broadcast)
+ * - ids: [K, N] - expert indices for routing
+ *
+ * @param ctx The CANN backend context for operation execution.
+ * @param dst The destination tensor where the multiplication result will be stored.
+ *
+ * @note Only Q4_0 and Q8_0 quantization formats are supported.
+ * @note The function handles automatic type conversion to/from F16 as needed by the hardware.
+ */
+static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ // dst: [M, K, N, 1]
+ // src0: [D, M, A, 1] - quantized weights
+ // src1: [D, B, N, 1] - input activations, B = K or B = 1
+ // ids: [K, N] - expert indices
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+ ggml_tensor * ids = dst->src[2];
+
+ GGML_ASSERT(src0->ne[3] == 1);
+ GGML_ASSERT(src1->ne[3] == 1);
+ GGML_ASSERT(dst->ne[3] == 1);
+ GGML_ASSERT(src1->ne[2] == ids->ne[1]);
+
+ const int64_t n_batches = ids->ne[1];
+ const int64_t n_select_experts = ids->ne[0];
+ const enum ggml_type type = src0->type;
+
+ const int32_t group_size = QK8_0; // Both Q4_0 and Q8_0 use group size of 32
+ GGML_ASSERT(group_size == QK4_0);
+
+ // Calculate element size for quantized weights
+ const float weight_elem_size =
+ (type == GGML_TYPE_Q4_0) ? 0.5f :
+ (type == GGML_TYPE_Q8_0) ? 1.0f :
+ (GGML_ABORT("MUL_MAT_ID only supports Q4_0 and Q8_0"), 0.0f);
+
+ // Calculate scale offset in memory
+ const size_t weight_size = src0->ne[0] * src0->ne[1] * src0->ne[2] * weight_elem_size;
+ const size_t scale_elem_size = sizeof(uint16_t);
+ char * scale_data = (char *) src0->data + weight_size;
+
+ // Allocate buffers for selected expert weights and scales
+ const size_t selected_weight_size = src0->ne[0] * src0->ne[1] * n_select_experts * weight_elem_size;
+ ggml_cann_pool_alloc selected_weight_alloc(ctx.pool(), selected_weight_size);
+ void * selected_weight_buffer = selected_weight_alloc.get();
+
+ const size_t selected_scale_size = (src0->ne[0] / group_size) * src0->ne[1] * n_select_experts * scale_elem_size;
+ ggml_cann_pool_alloc selected_scale_alloc(ctx.pool(), selected_scale_size);
+ void * selected_scale_buffer = selected_scale_alloc.get();
+
+ // Helper lambda to allocate and cast tensor to F16 if needed
+ constexpr size_t f16_elem_size = sizeof(uint16_t);
+ auto prepare_f16_buffer = [&](ggml_tensor * tensor, ggml_cann_pool_alloc & allocator,
+ bool need_cast = false) -> void * {
+ if (tensor->type == GGML_TYPE_F16) {
+ return tensor->data;
+ }
+
+ size_t total_size = f16_elem_size;
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ total_size *= tensor->ne[i];
+ }
+ void * buffer = allocator.alloc(total_size);
+
+ if (need_cast == false) {
+ return buffer;
+ }
+
+ int64_t ne[GGML_MAX_DIMS];
+ size_t nb[GGML_MAX_DIMS] = { f16_elem_size };
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ ne[i] = tensor->ne[i];
+ if (i > 0) {
+ nb[i] = nb[i - 1] * ne[i - 1];
+ }
+ }
+
+ acl_tensor_ptr src_tensor = ggml_cann_create_tensor(tensor);
+ acl_tensor_ptr f16_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
+ aclnn_cast(ctx, src_tensor.get(), f16_tensor.get(), ACL_FLOAT16);
+
+ return buffer;
+ };
+
+ // Prepare input and output buffers
+ ggml_cann_pool_alloc input_alloc(ctx.pool());
+ void * input_buffer = prepare_f16_buffer(src1, input_alloc, true);
+
+ ggml_cann_pool_alloc output_alloc(ctx.pool());
+ void * output_buffer = prepare_f16_buffer(dst, output_alloc, false);
+
+ // Process each batch
+ for (int64_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
+ // Create index tensor for current batch
+ const size_t index_offset = batch_idx * ids->nb[1];
+ acl_tensor_ptr batch_indices = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, index_offset);
+
+ // Select quantized weights using expert indices
+ // Q4_0 stores 2 values per byte, Q8_0 stores 1 value per byte
+ const int64_t weight_d = (type == GGML_TYPE_Q4_0) ? src0->ne[0] / 2 : src0->ne[0];
+ const int64_t weight_m = src0->ne[1];
+ const int64_t weight_n_experts = src0->ne[2];
+
+ int64_t weight_ne[3] = { weight_d, weight_m, weight_n_experts };
+ size_t weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t), weight_d * weight_m * sizeof(int8_t) };
+
+ acl_tensor_ptr all_weights =
+ ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, 3);
+
+ int64_t selected_weight_ne[3] = { weight_d, weight_m, n_select_experts };
+ size_t selected_weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t),
+ weight_d * weight_m * sizeof(int8_t) };
+
+ acl_tensor_ptr selected_weights = ggml_cann_create_tensor(selected_weight_buffer, ACL_INT8, sizeof(int8_t),
+ selected_weight_ne, selected_weight_nb, 3);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_weights.get(), 0, batch_indices.get(), selected_weights.get());
+
+ // Select scales using the same expert indices
+ const int64_t scale_d = src0->ne[0] / group_size;
+ int64_t scale_ne[3] = { scale_d, weight_m, weight_n_experts };
+ size_t scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size, scale_d * weight_m * scale_elem_size };
+
+ acl_tensor_ptr all_scales =
+ ggml_cann_create_tensor(scale_data, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 3);
+
+ int64_t selected_scale_ne[3] = { scale_d, weight_m, n_select_experts };
+ size_t selected_scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size,
+ scale_d * weight_m * scale_elem_size };
+
+ acl_tensor_ptr selected_scales = ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size,
+ selected_scale_ne, selected_scale_nb, 3);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_scales.get(), 0, batch_indices.get(), selected_scales.get());
+
+ // Process each expert for current batch
+ // IndexSelect output layout: [D, M, K] in contiguous format
+ // WeightQuantBatchMatmulV2 expects: [M, D] with row-major stride
+ for (int64_t expert_idx = 0; expert_idx < n_select_experts; expert_idx++) {
+ // Determine input offset: broadcast if src1->ne[1]==1, otherwise use per-expert input
+ const size_t input_offset =
+ (batch_idx * src1->ne[1] + (src1->ne[1] == 1 ? 0 : expert_idx)) * src1->ne[0] * f16_elem_size;
+ const size_t output_offset = (batch_idx * dst->ne[1] + expert_idx) * dst->ne[0] * f16_elem_size;
+
+ // Create weight view for current expert: [D, M, K] -> [M, D]
+ int64_t weight_view_ne[2] = { weight_m, src0->ne[0] };
+ float weight_view_nb[2] = { src0->ne[0] * weight_elem_size, weight_elem_size };
+ const size_t weight_view_offset = expert_idx * selected_weight_nb[2];
+
+ acl_tensor_ptr weight_view =
+ ggml_cann_create_tensor(selected_weight_buffer, ggml_cann_type_mapping(type), weight_elem_size,
+ weight_view_ne, weight_view_nb, 2, ACL_FORMAT_ND, weight_view_offset);
+
+ // Create scale view for current expert: [D, M, K] -> [M, D]
+ int64_t scale_view_ne[2] = { weight_m, scale_d };
+ size_t scale_view_nb[2] = { selected_scale_nb[1], selected_scale_nb[0] };
+ const size_t scale_view_offset = expert_idx * selected_scale_nb[2];
+
+ acl_tensor_ptr scale_view =
+ ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size, scale_view_ne,
+ scale_view_nb, 2, ACL_FORMAT_ND, scale_view_offset);
+
+ // Create input activation tensor [D, 1]
+ int64_t input_ne[2] = { src1->ne[0], 1 };
+ size_t input_nb[2] = { f16_elem_size, src1->ne[0] * f16_elem_size };
+
+ acl_tensor_ptr input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, f16_elem_size, input_ne,
+ input_nb, 2, ACL_FORMAT_ND, input_offset);
+
+ // Create output tensor [M, 1]
+ int64_t output_ne[2] = { dst->ne[0], 1 };
+ size_t output_nb[2] = { f16_elem_size, dst->ne[0] * f16_elem_size };
+
+ acl_tensor_ptr output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, output_ne,
+ output_nb, 2, ACL_FORMAT_ND, output_offset);
+
+ // Perform quantized matrix multiplication
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, input_tensor.get(), weight_view.get(),
+ scale_view.get(), nullptr, nullptr, nullptr, nullptr, group_size,
+ output_tensor.get());
+ }
+ }
+
+ // Cast output back to original type if we used a temporary F16 buffer
+ if (dst->type != GGML_TYPE_F16) {
+ int64_t ne[GGML_MAX_DIMS];
+ size_t nb[GGML_MAX_DIMS] = { f16_elem_size };
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ ne[i] = dst->ne[i];
+ if (i > 0) {
+ nb[i] = nb[i - 1] * ne[i - 1];
+ }
+ }
+
+ acl_tensor_ptr f16_output =
+ ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
+ acl_tensor_ptr dst_tensor = ggml_cann_create_tensor(dst);
+
+ aclnn_cast(ctx, f16_output.get(), dst_tensor.get(), ggml_cann_type_mapping(dst->type));
+ }
+}
+
+void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ const enum ggml_type type = dst->src[0]->type;
+ switch (type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ ggml_cann_mul_mat_id_fp(ctx, dst);
+ break;
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q8_0:
+ ggml_cann_mul_mat_id_quant(ctx, dst);
+ break;
+ default:
+ GGML_ABORT("Unsupported type for mul_mat_id");
+ break;
+ }
+}
+
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont)
+ ggml_tensor * src1 = dst->src[1]; // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
+ ggml_tensor * src2 = dst->src[2]; // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
+ ggml_tensor * src3 = dst->src[3]; // mask, fp16
+
+ // B, N, S, D (uncont) -> B, S, N, D (cont)
+ int64_t src0_bsnd_ne[GGML_MAX_DIMS];
+ memcpy(src0_bsnd_ne, src0->ne, GGML_MAX_DIMS * sizeof(int64_t));
+ size_t src0_bsnd_nb[GGML_MAX_DIMS];
+ memcpy(src0_bsnd_nb, src0->nb, GGML_MAX_DIMS * sizeof(size_t));
+ int64_t src1_bsnd_ne[GGML_MAX_DIMS];
+ memcpy(src1_bsnd_ne, src1->ne, GGML_MAX_DIMS * sizeof(int64_t));
+ size_t src1_bsnd_nb[GGML_MAX_DIMS];
+ memcpy(src1_bsnd_nb, src1->nb, GGML_MAX_DIMS * sizeof(size_t));
+ int64_t src2_bsnd_ne[GGML_MAX_DIMS];
+ memcpy(src2_bsnd_ne, src2->ne, GGML_MAX_DIMS * sizeof(int64_t));
+ size_t src2_bsnd_nb[GGML_MAX_DIMS];
+ memcpy(src2_bsnd_nb, src2->nb, GGML_MAX_DIMS * sizeof(size_t));
+
+ auto transpose12 = [](int64_t * ne, size_t * nb) {
+ int64_t ne_tmp = ne[1];
+ size_t nb_tmp = nb[1];
+ ne[1] = ne[2];
+ nb[1] = nb[2];
+ ne[2] = ne_tmp;
+ nb[2] = nb_tmp;
+ };
+
+ transpose12(src0_bsnd_ne, src0_bsnd_nb);
+ transpose12(src1_bsnd_ne, src1_bsnd_nb);
+ transpose12(src2_bsnd_ne, src2_bsnd_nb);
+
+ float maxBias = 0.0f;
+ float scaleValue = 1.0f;
+ float logitSoftcap = 0.0f;
+ memcpy(&scaleValue, (float *) dst->op_params + 0, sizeof(float));
+ memcpy(&maxBias, (float *) dst->op_params + 1, sizeof(float));
+ memcpy(&logitSoftcap, (float *) dst->op_params + 2, sizeof(float));
+
+ if (logitSoftcap == 0.0f) {
+ size_t faElemSize = sizeof(uint16_t);
+ auto faDataType = ACL_FLOAT16; //ACL_BF16;
+
+ acl_tensor_ptr acl_q_tensor = nullptr;
+ acl_tensor_ptr acl_k_tensor = nullptr;
+ acl_tensor_ptr acl_v_tensor = nullptr;
+
+ // Step 1: cast the src0 (Query) to fp16 if needed
+ ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
+ void * src0_f16_buffer = nullptr;
+
+ if (ggml_cann_type_mapping(src0->type) != faDataType) {
+ acl_tensor_ptr acl_src0_f32_tensor =
+ ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
+ src0_f16_buffer = src0_f16_allocator.alloc(ggml_nelements(src0) * faElemSize);
+
+ int64_t * src0_f16_ne = src0_bsnd_ne;
+ size_t src0_f16_nb[GGML_MAX_DIMS];
+ src0_f16_nb[0] = sizeof(uint16_t);
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+ src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
+ }
+
+ acl_q_tensor = ggml_cann_create_tensor(src0_f16_buffer, faDataType, faElemSize, src0_f16_ne, src0_f16_nb,
+ GGML_MAX_DIMS);
+ aclnn_cast(ctx, acl_src0_f32_tensor.get(), acl_q_tensor.get(), faDataType);
+ } else {
+ acl_q_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
+ }
+
+ // Step 2: create the acl tensors for src1 (Key), src2 (Value),
+ // and the direct output from FusedInferAttention
+
+ acl_k_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
+ acl_v_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);
+
+ // Step 3: create the PSEShift tensor if needed
+ // this tensor is considered as mask (f16) in the llama.cpp
+ acl_tensor_ptr bcast_pse_tensor;
+ ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
+ if (src3 != nullptr) {
+ // Construct the truncated pse tensor (common for prefill/decode)
+ int64_t trunc_pse_ne[GGML_MAX_DIMS] = {
+ src3->ne[0], // D
+ src0->ne[1], // S (number of Q tokens)
+ src3->ne[2], // mask N
+ src3->ne[3] // B
+ };
+ size_t * trunc_pse_nb = src3->nb;
+
+ acl_tensor_ptr acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
+ src3->data, ACL_FLOAT16, sizeof(uint16_t), trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
+
+ int64_t bcast_pse_ne[GGML_MAX_DIMS];
+ size_t bcast_pse_nb[GGML_MAX_DIMS];
+ bcast_pse_ne[0] = src3->ne[0]; // D
+ bcast_pse_ne[1] = src0->ne[1]; // S
+ bcast_pse_ne[2] = src0->ne[2]; // N (num_heads)
+ bcast_pse_ne[3] = src3->ne[3]; // B
+ if (maxBias == 0.0f) {
+ // When maxBias == 0.0f, use nb = 0 reduce once repeat (Qwen2)
+ // Construct the bcast tensor (simulate repeat on the head dimension using stride=0)
+ bcast_pse_nb[0] = sizeof(uint16_t);
+ bcast_pse_nb[1] = bcast_pse_nb[0] * bcast_pse_ne[0];
+ bcast_pse_nb[2] = 0; // <---- the head dimension shares the same data
+ bcast_pse_nb[3] = src3->nb[3];
+
+ bcast_pse_tensor = ggml_cann_create_tensor(src3->data, ACL_FLOAT16, sizeof(uint16_t), bcast_pse_ne,
+ bcast_pse_nb, GGML_MAX_DIMS);
+
+ } else {
+ bcast_pse_nb[0] = sizeof(uint16_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
+ }
+
+ void * bcast_pse_buffer =
+ bcast_pse_allocator.alloc(ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
+
+ bcast_pse_tensor = ggml_cann_create_tensor(bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
+ bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
+
+ int64_t repeats[] = { 1, src0->ne[2], 1, 1 };
+ aclnn_repeat(ctx, acl_mask_f16_trunc_tensor.get(), bcast_pse_tensor.get(), repeats);
+
+ // alibi
+ // Compute the slope if needed. Derived from ggml_cann_softmax().
+ const int64_t n_heads = src0->ne[2];
+ ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
+ void * slope_buffer = slope_allocator.get();
+ aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
+
+ int64_t slope_ne[] = { 1, 1, n_heads, 1 };
+ size_t slope_nb[GGML_MAX_DIMS];
+ slope_nb[0] = sizeof(uint16_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ slope_nb[i] = slope_nb[i - 1] * slope_ne[0];
+ }
+
+ acl_tensor_ptr slope_tensor = ggml_cann_create_tensor(slope_buffer, ACL_FLOAT16, sizeof(uint16_t),
+ slope_ne, slope_nb, GGML_MAX_DIMS);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor.get(), slope_tensor.get());
+ }
+ }
+
+ // Step 4: set the inputs for FusedInferAttention.
+ acl_tensor_list_ptr acl_k_tensor_list = ggml_cann_create_tensor_list(acl_k_tensor);
+ acl_tensor_list_ptr acl_v_tensor_list = ggml_cann_create_tensor_list(acl_v_tensor);
+
+ int64_t numHeads = src0->ne[2]; // N
+ int64_t numKeyValueHeads = src1->ne[2];
+ // double scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
+ int64_t preTokens = 65535;
+ int64_t nextTokens = 65535;
+ char layout[5] = { 'B', 'S', 'N', 'D', 0 };
+ int64_t sparseMode = 0;
+ int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
+ int64_t blockSize = 0;
+ int64_t antiquantMode = 0;
+ bool softmaxLseFlag = false;
+ int64_t keyAntiquantMode = 0;
+ int64_t valueAntiquantMode = 0;
+
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+ acl_tensor_ptr fa_dst_tensor;
+ acl_tensor_ptr acl_dst_tensor;
+ ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
+ if (dst->type == GGML_TYPE_F32) {
+ void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
+
+ int64_t * out_f16_ne = src0_bsnd_ne;
+ size_t out_f16_nb[GGML_MAX_DIMS];
+ out_f16_nb[0] = faElemSize;
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+ out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
+ }
+
+ fa_dst_tensor =
+ ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
+ } else {
+ fa_dst_tensor = ggml_cann_create_tensor(dst);
+ }
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2, acl_q_tensor.get(), acl_k_tensor_list.get(),
+ acl_v_tensor_list.get(), // q, k, v
+ bcast_pse_tensor.get(), nullptr, // pse, mask
+ nullptr, nullptr, // actSeqLen, actSeqLenkv
+ nullptr, nullptr, // deqScale1, quantScale1
+ nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
+ nullptr, nullptr, // antiquantScale, antiquantOffset
+ nullptr, // blockTable
+ nullptr, nullptr, // qPadSize, kvPadSize
+ nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
+ nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
+ nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
+ numHeads, scaleValue, // heads, scaleValue
+ preTokens, nextTokens, // preTokens, nextTokens
+ layout, // inputLayout
+ numKeyValueHeads, // numKVHeads
+ sparseMode, innerPrecise, // sparseMode, innerPrecise
+ blockSize, antiquantMode, // blockSize, antiquantMode
+ softmaxLseFlag, // softmaxLseFlag
+ keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
+ fa_dst_tensor.get(), // attentionOut
+ nullptr // softmaxLse
+ );
+
+ if (dst->type == GGML_TYPE_F32) {
+ // Step 6: post-processing, permute and cast to f32
+ acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
+ aclnn_cast(ctx, fa_dst_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
+ }
+ } else {
+ GGML_ABORT("Function is not implemented.");
+ }
+}
+
+static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // weight
+ ggml_tensor * src1 = dst->src[1]; // input
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
+
+ const int64_t dps2 = ne2 / ne02;
+ const int64_t dps3 = ne3 / ne03;
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
+ for (int64_t i2 = 0; i2 < ne2; i2++) {
+ const int64_t i02 = i2 / dps2;
+ const int64_t i03 = i3 / dps3;
+
+ const int64_t i12 = i2;
+ const int64_t i13 = i3;
+ acl_tensor_ptr accumulator =
+ ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), dst->ne, dst->nb, 2);
+
+ // The outer product needs to be accumulated in this dimension.
+ for (int64_t i1 = 0; i1 < ne11; i1++) {
+ acl_tensor_ptr acl_input = ggml_cann_create_tensor(
+ (char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src1->ne, src1->nb, 1);
+
+ acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
+ (char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src0->ne, src0->nb, 1);
+
+ ggml_cann_pool_alloc output_allocator(ctx.pool());
+ void * output_buffer = output_allocator.alloc(ggml_nbytes(dst));
+ acl_tensor_ptr acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), dst->ne, dst->nb, 2);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
+ float alpha_value = 1.0f;
+ aclScalar * alpha = aclCreateScalar(&alpha_value, ACL_FLOAT);
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
+ }
+ }
+ }
+}
+
+void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+
+ const enum ggml_type type = src0->type;
+
+ switch (type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ ggml_cann_out_prod_fp(ctx, dst);
+ break;
+ default:
+ GGML_ABORT("Unsupport type for GGML_OP_OUT_PROD");
+ break;
+ }
+}
+
+void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // conv_x
+ ggml_tensor * src1 = dst->src[1]; // conv1d.weight
+
+ // This op is currently defined only for F32 in ggml_cpu
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+ // Shapes follow ggml_compute_forward_ssm_conv_f32
+ const int64_t nc = src1->ne[0]; // d_conv
+ const int64_t ncs = src0->ne[0]; // d_conv - 1 + n_t
+ const int64_t nr = src0->ne[1]; // d_inner
+ const int64_t n_s = src0->ne[2]; // n_seqs
+
+ const int64_t n_t = dst->ne[1]; // tokens per sequence
+
+ GGML_ASSERT(dst->ne[0] == nr); // dst: {d_inner, n_t, n_s}
+ GGML_ASSERT(src1->ne[1] == nr); // weight: {d_conv, d_inner}
+ GGML_ASSERT(ncs == nc - 1 + n_t); // conv_x: {d_conv - 1 + n_t, d_inner, n_s}
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT(src1->nb[0] == sizeof(float));
+
+ // --- Build CANN tensors ---
+
+ // 1) Input: conv_x as NCL
+ //
+ // src0->ne = { ncs, nr, n_s, 1 } // {L_in, C, N}
+ // Passing ACL_FORMAT_NCL here means:
+ // reversed dims -> [N, C, L_in] = [n_s, nr, ncs]
+ acl_tensor_ptr acl_x = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
+
+ // 2) Weights: depthwise conv kernel, view src1 as {K, 1, C}
+ //
+ // src1 original: ne = { nc, nr, 1, 1 } // [K, C, 1, 1]
+ // we want a view: ne_w = { nc, 1, nr } // [K, 1, C]
+ // so that reversed dims -> [C, 1, K] which matches
+ // [out_channels, in_channels/groups, kernel_size]
+ int64_t w_ne[GGML_MAX_DIMS] = { nc, 1, nr, 1 }; // [K, 1 input ch. per group, C groups]
+ // Layout: src1 data is [K, C] with
+ // offset(k, c) = k*nb0 + c*nb1
+ // We want offset_w(k, 0, c) = k*nb0 + c*nb1,
+ // so we can reuse nb0 and nb1, and set nb2 = nb1.
+ size_t w_nb[GGML_MAX_DIMS] = { src1->nb[0], src1->nb[1], src1->nb[1], src1->nb[3] }; // same as src1
+
+ acl_tensor_ptr acl_w = ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type),
+ ggml_type_size(src1->type), w_ne, w_nb, 3, ACL_FORMAT_NCL);
+
+ // 3) Output: dst is { d_inner, n_t, n_s } (CLN)
+ //
+ // We need an NCL view of the same buffer:
+ // desired NCL logical shape: { L_out = n_t, C = nr, N = n_s }
+ //
+ // Original CLN layout:
+ // dst->ne = { nr, n_t, n_s }
+ // dst->nb[0] = sizeof(float)
+ // dst->nb[1] = nr * sizeof(float)
+ // dst->nb[2] = nr * n_t * sizeof(float)
+ //
+ // We want offset_new(L, C, N) = offset_orig(C, L, N).
+ // Choose:
+ // nb_y[0] = nr * sizeof(float); // step in L
+ // nb_y[1] = sizeof(float); // step in C
+ // nb_y[2] = nr * n_t * sizeof(float); // step in N
+ int64_t y_ne[GGML_MAX_DIMS] = { n_t, nr, n_s, 1 }; // [L_out, C, N]
+ size_t y_nb[GGML_MAX_DIMS] = { dst->ne[0] * sizeof(float), sizeof(float), dst->ne[0] * dst->ne[1] * sizeof(float),
+ dst->nb[3] }; // [nr, 1, nr * n_t]
+
+ acl_tensor_ptr acl_y = ggml_cann_create_tensor(dst->data, ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), y_ne, y_nb, 3, ACL_FORMAT_NCL);
+
+ // --- Conv1d parameters: depthwise, stride 1, no padding ("valid") ---
+ int64_t strideVal[1] = { 1 };
+ int64_t paddingVal[1] = { 0 };
+ int64_t dilationVal[1] = { 1 };
+
+ acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
+ acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
+ acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
+
+ const bool transposed = false;
+ const int64_t groups = nr; // depthwise: one group per inner dim
+ int8_t cubeMathType = 0;
+
+#ifdef ASCEND_310P
+ cubeMathType = 1;
+#endif
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, Convolution,
+ acl_x.get(), // input: N, C, L_in = ncs
+ acl_w.get(), // weight: [C, 1, K] with groups=nr
+ nullptr, // bias
+ stride.get(), padding.get(), dilation.get(), transposed,
+ padding.get(), // output padding (unused for non-transposed)
+ groups, acl_y.get(), cubeMathType);
+}
+
+void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx,
+ ggml_tensor * add_node,
+ ggml_tensor * rms_norm_node) {
+ // Get the two input tensors for ADD operation
+ ggml_tensor * x1 = add_node->src[0];
+ ggml_tensor * x2 = add_node->src[1];
+
+ // Create ACL tensors for the two ADD inputs
+ acl_tensor_ptr acl_x1 = ggml_cann_create_tensor(x1);
+ acl_tensor_ptr acl_x2 = ggml_cann_create_tensor(x2);
+
+ // Get epsilon parameter from rms_norm_tensor
+ float eps;
+ memcpy(&eps, rms_norm_node->op_params, sizeof(float));
+
+ // Build gamma tensor (RMS normalization scaling factor)
+ // Gamma should match the normalized dimensions (last dimension of x1)
+ size_t acl_gamma_nb[GGML_MAX_DIMS];
+ acl_gamma_nb[0] = ggml_type_size(rms_norm_node->type);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ acl_gamma_nb[i] = acl_gamma_nb[i - 1] * x1->ne[i - 1];
+ }
+ acl_tensor_ptr acl_gamma =
+ get_cache_acl_tensor(ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, x1->ne,
+ acl_gamma_nb, rms_norm_node->type,
+ 1, // dims - only the last dimension
+ 1.0f // value
+ );
+
+ // Build rstdOut tensor (output for normalized standard deviation)
+ // Shape should be the dimensions that are NOT normalized
+ int64_t acl_rstd_ne[] = { 1, x1->ne[1], x1->ne[2], x1->ne[3] };
+ size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
+ acl_rstd_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+ acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
+ }
+ acl_tensor_ptr acl_rstd =
+ get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
+ acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS,
+ 0.0f // value
+ );
+
+ acl_tensor_ptr acl_xout = ggml_cann_create_tensor(add_node);
+
+ // Create yOut tensor (final output after RMS normalization)
+ acl_tensor_ptr acl_yout = ggml_cann_create_tensor(rms_norm_node);
+
+ // Call fused ADD + RMS_NORM operator
+ GGML_CANN_CALL_ACLNN_OP(ctx, AddRmsNorm, acl_x1.get(), acl_x2.get(), acl_gamma.get(),
+ eps, // double type
+ acl_yout.get(), acl_rstd.get(), acl_xout.get());
+}
+
+void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * k = dst->src[0];
+ ggml_tensor * v = dst->src[1];
+ ggml_tensor * q = dst->src[2];
+ ggml_tensor * g = dst->src[3];
+ ggml_tensor * s = dst->src[4];
+
+ int64_t B = dst->src[4]->ne[1];
+ int64_t T = dst->src[0]->ne[2];
+ int64_t H = dst->src[0]->ne[1];
+ int64_t C = dst->ne[0];
+ int64_t D = C / H;
+ int64_t L = T / B;
+
+ int64_t ne_qkg[2] = { 1, D };
+ int64_t ne_s[2] = { D, D };
+ int64_t ne_st[2] = { ne_s[1], ne_s[0] };
+ int64_t ne_vo[2] = { D, 1 };
+ int64_t ne_q[1] = { D };
+ size_t nb_base = ggml_type_size(k->type);
+ size_t nb_qkg[2] = { nb_base, nb_base };
+ size_t nb_s[2] = { nb_base, D * nb_base };
+ size_t nb_st[2] = { nb_s[1], nb_s[0] };
+ size_t nb_vo[2] = { nb_base, D * nb_base };
+ size_t nb_q[1] = { nb_base };
+
+ const float scale = ggml_get_op_params_f32(dst, 0);
+
+ acl_tensor_ptr acl_s = ggml_cann_create_tensor(s, s->ne, s->nb, 2, ACL_FORMAT_ND);
+ acl_tensor_ptr new_state = ggml_cann_create_tensor(dst, s->ne, s->nb, 2, ACL_FORMAT_ND, (B * L * H * D) * nb_base);
+ cann_copy(ctx, acl_s.get(), new_state.get());
+
+ for (int64_t b = 0; b < B; b++) {
+ for (int64_t h = 0; h < H; h++) {
+ size_t s_offset = (b * (H * D * D) + h * (D * D)) * nb_base;
+ // D * D
+ acl_tensor_ptr acl_s_new =
+ ggml_cann_create_tensor(dst, ne_s, nb_s, 2, ACL_FORMAT_ND, (B * L * H * D) * nb_base + s_offset);
+ acl_tensor_ptr acl_s_new_t =
+ ggml_cann_create_tensor(dst, ne_st, nb_st, 2, ACL_FORMAT_ND, (B * L * H * D) * nb_base + s_offset);
+ for (int64_t l = 0; l < L; l++) {
+ size_t qkvgo_offset = (b * (L * H * D) + l * (H * D) + h * (D)) * nb_base;
+ // D * 1
+ acl_tensor_ptr acl_k = ggml_cann_create_tensor(k, ne_qkg, nb_qkg, 2, ACL_FORMAT_ND, qkvgo_offset);
+ acl_tensor_ptr acl_g = ggml_cann_create_tensor(g, ne_qkg, nb_qkg, 2, ACL_FORMAT_ND, qkvgo_offset);
+ // D
+ acl_tensor_ptr acl_q = ggml_cann_create_tensor(q, ne_q, nb_q, 1, ACL_FORMAT_ND, qkvgo_offset);
+ // 1 * D
+ acl_tensor_ptr acl_v = ggml_cann_create_tensor(v, ne_vo, nb_vo, 2, ACL_FORMAT_ND, qkvgo_offset);
+ // D
+ acl_tensor_ptr acl_o = ggml_cann_create_tensor(dst, ne_q, nb_q, 1, ACL_FORMAT_ND, qkvgo_offset);
+ // k ⊗ v
+ size_t buf_size = D * D * nb_base;
+ ggml_cann_pool_alloc buffer_allocator(ctx.pool(), buf_size);
+ acl_tensor_ptr tmp_tensor = ggml_cann_create_tensor(
+ buffer_allocator.get(), ggml_cann_type_mapping(k->type), nb_base, ne_s, nb_s, 2);
+ aclnn_mul(ctx, acl_k.get(), acl_v.get(), tmp_tensor.get());
+ //s_new = g ⊗ s_old + k ⊗ v
+ aclnn_mul(ctx, acl_s_new.get(), acl_g.get(), nullptr);
+ aclnn_add(ctx, acl_s_new.get(), tmp_tensor.get(), nullptr);
+ // compute output
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mv, acl_s_new_t.get(), acl_q.get(), acl_o.get(), 1);
+ aclnn_muls(ctx, acl_o.get(), scale, nullptr, true);
+ }
+ }
+ }
+}
diff --git a/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h b/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h
new file mode 100644
index 0000000..3effa1c
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h
@@ -0,0 +1,1119 @@
+/**
+ * Copyright (c) 2023-2026 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CANN_ACLNN_OPS
+#define CANN_ACLNN_OPS
+
+#include "acl_tensor.h"
+#include "common.h"
+
+#include <aclnnop/aclnn_abs.h>
+#include <aclnnop/aclnn_arange.h>
+#include <aclnnop/aclnn_argsort.h>
+#include <aclnnop/aclnn_cat.h>
+#include <aclnnop/aclnn_clamp.h>
+#include <aclnnop/aclnn_cos.h>
+#include <aclnnop/aclnn_exp.h>
+#include <aclnnop/aclnn_gelu.h>
+#include <aclnnop/aclnn_gelu_v2.h>
+#include <aclnnop/aclnn_hardsigmoid.h>
+#include <aclnnop/aclnn_hardswish.h>
+#include <aclnnop/aclnn_leaky_relu.h>
+#include <aclnnop/aclnn_log.h>
+#include <aclnnop/aclnn_logsoftmax.h>
+#include <aclnnop/aclnn_neg.h>
+#include <aclnnop/aclnn_norm.h>
+#include <aclnnop/aclnn_relu.h>
+#include <aclnnop/aclnn_sigmoid.h>
+#include <aclnnop/aclnn_sign.h>
+#include <aclnnop/aclnn_silu.h>
+#include <aclnnop/aclnn_sin.h>
+#include <aclnnop/aclnn_slice.h>
+#include <aclnnop/aclnn_sqrt.h>
+#include <aclnnop/aclnn_tanh.h>
+
+#include <functional>
+#include <unordered_set>
+
+/**
+ * @brief Repeats a ggml tensor along each dimension to match the dimensions
+ * of another tensor.
+ *
+ * @details This function repeats the elements of a source ggml tensor along
+ * each dimension to create a destination tensor with the specified
+ * dimensions. The operation is performed using the ACL backend and
+ * executed asynchronously on the device.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The ggml tensor representing the destination, which op is
+ * GGML_OP_REPEAT and specifies the desired dimensions.
+ */
+void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Applies the Leaky ReLU activation function to a tensor using the CANN
+ * backend.
+ *
+ * @details This function computes the Leaky ReLU activation for each element of
+ * the input tensor. The Leaky ReLU function allows a small gradient
+ * when the unit is not active (i.e., when the input is negative). The
+ * Leaky ReLU function is defined as:
+ * \f[
+ * \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
+ * src)
+ * \f]
+ * `negativeSlope` is in dst->params.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result of the Leaky ReLU
+ * activation is stored, which op is `GGML_OP_LEAKY_RELU`
+ */
+void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Concatenates multiple tensors along a specified dimension using the
+ * CANN backend.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param tensorList A pointer to the list of tensors to be concatenated.
+ * @param dst The destination tensor where the result of the
+ * concatenation is stored. dst->op is `GGML_OP_CONCAT`.
+ * @param concat_dim The dimension along which the tensors are concatenated.
+ *
+ * @attention tensorList length should be 2 and the dimension using for concat
+ * default to 1.
+ */
+void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Generates a sequence of evenly spaced values within a specified
+ * interval for a ggml tensor using the CANN backend.
+ *
+ * @details This function creates a sequence of numbers over a specified i
+ * nterval, starting from `start`, ending before `stop`, and
+ * incrementing by `step`. The sequence is stored in the destination
+ * tensor `dst`.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the generated sequence will be stored.
+ * `start`, 'stop' and 'step' are in dst->op_params and dst->op is
+ * `GGML_OP_ARANGE`.
+ */
+void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Applies a clamp operation to the elements of a ggml tensor using the
+ * CANN backend.
+ *
+ * @details This function clamps the elements of the input tensor `src` to a
+ * specified range defined by `min` and `max` values. The result is
+ * stored in the destination tensor `dst`. The operation is defined as:
+ * \f[
+ * y = \max(\min(x, max\_value), min\_value)
+ * \f]
+ * where `x` is an element of the input tensor, and `y` is the
+ * corresponding element in the output tensor.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the clamped values will be stored.
+ * dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
+ */
+void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Scales the elements of a ggml tensor by a constant factor using the
+ * CANN backend.
+ *
+ * @details This function multiplies each element of the input tensor `src` by
+ * a scaling factor `scale`, storing the result in the destination
+ * tensor `dst`. The operation is defined as:
+ * \f[
+ * dst = src \times scale
+ * \f]
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the scaled values will be stored.
+ * dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
+ */
+void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Sorts the elements of a ggml tensor and returns the indices that
+ * would sort the tensor using the CANN backend.
+ *
+ * @details This function performs an argsort operation on the input tensor
+ * `src`. It sorts the elements of `src` in either ascending or
+ * descending order, depending on the `GGML_SORT_ORDER_DESC`,
+ * and returns the indices that would sort the original tensor.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the sorted indices will be stored.
+ * dst->op is `GGML_OP_ARGSORT`.
+ */
+void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the Layer Normalization for a ggml tensor using the CANN
+ * backend.
+ *
+ * @details This function applies the Layer Normalization operation on the
+ * input tensor `src` and stores the result in the destination tensor
+ * `dst`. Layer Normalization normalizes the features at each sample in
+ * a mini-batch independently. It is commonly used in neural networks
+ * to normalize the activations of a layer by adjusting and scaling
+ * the outputs.
+ * The operation is defined as:
+ * \f[
+ * \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
+ * \f]
+ * `Var` defaults dst->ne[0]. `eps` is in dst->params.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ * @attention `Var` defaults to dst->ne[0].
+ */
+void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the L2 Normalization for a ggml tensor using the CANN
+ * backend.
+ *
+ * @details This function applies the L2 Normalization operation on the
+ * input tensor `src` and stores the result in the destination tensor
+ * `dst`. L2 Normalization scales the input tensor such that the
+ * L2 norm along the specified dimension equals 1. This operation
+ * is commonly used in neural networks for feature normalization
+ * and vector scaling.
+ * The operation is defined as:
+ * \f[
+ * \text{out} = \frac{x}{\sqrt{\sum{x^2}}}
+ * \f]
+ * The normalization is performed along the last dimension by default.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ * @attention The normalization is performed along the last dimension of the
+ * input tensor by default.
+ */
+void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the Cross Entropy Loss for a ggml tensor using the CANN
+ * backend.
+ *
+ * @details This function computes the cross entropy loss between the predicted
+ * logits and target probability distributions. The operation follows
+ * the same computation pattern as the CPU implementation:
+ * 1. Applies log_softmax to the logits along the class dimension
+ * 2. Element-wise multiplication with target distributions
+ * 3. Summation along the class dimension to get per-sample losses
+ * 4. Global summation and scaling by -1/nr to get final loss
+ *
+ * The computation can be expressed as:
+ * \f[
+ * \text{loss} = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{C} y_{ij} \cdot \log(\text{softmax}(x_{ij}))
+ * \f]
+ * where \f$N\f$ is the total number of samples, \f$C\f$ is the number
+ * of classes, \f$x\f$ are the logits, and \f$y\f$ are the target
+ * probability distributions.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the computed loss will be stored.
+ * This should be a scalar tensor containing the final loss value.
+ *
+ * @note This implementation computes cross entropy between probability
+ * distributions, not the typical classification cross entropy that
+ * expects class indices as targets. Both input tensors (src0 and src1)
+ * should have the same shape and represent probability distributions
+ * over the class dimension.
+ * @note The function expects two source tensors:
+ * - dst->src[0]: Logits tensor (before softmax)
+ * - dst->src[1]: Target probability distributions tensor
+ * @note The computation is performed using CANN backend operators including
+ * LogSoftmax, Mul, ReduceSum, and Muls for the final scaling.
+ */
+void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the Group Normalization for a ggml tensor using the CANN
+ * backend.
+ *
+ * @brief This function applies the Group Normalization operation on the input
+ * tensor `src` and stores the result in the destination tensor `dst`.
+ * Group Normalization divides the channels into groups and normalizes
+ * the features within each group across spatial locations.
+ * It is commonly used in convolutional neural networks to improve
+ * training stability and performance.
+ * The operation is defined as:
+ * \f[
+ * \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
+ * \f]
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ * `n_groups` is in dst->params, which split C channel to `n_groups`.
+ * dst->op is `GGML_OP_GROUP_NORM`.
+ *
+ * @attention eps defaults to 1e-6f.
+ */
+void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the accumulation of tensors using the CANN backend.
+ *
+ * @details This function performs an accumulation operation on two tensors.
+ * Depending on the `inplace` flag, it either updates the destination
+ * tensor `dst` in place by adding `alpha * src1` to it, or it creates
+ * a new tensor as the result of `src0 + alpha * src1` and stores it in
+ * `dst`.
+ * The operation is defined as:
+ * \f[
+ * dst = src0 + alpha \times src1
+ * \f]
+ * if `inplace` is `true`, `src0` is equal to 'dst'.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the accumulated values will be stored.
+ * `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
+ */
+void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the sum of elements along the last dimension of a ggml tensor
+ * using the CANN backend.
+ *
+ * @details This function performs a reduction sum operation along the last
+ * dimension of the input tensor `src`. The result of the sum is stored
+ * in the destination tensor `dst`.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the reduced values will be stored。
+ * dst->op is `GGML_OP_SUM_ROWS`.
+ *
+ * @attention `reduce_dims` defaults to 3, which means the last dimension.
+ */
+void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the sum of elements in a ggml tensor.
+ *
+ * @details This function performs a reduction sum operation along the last
+ * dimension of the input tensor `src`. The result of the sum is stored
+ * in the destination tensor `dst`.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the reduced values will be stored。
+ *
+ */
+
+void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Upsamples a ggml tensor using nearest neighbor interpolation using
+ * the CANN backend.
+ *
+ * @details This function performs upsampling of the input tensor `src` using
+ * nearest neighbor interpolation. The upsampling is applied to the
+ * height and width dimensions (last two dimensions) of the tensor. The
+ * result is stored in the destination tensor `dst`, which must have
+ * the appropriate dimensions for the upsampled output.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the upsampled values will be stored.
+ * dst->op is `GGML_OP_UPSCALE`.
+ */
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Pads a ggml tensor to match the dimensions of the destination tensor
+ * using the CANN backend.
+ *
+ * @details This function pads the input tensor `src` so that it matches the
+ * dimensions of the destination tensor `dst`. The amount of padding
+ * is calculated based on the difference in sizes between `src` and
+ * `dst` along each dimension. The padded tensor is stored in `dst`.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor, which specifies the target dimensions for
+ * padding. dst->op is `GGML_OP_PAD`.
+ */
+void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Executes a 2D pooling operation on a ggml tensor using the CANN
+ * backend.
+ *
+ * @details This function dispatches the execution of a 2D pooling operation on
+ * the input tensor `dst`. The type of pooling (average or max) is
+ * determined by the `op` parameter, which is read from the operation
+ * parameters of `dst`. The function supports average pooling
+ * (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
+ * invalid operation is encountered, the function asserts a failure.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor on which the pooling operation is to be
+ * performed. dst->op is `GGML_OP_POOL_2D`.
+ */
+void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Duplicates a ggml tensor using the CANN backend.
+ *
+ * @details This function duplicates the contents of the source tensor `src` to
+ * the destination tensor `dst`. The function supports various tensor
+ * types and configurations, including handling of extra data, type
+ * conversions, and special cases for contiguous and non-contiguous
+ * tensors.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the duplicated data will be stored.
+ * dst->op is `GGML_OP_DUP`
+ *
+ * @attention Only support Fp16/FP32. Not support when src and dst have
+ * different shape and dst is no-contiguous.
+ * @note: This func need to simplify.
+ */
+void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the Root Mean Square (RMS) normalization of a ggml tensor
+ * using the CANN backend.
+ *
+ * @details This function applies RMS normalization to the input tensor `src`
+ * and stores the result in the destination tensor `dst`. RMS
+ * normalization involves computing the root mean square of the input
+ * tensor along a specified dimension and then dividing each element of
+ * the tensor by this value, adjusted by a small epsilon value to
+ * prevent division by zero.
+ * The operation is defined as:
+ * \f[
+ * \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
+ * \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
+ * \f]
+ * `eps` is in dst->op_params.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ * dst->op is `GGML_OP_RMS_NORM`.
+ */
+void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Applies a diagonal mask to the tensor with a specified value.
+ *
+ * @details This function creates a mask tensor filled with ones, then applies
+ * an upper triangular and lower triangular operation to it based on
+ * the number of past elements specified. Afterward, it adds the masked
+ * tensor to the destination tensor in-place.
+ *
+ * @param ctx The backend CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored. dst->op is
+ * `GGML_OP_DIAG_MASK`
+ * @param value The value to use for masking.
+ */
+void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value);
+
+/**
+ * @brief Performs an image-to-column transformation on the input tensor.
+ *
+ * @details This function takes an input tensor and applies an image-to-column
+ * operation, converting spatial dimensions into column-like
+ * structures suitable for convolutional operations. It supports both
+ * half-precision (F16) and single-precision (F32) floating-point data
+ * types.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor that stores the result of the operation.
+ * dst->op is `GGML_OP_IM2COL`.
+ */
+void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes time step embeddings using sine and cosine functions.
+ *
+ * @details This function calculates time step embeddings by applying sine and
+ * cosine transformations to a given input tensor, which is typically
+ * used in temporal models like diffusion models or transformers to
+ * encode time information effectively.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the result of the embedding operation
+ * will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
+ */
+void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+// @see ggml_cann_dup.
+void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the softmax activation with optional masking.
+ *
+ * @details This function computes the softmax activation over the input tensor,
+ * optionally applying a mask and scaling factor. It supports both FP16
+ * and FP32 data types and can handle masking by broadcasting the mask
+ * across rows if necessary.
+ * The function performs the following steps:
+ * 1. Multiplies the input tensor by a scale factor.
+ * 2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
+ * 3. Broadcasts the mask tensor if its dimensions do not match the
+ * input tensor's dimensions.
+ * 4. Adds the mask to the scaled input tensor.
+ * 5. Applies the softmax activation function along the specified
+ * dimension.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the result will be stored. dst->op is
+ * `GGML_OP_SOFTMAX`.
+ */
+void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Extracts specific rows from a tensor based on indices.
+ *
+ * @details This function retrieves rows from a source tensor src0 according to
+ * the indices provided in another tensor src1 and stores the result in
+ * a destination tensor (\p dst).
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the extracted rows will be stored.
+ */
+void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Writes specific rows into a tensor at positions specified by indices.
+ *
+ * @details This function copies rows from a source tensor into a destination
+ * tensor (\p dst) at the positions indicated by the indices in another
+ * tensor.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the specified rows will be updated.
+ */
+void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Executes matrix multiplication for the given tensor.
+ *
+ * @details This function performs matrix multiplication on the source tensors
+ * associated with the destination tensor. It supports matrix
+ * multiplication F32, F16, and Q8_0.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor for storing the result of the matrix
+ * multiplication. dst->op is `GGML_OP_MUL_MAT`.
+ */
+void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
+ *
+ * @details This function implements the RoPE mechanism, which is a method to
+ * encode positional information into sequence data, particularly
+ * useful in transformer models. It supports both F32 and F16 data
+ * types.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the RoPE-transformed data will be
+ * stored. dst->op is `GGML_OP_ROPE`.
+ *
+ * @note The function currently does not support cases where the n_dims is less
+ * than the input tensor's first dimension.
+ * @note The function currently does not support cases where the freq_factors is
+ * not NULL.
+ * @note The function currently does not support cases where the ext_factor is
+ * not equal 0.
+ * @note The function currently does not support cases where the freq_scale is
+ * not equal 1.
+ */
+void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the index of the maximum value along the specified dimension
+ * of a ggml tensor using the CANN backend.
+ *
+ * @details This function performs an argmax operation on the input tensor.
+ * It finds the index of the maximum value along the specified axis
+ * and stores these indices in the destination tensor `dst`. The
+ * operation is executed using the CANN backend for optimized performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the indices of the maximum values will
+ * be stored. dst->op is `GGML_OP_ARGMAX`.
+ */
+void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Adds two tensors element-wise and stores the result in a destination
+ * tensor.
+ *
+ * This function performs the operation:
+ * \f[
+ * dst = acl\_src0 + alpha \times acl\_src1
+ * \f]
+ * where alpha is a scalar value and defaults to 1.0f.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src0 The first source tensor.
+ * @param acl_src1 The second source tensor.
+ * @param acl_dst The destination tensor where the result will be stored.
+ */
+void aclnn_add(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src0,
+ aclTensor * acl_src1,
+ aclTensor * acl_dst = nullptr);
+
+/**
+ * @brief Sub two tensors element-wise and stores the result in a destination
+ * tensor.
+ *
+ * This function performs the operation:
+ * \f[
+ * dst = acl\_src0 - alpha \times acl\_src1
+ * \f]
+ * where alpha is a scalar value and defaults to 1.0f.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src0 The first source tensor.
+ * @param acl_src1 The second source tensor.
+ * @param acl_dst The destination tensor where the result will be stored.
+ */
+void aclnn_sub(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src0,
+ aclTensor * acl_src1,
+ aclTensor * acl_dst = nullptr);
+
+/**
+ * @brief Performs element-wise multiplication of two tensors and stores the
+ * result in a destination tensor.
+ *
+ * This function performs element-wise multiplication of the tensors `acl_src`
+ * and `acl_other` and stores the result in the destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ * \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The first tensor for element-wise multiplication.
+ * @param acl_other The second tensor for element-wise multiplication.
+ * @param acl_dst The destination tensor where the result will be stored.
+ */
+void aclnn_mul(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_other,
+ aclTensor * acl_dst = nullptr);
+
+/**
+ * @brief Matrix division, optionally in-place.
+ *
+ * This function division each element of the source tensor `acl_src` by the
+ * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
+ * If `inplace` is true, `acl_dst` will not be used and the operation is
+ * performed in-place on `acl_src`. The operation is defined as: \f[
+ * \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src Numerator tensor..
+ * @param acl_other Denominator tensor.
+ * @param acl_dst The destination tensor where the result will be stored if
+ * `inplace` is false.
+ * @param inplace Flag indicating whether to perform the operation in-place on
+ * `acl_src`.
+ */
+void aclnn_div(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_other,
+ aclTensor * acl_dst = nullptr);
+
+/**
+ * @brief Applies element-wise cosine function to the elements of a tensor.
+ *
+ * This function computes the cosine of each element in the source tensor
+ * `acl_src` and stores the result in the destination tensor `acl_dst`. The
+ * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
+ * }_i\right) \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the cosine function will be
+ * applied.
+ * @param acl_dst The destination tensor where the cosine results will be
+ * stored.
+ */
+void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
+
+/**
+ * @brief Applies element-wise sine function to the elements of a tensor.
+ *
+ * This function computes the sine of each element in the source tensor
+ `acl_src`
+ * and stores the result in the destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ * \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
+ * \f]
+
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the sine function will be applied.
+ * @param acl_dst The destination tensor where the sine results will be stored.
+ */
+void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
+
+/**
+ * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
+ * output tensor.
+ *
+ * This function checks whether broadcasting is needed between `src0` and `src1`.
+ * If broadcasting is required, it calculates the proper shapes and creates
+ * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
+ * based on the original tensor shapes.
+ *
+ * @param src0 The first input tensor (reference shape).
+ * @param src1 The second input tensor (possibly broadcasted).
+ * @param dst The destination/output tensor.
+ * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
+ * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
+ * @param acl_dst Output pointer to the created ACL tensor corresponding to dst.
+ */
+void bcast_shape(ggml_tensor * src0,
+ ggml_tensor * src1,
+ ggml_tensor * dst,
+ acl_tensor_ptr & acl_src0,
+ acl_tensor_ptr & acl_src1,
+ acl_tensor_ptr & acl_dst);
+
+/**
+ * @brief Computes the 1D transposed convolution (deconvolution) of a ggml
+ * tensor using the CANN backend.
+ *
+ * @details This function performs a 1D transposed convolution (also known as
+ * deconvolution) operation on the input tensor. The computed result is stored
+ * in the destination tensor `dst`. The operation is optimized using the CANN
+ * backend for improved performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the transposed convolution result
+ * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
+ */
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
+ * using the CANN backend.
+ *
+ * @details This function performs an element-wise ELU activation on the input
+ * tensor.
+ * The result is written to the destination tensor `dst` in-place.
+ * The ELU function is defined as:
+ *
+ * \text{ELU}(x) =
+ * \begin{cases}
+ * x, & \text{if } x > 0 \\
+ * \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
+ * \end{cases}
+ *
+ * where α (alpha) is a hyperparameter, typically set to 1.0.
+ * This operation is optimized using the CANN backend for high-performance
+ * inference or training.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the ELU-activated result will be stored.
+ * dst->op is expected to be `GGML_OP_ELU`.
+ */
+void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Computes the mean of a ggml tensor element-wise using the CANN backend.
+ *
+ * @details This function calculates the element-wise mean of the input tensor.
+ * The result is written to the destination tensor `dst`.
+ * The mean is computed by averaging the values across the entire tensor.
+ *
+ * This operation is optimized using the CANN backend for high-performance inference or training.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the mean result will be stored.
+ * dst->op is expected to be `GGML_OP_MEAN`.
+ */
+void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Applies 1D reflect padding to a ggml tensor using the CANN backend.
+ *
+ * @details This function performs 1D reflect padding on the input tensor.
+ * The amount of padding on each side is specified by parameters stored in `dst->op_params`.
+ * The operation reflects the values at the borders of the tensor to generate the padded output.
+ *
+ * This operation is optimized using the CANN backend for high-performance inference or training.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the padded result will be stored.
+ * dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`.
+ */
+void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Counts the number of equal elements in two ggml tensors using the CANN backend.
+ *
+ * @details This function performs an element-wise comparison between two input tensors,
+ * and counts the number of positions where the elements are equal. The result is
+ * stored in the destination tensor `dst` as a scalar.
+ *
+ * The operation is optimized using the CANN backend, making it suitable for
+ * high-performance inference or training scenarios.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored.
+ * dst->op is expected to be `GGML_OP_COUNT_EQUAL`.
+ */
+void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Applies the Step activation function to a ggml tensor using the CANN backend.
+ *
+ * @details This function applies a step function element-wise to the input tensor, where
+ * each element is transformed to 1.0 if it is greater than 0, and 0.0 otherwise.
+ * The result is stored in the destination tensor `dst`.
+ *
+ * This operation is accelerated using the CANN backend to improve runtime performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored.
+ * dst->op is expected to be `GGML_OP_STEP`.
+ */
+void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Performs the Flash Attention extended operator using the CANN backend.
+ *
+ * @details This function implements the memory-efficient Flash Attention algorithm
+ * for computing scaled dot-product attention with hardware acceleration.
+ * The result is stored in the destination tensor `dst`.
+ *
+ * This operation is accelerated using the CANN backend to improve runtime performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored.
+ * dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
+ */
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Forward Gated Linear Attention on the CANN backend.
+ *
+ * Expects dst->src[0..4] = {k, v, q, g, s} with shape conventions:
+ * k, v, q, g: [D] with outer dims T x H batched as ne[2]=T, ne[1]=H
+ * s: initial state [B, H, D, D], where B is batch and D=C/H
+ * dst holds both outputs (o) and updated state; a scale factor is read from op params.
+ *
+ * The kernel updates per time step l: S_new = g ⊗ S_old + k ⊗ v, then computes o = (S_new^T q) * scale.
+ *
+ * @param ctx Backend context providing stream/allocator utilities.
+ * @param dst Output tensor; src deps are k, v, q, g, s as above.
+ */
+void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Launches an asynchronous task using the memory allocator.
+ *
+ * This macro submit an asynchronous task on the specified stream.
+ * The task uses memory allocated by the allocator. It is guaranteed
+ * that the memory will not be accessed by other tasks until this task
+ * completes, due to the sequential execution order within the same stream.
+ *
+ * @param OP_NAME aclnn operator name.
+ * @param args Additional arguments required by the task.
+ *
+ * @note
+ * Memory from the allocator will be "freed" immediately and can be
+ * reallocated to other pointers. However, it won't be accessed by any
+ * other task before this asynchronous task ends, because all tasks in the
+ * same stream are executed in queue order.
+ */
+
+# define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
+ do { \
+ uint64_t workspaceSize = 0; \
+ aclOpExecutor * executor; \
+ void * workspaceAddr = nullptr; \
+ ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
+ /* workspace should alloced in main thread to keep malloc order when using vmm. */ \
+ if (workspaceSize > 0) { \
+ ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
+ workspaceAddr = workspace_allocator.get(); \
+ } \
+ ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream())); \
+ } while (0)
+
+/**
+ * @brief Performs sparse expert-based matrix multiplication using the CANN backend.
+ *
+ * @details This function implements a MoE-style batched matrix multiplication, where each input token
+ * is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
+ * in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
+ *
+ * For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
+ * performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
+ * and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
+ *
+ * Dimensions:
+ * - src0: [D, M, A, 1], where A is the number of experts
+ * - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
+ * - ids : [K, N], where K is the number of experts each token is routed to
+ * - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication
+ *
+ * The function handles two main modes:
+ * - If `ne12 == 1`, a simpler per-token loop is used.
+ * - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the expert-weighted token outputs are stored.
+ * Expected to be of shape [M, K, N, 1].
+ */
+void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Performs fused ADD + RMS_NORM operation using the CANN backend.
+ *
+ * This function fuses the ADD and RMS_NORM operations into a single kernel call
+ * for better performance. It first adds two input tensors (x1 + x2), then applies
+ * RMS normalization to the result.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The ADD operation node, contains the two input tensors to be added.
+ * @param rms_norm_tensor The RMS_NORM operation node, contains the gamma weights
+ * and epsilon parameter.
+ */
+void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx,
+ ggml_tensor * add_node,
+ ggml_tensor * rms_norm_node);
+
+/**
+ * @brief Check whether a tensor is a weight tensor for matrix multiplication.
+ *
+ * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
+ * typically within neural network layers. The function maintains a static set of canonical weight
+ * naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
+ * tensors even with hierarchical naming patterns.
+ *
+ * @param tensor Pointer to the target ggml_tensor object (const-qualified).
+ */
+static bool is_matmul_weight(const ggml_tensor * tensor) {
+ std::string name = ggml_get_name(tensor);
+ static const std::unordered_set<std::string> weight_suffixes{ "output.weight", "attn_q.weight",
+ "attn_k.weight", "attn_v.weight",
+ "attn_output.weight", "ffn_gate.weight",
+ "ffn_up.weight", "ffn_down.weight" };
+
+ for (const auto & suffix : weight_suffixes) {
+ if (name.find(suffix) != std::string::npos) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/**
+ * @brief Applies a element-wise operation to two input tensors using the CANN
+ * backend.
+ *
+ * This templated function takes a binary operator and applies it to two source
+ * tensors
+ * associated with the destination tensor. The function handles broadcasting as
+ * needed.
+ *
+ * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
+ * the binary operation to be performed. It must take three arguments:
+ * (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
+ *
+ * @param ctx The CANN backend context used to manage execution and resources.
+ * @param dst The destination tensor.
+ */
+template <auto binary_op> void ggml_cann_binary_op(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+
+ acl_tensor_ptr acl_src0, acl_src1, acl_dst;
+
+ // Need bcast
+ bcast_shape(src0, src1, dst, acl_src0, acl_src1, acl_dst);
+ binary_op(ctx, acl_src0.get(), acl_src1.get(), acl_dst.get());
+}
+
+/**
+ * @brief Applies a unary operation to an input tensor using the CANN backend.
+ *
+ * This templated function applies a unary operator to the source tensor of `dst`
+ * and stores the result in the destination tensor.
+ *
+ * @tparam unary_op A callable with the signature:
+ * void(ggml_backend_cann_context&, aclTensor *, aclTensor *)
+ * where the first aclTensor is the source and the second is the destination.
+ * @param ctx The CANN backend context for managing resources and execution.
+ * @param dst The destination tensor. Its src[0] is treated as the input tensor.
+ */
+template <void unary_op(ggml_backend_cann_context &, aclTensor *, aclTensor *)>
+void ggml_cann_op_unary(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
+
+ unary_op(ctx, acl_src.get(), acl_dst.get());
+}
+
+/**
+ * @brief Applies a unary operation to a ggml tensor using the CANN backend.
+ *
+ * @details This function applies a unary operation to the input tensor using
+ * a user-provided lambda or callable `unary_op`. The lambda receives the
+ * CANN backend context and two ACL tensors: the source and the destination.
+ *
+ * Internally, this function handles the conversion from GGML tensors to ACL tensors,
+ * calls the provided unary op, and manages resource cleanup. The input is assumed
+ * to be `dst->src[0]`, and the result is written to `dst`.
+ *
+ * This utility simplifies writing unary op wrappers by abstracting tensor preparation.
+ *
+ * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
+ * @param ctx The CANN context for operation execution.
+ * @param dst The destination ggml_tensor where the result will be stored.
+ * The input tensor is assumed to be `dst->src[0]`.
+ *
+ * @see GGML_CANN_CALL_OP_UNARY
+ */
+void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+ ggml_backend_cann_context & ctx,
+ ggml_tensor * dst);
+
+void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst);
+
+/**
+ * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
+ *
+ * @details This function performs a gated activation such as GEGLU or ReGLU.
+ * It supports two input modes:
+ *
+ * 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
+ * These are used directly as the value and gate tensors.
+ *
+ * 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
+ * contain a concatenation of value and gate along the first dimension. This tensor
+ * will be split into two equal halves to form the value and gate inputs.
+ *
+ * The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
+ * then multiplies the result in-place with the gate tensor:
+ *
+ * @code
+ * dst = unary_op(value) * gate;
+ * @endcode
+ *
+ * The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
+ * order of value/gate in the packed input case.
+ *
+ * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
+ * It receives (ctx, acl_value_tensor, acl_output_tensor).
+ * @param ctx The CANN context used for execution.
+ * @param dst The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
+ *
+ * @see GGML_CANN_CALL_OP_UNARY_GATED
+ */
+void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+ ggml_backend_cann_context & ctx,
+ ggml_tensor * dst);
+
+/**
+ * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
+ *
+ * This macro wraps the specified ACLNN unary operator name into a lambda expression,
+ * and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
+ * unary ops in the CANN backend.
+ *
+ * Internally, this macro expands to a lambda like:
+ * @code
+ * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
+ * };
+ * @endcode
+ *
+ * This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
+ *
+ * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
+ *
+ * @see ggml_cann_op_unary
+ * @see GGML_CANN_CALL_ACLNN_OP
+ */
+# define GGML_CANN_CALL_OP_UNARY(OP_NAME) \
+ do { \
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
+ GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
+ }; \
+ ggml_cann_op_unary(lambda, ctx, dst); \
+ } while (0)
+
+/**
+ * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
+ *
+ * This macro wraps the specified ACLNN unary operator name into a lambda expression,
+ * and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
+ * executing gated unary ops in the CANN backend.
+ *
+ * Internally, this macro expands to a lambda like:
+ * @code
+ * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
+ * };
+ * @endcode
+ *
+ * This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
+ *
+ * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
+ *
+ * @see ggml_cann_op_unary_gated
+ * @see GGML_CANN_CALL_ACLNN_OP
+ */
+# define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \
+ do { \
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
+ GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
+ }; \
+ ggml_cann_op_unary_gated(lambda, ctx, dst); \
+ } while (0)
+
+#endif // CANN_ACLNN_OPS
+
+/**
+ * @brief Performs outer product operation on two ggml tensors using the CANN backend.
+ *
+ * @details This function computes the outer product of two input tensors (src0 and src1)
+ * and stores the result in the destination tensor. The outer product operation is defined as:
+ * dst[i,j,k,l] = sum_m (src0[i,m,k,l] * src1[j,m,k,l])
+ *
+ * The function supports multiple data types including F32, F16. For floating-point
+ * types, it uses batch matrix multiplication for efficient computation.
+ *
+ * The implementation handles 4D tensor broadcasting and batch processing automatically.
+ *
+ * @param ctx The CANN backend context for operation execution and memory management.
+ * @param dst The destination ggml_tensor where the outer product result will be stored.
+ * The input tensors are assumed to be `dst->src[0]` and `dst->src[1]`.
+ *
+ * @see GGML_CANN_CALL_ACLNN_OP for CANN operator invocation
+ */
+void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst);
diff --git a/llama.cpp/ggml/src/ggml-cann/common.h b/llama.cpp/ggml/src/ggml-cann/common.h
new file mode 100644
index 0000000..0120f0d
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cann/common.h
@@ -0,0 +1,641 @@
+/*
+ * Copyright (c) 2023-2026 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CANN_COMMON_H
+#define CANN_COMMON_H
+
+#include "../ggml-impl.h"
+#include "../include/ggml-cann.h"
+#include "../include/ggml.h"
+
+#include <acl/acl.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <thread>
+#include <vector>
+
+#define MATRIX_ROW_PADDING 512
+#define GGML_CANN_MAX_STREAMS 8
+
+/**
+ * @brief Handles CANN-related errors by printing an error message and
+ * terminating the program.
+ * @param stmt The statement that caused the error.
+ * @param func The function in which the error occurred.
+ * @param file The file in which the error occurred.
+ * @param line The line number at which the error occurred.
+ * @param msg The error message.
+ */
+[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
+
+/**
+ * @brief Checks the result of a CANN function call and invokes the error
+ * handler if the call fails.
+ * @param stmt The CANN function call to check.
+ * @param success The success code that indicates the call was successful.
+ * @param error_fn The function to call to retrieve the error message.
+ */
+#define ACL_CHECK_GEN(stmt, success, error_fn) \
+ do { \
+ int err_code = (stmt); \
+ if (err_code != (success)) { \
+ ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \
+ } \
+ } while (0);
+
+#define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg)
+
+/**
+ * @brief Contains information about CANN devices.
+ */
+struct ggml_cann_device_info {
+ /**
+ * @brief Number of CANN devices available.
+ */
+ int32_t device_count;
+
+ /**
+ * @brief Information about a single CANN device.
+ */
+ struct cann_device_info {
+ int cc; /**< Compute capability. */
+ size_t smpb; /**< Maximum shared memory per block. */
+ bool vmm; /**< Virtual memory support. */
+ size_t vmm_granularity; /**< Granularity of virtual memory. */
+ size_t total_vram; /**< Total video RAM available on the device. */
+ };
+
+ cann_device_info devices[GGML_CANN_MAX_DEVICES] = {}; /**< Array of CANN device information. */
+};
+
+const ggml_cann_device_info & ggml_cann_info();
+
+void ggml_cann_set_device(int32_t device);
+
+std::optional<std::string> get_env_as_lowercase(const std::string & name);
+bool parse_bool(const std::string & value);
+int parse_integer(const std::string & value);
+
+/**
+ * @brief Abstract base class for memory pools used by CANN.
+ */
+struct ggml_cann_pool {
+ /**
+ * @brief Virtual destructor for the memory pool.
+ */
+ virtual ~ggml_cann_pool() = default;
+
+ /**
+ * @brief Allocates memory from the pool.
+ *
+ * @param size The size of the memory block to allocate.
+ * @param actual_size Pointer to a variable where the actual allocated size
+ * will be stored.
+ * @return Pointer to the allocated memory block.
+ */
+ virtual void * alloc(size_t size, size_t * actual_size) = 0;
+
+ /**
+ * @brief Frees a previously allocated memory block.
+ *
+ * @param ptr Pointer to the memory block to free.
+ * @param size Size of the memory block to free.
+ * @note Note that all CANN opertors are running async. Make sure memory is
+ * still avaiable before this operator finished.
+ */
+ virtual void free(void * ptr, size_t size) = 0;
+};
+
+/**
+ * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
+ */
+struct ggml_cann_pool_alloc {
+ ggml_cann_pool * pool = nullptr; /**< Pointer to the memory pool. */
+ void * ptr = nullptr; /**< Pointer to the allocated memory block. */
+ size_t actual_size = 0; /**< Actual size of the allocated memory block. */
+
+ /**
+ * @brief Default constructor.
+ */
+ ggml_cann_pool_alloc() = default;
+
+ /**
+ * @brief Constructor that initializes the memory pool.
+ * @param pool Reference to the memory pool.
+ */
+ explicit ggml_cann_pool_alloc(ggml_cann_pool & pool) : pool(&pool) {}
+
+ /**
+ * @brief Constructor that initializes the memory pool and allocates memory.
+ * @param pool Reference to the memory pool.
+ * @param size Size of the memory block to allocate.
+ */
+ ggml_cann_pool_alloc(ggml_cann_pool & pool, size_t size) : pool(&pool) { alloc(size); }
+
+ /**
+ * @brief Destructor that frees the allocated memory block.
+ */
+ ~ggml_cann_pool_alloc() {
+ if (ptr != nullptr) {
+ pool->free(ptr, actual_size);
+ }
+ }
+
+ /**
+ * @brief Allocates memory from the pool.
+ * @param size Size of the memory block to allocate.
+ * @return Pointer to the allocated memory block.
+ */
+ void * alloc(size_t size) {
+ GGML_ASSERT(pool != nullptr);
+ GGML_ASSERT(ptr == nullptr);
+ ptr = pool->alloc(size, &this->actual_size);
+ return ptr;
+ }
+
+ /**
+ * @brief Allocates memory from a specific memory pool.
+ * @param pool Reference to the memory pool.
+ * @param size Size of the memory block to allocate.
+ * @return Pointer to the allocated memory block.
+ */
+ void * alloc(ggml_cann_pool & pool, size_t size) {
+ this->pool = &pool;
+ return alloc(size);
+ }
+
+ /**
+ * @brief Gets the pointer to the allocated memory block.
+ * @return Pointer to the allocated memory block.
+ */
+ void * get() { return ptr; }
+
+ // Deleted copy constructor
+ ggml_cann_pool_alloc(const ggml_cann_pool_alloc &) = delete;
+
+ // Deleted move constructor
+ ggml_cann_pool_alloc(ggml_cann_pool_alloc &&) = delete;
+
+ // Deleted copy assignment operator
+ ggml_cann_pool_alloc & operator=(const ggml_cann_pool_alloc &) = delete;
+
+ // Deleted move assignment operator
+ ggml_cann_pool_alloc & operator=(ggml_cann_pool_alloc &&) = delete;
+};
+
+#ifdef USE_ACL_GRAPH
+struct ggml_graph_node_properties {
+ // dst tensor
+ void * node_address;
+ int64_t ne[GGML_MAX_DIMS];
+ size_t nb[GGML_MAX_DIMS];
+
+ // src tensor
+ void * src_address[GGML_MAX_SRC];
+ int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
+ size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
+
+ // op
+ ggml_op node_op;
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+
+ /**
+ * @brief Check if a ggml tensor node matches this property set.
+ *
+ * This function compares all relevant fields (address, op type, shape, source inputs, op params)
+ * to determine whether the current node matches these previously recorded properties.
+ *
+ * @param node The current ggml tensor node.
+ * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
+ */
+ bool has_matching_properties(ggml_tensor * node) {
+ if (node->data != this->node_address && node->op != GGML_OP_VIEW) {
+ return false;
+ }
+
+ if (node->op != this->node_op) {
+ return false;
+ }
+
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ if (node->ne[i] != this->ne[i]) {
+ return false;
+ }
+ if (node->nb[i] != this->nb[i]) {
+ return false;
+ }
+ }
+
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (node->src[i]) {
+ if (node->src[i]->data != this->src_address[i] && node->op != GGML_OP_VIEW) {
+ return false;
+ }
+
+ for (int d = 0; d < GGML_MAX_DIMS; d++) {
+ if (node->src[i]->ne[d] != this->src_ne[i][d]) {
+ return false;
+ }
+ if (node->src[i]->nb[d] != this->src_nb[i][d]) {
+ return false;
+ }
+ }
+ } else {
+ if (this->src_address[i] != nullptr) {
+ return false;
+ }
+ }
+ }
+
+ if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
+ return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
+ }
+ return true;
+ }
+};
+
+struct ggml_cann_graph {
+ ~ggml_cann_graph() {
+ if (graph != nullptr) {
+ ACL_CHECK(aclmdlRIDestroy(graph));
+ }
+ }
+
+ aclmdlRI graph = nullptr;
+
+ std::vector<ggml_graph_node_properties> ggml_graph_properties;
+
+ /**
+ * @brief Create a new CANN graph from a ggml computation graph.
+ *
+ * This function creates a new ggml_cann_graph object and fills its node properties
+ * (operation type, dimensions, strides, input sources, and operation parameters)
+ * based on the current ggml computation graph.
+ *
+ * Each node in the ggml graph is mapped to a property entry in the new CANN graph:
+ * - node address
+ * - operation type
+ * - shape (ne) and strides (nb)
+ * - source tensor addresses
+ * - operation parameters
+ *
+ * @param cgraph The current ggml computation graph.
+ * @return Pointer to the newly created ggml_cann_graph object.
+ */
+ static ggml_cann_graph * create_from_cgraph(ggml_cgraph * cgraph) {
+ ggml_cann_graph * new_graph = new ggml_cann_graph();
+ new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
+
+ for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
+ ggml_tensor * node = cgraph->nodes[node_idx];
+ auto & prop = new_graph->ggml_graph_properties[node_idx];
+
+ prop.node_address = node->data;
+ prop.node_op = node->op;
+
+ std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
+ std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
+
+ for (int src = 0; src < GGML_MAX_SRC; ++src) {
+ if (node->src[src]) {
+ prop.src_address[src] = node->src[src]->data;
+ std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
+ std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
+ } else {
+ prop.src_address[src] = nullptr;
+ std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
+ std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
+ }
+ }
+
+ memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
+ }
+
+ return new_graph;
+ }
+
+ /**
+ * @brief Check whether this CANN graph matches the given ggml computation graph.
+ *
+ * This function compares the number of nodes and each node's properties
+ * (operation type, dimensions, strides, inputs, and operation parameters)
+ * to determine whether this CANN graph matches the given ggml graph.
+ *
+ * @param cgraph The current ggml computation graph.
+ * @return true if this CANN graph matches the ggml graph; false otherwise.
+ */
+ bool matches_cgraph(ggml_cgraph * cgraph) {
+ if (this->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
+ return false;
+ }
+
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
+ if (!this->ggml_graph_properties[i].has_matching_properties(cgraph->nodes[i])) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+};
+
+/**
+ * @brief LRU cache for managing ggml_cann_graph objects.
+ *
+ * This class maintains a list of shared_ptr to ggml_cann_graph objects
+ * and enforces a maximum capacity. It provides methods to push new graphs,
+ * move existing graphs to the front (most recently used), and clear the cache.
+ */
+struct ggml_cann_graph_lru_cache {
+ size_t capacity; /**< Maximum number of graphs in the cache. */
+
+ std::list<ggml_cann_graph *> cache_list; /**< List storing cached graphs as raw pointers. */
+
+ ggml_cann_graph_lru_cache() { capacity = parse_integer(get_env_as_lowercase("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); }
+
+ /**
+ * @brief Push a new graph to the front of the cache.
+ * If the cache exceeds capacity, the least recently used graph is deleted.
+ * @param new_node Pointer to the new ggml_cann_graph to cache.
+ * Ownership is transferred to the cache (cache will delete it).
+ */
+ void push(ggml_cann_graph * new_node) {
+ if (cache_list.size() >= capacity) {
+ ggml_cann_graph * old = cache_list.back();
+ cache_list.pop_back();
+ delete old; // free the old graph
+ }
+ cache_list.push_front(new_node);
+ }
+
+ /**
+ * @brief Clear all graphs from the cache (also frees memory).
+ */
+ void clear() {
+ for (auto ptr : cache_list) {
+ delete ptr;
+ }
+ cache_list.clear();
+ }
+
+ /**
+ * @brief Destructor that clears the cache and frees all cached graphs.
+ */
+ ~ggml_cann_graph_lru_cache() { clear(); }
+
+ /**
+ * @brief Find a cached CANN graph that matches the given ggml graph and move it to front.
+ *
+ * This function iterates through the cached CANN graphs stored in the LRU cache and
+ * compares them against the given ggml computation graph. If a matching graph is found,
+ * it is promoted to the front of the LRU cache and returned. Otherwise, the function
+ * returns nullptr.
+ *
+ * @param cgraph The current ggml computation graph.
+ * @return true if found; false otherwise.
+ */
+ bool find_and_move_to_front(ggml_cgraph * cgraph) {
+ for (auto & graph_ptr : this->cache_list) {
+ if (graph_ptr->matches_cgraph(cgraph)) {
+ cache_list.remove(graph_ptr);
+ cache_list.push_front(graph_ptr);
+ return true;
+ }
+ }
+ return false;
+ }
+};
+#endif // USE_ACL_GRAPH
+
+struct ggml_cann_rope_cache {
+ ~ggml_cann_rope_cache() {
+ if (theta_scale_cache) {
+ ACL_CHECK(aclrtFree(theta_scale_cache));
+ }
+ if (sin_cache) {
+ ACL_CHECK(aclrtFree(sin_cache));
+ }
+ if (cos_cache) {
+ ACL_CHECK(aclrtFree(cos_cache));
+ }
+ if (position_select_index) {
+ ACL_CHECK(aclrtFree(position_select_index));
+ }
+ if (theta_scale_exp_host) {
+ free(theta_scale_exp_host);
+ }
+ if (position_select_index_host) {
+ free(position_select_index_host);
+ }
+ if (yarn_ramp_cache) {
+ ACL_CHECK(aclrtFree(yarn_ramp_cache));
+ }
+ }
+
+ bool equal(int64_t theta_scale_length,
+ int64_t position_length,
+ float ext_factor,
+ float theta_scale,
+ float freq_scale,
+ float attn_factor,
+ bool is_neox,
+ bool indep_sects,
+ bool mrope_used,
+ bool is_imrope,
+ int sections[4]) {
+ return this->theta_scale_length == theta_scale_length && this->position_length == position_length &&
+ this->ext_factor == ext_factor && this->theta_scale == theta_scale && this->freq_scale == freq_scale &&
+ this->attn_factor == attn_factor && this->is_neox == is_neox && this->indep_sects == indep_sects &&
+ this->mrope_used == mrope_used && this->is_imrope == is_imrope && this->sections[0] == sections[0] &&
+ this->sections[1] == sections[1] && this->sections[2] == sections[2] && this->sections[3] == sections[3];
+ }
+
+ void set(int64_t theta_scale_length,
+ int64_t position_length,
+ float ext_factor,
+ float theta_scale,
+ float freq_scale,
+ float attn_factor,
+ bool is_neox,
+ bool indep_sects,
+ bool mrope_used,
+ bool is_imrope,
+ int sections[4]) {
+ this->theta_scale_length = theta_scale_length;
+ this->position_length = position_length;
+ this->ext_factor = ext_factor;
+ this->theta_scale = theta_scale;
+ this->freq_scale = freq_scale;
+ this->attn_factor = attn_factor;
+ this->is_neox = is_neox;
+ this->indep_sects = indep_sects;
+ this->mrope_used = mrope_used;
+ this->is_imrope = is_imrope;
+ this->sections[0] = sections[0];
+ this->sections[1] = sections[1];
+ this->sections[2] = sections[2];
+ this->sections[3] = sections[3];
+ }
+
+ // memory cache, prepare before inferencing.
+ void * theta_scale_cache = nullptr;
+ float * theta_scale_exp_host = nullptr;
+ int * position_select_index_host = nullptr;
+ void * position_select_index = nullptr;
+ void * yarn_ramp_cache = nullptr;
+ // sin/cos cache, used only to accelerate first layer on each device
+ void * sin_cache = nullptr;
+ void * cos_cache = nullptr;
+ // Properties to check before reusing the sincos cache
+ int64_t theta_scale_length = 0;
+ int64_t position_length = 0;
+ bool cached = false;
+ float ext_factor = 0.0f;
+ float theta_scale = 0.0f;
+ float freq_scale = 0.0f;
+ float attn_factor = 0.0f;
+ bool is_neox = false;
+ bool indep_sects = false;
+ bool mrope_used = false;
+ int sections[4] = { 0, 0, 0, 0 };
+ bool is_imrope = false;
+};
+
+struct ggml_cann_tensor_cache {
+ ~ggml_cann_tensor_cache() {
+ if (cache != nullptr) {
+ ACL_CHECK(aclrtFree(cache));
+ }
+ }
+
+ void * cache = nullptr;
+ int64_t size = 0;
+};
+
+/**
+ * @brief Context for managing CANN backend operations.
+ */
+struct ggml_backend_cann_context {
+ int32_t device; /**< Device ID. */
+ std::string name; /**< Name of the device. */
+ std::string description; /**< Description of the device. */
+ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
+#ifdef USE_ACL_GRAPH
+ /// Cached CANN ACL graph used for executing the current ggml computation graph.
+ ggml_cann_graph_lru_cache graph_lru_cache;
+ bool acl_graph_mode = true;
+#endif
+ bool async_mode;
+ // Rope Cache
+ ggml_cann_rope_cache rope_cache;
+ // Constant Pool
+ ggml_cann_tensor_cache rms_norm_one_tensor_cache;
+ ggml_cann_tensor_cache rms_norm_zero_tensor_cache;
+
+ aclrtStream streams[GGML_CANN_MAX_STREAMS] = { nullptr }; /**< Array of streams for the device. */
+
+ /**
+ * @brief Constructor for initializing the context with a given device.
+ * @param device Device ID.
+ */
+ explicit ggml_backend_cann_context(int device) : device(device), name("CANN" + std::to_string(device)) {
+ ggml_cann_set_device(device);
+ description = aclrtGetSocName();
+
+#ifdef USE_ACL_GRAPH
+ acl_graph_mode = parse_bool(get_env_as_lowercase("GGML_CANN_ACL_GRAPH").value_or("on"));
+ GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n", __func__, device, acl_graph_mode ? "GRAPH" : "EAGER",
+ acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
+#endif
+ }
+
+ /**
+ * @brief Destructor for cleaning up resources.
+ */
+ ~ggml_backend_cann_context() {
+ ggml_cann_set_device(device);
+ if (copy_event != nullptr) {
+ ACL_CHECK(aclrtDestroyEvent(copy_event));
+ }
+ for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
+ if (streams[i] != nullptr) {
+ ACL_CHECK(aclrtDestroyStream(streams[i]));
+ }
+ }
+ }
+
+ /**
+ * @brief Get or create a stream for a given index.
+ * @param stream Index of the stream.
+ * @return The stream corresponding to the given index.
+ */
+ aclrtStream stream(int stream) {
+ if (streams[stream] == nullptr) {
+ // If the device is not set here, destroying the stream later may cause a mismatch
+ // between the thread contexts where the stream was created and destroyed.
+ // However, I printed the device_id, thread_id, and stream, and they are all consistent.
+ ACL_CHECK(aclrtSetDevice(device));
+ ACL_CHECK(aclrtCreateStream(&streams[stream]));
+ }
+ return streams[stream];
+ }
+
+ /**
+ * @brief Get or create the default stream (index 0).
+ * @return The default stream.
+ */
+ aclrtStream stream() { return stream(0); }
+
+ // TODO: each stream should have a memory pool.
+ std::unique_ptr<ggml_cann_pool> mem_pool; /**< Memory pool for the device. */
+
+ /**
+ * @brief Create a new memory pool for a given device.
+ * @param device Device ID.
+ * @return A unique pointer to the new memory pool.
+ */
+ static std::unique_ptr<ggml_cann_pool> new_pool_for_device(int device);
+
+ /**
+ * @brief Get or create the memory pool for the context.
+ * @return Reference to the memory pool.
+ */
+ ggml_cann_pool & pool() {
+ if (mem_pool == nullptr) {
+ mem_pool = new_pool_for_device(device);
+ }
+ return *mem_pool;
+ }
+};
+
+#endif // CANN_COMMON_H
diff --git a/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp b/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp
new file mode 100644
index 0000000..3f3de9f
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp
@@ -0,0 +1,2881 @@
+/*
+ * Copyright (c) 2023-2026 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ggml-cann.h"
+
+#include "ggml-backend-impl.h"
+#include "ggml-cann/aclnn_ops.h"
+#include "ggml-cann/common.h"
+#include "ggml-impl.h"
+#include "ggml.h"
+
+#include <acl/acl.h>
+#include <aclnnop/aclnn_trans_matmul_weight.h>
+#include <stdarg.h>
+
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <mutex>
+#include <optional>
+#include <queue>
+#include <unordered_set>
+
+#define GGML_COMMON_DECL_C
+
+#include "ggml-common.h"
+
+#define GGML_CANN_NAME "CANN"
+
+/**
+ * @brief Handles CANN errors by printing an error message and aborting.
+ *
+ * @param stmt The statement that caused the error.
+ * @param func The function in which the error occurred.
+ * @param file The file in which the error occurred.
+ * @param line The line number where the error occurred.
+ * @param msg The error message.
+ */
+[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
+ int32_t id = -1;
+ aclrtGetDevice(&id);
+
+ GGML_LOG_ERROR("CANN error: %s\n", msg);
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
+ GGML_LOG_ERROR(" %s\n", stmt);
+ // abort with GGML_ASSERT to get a stack trace
+ GGML_ABORT("CANN error");
+}
+
+// Thread-local variable to record the current device of this thread.
+thread_local int g_current_cann_device = -1;
+
+/**
+ * @brief Set the CANN device to be used.
+ *
+ * @param device The target device ID to set.
+ */
+void ggml_cann_set_device(const int32_t device) {
+ // int current_device = -1;
+ // Note: In some CANN versions, if no device has been set yet,
+ // aclrtGetDevice(&current_device) may return 0 by default.
+ // aclrtGetDevice(&current_device);
+
+ // If the current device is already the target one, no need to switch.
+ if (device == g_current_cann_device) {
+ return;
+ }
+
+ // Switch to the new device.
+ ACL_CHECK(aclrtSetDevice(device));
+
+ // Update the global device record.
+ g_current_cann_device = device;
+}
+
+/**
+ * @brief Get the value of the specified environment variable (name) as lowercase.
+ * if not empty, return a std::string object
+ */
+std::optional<std::string> get_env_as_lowercase(const std::string & name) {
+ const char * val = std::getenv(name.c_str());
+ if (!val) {
+ return std::nullopt;
+ }
+ std::string res = std::string(val);
+ std::transform(res.begin(), res.end(), res.begin(), ::tolower);
+ return res;
+}
+
+/**
+ * @brief Verify whether the environment variable is a valid value.
+ */
+bool parse_bool(const std::string & value) {
+ static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
+ return valid_values.find(value) != valid_values.end();
+}
+
+/**
+ * @brief Parse a string as an integer, returning 0 if invalid.
+ *
+ * This function attempts to convert the input string `value` to an `int`.
+ * If the string is not a valid integer or is out of the `int` range,
+ * it returns 0.
+ *
+ * @param value The string to parse.
+ * @return The parsed integer, or 0 if conversion fails.
+ */
+int parse_integer(const std::string & value) {
+ try {
+ return std::stoi(value);
+ } catch (...) {
+ return 0;
+ }
+}
+
+/**
+ * @brief Initialize the CANN device information.
+ *
+ * This function initializes the CANN device information by obtaining the
+ * device count and setting the memory allocation granularity for each device.
+ *
+ * @return A structure containing the device information.
+ */
+static ggml_cann_device_info ggml_cann_init() {
+ ggml_cann_device_info info = {};
+
+ aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count);
+
+ if (err != ACL_SUCCESS) {
+ GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg());
+ return info;
+ }
+
+ GGML_ASSERT(info.device_count <= GGML_CANN_MAX_DEVICES);
+
+ for (int id = 0; id < info.device_count; ++id) {
+ aclrtPhysicalMemProp prop = {};
+ prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
+ prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+ prop.memAttr = ACL_HBM_MEM_HUGE;
+ prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+ prop.location.id = id;
+ prop.reserve = 0;
+ err = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+ &info.devices[id].vmm_granularity);
+ info.devices[id].vmm = err == ACL_SUCCESS;
+
+ size_t free, total;
+ ggml_backend_cann_get_device_memory(id, &free, &total);
+ info.devices[id].total_vram = free;
+ }
+
+ // TODO: add more device info later.
+ return info;
+}
+
+/**
+ * @brief Retrieve the CANN device information.
+ *
+ * This function returns a reference to a structure containing the CANN device
+ * information. The device information is initialized once and reused on
+ * subsequent calls.
+ *
+ * @return A reference to the structure containing the device information.
+ */
+const ggml_cann_device_info & ggml_cann_info() {
+ static ggml_cann_device_info info = ggml_cann_init();
+ return info;
+}
+
+//#define DEBUG_CANN_MALLOC
+/**
+ * @brief A pool of CANN buffers(priority segment buffer).
+ *
+ * This class manages a pool of CANN buffers for a specific device.
+ */
+struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
+ /**
+ * @brief The maximum reuse margin for a buffer.
+ */
+ static const size_t max_reuse_margin = 1ull << 22; // 4MB
+
+ /**
+ * @brief The minimum free margin for a buffer.
+ */
+ static const size_t min_free_margin = 1ull << 20; // 1MB
+
+ /**
+ * @brief The alignment for buffer allocation.
+ */
+ static const size_t alignment = 128;
+
+ /**
+ * @brief The device ID associated with this buffer pool.
+ */
+ int device;
+
+ /**
+ * @brief Whether to disable clean during buffer allocation.
+ */
+ bool disable_clean = false;
+
+ /**
+ * @brief Structure representing a CANN buffer.
+ */
+ struct ggml_cann_buffer {
+ void * ptr = nullptr; ///< Pointer to the buffer.
+ size_t size = 0; ///< Size of the buffer.
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
+
+ bool operator>(const ggml_cann_buffer & other) const { return size > other.size; }
+ };
+
+ /**
+ * @brief Array of CANN buffers in the pool.
+ */
+ std::unordered_map<void *, size_t> buffer_pool;
+ std::priority_queue<ggml_cann_buffer, std::vector<ggml_cann_buffer>, std::greater<>> free_buffers;
+
+ /**
+ * @brief Total size of all buffers in the pool.
+ */
+ size_t pool_size = 0;
+
+ /**
+ * @brief Constructor to initialize the buffer pool for a specific device.
+ *
+ * @param device The device ID to associate with this buffer pool.
+ */
+ explicit ggml_cann_pool_buf_prio(int device) : device(device) {
+ disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
+ }
+
+ /**
+ * @brief Destructor to free all buffers in the pool.
+ */
+ ~ggml_cann_pool_buf_prio() {
+ ggml_cann_set_device(device);
+ for (auto & [b_ptr, b_size] : buffer_pool) {
+ aclrtFree(b_ptr);
+ pool_size -= b_size;
+ }
+ buffer_pool.clear();
+ GGML_ASSERT(pool_size == 0);
+ }
+
+ /**
+ * @brief Allocate a buffer of the given size.
+ *
+ * @param size The size of the buffer to allocate.
+ * @param actual_size A pointer to a variable to receive the actual size of
+ * the allocated buffer.
+ * @return A pointer to the allocated buffer.
+ */
+ void * alloc(size_t size, size_t * actual_size) override {
+ size = GGML_PAD(size, alignment);
+ if (size == 0) {
+ size = alignment;
+ }
+
+ void * ptr = nullptr;
+ auto now = std::chrono::steady_clock::now();
+
+ std::vector<ggml_cann_buffer> free_buffers_rest;
+ free_buffers_rest.reserve(free_buffers.size());
+ while (!free_buffers.empty()) {
+ auto b = free_buffers.top();
+ free_buffers.pop();
+
+ if (b.size >= size) {
+ // reuse the buffer if the size is enough
+ const size_t margin = b.size - size;
+ if (margin <= max_reuse_margin) {
+ *actual_size = b.size;
+ ptr = b.ptr;
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO(
+ "cann pool[%d]: reused %p, "
+ "pool_size = %5u MB, "
+ "size = %5u MB, "
+ "margin = %5u MB\n",
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
+#endif
+ break;
+ }
+ }
+
+ bool should_clean = !disable_clean && b.size > min_free_margin &&
+ std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
+ if (should_clean) {
+ // free the buffer if the size is needed to be freed
+ ACL_CHECK(aclrtFree(b.ptr));
+ pool_size -= b.size;
+ buffer_pool.erase(b.ptr);
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO(
+ "cann pool[%d]: clean %p, "
+ "pool_size = %5u MB, "
+ "size = %5u MB\n",
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
+#endif
+ continue;
+ }
+ free_buffers_rest.push_back(b);
+ }
+ for (ggml_cann_buffer & b : free_buffers_rest) {
+ free_buffers.push(std::move(b));
+ }
+
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
+ (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
+#endif
+ if (ptr != nullptr) {
+ return ptr;
+ }
+
+ // allocate a new buffer if no buffer can be reused
+ ggml_cann_set_device(device);
+ ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+ *actual_size = size;
+ pool_size += size;
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO(
+ "cann pool[%d]: allocate %p, "
+ "pool_size = %5u MB, "
+ "size = %5u MB\n",
+ device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576));
+#endif
+ buffer_pool.emplace(ptr, size);
+ return ptr;
+ }
+
+ /**
+ * @brief Free a buffer and return it to the pool.
+ *
+ * @param ptr Pointer to the buffer to free.
+ * @param size Size of the buffer to free.
+ */
+ void free(void * ptr, size_t size) override {
+ GGML_UNUSED(size);
+ auto it = buffer_pool.find(ptr);
+ if (it == buffer_pool.end()) {
+ GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
+ }
+
+ auto now = std::chrono::steady_clock::now();
+ free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now });
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO(
+ "cann pool[%d]: return %p, "
+ "pool_size = %5u MB\n",
+ device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
+#endif
+ }
+};
+
+/**
+ * @brief A pool of CANN buffers(segment buffer).
+ *
+ * This class manages a pool of CANN buffers for a specific device.
+ */
+struct ggml_cann_pool_buf : public ggml_cann_pool {
+ /**
+ * @brief The maximum reuse margin for a buffer.
+ */
+ static const size_t max_reuse_margin = 1ull << 22; // 4MB
+
+ /**
+ * @brief The minimum free margin for a buffer.
+ */
+ static const size_t min_free_margin = 1ull << 20; // 1MB
+
+ /**
+ * @brief The alignment for buffer allocation.
+ */
+ static const size_t alignment = 128;
+
+ /**
+ * @brief The maximum number of buffers in the pool.
+ */
+ static const int MAX_BUFFERS = 256;
+
+ /**
+ * @brief The device ID associated with this buffer pool.
+ */
+ int device;
+
+ /**
+ * @brief Whether to disable clean during buffer allocation.
+ */
+ bool disable_clean = false;
+
+ /**
+ * @brief Structure representing a CANN buffer.
+ */
+ struct ggml_cann_buffer {
+ void * ptr = nullptr; ///< Pointer to the buffer memory.
+ size_t size = 0; ///< Size of the buffer.
+ bool used = false; ///< Whether the buffer is currently in use.
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
+ };
+
+ /**
+ * @brief Array of CANN buffers in the pool.
+ */
+ ggml_cann_buffer buffer_pool[MAX_BUFFERS] = {};
+
+ /**
+ * @brief Total size of all buffers in the pool.
+ */
+ size_t pool_size = 0;
+
+ /**
+ * @brief Constructor to initialize the buffer pool for a specific device.
+ *
+ * @param device The device ID to associate with this buffer pool.
+ */
+ explicit ggml_cann_pool_buf(int device) : device(device) {
+ disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
+ }
+
+ /**
+ * @brief Destructor to free all buffers in the pool.
+ */
+ ~ggml_cann_pool_buf() {
+ ggml_cann_set_device(device);
+ for (int i = 0; i < MAX_BUFFERS; ++i) {
+ ggml_cann_buffer & b = buffer_pool[i];
+ if (b.ptr != nullptr) {
+ aclrtFree(b.ptr);
+ pool_size -= b.size;
+ }
+ }
+ GGML_ASSERT(pool_size == 0);
+ }
+
+ /**
+ * @brief Allocate a buffer of the given size.
+ *
+ * @param size The size of the buffer to allocate.
+ * @param actual_size A pointer to a variable to receive the actual size of
+ * the allocated buffer.
+ * @return A pointer to the allocated buffer.
+ */
+ void * alloc(size_t size, size_t * actual_size) override {
+ size = GGML_PAD(size, alignment);
+ if (size == 0) {
+ size = alignment;
+ }
+
+ void * ptr = nullptr;
+ auto now = std::chrono::steady_clock::now();
+
+ int i = 0;
+ for (; i < MAX_BUFFERS; ++i) {
+ ggml_cann_buffer & b = buffer_pool[i];
+ if (b.ptr == nullptr) {
+ break;
+ }
+ if (b.used) {
+ continue;
+ }
+ if (b.size >= size) {
+ // reuse the buffer if the size is enough
+ const size_t margin = b.size - size;
+ if (margin <= max_reuse_margin) {
+ *actual_size = b.size;
+ b.used = true;
+ ptr = b.ptr;
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO(
+ "cann pool[%d]: reused %p, "
+ "pool_size = %5u MB, "
+ "size = %5u MB, "
+ "margin = %5u MB\n",
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
+#endif
+ break;
+ }
+ }
+
+ bool should_clean = !disable_clean && b.size > min_free_margin &&
+ std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
+ if (should_clean) {
+ // free the buffer if the size is needed to be freed
+ ACL_CHECK(aclrtFree(b.ptr));
+ pool_size -= b.size;
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO(
+ "cann pool[%d]: clean %p, "
+ "pool_size = %5u MB, "
+ "size = %5u MB\n",
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
+#endif
+ b.ptr = nullptr;
+ }
+ }
+ if (ptr != nullptr) {
+ return ptr;
+ }
+
+ if (i < MAX_BUFFERS) {
+ // allocate a new buffer if no buffer can be reused
+ ggml_cann_buffer & b = buffer_pool[i];
+ ggml_cann_set_device(device);
+ ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+ pool_size += size;
+ *actual_size = size;
+ b.size = size;
+ b.used = true;
+ if (i >= MAX_BUFFERS - 8) {
+ GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
+ }
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO(
+ "cann pool[%d]: allocate %p, "
+ "pool_size = %5u MB, "
+ "size = %5u MB\n",
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
+#endif
+ return b.ptr;
+ }
+
+ GGML_ABORT("cann pool[%d]: slots full\n", device);
+ }
+
+ /**
+ * @brief Free a buffer and return it to the pool.
+ *
+ * @param ptr Pointer to the buffer to free.
+ * @param size Size of the buffer to free.
+ */
+ void free(void * ptr, size_t size) override {
+ GGML_UNUSED(size);
+ for (int i = 0; i < MAX_BUFFERS; ++i) {
+ ggml_cann_buffer & b = buffer_pool[i];
+ if (b.ptr != ptr) {
+ continue;
+ }
+ b.used = false;
+ b.last_used = std::chrono::steady_clock::now();
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO(
+ "cann pool[%d]: return %p, "
+ "pool_size = %5u MB\n",
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
+#endif
+ return;
+ }
+ GGML_ABORT("cann pool[%d]: slots full\n", device);
+ }
+};
+
+/**
+ * @brief A pool of CANN buffers with virtual memory.
+ *
+ * This class manages a pool of CANN buffers with virtual memory for a specific
+ * device.
+ */
+struct ggml_cann_pool_vmm : public ggml_cann_pool {
+ /**
+ * @brief The maximum size of the virtual memory pool (32 GB).
+ */
+ size_t max_size;
+
+ /**
+ * @brief The device ID associated with this buffer pool.
+ */
+ int device;
+
+ /**
+ * @brief Pointer to the start of the virtual memory pool.
+ */
+ void * pool_addr = 0;
+
+ /**
+ * @brief Amount of virtual memory used in the pool.
+ */
+ size_t pool_used = 0;
+
+ /**
+ * @brief Total size of the virtual memory pool.
+ */
+ size_t pool_size = 0;
+
+ /**
+ * @brief Allocation granularity for the virtual memory pool.
+ */
+ size_t granularity;
+
+ /**
+ * @brief Handles for the physical memory allocated.
+ */
+ std::vector<aclrtDrvMemHandle> handles;
+
+ /**
+ * @brief Offsets for the mapped memory regions.
+ */
+ std::vector<void *> map_offsets;
+
+ /**
+ * @brief Constructor to initialize the buffer pool with virtual memory for
+ * a specific device.
+ *
+ * @param device The device ID to associate with this buffer pool.
+ */
+ explicit ggml_cann_pool_vmm(int device) : device(device) {
+ auto dev = ggml_cann_info().devices[device];
+ granularity = dev.vmm_granularity;
+ max_size = dev.total_vram;
+ }
+
+ /**
+ * @brief Destructor to free all buffers in the virtual memory pool.
+ */
+ ~ggml_cann_pool_vmm() {
+ if (pool_addr != 0) {
+ for (auto & offset : map_offsets) {
+ ACL_CHECK(aclrtUnmapMem(offset));
+ }
+ for (auto & handle : handles) {
+ ACL_CHECK(aclrtFreePhysical(handle));
+ }
+ ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
+ }
+ }
+
+ /**
+ * @brief Allocate a buffer of the given size in the virtual memory pool.
+ *
+ * @param size The size of the buffer to allocate.
+ * @param actual_size A pointer to a variable to receive the actual size of
+ * the allocated buffer.
+ * @return A pointer to the allocated buffer.
+ */
+ void * alloc(size_t size, size_t * actual_size) override {
+ // round up the allocation size to the alignment to ensure that all
+ // allocations are aligned for all data types
+ const size_t alignment = 128;
+ size = GGML_PAD(size, alignment);
+ if (size == 0) {
+ size = alignment;
+ }
+
+ size_t avail = pool_size - pool_used;
+
+ if (size > avail) {
+ // round up to the next multiple of the granularity
+ size_t reserve_size = size - avail;
+ reserve_size = GGML_PAD(reserve_size, granularity);
+
+ GGML_ASSERT(pool_size + reserve_size <= max_size);
+
+ // allocate more physical memory
+ aclrtPhysicalMemProp prop = {};
+ prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
+ prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+ prop.memAttr = ACL_HBM_MEM_HUGE;
+ prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+ prop.location.id = device;
+ prop.reserve = 0;
+ aclrtDrvMemHandle handle;
+ ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
+
+ // reserve virtual address space (if not already reserved)
+ if (pool_addr == 0) {
+ ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1));
+ }
+
+ // map at the end of the pool
+ ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0));
+
+ handles.push_back(handle);
+ map_offsets.push_back((char *) pool_addr + pool_size);
+
+ // add to the pool
+ pool_size += reserve_size;
+
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device,
+ (unsigned long long) (pool_size / 1024 / 1024),
+ (unsigned long long) (reserve_size / 1024 / 1024));
+#endif
+ }
+
+ GGML_ASSERT(pool_addr != 0);
+
+ void * ptr = (void *) ((char *) pool_addr + pool_used);
+ *actual_size = size;
+ pool_used += size;
+
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size,
+ (unsigned long long) ptr);
+#endif
+ return ptr;
+ }
+
+ /**
+ * @brief Free a buffer and return it to the virtual memory pool.
+ *
+ * @param ptr Pointer to the buffer to free.
+ * @param size Size of the buffer to free.
+ */
+ void free(void * ptr, size_t size) override {
+#ifdef DEBUG_CANN_MALLOC
+ GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size,
+ (unsigned long long) ptr);
+#endif
+
+ pool_used -= size;
+
+ // all deallocations must be in reverse order of the allocations
+ GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used));
+ }
+};
+
+/**
+ * @brief Create a new CANN pool for a specific device.
+ *
+ * Factory method to create a new CANN pool object based on the device type.
+ *
+ * @param device The device ID for which to create the pool.
+ * @return A unique pointer to the created CANN pool.
+ */
+std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
+ std::string mem_pool_type = get_env_as_lowercase("GGML_CANN_MEM_POOL").value_or("");
+
+ if (mem_pool_type == "prio") {
+ GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
+ }
+
+ if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
+ GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
+ }
+
+ GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
+}
+
+// cann buffer
+/**
+ * @brief Context for managing a CANN buffer associated with a specific device.
+ *
+ * This structure holds information about a CANN buffer, including the device
+ * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
+ */
+struct ggml_backend_cann_buffer_context {
+ int32_t device; ///< The device ID associated with this buffer context.
+ void * dev_ptr = nullptr; ///< Pointer to the device memory allocated for the buffer.
+
+ /**
+ * @brief Constructor to initialize the CANN buffer context.
+ *
+ * @param device The device ID associated with this buffer context.
+ * @param dev_ptr Pointer to the device memory allocated for the buffer.
+ */
+ ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
+
+ /**
+ * @brief Destructor to free the device memory allocated for the buffer.
+ */
+ ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
+};
+
+// cann buffer type
+/**
+ * @brief Structure representing context information for a specific backend
+ * buffer type.
+ */
+struct ggml_backend_cann_buffer_type_context {
+ int32_t device; /**< Device identifier associated with the buffer context. */
+ std::string name; /**< Name associated with the buffer context. */
+};
+
+/**
+ * @brief Retrieves the name associated with a CANN buffer type.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN buffer type context.
+ *
+ * @param buft Pointer to the buffer type context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
+
+ return buft_ctx->name.c_str();
+}
+
+/**
+ * @brief Checks if the backend buffer type is associated with the CANN backend.
+ *
+ * This function checks whether the provided backend buffer type is associated
+ * with the CANN backend based on the comparison of its name retrieval function
+ * pointer.
+ *
+ * @param buft Pointer to the backend buffer type to check.
+ * @return bool Returns true if the buffer type is associated with the CANN
+ * backend, otherwise false.
+ */
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
+ return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
+}
+
+/**
+ * @brief Free resources associated with a CANN buffer.
+ *
+ * This function frees the resources associated with a CANN buffer, including
+ * its context.
+ *
+ * @param buffer The CANN buffer to free.
+ */
+static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
+ delete ctx;
+}
+
+/**
+ * @brief Retrieve the base pointer of a CANN buffer.
+ *
+ * This function returns the base pointer of a CANN buffer, which points to the
+ * device memory allocated for the buffer.
+ *
+ * @param buffer The CANN buffer whose base pointer is to be retrieved.
+ * @return A pointer to the base of the device memory allocated for the buffer.
+ */
+static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
+ return ctx->dev_ptr;
+}
+
+/**
+ * @brief Transform quantized Q4.0 tensor data into a format suitable for CANN
+ * processing.
+ *
+ * This function transforms quantized Q4.0 tensor data into a format suitable
+ * for CANN processing. It extracts quantization values and scales from the
+ * source data and prepares them in a format expected by CANN operations.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data in Q4.0 format.
+ * @param dst Pointer to the destination buffer where transformed data will be
+ * stored.
+ */
+static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) {
+ int64_t n_elems = ggml_nelements(tensor);
+ int64_t groups = n_elems / QK4_0;
+ size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+
+ uint8_t * quant_offset = (uint8_t *) dst;
+ uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
+
+ for (int i = 0; i < groups; i++) {
+ const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0));
+ *scale_offset = group->d;
+ scale_offset++;
+
+ // 0-15
+ for (int j = 0; j < QK4_0 / 2; j += 2) {
+ (*quant_offset) = (group->qs[j] & 0x0F);
+ (*quant_offset) |= ((group->qs[j + 1] << 4));
+ quant_offset++;
+ }
+
+ // 16-31
+ for (int j = 0; j < QK4_0 / 2; j += 2) {
+ (*quant_offset) = (group->qs[j] >> 4);
+ (*quant_offset) |= (group->qs[j + 1] & 0xF0);
+ quant_offset++;
+ }
+ }
+
+ // put (uint4b_t -8) into int4b_t
+ for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) {
+ (*quant_offset) ^= 0x88;
+ }
+}
+
+/**
+ * @brief Transform CANN processed data back into quantized Q4.0 format.
+ *
+ * This function transforms CANN processed data back into quantized Q4.0 format.
+ * It reverses the transformation performed by
+ * ggml_backend_cann_transform_q4_0(), converting the data back into its
+ * original quantized form.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source buffer containing transformed data.
+ * @param dst Pointer to the destination buffer where the Q4.0 formatted data
+ * will be stored.
+ */
+static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) {
+ int64_t n_elems = ggml_nelements(tensor);
+ int64_t groups = n_elems / QK4_0;
+ size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+
+ uint8_t * quant_offset = (uint8_t *) src;
+ uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes);
+
+ for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) {
+ (*quant_offset) ^= 0x88;
+ }
+ quant_offset = (uint8_t *) src;
+
+ for (int i = 0; i < groups; i++) {
+ block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0));
+ group->d = *scale_offset;
+ scale_offset++;
+
+ // 0-15
+ for (int j = 0; j < QK4_0 / 2; j += 2) {
+ group->qs[j] = ((*quant_offset) & 0x0F);
+ group->qs[j + 1] = ((*quant_offset) >> 4);
+ quant_offset++;
+ }
+
+ // 16-31
+ for (int j = 0; j < QK4_0 / 2; j += 2) {
+ group->qs[j] |= ((*quant_offset) << 4);
+ group->qs[j + 1] |= ((*quant_offset) & 0xF0);
+ quant_offset++;
+ }
+ }
+}
+
+/**
+ * @brief Transform quantized Q8.0 tensor data into a format suitable for CANN
+ * processing.
+ *
+ * This function transforms quantized Q8.0 tensor data into a format suitable
+ * for CANN processing. It extracts quantization values and scales from the
+ * source data and prepares them in a format expected by CANN operations.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data in Q8.0 format.
+ * @param dst Pointer to the destination buffer where transformed data will be
+ * stored.
+ */
+static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) {
+ int64_t n_elems = ggml_nelements(tensor);
+ int64_t groups = n_elems / QK8_0;
+ size_t quant_bytes = n_elems * sizeof(uint8_t);
+
+ uint8_t * quant_offset = (uint8_t *) dst;
+ uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
+
+ for (int i = 0; i < groups; i++) {
+ const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0));
+ *scale_offset = group->d;
+ scale_offset++;
+ size_t group_quant_size = QK8_0 * sizeof(uint8_t);
+ memcpy(quant_offset, group->qs, group_quant_size);
+ quant_offset += group_quant_size;
+ }
+}
+
+/**
+ * @brief Transform CANN processed data back into quantized Q8.0 format.
+ *
+ * This function transforms CANN processed data back into quantized Q8.0 format.
+ * It reverses the transformation performed by
+ * ggml_backend_cann_transform_q8_0(), converting the data back into its
+ * original quantized form.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source buffer containing transformed data.
+ * @param dst Pointer to the destination buffer where the Q8.0 formatted data
+ * will be stored.
+ */
+static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) {
+ int64_t n_elems = ggml_nelements(tensor);
+ int64_t groups = n_elems / QK8_0;
+ size_t quant_bytes = n_elems * sizeof(uint8_t);
+
+ const uint8_t * quant_offset = (const uint8_t *) src;
+ const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes);
+
+ for (int i = 0; i < groups; i++) {
+ block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0));
+ group->d = *scale_offset;
+ scale_offset++;
+ size_t group_quant_size = QK8_0 * sizeof(uint8_t);
+ memcpy(group->qs, quant_offset, group_quant_size);
+ quant_offset += group_quant_size;
+ }
+}
+
+/**
+ * @brief Transform tensor data based on its type for CANN processing.
+ *
+ * This function transforms tensor data based on its quantization type for CANN
+ * processing. It dispatches the transformation based on the tensor's type to
+ * specialized functions handling Q4.0 and Q8.0 formats.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data to be transformed.
+ * @param dst Pointer to the destination buffer where transformed data will be
+ * stored.
+ */
+static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) {
+ switch (tensor->type) {
+ case GGML_TYPE_Q4_0:
+ ggml_backend_cann_transform_q4_0(tensor, src, dst);
+ break;
+ case GGML_TYPE_Q8_0:
+ ggml_backend_cann_transform_q8_0(tensor, src, dst);
+ break;
+ default:
+ break;
+ }
+}
+
+/**
+ * @brief Transform CANN processed data back into tensor data based on its type.
+ *
+ * This function transforms CANN processed data back into tensor data based on
+ * its quantization type for Q4.0 and Q8.0 formats. It dispatches the
+ * transformation based on the tensor's type to specialized functions.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data containing CANN processed data.
+ * @param dst Pointer to the destination buffer where transformed tensor data
+ * will be stored.
+ */
+static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) {
+ switch (tensor->type) {
+ case GGML_TYPE_Q4_0:
+ ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
+ break;
+ case GGML_TYPE_Q8_0:
+ ggml_backend_cann_transform_back_q8_0(tensor, src, dst);
+ break;
+ default:
+ break;
+ }
+}
+
+/**
+ * @brief Check if transformation is needed for a given tensor type.
+ *
+ * This function checks if transformation is needed for a given tensor type
+ * to prepare data for CANN processing.
+ *
+ * @param type The tensor type to check.
+ * @return true if transformation is needed, false otherwise.
+ */
+static bool need_transform(ggml_type type) {
+ switch (type) {
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q8_0:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * @brief Initialize a tensor using data from a CANN buffer.
+ *
+ * This function initializes a tensor using data from a CANN buffer.
+ * It handles special cases such as views and quantization.
+ *
+ * @param buffer The CANN buffer from which to initialize the tensor.
+ * @param tensor Pointer to the tensor to be initialized.
+ */
+static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
+ GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+ return GGML_STATUS_SUCCESS;
+ }
+
+ // TODO: cann backend doesn't support quantized yet. Just leave the code
+ // here.
+ if (ggml_is_quantized(tensor->type)) {
+ // Initialize padding to 0 to avoid possible NaN values
+ size_t original_size = ggml_nbytes(tensor);
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+ if (padded_size > original_size && tensor->view_src == nullptr) {
+ size_t memset_size = padded_size - original_size;
+ ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size));
+ }
+ }
+ return GGML_STATUS_SUCCESS;
+}
+
+/**
+ * @brief Workspace for caching NZ buffers per device.
+ *
+ * This struct manages a device buffer used in NZ computations. It supports
+ * allocation, reallocation, and clearing of cached memory. The struct is
+ * designed to be used with a global array, one per device.
+ */
+struct ggml_cann_nz_workspace {
+ void * ptr; // Pointer to allocated device buffer
+ size_t allocated; // Size of currently allocated buffer in bytes
+
+ /**
+ * @brief Constructor. Initializes the workspace with no allocated memory.
+ */
+ ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
+
+ /**
+ * @brief Free cached memory and reset the workspace.
+ *
+ * If a buffer has been allocated, this function releases it using
+ * aclrtFree and resets internal state.
+ */
+ void clear() {
+ if (ptr) {
+ ACL_CHECK(aclrtFree(ptr));
+ ptr = nullptr;
+ allocated = 0;
+ }
+ }
+
+ /**
+ * @brief Allocate or reallocate the workspace buffer.
+ *
+ * If the requested size is larger than the currently allocated size,
+ * the old buffer will be freed and a new buffer of the requested size
+ * will be allocated on the device.
+ *
+ * @param new_size Size in bytes to allocate for the workspace.
+ */
+ void realloc(size_t new_size) {
+ if (new_size > allocated) {
+ clear();
+ ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
+ allocated = new_size;
+ }
+ }
+
+ /**
+ * @brief Get the device buffer pointer.
+ *
+ * @return Pointer to the allocated buffer, or nullptr if not allocated.
+ */
+ void * get() const { return ptr; }
+};
+
+/**
+ * @brief Global array of NZ workspaces, one per device.
+ */
+static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
+
+/**
+ * @brief Convert tensor weights to NZ format using Ascend CANN API.
+ *
+ * This function creates a transposed tensor descriptor and performs the
+ * TransMatmulWeight operation. Converting tensor formats can significantly
+ * improve performance on certain hardware.
+ *
+ * @param tensor Pointer to the input ggml_tensor containing the weights.
+ * @param offset Byte offset within the tensor data buffer where weights start.
+ * @param device device id.
+ *
+ * @note The workspace buffer used in this function is managed globally and reused
+ * across calls. This reduces overhead from repeated memory allocation and deallocation.
+ */
+static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
+ acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
+ uint64_t workspaceSize = 0;
+ aclOpExecutor * executor;
+
+ // TransMatmulWeight
+ ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
+ // Avoid frequent malloc/free of the workspace.
+ g_nz_workspaces[device].realloc(workspaceSize);
+
+ void * g_nz_workspace = g_nz_workspaces[device].get();
+
+ ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
+}
+
+// TODO: need handle tensor which has paddings.
+/**
+ * @brief Set tensor data in a CANN buffer.
+ *
+ * This function sets tensor data in a CANN buffer, handling transformations
+ * if needed based on the tensor's type.
+ *
+ * @param buffer The CANN buffer where the tensor data will be set.
+ * @param tensor Pointer to the tensor whose data will be set.
+ * @param data Pointer to the source data to be copied into the tensor.
+ * @param offset Offset in the source data from where to start copying.
+ * @param size Size of the data to be copied, in bytes.
+ */
+static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
+ ggml_tensor * tensor,
+ const void * data,
+ size_t offset,
+ size_t size) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
+
+ ggml_cann_set_device(ctx->device);
+ // TODO: refer to cann(#6017), it use thread's default stream.
+ // For acl, synchronous functions use this default stream.
+ // Why aclrtSynchronizeDevice?
+
+ // Only check env once.
+ static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
+ if (!need_transform(tensor->type)) {
+ ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+ if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
+ GGML_ASSERT(tensor->ne[2] == 1);
+ GGML_ASSERT(tensor->ne[3] == 1);
+ weight_format_to_nz(tensor, offset, ctx->device);
+ }
+ } else {
+ void * transform_buffer = malloc(size);
+ ggml_backend_cann_transform(tensor, data, transform_buffer);
+
+ ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
+ free(transform_buffer);
+ }
+}
+
+/**
+ * @brief Get tensor data from a CANN buffer.
+ *
+ * This function retrieves tensor data from a CANN buffer, handling
+ * transformations if needed based on the tensor's type.
+ *
+ * @param buffer The CANN buffer from which to retrieve tensor data.
+ * @param tensor Pointer to the tensor whose data will be retrieved.
+ * @param data Pointer to the destination buffer where the tensor data will be
+ * copied.
+ * @param offset Offset in the destination buffer where to start copying.
+ * @param size Size of the data to be copied, in bytes.
+ */
+static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
+ const ggml_tensor * tensor,
+ void * data,
+ size_t offset,
+ size_t size) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
+
+ ggml_cann_set_device(ctx->device);
+
+ if (!need_transform(tensor->type)) {
+ ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
+ } else {
+ void * transform_buffer = malloc(size);
+ ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
+ ggml_backend_cann_transform_back(tensor, transform_buffer, data);
+ free(transform_buffer);
+ }
+}
+
+/**
+ * @brief Copy tensor data between CANN buffers if possible.
+ *
+ * This function copies tensor data between CANN buffers if the source and
+ * destination buffers are CANN buffers and they meet the necessary conditions
+ * (same device or devices can access each other).
+ *
+ * @param buffer The destination CANN buffer where the tensor data will be
+ * copied.
+ * @param src Pointer to the source tensor whose data will be copied.
+ * @param dst Pointer to the destination tensor where the data will be copied.
+ * @return true if the copy operation succeeded, false otherwise.
+ */
+static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+ const ggml_tensor * src,
+ ggml_tensor * dst) {
+ if (ggml_backend_buft_is_cann(src->buffer->buft)) {
+ ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
+ ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
+
+ size_t memcpy_size = ggml_nbytes(src);
+ // Same device.
+ if (src_ctx->device == dst_ctx->device) {
+ ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
+ ACL_MEMCPY_DEVICE_TO_DEVICE));
+ return true;
+ } else {
+#ifdef ASCEND_310P
+ // TODO: Support 310p P2P copy
+ return false;
+#endif
+ // Different device but can access by peer.
+ int32_t canAccessPeer = 0;
+ ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device));
+ if (canAccessPeer) {
+ ggml_cann_set_device(src_ctx->device);
+ ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
+ ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
+ ACL_MEMCPY_DEVICE_TO_DEVICE));
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/**
+ * @brief Clear a CANN buffer by setting all its memory to a specified value.
+ *
+ * This function clears a CANN buffer by setting all its memory to a specified
+ * value.
+ *
+ * @param buffer The CANN buffer to be cleared.
+ * @param value The value to which each byte in the buffer will be set.
+ */
+static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
+
+ ggml_cann_set_device(ctx->device);
+ ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
+}
+
+/**
+ * @brief Interface for a CANN buffer in the backend.
+ *
+ * This structure defines function pointers to operations that can be performed
+ * on a CANN buffer within the backend.
+ */
+static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
+ /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
+ /* .get_base = */ ggml_backend_cann_buffer_get_base,
+ /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
+ /* .memset_tensor = */ NULL,
+ /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
+ /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
+ /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor,
+ /* .clear = */ ggml_backend_cann_buffer_clear,
+ /* .reset = */ NULL,
+};
+
+/**
+ * @brief Allocates a new CANN buffer of the specified type and size.
+ *
+ * This function allocates a new CANN buffer on the specified device with the
+ * given size.
+ *
+ * @param buft Pointer to the buffer type context.
+ * @param size Size in bytes of the buffer to allocate.
+ * @return Pointer to the allocated buffer, or nullptr if allocation fails.
+ */
+static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
+
+ ggml_cann_set_device(buft_ctx->device);
+
+ const size_t alignment = 128;
+ size = GGML_PAD(size, alignment);
+ if (size == 0) {
+ size = alignment;
+ }
+ void * dev_ptr;
+ aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+ if (err != ACL_SUCCESS) {
+ GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__,
+ size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg());
+ return nullptr;
+ }
+
+ ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
+
+ return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size);
+}
+
+/**
+ * @brief Retrieves the memory alignment requirement for CANN buffers of this
+ * type.
+ *
+ * This function returns the alignment requirement in bytes for memory allocated
+ * by the CANN buffer type.
+ *
+ * @param buft Pointer to the buffer type context (unused in this
+ * implementation).
+ * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
+ * buffers).
+ */
+static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+ return 128;
+
+ GGML_UNUSED(buft);
+}
+
+/**
+ * @brief Calculates the allocation size required for a tensor in a CANN buffer.
+ *
+ * Computes the total allocation size needed for storing the tensor's data in a
+ * CANN buffer, considering any necessary padding or adjustments for quantized
+ * types.
+ *
+ * @param buft Pointer to the buffer type context (unused in this
+ * implementation).
+ * @param tensor Pointer to the tensor for which the allocation size is
+ * calculated.
+ * @return The total allocation size in bytes required for the tensor in the
+ * CANN buffer.
+ */
+static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+ const ggml_tensor * tensor) {
+ size_t size = ggml_nbytes(tensor);
+ int64_t ne0 = tensor->ne[0];
+
+ // Only check env once.
+ static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
+
+ // last line must bigger than 32, because every single op deal at
+ // least 32 bytes.
+ // TODO: quantized type?
+ // int64_t line_size = ne0 * ggml_element_size(tensor);
+ // int64_t line_size_align_32 = (line_size + 31) & ~31;
+ // size += (line_size_align_32 - line_size);
+ if (ggml_is_quantized(tensor->type)) {
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+ }
+ } else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
+ // NZ format weight are not support quantized yet.
+ // If ND tensor transform to NZ, size may changed.
+ int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
+ GGML_ASSERT(tensor->ne[2] == 1);
+ GGML_ASSERT(tensor->ne[3] == 1);
+ const aclIntArray * acl_shape = aclCreateIntArray(shape, 2);
+ size_t new_size;
+ ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size));
+ ACL_CHECK(aclDestroyIntArray(acl_shape));
+ size = std::max(size, new_size);
+ }
+
+ return size;
+
+ GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+ return false;
+
+ GGML_UNUSED(buft);
+}
+
+/**
+ * @brief Interface for managing CANN buffer types in the GGML backend.
+ *
+ * Provides function pointers for allocating, querying properties, and managing
+ * memory for CANN buffer types in the GGML backend.
+ */
+static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
+ /* .get_name = */ ggml_backend_cann_buffer_type_name,
+ /* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
+ /* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size,
+ /* .is_host = */ ggml_backend_cann_buffer_type_is_host,
+};
+
+/**
+ * @brief Retrieves the CANN buffer type for a specified device.
+ *
+ * This function initializes and returns the buffer type interface associated
+ * with the given device. It ensures thread-safe access using a mutex.
+ *
+ * @param device The device index for which to retrieve the buffer type.
+ * @return A pointer to the buffer type interface for the specified device, or
+ * nullptr if the device index is out of range.
+ */
+ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) {
+ static std::mutex mutex;
+ std::lock_guard<std::mutex> lock(mutex);
+
+ if (device >= ggml_backend_cann_get_device_count()) {
+ return nullptr;
+ }
+
+ static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
+
+ static bool ggml_backend_cann_buffer_type_initialized = false;
+
+ if (!ggml_backend_cann_buffer_type_initialized) {
+ for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
+ ggml_backend_cann_buffer_types[i] = {
+ /* .iface = */ ggml_backend_cann_buffer_type_interface,
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
+ /* .context = */
+ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) },
+ };
+ }
+ ggml_backend_cann_buffer_type_initialized = true;
+ }
+
+ return &ggml_backend_cann_buffer_types[device];
+}
+
+/**
+ * @brief Retrieves the name associated with a CANN host buffer type.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN host buffer type context.
+ *
+ * @param buft Pointer to the host buffer type context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+ return "CANN_Host";
+
+ GGML_UNUSED(buft);
+}
+
+/**
+ * @brief Retrieves the name associated with a CANN host buffer.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN host buffer context.
+ *
+ * @param buft Pointer to the host buffer context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
+ return "CANN_Host";
+
+ GGML_UNUSED(buffer);
+}
+
+/**
+ * @brief Free resources associated with a CANN host buffer.
+ *
+ * This function frees the resources associated with a CANN host buffer, including
+ * its context.
+ *
+ * @param buffer The CANN host buffer to free.
+ */
+static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
+ ACL_CHECK(aclrtFreeHost(buffer->context));
+}
+
+/**
+ * @brief Allocates a new CANN host buffer of the specified size.
+ *
+ * This function allocates a new CANN host buffer with the given size.
+ * @param size Size in bytes of the host buffer to allocate.
+ * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
+ */
+static void * ggml_cann_host_malloc(size_t size) {
+ if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
+ return nullptr;
+ }
+
+ const size_t alignment = 128;
+ size = GGML_PAD(size, alignment);
+ if (size == 0) {
+ size = alignment;
+ }
+
+ void * hostPtr = nullptr;
+ aclError err = aclrtMallocHost((void **) &hostPtr, size);
+ if (err != ACL_SUCCESS) {
+ GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0,
+ aclGetRecentErrMsg());
+ return nullptr;
+ }
+ return hostPtr;
+}
+
+/**
+ * @brief Allocates a new CANN host buffer of the specified type and size.
+ *
+ * @param buft Pointer to the host buffer type context.
+ * @param size Size in bytes of the host buffer to allocate.
+ * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
+ */
+static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+ size_t size) {
+ void * hostPtr = ggml_cann_host_malloc(size);
+
+ if (hostPtr == nullptr) {
+ // fallback to cpu buffer
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+ }
+
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
+ buffer->buft = buft;
+ buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
+
+ return buffer;
+}
+
+/**
+ * @brief Interface for managing CANN host buffer types in the GGML backend.
+ *
+ * Provides function pointers for allocating, querying properties, and managing
+ * memory for CANN buffer types in the GGML backend.
+ */
+ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
+ static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
+ /* .iface = */ {
+ /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
+ /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+ },
+ /* .device = */
+ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
+ /* .context = */ nullptr,
+ };
+
+ return &ggml_backend_cann_buffer_type_host;
+}
+
+/**
+ * @brief Computes the forward operation for a given tensor using CANN
+ * operations.
+ *
+ * This function selects the appropriate CANN operation based on the type of
+ * operation specified in the tensor and performs the computation.
+ *
+ * @param ctx The CANN context containing necessary resources and
+ * configurations.
+ * @param dst The destination tensor where the result of the computation will be
+ * stored.
+ * @return true if the computation was successful; false otherwise.
+ */
+static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) {
+ switch (dst->op) {
+ case GGML_OP_REPEAT:
+ ggml_cann_repeat(ctx, dst);
+ break;
+ case GGML_OP_GET_ROWS:
+ ggml_cann_get_rows(ctx, dst);
+ break;
+ case GGML_OP_SET_ROWS:
+ ggml_cann_set_rows(ctx, dst);
+ break;
+ case GGML_OP_DUP:
+ ggml_cann_dup(ctx, dst);
+ break;
+ case GGML_OP_ADD:
+ case GGML_OP_ADD1:
+ ggml_cann_binary_op<aclnn_add>(ctx, dst);
+ break;
+ case GGML_OP_SUB:
+ ggml_cann_binary_op<aclnn_sub>(ctx, dst);
+ break;
+ case GGML_OP_ACC:
+ ggml_cann_acc(ctx, dst);
+ break;
+ case GGML_OP_MUL:
+ ggml_cann_binary_op<aclnn_mul>(ctx, dst);
+ break;
+ case GGML_OP_DIV:
+ ggml_cann_binary_op<aclnn_div>(ctx, dst);
+ break;
+ case GGML_OP_UNARY:
+ switch (ggml_get_unary_op(dst)) {
+ case GGML_UNARY_OP_ABS:
+ GGML_CANN_CALL_OP_UNARY(Abs);
+ break;
+ case GGML_UNARY_OP_NEG:
+ GGML_CANN_CALL_OP_UNARY(Neg);
+ break;
+ case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_ERF:
+ // aclnnGelu internally uses the erf-based approximation.
+ GGML_CANN_CALL_OP_UNARY(Gelu);
+ break;
+ case GGML_UNARY_OP_SILU:
+ GGML_CANN_CALL_OP_UNARY(Silu);
+ break;
+ case GGML_UNARY_OP_GELU_QUICK:
+ {
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+ };
+ ggml_cann_op_unary(lambda, ctx, dst);
+ }
+ break;
+ case GGML_UNARY_OP_TANH:
+ GGML_CANN_CALL_OP_UNARY(Tanh);
+ break;
+ case GGML_UNARY_OP_RELU:
+ GGML_CANN_CALL_OP_UNARY(Relu);
+ break;
+ case GGML_UNARY_OP_SIGMOID:
+ GGML_CANN_CALL_OP_UNARY(Sigmoid);
+ break;
+ case GGML_UNARY_OP_HARDSIGMOID:
+ GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
+ break;
+ case GGML_UNARY_OP_HARDSWISH:
+ GGML_CANN_CALL_OP_UNARY(Hardswish);
+ break;
+ case GGML_UNARY_OP_EXP:
+ GGML_CANN_CALL_OP_UNARY(Exp);
+ break;
+ case GGML_UNARY_OP_ELU:
+ ggml_cann_elu(ctx, dst);
+ break;
+ case GGML_UNARY_OP_SGN:
+ GGML_CANN_CALL_OP_UNARY(Sign);
+ break;
+ case GGML_UNARY_OP_STEP:
+ ggml_cann_step(ctx, dst);
+ break;
+ default:
+ return false;
+ }
+ break;
+ case GGML_OP_GLU:
+ switch (ggml_get_glu_op(dst)) {
+ case GGML_GLU_OP_REGLU:
+ GGML_CANN_CALL_OP_UNARY_GATED(Relu);
+ break;
+ case GGML_GLU_OP_GEGLU:
+ case GGML_GLU_OP_GEGLU_ERF:
+ // aclnnGelu internally uses the erf-based approximation.
+ GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
+ break;
+ case GGML_GLU_OP_SWIGLU:
+ GGML_CANN_CALL_OP_UNARY_GATED(Silu);
+ break;
+ case GGML_GLU_OP_GEGLU_QUICK:
+ {
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+ };
+ ggml_cann_op_unary_gated(lambda, ctx, dst);
+ }
+ break;
+ default:
+ return false;
+ }
+ break;
+ case GGML_OP_NORM:
+ ggml_cann_norm(ctx, dst);
+ break;
+ case GGML_OP_GROUP_NORM:
+ ggml_cann_group_norm(ctx, dst);
+ break;
+ case GGML_OP_L2_NORM:
+ ggml_cann_l2_norm(ctx, dst);
+ break;
+ case GGML_OP_CROSS_ENTROPY_LOSS:
+ ggml_cann_cross_entropy_loss(ctx, dst);
+ break;
+ case GGML_OP_CONCAT:
+ ggml_cann_concat(ctx, dst);
+ break;
+ case GGML_OP_UPSCALE:
+ ggml_cann_upsample_nearest2d(ctx, dst);
+ break;
+ case GGML_OP_PAD:
+ ggml_cann_pad(ctx, dst);
+ break;
+ case GGML_OP_ARANGE:
+ ggml_cann_arange(ctx, dst);
+ break;
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ ggml_cann_timestep_embedding(ctx, dst);
+ break;
+ case GGML_OP_LEAKY_RELU:
+ ggml_cann_leaky_relu(ctx, dst);
+ break;
+ case GGML_OP_RMS_NORM:
+ ggml_cann_rms_norm(ctx, dst);
+ break;
+ case GGML_OP_MUL_MAT:
+ ggml_cann_mul_mat(ctx, dst);
+ break;
+ case GGML_OP_MUL_MAT_ID:
+ ggml_cann_mul_mat_id(ctx, dst);
+ break;
+ case GGML_OP_SCALE:
+ ggml_cann_scale(ctx, dst);
+ break;
+ case GGML_OP_SQR:
+ GGML_ASSERT(dst->src[1] == nullptr);
+ dst->src[1] = dst->src[0];
+ ggml_cann_binary_op<aclnn_mul>(ctx, dst);
+ break;
+ case GGML_OP_SQRT:
+ GGML_CANN_CALL_OP_UNARY(Sqrt);
+ break;
+ case GGML_OP_CLAMP:
+ ggml_cann_clamp(ctx, dst);
+ break;
+ case GGML_OP_CPY:
+ ggml_cann_cpy(ctx, dst);
+ break;
+ case GGML_OP_CONT:
+ ggml_cann_dup(ctx, dst);
+ break;
+ case GGML_OP_NONE:
+ case GGML_OP_RESHAPE:
+ case GGML_OP_VIEW:
+ case GGML_OP_PERMUTE:
+ case GGML_OP_TRANSPOSE:
+ break;
+ case GGML_OP_DIAG_MASK_INF:
+ ggml_cann_diag_mask(ctx, dst, -INFINITY);
+ break;
+ case GGML_OP_SOFT_MAX:
+ ggml_cann_softmax(ctx, dst);
+ break;
+ case GGML_OP_ROPE:
+ ggml_cann_rope(ctx, dst);
+ break;
+ case GGML_OP_IM2COL:
+ ggml_cann_im2col(ctx, dst);
+ break;
+ case GGML_OP_POOL_2D:
+ ggml_cann_pool2d(ctx, dst);
+ break;
+ case GGML_OP_SUM:
+ ggml_cann_sum(ctx, dst);
+ break;
+ case GGML_OP_SUM_ROWS:
+ ggml_cann_sum_rows(ctx, dst);
+ break;
+ case GGML_OP_ARGSORT:
+ ggml_cann_argsort(ctx, dst);
+ break;
+ case GGML_OP_ARGMAX:
+ ggml_cann_argmax(ctx, dst);
+ break;
+ case GGML_OP_COS:
+ ggml_cann_op_unary<aclnn_cos>(ctx, dst);
+ break;
+ case GGML_OP_SIN:
+ ggml_cann_op_unary<aclnn_sin>(ctx, dst);
+ break;
+ case GGML_OP_CONV_TRANSPOSE_1D:
+ ggml_cann_conv_transpose_1d(ctx, dst);
+ break;
+ case GGML_OP_LOG:
+ GGML_CANN_CALL_OP_UNARY(Log);
+ break;
+ case GGML_OP_MEAN:
+ ggml_cann_mean(ctx, dst);
+ break;
+ case GGML_OP_PAD_REFLECT_1D:
+ ggml_cann_pad_reflect_1d(ctx, dst);
+ break;
+ case GGML_OP_COUNT_EQUAL:
+ ggml_cann_count_equal(ctx, dst);
+ break;
+ case GGML_OP_FLASH_ATTN_EXT:
+ ggml_cann_flash_attn_ext(ctx, dst);
+ break;
+ case GGML_OP_OUT_PROD:
+ ggml_cann_out_prod(ctx, dst);
+ break;
+ case GGML_OP_GATED_LINEAR_ATTN:
+ ggml_cann_gated_linear_attn(ctx, dst);
+ break;
+ case GGML_OP_SSM_CONV:
+ ggml_cann_ssm_conv(ctx, dst);
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+// backend
+/**
+ * @brief Retrieves the name associated with the CANN backend.
+ *
+ * This function returns the name assigned to the CANN backend, which is stored
+ * in the context of the provided backend structure.
+ *
+ * @param backend Pointer to the CANN backend structure.
+ * @return A pointer to a constant string representing the backend name.
+ */
+static const char * ggml_backend_cann_name(ggml_backend_t backend) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+
+ return cann_ctx->name.c_str();
+}
+
+/**
+ * @brief Frees resources associated with the CANN backend.
+ *
+ * This function releases resources associated with the CANN backend context
+ * and resets the device associated with the backend to its initial state.
+ *
+ * @param backend Pointer to the CANN backend structure to be freed.
+ */
+static void ggml_backend_cann_free(ggml_backend_t backend) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ACL_CHECK(aclrtSynchronizeDevice());
+ ACL_CHECK(aclrtResetDevice(cann_ctx->device));
+
+ delete cann_ctx;
+ delete backend;
+}
+
+/**
+ * @brief Sets tensor data asynchronously in the CANN backend.
+ *
+ * This function asynchronously sets tensor data in the CANN backend.
+ *
+ * @param backend Pointer to the CANN backend structure.
+ * @param tensor Pointer to the tensor structure to set data for.
+ * @param data Pointer to the host data to copy to the tensor.
+ * @param offset Offset in bytes within the host data.
+ * @param size Size of the data to copy in bytes.
+ */
+static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
+ ggml_tensor * tensor,
+ const void * data,
+ size_t offset,
+ size_t size) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
+ GGML_ASSERT(!ggml_is_quantized(tensor->type));
+
+ ACL_CHECK(aclrtMemcpyAsync((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE,
+ cann_ctx->stream()));
+}
+
+/**
+ * @brief Gets tensor data asynchronously in the CANN backend.
+ *
+ * This function asynchronously gets tensor data in the CANN backend.
+ *
+ * @param backend Pointer to the CANN backend structure.
+ * @param tensor Pointer to the tensor structure to get data from.
+ * @param data Pointer to the host data to copy from the tensor.
+ * @param offset Offset in bytes within the host data.
+ * @param size Size of the data to copy in bytes.
+ */
+static void ggml_backend_cann_get_tensor_async(ggml_backend_t backend,
+ const ggml_tensor * tensor,
+ void * data,
+ size_t offset,
+ size_t size) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
+ GGML_ASSERT(!ggml_is_quantized(tensor->type));
+
+ ACL_CHECK(aclrtMemcpyAsync(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST,
+ cann_ctx->stream()));
+}
+
+/**
+ * @brief Asynchronously copies tensor data between CANN backends.
+ *
+ * This function copies tensor data asynchronously between two CANN backends. It
+ * checks if both tensors reside in CANN buffers and whether the devices support
+ * peer-to-peer access for direct copying. If not, it returns false.
+ *
+ * @param backend_src Pointer to the source CANN backend structure.
+ * @param backend_dst Pointer to the destination CANN backend structure.
+ * @param src Pointer to the source tensor to copy data from.
+ * @param dst Pointer to the destination tensor to copy data to.
+ * @return true if the copy operation succeeds, false otherwise.
+ */
+static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src,
+ ggml_backend_t backend_dst,
+ const ggml_tensor * src,
+ ggml_tensor * dst) {
+ GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst));
+
+ GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
+
+ if (!ggml_backend_buft_is_cann(src->buffer->buft) || !ggml_backend_buft_is_cann(dst->buffer->buft)) {
+ return false;
+ }
+
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
+
+ ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context;
+ ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context;
+
+ size_t copy_size = ggml_nbytes(dst);
+ if (copy_size == 0) {
+ return true;
+ }
+ if (backend_src != backend_dst) {
+#ifdef ASCEND_310P
+ // TODO: Support 310p P2P copy
+ return false;
+#endif
+ ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context;
+ ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context;
+
+ GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
+ GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
+
+ int32_t canAccessPeer = 0;
+ ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device));
+ if (!canAccessPeer) {
+ return false;
+ }
+
+ // need open both directions for memcpyasync between devices.
+ ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
+ ggml_cann_set_device(cann_ctx_src->device);
+ ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
+
+ // wait for task_queue empty to keep task order.
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
+ cann_ctx_src->stream()));
+ // record event on src stream after the copy
+ // TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
+ // if (!cann_ctx_src->copy_event) {
+ // ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
+ // }
+ // ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
+
+ // // wait on dst stream for the copy to complete
+ // ggml_cann_set_device(cann_ctx_dst->device);
+ // ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
+ ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
+ } else {
+ // src and dst are on the same backend
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
+ cann_ctx_dst->stream()));
+ }
+
+ return true;
+}
+
+/**
+ * @brief Synchronizes a CANN backend.
+ *
+ * This function synchronizes the specified CANN backend by waiting for all
+ * operations in its associated stream to complete.
+ *
+ * @param backend Pointer to the CANN backend structure to synchronize.
+ */
+static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ggml_cann_set_device(cann_ctx->device);
+ ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
+}
+
+/**
+ * @brief Check if CANN backend can fuse the specified operation sequence
+ *
+ * This function determines whether an operation sequence starting from the specified node
+ * can be fused into an optimized operation in the CANN backend. Operation fusion can reduce
+ * memory access overhead and improve computational efficiency.
+ *
+ * @param cgraph Pointer to the computation graph
+ * @param node_idx Index of the starting node in the computation graph
+ * @param ops Sequence of operation types to check for fusion
+ * @return true if the operations can be fused
+ * @return false if the operations cannot be fused
+ */
+static bool ggml_cann_can_fuse(const struct ggml_cgraph * cgraph,
+ int node_idx,
+ std::initializer_list<enum ggml_op> ops) {
+ if (!ggml_can_fuse(cgraph, node_idx, ops)) {
+ return false;
+ }
+
+ // CANN backend supports fusing ADD + RMS_NORM operations
+ if ((ops.size() == 2) && ops.begin()[0] == GGML_OP_ADD && ops.begin()[1] == GGML_OP_RMS_NORM) {
+ ggml_tensor * add_node = cgraph->nodes[node_idx];
+ // TODO: support broadcast for ADD + RMS_NORM
+ if (add_node->src[0]->ne[0] != add_node->src[1]->ne[0] || add_node->src[0]->ne[1] != add_node->src[1]->ne[1] ||
+ add_node->src[0]->ne[2] != add_node->src[1]->ne[2] || add_node->src[0]->ne[3] != add_node->src[1]->ne[3]) {
+ return false;
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
+ *
+ * If CANN graph execution is enabled and graph capture is required, this function begins
+ * graph capture, runs the graph, ends capture, and stores the captured graph.
+ *
+ * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
+ *
+ * @param cann_ctx The CANN backend context.
+ * @param cgraph The ggml computation graph.
+ * @param use_cann_graph Whether to use CANN graph execution.
+ * @param cann_graph_capture_required Whether graph capture is needed due to graph changes.
+ */
+static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
+ ggml_cgraph * cgraph,
+ bool use_cann_graph,
+ bool cann_graph_capture_required) {
+#ifdef USE_ACL_GRAPH
+ if (use_cann_graph && cann_graph_capture_required) { // Begin CANN graph capture
+ ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
+ }
+#endif // USE_ACL_GRAPH
+ // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
+ // With the use of CANN graphs, the execution will be performed by the graph launch.
+ static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));
+
+ if (!use_cann_graph || cann_graph_capture_required) {
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ ggml_tensor * node = cgraph->nodes[i];
+ if (opt_fusion) {
+ if (ggml_cann_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM })) {
+ ggml_cann_op_add_rms_norm_fused(*cann_ctx, node, cgraph->nodes[i + 1]);
+ i++;
+ continue;
+ }
+ }
+
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
+ node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+ continue;
+ }
+
+ if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+ continue;
+ }
+
+ bool ok = ggml_cann_compute_forward(*cann_ctx, node);
+ if (!ok) {
+ GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+ }
+ GGML_ASSERT(ok);
+ }
+ }
+
+#ifdef USE_ACL_GRAPH
+ if (use_cann_graph) {
+ GGML_ASSERT(!cann_ctx->graph_lru_cache.cache_list.empty());
+ ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
+
+ if (cann_graph_capture_required) { // End CANN graph capture
+ ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
+ }
+
+ // Execute CANN graph
+ ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
+ }
+#endif // USE_ACL_GRAPH
+}
+
+/**
+ * @brief Computes a computational graph using a CANN backend.
+ *
+ * This function computes the operations defined in the computational graph
+ * using the specified CANN backend.
+ *
+ * @param backend Pointer to the CANN backend structure to use for computation.
+ * @param cgraph Pointer to the computational graph structure containing nodes
+ * representing operations to be computed.
+ * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
+ * completes successfully, otherwise an appropriate error status.
+ */
+static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ggml_cann_set_device(cann_ctx->device);
+ g_nz_workspaces[cann_ctx->device].clear();
+
+ // calculate rope cache for fist layer in current device.
+ cann_ctx->rope_cache.cached = false;
+
+ bool graph_capture_required = false;
+#ifdef USE_ACL_GRAPH
+ bool use_cann_graph = true;
+
+ static bool prefill_use_graph = parse_bool(get_env_as_lowercase("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
+ if (!prefill_use_graph) {
+ // Do not use acl_graph for prefill.
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ ggml_tensor * node = cgraph->nodes[i];
+ // TODO: Optimize here. Currently, we can only
+ // get seq_len by FA's input.
+ if (node->op == GGML_OP_FLASH_ATTN_EXT) {
+ // Q -> src[0], shape: [B, S, N, D]
+ use_cann_graph = (node->src[0]->ne[1] == 1);
+ break;
+ }
+ }
+ }
+
+ if (!cann_ctx->acl_graph_mode) {
+ use_cann_graph = false;
+ }
+
+ if (use_cann_graph) {
+ // If no matching graph is found, the graph needs to be recaptured.
+ graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
+ if (graph_capture_required) {
+ // If no matching graph is found, add a new ACL graph.
+ ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
+ cann_ctx->graph_lru_cache.push(new_graph);
+ }
+ }
+#else
+ bool use_cann_graph = false;
+#endif // USE_ACL_GRAPH
+ evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, graph_capture_required);
+
+ return GGML_STATUS_SUCCESS;
+}
+
+/**
+ * @brief Checks if the CANN backend supports a specific operation.
+ *
+ * This function checks whether the specified operation is supported by the
+ * CANN backend.
+ *
+ * @param backend Pointer to the CANN backend structure to check support for
+ * the operation.
+ * @param op Pointer to the tensor representing the operation to check.
+ * @return bool Returns true if the operation is supported by the backend,
+ * otherwise false.
+ */
+static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+ switch (op->op) {
+ case GGML_OP_UNARY:
+ switch (ggml_get_unary_op(op)) {
+ case GGML_UNARY_OP_ABS:
+ case GGML_UNARY_OP_NEG:
+ case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_SILU:
+ case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_SIGMOID:
+ case GGML_UNARY_OP_HARDSIGMOID:
+ case GGML_UNARY_OP_HARDSWISH:
+ case GGML_UNARY_OP_GELU_QUICK:
+ case GGML_UNARY_OP_TANH:
+ case GGML_UNARY_OP_EXP:
+ case GGML_UNARY_OP_ELU:
+ case GGML_UNARY_OP_SGN:
+ case GGML_UNARY_OP_STEP:
+ case GGML_UNARY_OP_GELU_ERF:
+ return true;
+ default:
+ return false;
+ }
+ case GGML_OP_GLU:
+ switch (ggml_get_glu_op(op)) {
+ case GGML_GLU_OP_REGLU:
+ case GGML_GLU_OP_GEGLU:
+ case GGML_GLU_OP_SWIGLU:
+ case GGML_GLU_OP_GEGLU_ERF:
+ case GGML_GLU_OP_GEGLU_QUICK:
+ return true;
+ default:
+ return false;
+ }
+ break;
+ case GGML_OP_MUL_MAT:
+ {
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ return true;
+ case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q4_0:
+#ifdef ASCEND_310P
+ // Q4 && Q8 per group is not support on 310p device
+ return false;
+#endif
+ // only support contiguous for quantized types.
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+ default:
+ return false;
+ }
+ }
+ case GGML_OP_MUL_MAT_ID:
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ return true;
+ case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q4_0:
+#ifdef ASCEND_310P
+ // Q4 && Q8 per group is not support on 310p device
+ return false;
+#endif
+ // only support contiguous for quantized types.
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+ default:
+ return false;
+ }
+ // embedding
+ case GGML_OP_GET_ROWS:
+ {
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ case GGML_TYPE_Q8_0:
+ return true;
+ default:
+ return false;
+ }
+ }
+ break;
+ case GGML_OP_SET_ROWS:
+ {
+ switch (op->type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ return true;
+ default:
+ return false;
+ }
+ }
+ break;
+ case GGML_OP_CPY:
+ {
+ ggml_tensor * src = op->src[0];
+ if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
+ (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
+ // only support F32 and F16.
+ return false;
+ }
+ return true;
+ }
+ break;
+ case GGML_OP_CONT:
+ {
+ // TODO: support GGML_TYPE_BF16
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ return true;
+ default:
+ return false;
+ }
+ }
+ case GGML_OP_ROPE:
+ {
+ if (op->src[0]->ne[0] > 896) {
+ return false;
+ }
+#ifdef ASCEND_310P
+ // TODO: Support rope_dim < ne00(dim)
+ if (op->src[0]->ne[0] != op->op_params[1]) {
+ return false;
+ }
+ if (!ggml_is_contiguous(op->src[0])) {
+ return false;
+ }
+#endif
+ return true;
+ }
+ case GGML_OP_UPSCALE:
+ {
+ // aclnnUpsampleNearest2dGetWorkspaceSize not support
+ // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+ if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+ return false;
+ }
+ if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
+ return false;
+ }
+ if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) {
+ return false;
+ }
+ return true;
+ }
+ case GGML_OP_POOL_2D:
+ {
+ const int32_t * opts = (const int32_t *) op->op_params;
+#ifdef ASCEND_310P
+ enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
+ if (opt == GGML_OP_POOL_MAX) {
+ return false;
+ }
+#endif
+ const int k0 = opts[1];
+ const int k1 = opts[2];
+ const int p0 = opts[5];
+ const int p1 = opts[6];
+ // value of paddingH should be at most half of kernelH
+ // value of paddingW should be at most half of kernelW
+ return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
+ }
+ case GGML_OP_SUM:
+ return ggml_is_contiguous_rows(op->src[0]);
+ case GGML_OP_L2_NORM:
+ case GGML_OP_CROSS_ENTROPY_LOSS:
+ case GGML_OP_DUP:
+ case GGML_OP_IM2COL:
+ case GGML_OP_CONCAT:
+ case GGML_OP_REPEAT:
+ case GGML_OP_NONE:
+ case GGML_OP_RESHAPE:
+ case GGML_OP_VIEW:
+ case GGML_OP_PERMUTE:
+ case GGML_OP_TRANSPOSE:
+ case GGML_OP_NORM:
+ case GGML_OP_ADD:
+ case GGML_OP_ADD1:
+ case GGML_OP_SUB:
+ case GGML_OP_MUL:
+ case GGML_OP_DIV:
+ case GGML_OP_RMS_NORM:
+ case GGML_OP_SQR:
+ case GGML_OP_SQRT:
+ case GGML_OP_CLAMP:
+ case GGML_OP_DIAG_MASK_INF:
+ case GGML_OP_SUM_ROWS:
+ case GGML_OP_ARGSORT:
+ case GGML_OP_ACC:
+ case GGML_OP_GROUP_NORM:
+ return true;
+ case GGML_OP_PAD:
+ // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
+ return ggml_get_op_params_i32(op, 8) == 0;
+ case GGML_OP_ARANGE:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
+ case GGML_OP_ARGMAX:
+ case GGML_OP_COS:
+ case GGML_OP_SIN:
+ case GGML_OP_LOG:
+ case GGML_OP_MEAN:
+ case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_COUNT_EQUAL:
+ case GGML_OP_GATED_LINEAR_ATTN:
+ return true;
+ case GGML_OP_OUT_PROD:
+ {
+#ifdef ASCEND_310P
+ // Ger is not supported on 310p device
+ return false;
+#endif
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ return true;
+ default:
+ return false;
+ }
+ }
+ case GGML_OP_CONV_TRANSPOSE_1D:
+ return true;
+ case GGML_OP_SCALE:
+ float bias;
+ memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
+ return bias == 0.0f; // TODO: support bias != 0.0f
+ case GGML_OP_SOFT_MAX:
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
+ if (op->src[2]) {
+ return false;
+ }
+ return true;
+ case GGML_OP_FLASH_ATTN_EXT:
+ {
+#ifdef ASCEND_310P
+ // FA not support on 310p device
+ return false;
+#endif
+ // derived from [ggml-cuda.cu]
+ if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) {
+ return false;
+ }
+ if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 &&
+ op->src[1]->type != GGML_TYPE_BF16) {
+ return false;
+ }
+ if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) {
+ return false;
+ }
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
+ if (op->src[4]) {
+ return false;
+ }
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+ // different head sizes of K and V are not supported yet
+ return false;
+ }
+ if (op->src[0]->ne[0] % 16 != 0) {
+ // TODO: padding to support
+ return false;
+ }
+ float logitSoftcap = 0.0f;
+ memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
+ if (logitSoftcap != 0.0f) {
+ return false;
+ }
+ return true;
+ }
+ case GGML_OP_SSM_CONV:
+ return true;
+ default:
+ return false;
+ }
+
+ GGML_UNUSED(dev);
+}
+
+/**
+ * @brief Records an event on the CANN backend stream.
+ *
+ * This function records the given event on the ACL runtime stream associated
+ * with the backend context.
+ *
+ * @param event Pointer to the event structure to be recorded.
+ */
+static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream()));
+}
+
+/**
+ * @brief Waits for a recorded event to complete on the CANN backend stream.
+ *
+ * This function makes the given backend wait for the event to complete on its
+ * ACL runtime stream.
+ *
+ * @param backend Pointer to the backend structure.
+ * @param event Pointer to the event structure that the backend needs to wait
+ * for.
+ */
+static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ if (ggml_backend_is_cann(backend)) {
+ ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context));
+ } else {
+ GGML_ABORT("fatal error");
+ }
+}
+
+/**
+ * @brief Structure defining the interface for the CANN backend.
+ *
+ * This structure contains function pointers for various operations
+ * supported by the CANN backend, including name retrieval, memory
+ * management, tensor operations, synchronization, and event handling.
+ */
+static const ggml_backend_i ggml_backend_cann_interface = {
+ /* .get_name = */ ggml_backend_cann_name,
+ /* .free = */ ggml_backend_cann_free,
+ /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
+ /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
+ /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
+ /* .synchronize = */ ggml_backend_cann_synchronize,
+ /* .graph_plan_create = */ NULL,
+ /* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
+ /* .graph_plan_compute = */ NULL,
+ /* .graph_compute = */ ggml_backend_cann_graph_compute,
+ /* .event_record = */ ggml_backend_cann_event_record,
+ /* .event_wait = */ ggml_backend_cann_event_wait,
+ /* .graph_optimize = */ NULL,
+};
+
+/**
+ * @brief Return the hardcoded GUID for the CANN backend.
+ *
+ * This function returns a static GUID which uniquely identifies the CANN
+ * backend.
+ *
+ * @return A pointer to the static GUID.
+ */
+static ggml_guid_t ggml_backend_cann_guid() {
+ static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
+ 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 };
+ return &guid;
+}
+
+// backend device
+struct ggml_backend_cann_device_context {
+ int device;
+ std::string name;
+ std::string description;
+ int op_offload_min_batch_size;
+};
+
+static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
+ return ctx->name.c_str();
+}
+
+static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
+ return ctx->description.c_str();
+}
+
+static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
+ ggml_backend_cann_get_device_memory(ctx->device, free, total);
+}
+
+static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
+ GGML_UNUSED(dev);
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+ props->name = ggml_backend_cann_device_get_name(dev);
+ props->description = ggml_backend_cann_device_get_description(dev);
+ props->type = ggml_backend_cann_device_get_type(dev);
+ ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+ bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
+
+ props->caps = {
+ /* .async = */ false,
+ /* .host_buffer = */ host_buffer,
+ /* .buffer_from_host_ptr = */ false,
+ /* .events = */ true,
+ };
+}
+
+static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
+ GGML_UNUSED(params);
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
+ return ggml_backend_cann_init(ctx->device);
+}
+
+/**
+ * @brief Checks if the CANN backend supports a specific backend buffer type.
+ *
+ * This function determines whether the CANN backend supports the given backend
+ * buffer type by comparing the device context of the backend and buffer type.
+ * It returns true if the devices are same between the backend context and
+ * buffer type context.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param buft Pointer to the backend buffer type to check.
+ * @return bool Returns true if the CANN backend supports the buffer type,
+ * otherwise false.
+ */
+static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+ if (ggml_backend_buft_is_cann(buft)) {
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
+ return buft_ctx->device == dev_ctx->device;
+ }
+ return false;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
+ return ggml_backend_cann_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+ GGML_UNUSED(dev);
+ return ggml_backend_cann_host_buffer_type();
+}
+
+/**
+ * @brief Determines if a tensor operation should be offloaded to the CANN
+ * backend.
+ *
+ * This function checks if a given tensor operation should be offloaded to the
+ * CANN backend based on the operation type and the size of the tensor. It
+ * returns true if the second dimension (ne[1]) of the tensor is greater than or
+ * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param op Pointer to the tensor operation to check.
+ * @return bool Returns true if the operation should be offloaded, otherwise
+ * false.
+ */
+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+
+ return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
+}
+
+/**
+ * @brief Creates a new event for the CANN backend device.
+ *
+ * This function initializes a new event for the CANN backend by setting the
+ * device and creating an ACL runtime event. The created event is then wrapped
+ * in a ggml_backend_event structure and returned.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @return ggml_backend_event_t Returns a pointer to the new event structure.
+ */
+static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) {
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
+
+ ggml_cann_set_device(dev_ctx->device);
+
+ aclrtEvent event;
+ ACL_CHECK(aclrtCreateEvent(&event));
+
+ return new ggml_backend_event{
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
+ /* .context = */ event,
+ };
+}
+
+/**
+ * @brief Frees a CANN backend event.
+ *
+ * This function destroys the ACL runtime event associated with the given CANN
+ * backend event and then deletes the event structure itself.
+ *
+ * @param event Pointer to the event structure to be freed.
+ */
+static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+ ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context));
+
+ delete event;
+ GGML_UNUSED(dev);
+}
+
+/**
+ * @brief Synchronizes the given event on the CANN backend.
+ *
+ * This function waits for the specified event to complete on the ACL runtime.
+ *
+ * @param event Pointer to the event structure to be synchronized.
+ */
+static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+ ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context));
+
+ GGML_UNUSED(dev);
+}
+
+static const ggml_backend_device_i ggml_backend_cann_device_interface = {
+ /* .get_name = */ ggml_backend_cann_device_get_name,
+ /* .get_description = */ ggml_backend_cann_device_get_description,
+ /* .get_memory = */ ggml_backend_cann_device_get_memory,
+ /* .get_type = */ ggml_backend_cann_device_get_type,
+ /* .get_props = */ ggml_backend_cann_device_get_props,
+ /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
+ /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
+ /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
+ /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
+ /* .supports_op = */ ggml_backend_cann_supports_op,
+ /* .supports_buft = */ ggml_backend_cann_supports_buft,
+ /* .offload_op = */ ggml_backend_cann_offload_op,
+ /* .event_new = */ ggml_backend_cann_device_event_new,
+ /* .event_free = */ ggml_backend_cann_device_event_free,
+ /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
+};
+
+// backend reg
+struct ggml_backend_cann_reg_context {
+ std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
+ GGML_UNUSED(reg);
+ return GGML_CANN_NAME;
+}
+
+static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
+ return ctx->devices.size();
+}
+
+static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
+ GGML_ASSERT(index < ctx->devices.size());
+ return ctx->devices[index];
+}
+
+static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+ GGML_UNUSED(reg);
+ GGML_UNUSED(name);
+ // reserved for future use
+ return nullptr;
+}
+
+static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
+ /* .get_name = */ ggml_backend_cann_reg_get_name,
+ /* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
+ /* .get_device = */ ggml_backend_cann_reg_get_device,
+ /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
+};
+
+// backend registry, called only once for cann backend
+ggml_backend_reg_t ggml_backend_cann_reg() {
+ static ggml_backend_reg reg;
+ static bool initialized = false;
+
+ {
+ static std::mutex mutex;
+ std::lock_guard<std::mutex> lock(mutex);
+ if (!initialized) {
+ aclInit(nullptr);
+ ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
+ const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
+
+ for (int i = 0; i < ggml_cann_info().device_count; i++) {
+ ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
+ dev_ctx->description = aclrtGetSocName();
+ dev_ctx->device = i;
+ dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
+ dev_ctx->op_offload_min_batch_size = min_batch_size;
+ ggml_cann_set_device(i);
+ ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
+ /* .reg = */ &reg,
+ /* .context = */ dev_ctx };
+ ctx->devices.push_back(dev);
+ }
+
+ reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_cann_reg_interface,
+ /* .context = */ ctx };
+ }
+
+ initialized = true;
+ }
+
+ return &reg;
+}
+
+ggml_backend_t ggml_backend_cann_init(int32_t device) {
+ aclInit(nullptr);
+ if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
+ GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
+ return nullptr;
+ }
+
+ ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device);
+ if (ctx == nullptr) {
+ GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+ return nullptr;
+ }
+ ggml_cann_set_device(ctx->device);
+ ggml_backend_t cann_backend =
+ new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(),
+ /* .interface = */ ggml_backend_cann_interface,
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+ /* .context = */ ctx };
+
+ return cann_backend;
+}
+
+bool ggml_backend_is_cann(ggml_backend_t backend) {
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
+}
+
+int32_t ggml_backend_cann_get_device_count() {
+ return ggml_cann_info().device_count;
+}
+
+void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) {
+ ggml_cann_set_device(device);
+ const char * soc_name = aclrtGetSocName();
+ snprintf(description, description_size, "%s", soc_name);
+}
+
+void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) {
+ ggml_cann_set_device(device);
+ ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)