1/*
   2 * Copyright (c) 2023-2026 The ggml authors
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a copy
   5 * of this software and associated documentation files (the "Software"), to
   6 * deal in the Software without restriction, including without limitation the
   7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
   8 * sell copies of the Software, and to permit persons to whom the Software is
   9 * furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20 * IN THE SOFTWARE.
  21 */
  22
  23#include "aclnn_ops.h"
  24
  25#include "ggml-impl.h"
  26#include "ggml.h"
  27
  28#include <aclnnop/aclnn_add.h>
  29#include <aclnnop/aclnn_add_rms_norm.h>
  30#include <aclnnop/aclnn_addcdiv.h>
  31#include <aclnnop/aclnn_argmax.h>
  32#include <aclnnop/aclnn_avgpool2d.h>
  33#include <aclnnop/aclnn_batch_matmul.h>
  34#include <aclnnop/aclnn_cast.h>
  35#include <aclnnop/aclnn_clamp.h>
  36#include <aclnnop/aclnn_constant_pad_nd.h>
  37#include <aclnnop/aclnn_convolution.h>
  38#include <aclnnop/aclnn_copy.h>
  39#include <aclnnop/aclnn_div.h>
  40#include <aclnnop/aclnn_elu.h>
  41#include <aclnnop/aclnn_embedding.h>
  42#include <aclnnop/aclnn_eq_tensor.h>
  43#include <aclnnop/aclnn_exp.h>
  44#include <aclnnop/aclnn_fill_scalar.h>
  45#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
  46#include <aclnnop/aclnn_ger.h>
  47#include <aclnnop/aclnn_group_norm.h>
  48#include <aclnnop/aclnn_grouped_matmul_v3.h>
  49#include <aclnnop/aclnn_gt_scalar.h>
  50#include <aclnnop/aclnn_im2col.h>
  51#include <aclnnop/aclnn_index_copy.h>
  52#include <aclnnop/aclnn_index_fill_tensor.h>
  53#include <aclnnop/aclnn_index_select.h>
  54#include <aclnnop/aclnn_layer_norm.h>
  55#include <aclnnop/aclnn_log.h>
  56#include <aclnnop/aclnn_matmul.h>
  57#include <aclnnop/aclnn_max_pool.h>
  58#include <aclnnop/aclnn_mean.h>
  59#include <aclnnop/aclnn_mm.h>
  60#include <aclnnop/aclnn_mul.h>
  61#include <aclnnop/aclnn_mv.h>
  62#include <aclnnop/aclnn_permute.h>
  63#include <aclnnop/aclnn_pow.h>
  64#include <aclnnop/aclnn_pow_tensor_tensor.h>
  65#include <aclnnop/aclnn_reduce_sum.h>
  66#include <aclnnop/aclnn_reflection_pad1d.h>
  67#include <aclnnop/aclnn_repeat.h>
  68#include <aclnnop/aclnn_repeat_interleave.h>
  69#include <aclnnop/aclnn_rms_norm.h>
  70#include <aclnnop/aclnn_roll.h>
  71#include <aclnnop/aclnn_softmax.h>
  72#include <aclnnop/aclnn_sub.h>
  73#include <aclnnop/aclnn_sum.h>
  74#include <aclnnop/aclnn_threshold.h>
  75#include <aclnnop/aclnn_tril.h>
  76#include <aclnnop/aclnn_triu.h>
  77#include <aclnnop/aclnn_upsample_nearest_2d.h>
  78#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
  79#include <aclnnop/aclnn_zero.h>
  80#include <float.h>
  81
  82#include <cmath>
  83#include <cstring>
  84#include <exception>
  85#include <vector>
  86
  87#define GGML_COMMON_DECL_C
  88
  89#include "../ggml-common.h"
  90
  91void bcast_shape(ggml_tensor *    src0,
  92                 ggml_tensor *    src1,
  93                 ggml_tensor *    dst,
  94                 acl_tensor_ptr & acl_src0,
  95                 acl_tensor_ptr & acl_src1,
  96                 acl_tensor_ptr & acl_dst) {
  97    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
  98    // Need bcast
  99    if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
 100        BCAST_SHAPE(src0, src1)
 101        acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
 102        acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
 103        acl_dst  = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
 104    } else {
 105        acl_src0 = ggml_cann_create_tensor(src0);
 106        acl_src1 = ggml_cann_create_tensor(src1);
 107        acl_dst  = ggml_cann_create_tensor(dst);
 108    }
 109}
 110
 111void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
 112                        ggml_backend_cann_context &                                                ctx,
 113                        ggml_tensor *                                                              dst) {
 114    ggml_tensor * src = dst->src[0];
 115
 116    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 117    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 118
 119    unary_op(ctx, acl_src.get(), acl_dst.get());
 120}
 121
 122void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
 123                              ggml_backend_cann_context &                                                ctx,
 124                              ggml_tensor *                                                              dst) {
 125    ggml_tensor * src0 = dst->src[0];
 126    ggml_tensor * src1 = dst->src[1];
 127
 128    GGML_ASSERT(ggml_is_contiguous_1(src0));
 129    GGML_ASSERT(ggml_is_contiguous_1(dst));
 130    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
 131
 132    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 133    acl_tensor_ptr acl_src0, acl_src1;
 134    if (src1) {
 135        GGML_ASSERT(ggml_is_contiguous_1(src1));
 136        GGML_ASSERT(src0->type == src1->type);
 137
 138        acl_src0 = ggml_cann_create_tensor(src0);
 139        acl_src1 = ggml_cann_create_tensor(src1);
 140    } else {
 141        int64_t ne[] = { src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3] };
 142        size_t  nb[] = { src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3] };
 143        acl_src0     = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
 144        acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
 145        if (swapped) {
 146            std::swap(acl_src0, acl_src1);
 147        }
 148    }
 149
 150    unary_op(ctx, acl_src0.get(), acl_dst.get());
 151    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst.get(), acl_src1.get());
 152}
 153
 154/**
 155 * @brief Repeats elements of a tensor along each dimension according to the
 156 * specified repeat array.
 157 *
 158 * @param ctx The context for the CANN backend operations.
 159 * @param acl_src The source tensor to be repeated.
 160 * @param acl_dst The destination tensor after repeating.
 161 * @param repeat_array The array specifying the number of repetitions along each
 162 * dimension.
 163 */
 164static void aclnn_repeat(ggml_backend_cann_context & ctx,
 165                         aclTensor *                 acl_src,
 166                         aclTensor *                 acl_dst,
 167                         int64_t *                   repeat_array) {
 168    // repeat tensor along each dim with repeat_array
 169    acl_int_array_ptr repeats = ggml_cann_create_int_array(repeat_array, GGML_MAX_DIMS);
 170
 171    GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats.get(), acl_dst);
 172}
 173
 174/**
 175 * @brief Casts the data type of a source tensor to a destination tensor.
 176 *
 177 * This function casts the data type of the source tensor `acl_src` to the
 178 * specified data type `cast_data_type` and stores the result in the destination
 179 * tensor `acl_dst`.
 180 *
 181 * @param ctx The context for the CANN backend operations.
 182 * @param acl_src The source tensor whose data type will be casted.
 183 * @param acl_dst The destination tensor where the casted result will be stored.
 184 * @param cast_data_type The target data type to which the source tensor will be
 185 * casted.
 186 */
 187static void aclnn_cast(ggml_backend_cann_context & ctx,
 188                       aclTensor *                 acl_src,
 189                       aclTensor *                 acl_dst,
 190                       aclDataType                 cast_data_type) {
 191    GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst);
 192}
 193
 194void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 195    ggml_tensor * src = dst->src[0];
 196    GGML_ASSERT(ggml_can_repeat(src, dst));
 197
 198    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 199    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 200
 201    int64_t repeatsArray[] = { dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], dst->ne[1] / src->ne[1],
 202                               dst->ne[0] / src->ne[0] };
 203
 204    aclnn_repeat(ctx, acl_src.get(), acl_dst.get(), repeatsArray);
 205}
 206
 207void aclnn_add(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
 208    float          alphaValue = 1.0f;
 209    acl_scalar_ptr alpha      = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
 210    if (acl_dst != nullptr) {
 211        GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha.get(), acl_dst);
 212    } else {
 213        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha.get());
 214    }
 215}
 216
 217void aclnn_sub(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
 218    float          alphaValue = 1.0f;
 219    acl_scalar_ptr alpha      = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
 220    if (acl_dst != nullptr) {
 221        GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha.get(), acl_dst);
 222    } else {
 223        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha.get());
 224    }
 225}
 226
 227void aclnn_mul(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
 228    if (acl_dst != nullptr) {
 229        GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst);
 230    } else {
 231        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other);
 232    }
 233}
 234
 235void aclnn_div(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
 236    if (acl_dst != nullptr) {
 237        GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst);
 238    } else {
 239        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other);
 240    }
 241}
 242
 243/**
 244 * @brief Multiplies elements of a tensor by a scalar value, optionally
 245 * in-place.
 246 *
 247 * This function multiplies each element of the source tensor `acl_src` by the
 248 * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
 249 * `inplace` is true, `acl_dst` will not be used and the operation is performed
 250 *  in-place on `acl_src`.
 251 * The operation is defined as:
 252 * \f[
 253 *     \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
 254 * \f]
 255 *
 256 * @param ctx The context for the CANN backend operations.
 257 * @param acl_src The source tensor whose elements will be multiplied.
 258 * @param scale The scalar value by which each element of `acl_src` will be
 259 *  multiplied.
 260 * @param acl_dst The destination tensor where the result will be stored if
 261 * `inplace` is false.
 262 * @param inplace Flag indicating whether to perform the operation in-place on
 263 * `acl_src`.
 264 */
 265static void aclnn_muls(ggml_backend_cann_context & ctx,
 266                       aclTensor *                 acl_src,
 267                       float                       scale,
 268                       aclTensor *                 acl_dst,
 269                       bool                        inplace) {
 270    acl_scalar_ptr acl_scale = ggml_cann_create_scalar(&scale, aclDataType::ACL_FLOAT);
 271    if (inplace) {
 272        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale.get());
 273    } else {
 274        GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale.get(), acl_dst);
 275    }
 276}
 277
 278void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 279    ggml_tensor * src = dst->src[0];
 280
 281    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 282    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 283
 284    float negative_slope;
 285    memcpy(&negative_slope, dst->op_params, sizeof(float));
 286    acl_scalar_ptr acl_negative_slope = ggml_cann_create_scalar(&negative_slope, aclDataType::ACL_FLOAT);
 287
 288    GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src.get(), acl_negative_slope.get(), acl_dst.get());
 289}
 290
 291/**
 292 * @brief Concatenates a list of tensors along a specified dimension and stores
 293 * the result in a destination tensor.
 294 *
 295 * @param ctx The context for the CANN backend operations.
 296 * @param tensorList The list of tensors to be concatenated.
 297 * @param acl_dst The destination tensor where the concatenated result will be
 298 * stored.
 299 * @param concat_dim The dimension along which the tensors will be concatenated.
 300 */
 301static void aclnn_concat(ggml_backend_cann_context & ctx,
 302                         aclTensorList *             tensorList,
 303                         aclTensor *                 acl_dst,
 304                         int64_t                     concat_dim) {
 305    GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst);
 306}
 307
 308void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 309    ggml_tensor *  src0     = dst->src[0];
 310    ggml_tensor *  src1     = dst->src[1];
 311    acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
 312    acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
 313    acl_tensor_ptr acl_dst  = ggml_cann_create_tensor(dst);
 314
 315    const int32_t dim = ggml_get_op_params_i32(dst, 0);
 316
 317    GGML_ASSERT(dim >= 0 && dim < 4);
 318    int32_t acl_dim = 3 - dim;
 319
 320    acl_tensor_list_ptr tensor_list = ggml_cann_create_tensor_list(acl_src0, acl_src1);
 321    aclnn_concat(ctx, tensor_list.get(), acl_dst.get(), acl_dim);
 322}
 323
 324/**
 325 * @brief Creates a tensor with values starting from `start`, incremented by
 326 * `step`, and ending before `stop`.
 327 *
 328 * This function performs the operation:
 329 * \f[
 330 *    \text {out }_{i+1}=\text {out }_i+\text {step}
 331 * \f]
 332 * the range is [start, stop).
 333 *
 334 * @param ctx The context for the CANN backend operations.
 335 * @param acl_dst The destination tensor where the values will be stored.
 336 * @param start The starting value of the range.
 337 * @param stop The ending value of the range (exclusive).
 338 * @param step The step size between consecutive values.
 339 * @param n_elements The number of elements in the destination tensor.
 340 */
 341static void aclnn_arange(ggml_backend_cann_context & ctx,
 342                         aclTensor *                 acl_dst,
 343                         float                       start,
 344                         float                       stop,
 345                         float                       step,
 346                         int64_t                     n_elements) {
 347    int64_t steps = (int64_t) std::ceil((stop - start) / step);
 348    GGML_ASSERT(n_elements == steps);
 349
 350    acl_scalar_ptr acl_start = ggml_cann_create_scalar(&start, aclDataType::ACL_FLOAT);
 351    acl_scalar_ptr acl_end   = ggml_cann_create_scalar(&stop, aclDataType::ACL_FLOAT);
 352    acl_scalar_ptr acl_step  = ggml_cann_create_scalar(&step, aclDataType::ACL_FLOAT);
 353
 354    GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start.get(), acl_end.get(), acl_step.get(), acl_dst);
 355}
 356
 357void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 358    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 359
 360    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 361
 362    int64_t n_elements = ggml_nelements(dst);
 363    float   start;
 364    float   stop;
 365    float   step;
 366    memcpy(&start, (float *) dst->op_params + 0, sizeof(float));
 367    memcpy(&stop, (float *) dst->op_params + 1, sizeof(float));
 368    memcpy(&step, (float *) dst->op_params + 2, sizeof(float));
 369
 370    aclnn_arange(ctx, acl_dst.get(), start, stop, step, n_elements);
 371}
 372
 373void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 374    ggml_tensor * src = dst->src[0];
 375
 376    float min;
 377    float max;
 378    memcpy(&min, dst->op_params, sizeof(float));
 379    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
 380
 381    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 382    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 383
 384    acl_scalar_ptr acl_min = ggml_cann_create_scalar(&min, aclDataType::ACL_FLOAT);
 385    acl_scalar_ptr acl_max = ggml_cann_create_scalar(&max, aclDataType::ACL_FLOAT);
 386
 387    GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src.get(), acl_min.get(), acl_max.get(), acl_dst.get());
 388}
 389
 390void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 391    ggml_tensor * src = dst->src[0];
 392
 393    // scale factor
 394    float v;
 395    memcpy(&v, dst->op_params, sizeof(float));
 396
 397    acl_scalar_ptr scale   = ggml_cann_create_scalar(&v, aclDataType::ACL_FLOAT);
 398    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 399    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 400
 401    GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src.get(), scale.get(), acl_dst.get());
 402}
 403
 404void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 405    ggml_tensor *        src   = dst->src[0];
 406    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
 407
 408    acl_tensor_ptr       acl_src = ggml_cann_create_tensor(src);
 409    acl_tensor_ptr       acl_dst = ggml_cann_create_tensor(dst);
 410    ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
 411    void *               buffer = temp_buffer_allocator.get();
 412    acl_tensor_ptr       tmp_tensor =
 413        ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, dst->nb, GGML_MAX_DIMS);
 414    GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src.get(), -1, (order == GGML_SORT_ORDER_DESC ? true : false),
 415                            tmp_tensor.get());
 416    GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor.get(), ggml_cann_type_mapping(dst->type), acl_dst.get());
 417}
 418
 419void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 420    ggml_tensor * src = dst->src[0];
 421
 422    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 423    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 424
 425    float eps;
 426    memcpy(&eps, dst->op_params, sizeof(float));
 427
 428    std::vector<int64_t> normData = { dst->ne[0] };
 429    acl_int_array_ptr    norm     = ggml_cann_create_int_array(normData.data(), normData.size());
 430    GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src.get(), norm.get(), nullptr, nullptr, eps, acl_dst.get(), nullptr,
 431                            nullptr);
 432}
 433
 434void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 435    ggml_tensor * src = dst->src[0];
 436
 437    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 438    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 439
 440    size_t               type_size = ggml_type_size(src->type);
 441    int64_t              n_bytes   = src->ne[3] * src->ne[2] * src->ne[1] * type_size;
 442    ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes);
 443    void *               buffer = temp_buffer_allocator.get();
 444
 445    int64_t div_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
 446    size_t  div_nb[GGML_MAX_DIMS];
 447    div_nb[0] = sizeof(float);
 448    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
 449        div_nb[i] = div_nb[i - 1] * div_ne[i - 1];
 450    }
 451    acl_tensor_ptr acl_div = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, div_ne, div_nb, GGML_MAX_DIMS);
 452
 453    std::vector<int64_t> norm_dims  = { 3 };
 454    acl_int_array_ptr    dims_array = ggml_cann_create_int_array(norm_dims.data(), norm_dims.size());
 455
 456    float          p_value  = 2.0f;
 457    acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
 458    GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
 459    GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
 460}
 461
 462void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 463    ggml_tensor * src0 = dst->src[0];
 464    ggml_tensor * src1 = dst->src[1];
 465
 466    const int64_t nc = src0->ne[0];
 467    const int64_t nr = ggml_nrows(src0);
 468
 469    int64_t logits_ne[] = { nc, nr };
 470    size_t  logits_nb[2];
 471    logits_nb[0]              = ggml_type_size(src0->type);
 472    logits_nb[1]              = logits_nb[0] * logits_ne[0];
 473    acl_tensor_ptr acl_logits = ggml_cann_create_tensor(src0->data, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);
 474
 475    size_t               log_softmax_type_size = sizeof(float);
 476    int64_t              log_softmax_n_bytes   = nr * nc * log_softmax_type_size;
 477    ggml_cann_pool_alloc log_softmax_allocator(ctx.pool(), log_softmax_n_bytes);
 478    void *               log_softmax_buffer = log_softmax_allocator.get();
 479
 480    int64_t log_softmax_ne[] = { nc, nr };
 481    size_t  log_softmax_nb[2];
 482    log_softmax_nb[0]              = log_softmax_type_size;
 483    log_softmax_nb[1]              = log_softmax_nb[0] * log_softmax_ne[0];
 484    acl_tensor_ptr acl_log_softmax = ggml_cann_create_tensor(log_softmax_buffer, ACL_FLOAT, log_softmax_type_size,
 485                                                             log_softmax_ne, log_softmax_nb, 2);
 486
 487    GGML_CANN_CALL_ACLNN_OP(ctx, LogSoftmax, acl_logits.get(), 1, acl_log_softmax.get());
 488
 489    int64_t labels_ne[] = { nc, nr };
 490    size_t  labels_nb[2];
 491    labels_nb[0]              = ggml_type_size(src1->type);
 492    labels_nb[1]              = labels_nb[0] * labels_ne[0];
 493    acl_tensor_ptr acl_labels = ggml_cann_create_tensor(src1->data, ACL_FLOAT, sizeof(float), labels_ne, labels_nb, 2);
 494
 495    size_t               mul_type_size = sizeof(float);
 496    int64_t              mul_n_bytes   = nr * nc * mul_type_size;
 497    ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_n_bytes);
 498    void *               mul_buffer = mul_allocator.get();
 499
 500    int64_t mul_ne[] = { nc, nr };
 501    size_t  mul_nb[2];
 502    mul_nb[0]                     = mul_type_size;
 503    mul_nb[1]                     = mul_nb[0] * mul_ne[0];
 504    acl_tensor_ptr acl_mul_result = ggml_cann_create_tensor(mul_buffer, ACL_FLOAT, mul_type_size, mul_ne, mul_nb, 2);
 505
 506    GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_log_softmax.get(), acl_labels.get(), acl_mul_result.get());
 507
 508    size_t               sum_per_sample_type_size = sizeof(float);
 509    int64_t              sum_per_sample_n_bytes   = nr * sum_per_sample_type_size;
 510    ggml_cann_pool_alloc sum_per_sample_allocator(ctx.pool(), sum_per_sample_n_bytes);
 511    void *               sum_per_sample_buffer = sum_per_sample_allocator.get();
 512
 513    int64_t sum_per_sample_ne[] = { nr };
 514    size_t  sum_per_sample_nb[1];
 515    sum_per_sample_nb[0]              = sum_per_sample_type_size;
 516    acl_tensor_ptr acl_sum_per_sample = ggml_cann_create_tensor(
 517        sum_per_sample_buffer, ACL_FLOAT, sum_per_sample_type_size, sum_per_sample_ne, sum_per_sample_nb, 1);
 518
 519    std::vector<int64_t> sum_dims   = { 1 };
 520    acl_int_array_ptr    dims_array = ggml_cann_create_int_array(sum_dims.data(), sum_dims.size());
 521    bool                 keep_dims  = false;
 522
 523    GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_mul_result.get(), dims_array.get(), keep_dims, ACL_FLOAT,
 524                            acl_sum_per_sample.get());
 525
 526    size_t               total_sum_type_size = sizeof(float);
 527    int64_t              total_sum_n_bytes   = 1 * total_sum_type_size;
 528    ggml_cann_pool_alloc total_sum_allocator(ctx.pool(), total_sum_n_bytes);
 529    void *               total_sum_buffer = total_sum_allocator.get();
 530
 531    int64_t total_sum_ne[] = { 1 };
 532    size_t  total_sum_nb[1];
 533    total_sum_nb[0] = total_sum_type_size;
 534
 535    acl_tensor_ptr acl_total_sum =
 536        ggml_cann_create_tensor(total_sum_buffer, ACL_FLOAT, total_sum_type_size, total_sum_ne, total_sum_nb, 1);
 537
 538    std::vector<int64_t> total_sum_dims    = { 0 };
 539    acl_int_array_ptr total_sum_dims_array = ggml_cann_create_int_array(total_sum_dims.data(), total_sum_dims.size());
 540
 541    GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_sum_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
 542                            acl_total_sum.get());
 543
 544    float          value        = -1.0f / static_cast<float>(nr);
 545    acl_scalar_ptr scale_factor = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
 546    acl_tensor_ptr acl_dst =
 547        ggml_cann_create_tensor(dst->data, ACL_FLOAT, sizeof(float), total_sum_ne, total_sum_nb, 1);
 548
 549    GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_total_sum.get(), scale_factor.get(), acl_dst.get());
 550}
 551
 552void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 553    ggml_tensor * src = dst->src[0];
 554
 555    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 556    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 557
 558    int n_groups = dst->op_params[0];
 559
 560    float eps;
 561    memcpy(&eps, dst->op_params + 1, sizeof(float));
 562
 563    int64_t N   = src->ne[3];
 564    int64_t C   = src->ne[2];
 565    int64_t HxW = src->ne[1] * src->ne[0];
 566
 567    size_t  type_size = ggml_type_size(src->type);
 568    int64_t ne[]      = { n_groups, N };
 569    size_t  nb[]      = { type_size, type_size * n_groups };
 570    size_t  n_bytes   = N * n_groups;
 571
 572    ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
 573    void *               buffer       = temp_buffer_allocator.get();
 574    acl_tensor_ptr       acl_mean_out = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
 575    acl_tensor_ptr       acl_rstd_out =
 576        ggml_cann_create_tensor((char *) buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
 577
 578    GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src.get(), nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst.get(),
 579                            acl_mean_out.get(), acl_rstd_out.get());
 580}
 581
 582void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 583    ggml_tensor * src0 = dst->src[0];
 584    ggml_tensor * src1 = dst->src[1];
 585
 586    size_t nb1     = ((int32_t *) dst->op_params)[0];
 587    size_t nb2     = ((int32_t *) dst->op_params)[1];
 588    size_t nb3     = ((int32_t *) dst->op_params)[2];
 589    size_t offset  = ((int32_t *) dst->op_params)[3];
 590    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
 591
 592    size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 };
 593
 594    acl_tensor_ptr acl_dst  = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
 595    acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
 596
 597    acl_scalar_ptr alpha      = nullptr;
 598    float          alphaValue = 1.0f;
 599    alpha                     = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
 600
 601    if (!inplace) {
 602        size_t cpy_size = ggml_nbytes(dst);
 603        ACL_CHECK(
 604            aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
 605        acl_tensor_ptr acl_src0 =
 606            ggml_cann_create_tensor(src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
 607
 608        GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0.get(), acl_src1.get(), alpha.get(), acl_dst.get());
 609    } else {
 610        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), acl_src1.get(), alpha.get());
 611    }
 612}
 613
 614/**
 615 * @brief Performs sum reduction on a given tensor along specified dimensions.
 616 *
 617 * This function reduces the input tensor by summing along the specified dimensions.
 618 *
 619 * @param ctx The context for the CANN backend operations.
 620 * @param dst The destination tensor where the reduced result will be stored.
 621 * @param dim An array of dimension indices.
 622 * @param dim_size The number of dimensions.
 623 */
 624static void aclnn_reduce_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t * dim, size_t dim_size) {
 625    GGML_ASSERT(dst->ne[0] == 1);
 626    ggml_tensor *     src         = dst->src[0];
 627    acl_tensor_ptr    acl_src     = ggml_cann_create_tensor(src);
 628    acl_tensor_ptr    acl_dst     = ggml_cann_create_tensor(dst);
 629    acl_int_array_ptr reduce_dims = ggml_cann_create_int_array(dim, dim_size);
 630
 631    GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src.get(), reduce_dims.get(), true, ggml_cann_type_mapping(dst->type),
 632                            acl_dst.get());
 633}
 634
 635void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 636    int64_t reduce_dims[] = { 3 };
 637    aclnn_reduce_sum(ctx, dst, reduce_dims, 1);
 638}
 639
 640void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 641    int64_t reduce_dims[] = { 0, 1, 2, 3 };
 642    aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
 643}
 644
 645void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 646    ggml_tensor *  src     = dst->src[0];
 647    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
 648    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
 649
 650    std::vector<int64_t> output_size{ dst->ne[1], dst->ne[0] };
 651    acl_int_array_ptr    output_size_array = ggml_cann_create_int_array(output_size.data(), 2);
 652
 653    GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src.get(), output_size_array.get(), acl_dst.get());
 654}
 655
 656/**
 657 * @brief Pads a tensor with a specified value along each dimension.
 658 *
 659 * This function performs padding of the source tensor `acl_src` and stores the
 660 * result in the destination tensor `acl_dst`. The padding values for each
 661 * dimension are specified in the `paddings` array.
 662 *
 663 * @param ctx The context for the CANN backend operations.
 664 * @param acl_src The source tensor to be padded.
 665 * @param acl_dst The destination tensor where the padded result will be stored.
 666 * @param paddings An array specifying the padding values for each dimension.
 667 * The size of the array should be twice the number of dimensions of the tensor.
 668 * @param value The value to be used for padding. The default value is 0.0.
 669 */
 670static void aclnn_pad(ggml_backend_cann_context & ctx,
 671                      aclTensor *                 acl_src,
 672                      aclTensor *                 acl_dst,
 673                      int64_t *                   paddings,
 674                      float                       value = 0.0f) {
 675    acl_int_array_ptr acl_pad   = ggml_cann_create_int_array(paddings, GGML_MAX_DIMS * 2);
 676    acl_scalar_ptr    acl_value = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
 677
 678    GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad.get(), acl_value.get(), acl_dst);
 679}
 680
 681void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 682    ggml_tensor *  src     = dst->src[0];
 683    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 684    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 685
 686    // padding: value in the array means how much distance will be padding.
 687    // the position of elements in the array means which dirction to padding,
 688    // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
 689    //                       dim2.front, dim2.behind, dim3.front, dim3.behind]
 690    const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
 691    const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
 692    const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
 693    const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
 694    const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
 695    const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
 696    const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
 697    const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
 698
 699    int64_t paddings[] = { lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3 };
 700    aclnn_pad(ctx, acl_src.get(), acl_dst.get(), paddings);
 701}
 702
 703/**
 704 * @brief Performs 2D average pooling on the input tensor and stores the result
 705 * in the destination tensor.
 706 *
 707 * This function performs average pooling on the source tensor and stores the
 708 * result in the destination tensor. The pooling parameters (kernel size,
 709 * strides, padding) are specified in the `op_params` of the destination tensor.
 710 *
 711 * @param ctx The context for the CANN backend operations.
 712 * @param dst The destination tensor where the result will be stored. The source
 713 * tensor is referenced by `dst->src[0]`.
 714 */
 715static void ggml_cann_avg_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 716    ggml_tensor * src = dst->src[0];
 717    GGML_ASSERT(src->type == GGML_TYPE_F32);
 718    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 719
 720    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
 721    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
 722
 723    const int32_t * opts = (const int32_t *) dst->op_params;
 724    const int       k0   = opts[1];
 725    const int       k1   = opts[2];
 726    const int       s0   = opts[3];
 727    const int       s1   = opts[4];
 728    const int       p0   = opts[5];
 729    const int       p1   = opts[6];
 730
 731    std::vector<int64_t> kernel_dims      = { k1, k0 };
 732    std::vector<int64_t> stride_dims      = { s1, s0 };
 733    std::vector<int64_t> padding_avg_dims = { p1, p0 };  // (padH, padW)
 734
 735    acl_int_array_ptr kernel_size  = ggml_cann_create_int_array(kernel_dims.data(), 2);
 736    acl_int_array_ptr strides      = ggml_cann_create_int_array(stride_dims.data(), 2);
 737    acl_int_array_ptr paddings_avg = ggml_cann_create_int_array(padding_avg_dims.data(), 2);
 738
 739    bool    ceil_mode         = false;
 740    bool    count_include_pad = true;
 741    int64_t divisor_override  = 0;
 742    int8_t  cube_math_type    = 0;
 743#ifdef ASCEND_310P
 744    cube_math_type = 1;
 745#endif
 746
 747    GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src.get(), kernel_size.get(), strides.get(), paddings_avg.get(),
 748                            ceil_mode, count_include_pad, divisor_override, cube_math_type, acl_dst.get());
 749}
 750
 751/**
 752 * @brief Performs 2D max pooling on the input tensor and stores the result in
 753 * the destination tensor.
 754 *
 755 * This function performs max pooling on the source tensor and stores the result
 756 * in the destination tensor. The pooling parameters (kernel size, strides,
 757 * padding) are specified in the `op_params` of the destination tensor.
 758 *
 759 * @param ctx The context for the CANN backend operations.
 760 * @param dst The destination tensor where the result will be stored. The source
 761 * tensor is referenced by `dst->src[0]`.
 762 */
 763static void ggml_cann_max_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 764    ggml_tensor * src = dst->src[0];
 765    GGML_ASSERT(src->type == GGML_TYPE_F32);
 766    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 767
 768    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
 769    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
 770
 771    const int32_t * opts = (const int32_t *) dst->op_params;
 772    const int       k0   = opts[1];
 773    const int       k1   = opts[2];
 774    const int       s0   = opts[3];
 775    const int       s1   = opts[4];
 776    const int       p0   = opts[5];
 777    const int       p1   = opts[6];
 778
 779    int64_t temp_ne[] = { src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], src->ne[3] };
 780    size_t  temp_nb[GGML_MAX_DIMS];
 781
 782    temp_nb[0] = ggml_element_size(src);
 783    for (int i = 1; i < GGML_MAX_DIMS; i++) {
 784        temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
 785    }
 786
 787    ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
 788    void *               buffer = temp_buffer_allocator.get();
 789    acl_tensor_ptr tmp_tensor   = ggml_cann_create_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
 790                                                          GGML_MAX_DIMS, ACL_FORMAT_NCHW);
 791
 792    // pad: see padding in ggml_cann_pad()
 793    int64_t paddings[] = { p0, p0, p1, p1, 0, 0, 0, 0 };
 794    float   value      = -FLT_MAX;
 795    aclnn_pad(ctx, acl_src.get(), tmp_tensor.get(), paddings, value);
 796
 797    // max_pool
 798    std::vector<int64_t> kernel_dims      = { k1, k0 };
 799    std::vector<int64_t> stride_dims      = { s1, s0 };
 800    // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
 801    std::vector<int64_t> padding_max_dims = { 0, 0, 0, 0 };
 802    std::vector<int64_t> dilation_size    = { 1, 1 };
 803    acl_int_array_ptr    kernel_size      = ggml_cann_create_int_array(kernel_dims.data(), 2);
 804    acl_int_array_ptr    strides          = ggml_cann_create_int_array(stride_dims.data(), 2);
 805    acl_int_array_ptr    paddings_max     = ggml_cann_create_int_array(padding_max_dims.data(), 4);
 806    acl_int_array_ptr    dilations        = ggml_cann_create_int_array(dilation_size.data(), 2);
 807
 808    bool    ceil_mode = false;
 809    int64_t auto_pads = 0;
 810    GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor.get(), kernel_size.get(), strides.get(), auto_pads,
 811                            paddings_max.get(), dilations.get(), ceil_mode, acl_dst.get());
 812}
 813
 814void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 815    const int32_t *   opts = (const int32_t *) dst->op_params;
 816    enum ggml_op_pool op   = static_cast<ggml_op_pool>(opts[0]);
 817    switch (op) {
 818        case GGML_OP_POOL_AVG:
 819            ggml_cann_avg_pool2d(ctx, dst);
 820            break;
 821        case GGML_OP_POOL_MAX:
 822            ggml_cann_max_pool2d(ctx, dst);
 823            break;
 824        case GGML_OP_POOL_COUNT:
 825            GGML_ABORT("fatal error");
 826            break;
 827    }
 828}
 829
 830/**
 831 * @brief Copies data from the source tensor to the destination tensor.
 832 *
 833 * This function copies data from the source tensor `acl_src` to the destination
 834 * tensor `acl_dst`.
 835 *
 836 * @param ctx The context for the CANN backend operations.
 837 * @param acl_src The source tensor from which data will be copied.
 838 * @param acl_dst The destination tensor where the data will be copied to.
 839 */
 840static void cann_copy(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
 841    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src);
 842}
 843
 844void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 845    ggml_tensor * src0 = dst->src[0];
 846
 847    if (ggml_are_same_shape(src0, dst)) {
 848        acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
 849        acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 850        if (dst->type == src0->type) {
 851            cann_copy(ctx, acl_src.get(), acl_dst.get());
 852        } else {
 853            aclnn_cast(ctx, acl_src.get(), acl_dst.get(), ggml_cann_type_mapping(dst->type));
 854        }
 855    } else {
 856        void *               src_trans_buffer = src0->data;
 857        ggml_cann_pool_alloc src_buffer_allocator;
 858        if (!ggml_is_contiguous(src0)) {
 859            acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
 860            src_buffer_allocator.alloc(ctx.pool(), ggml_nelements(src0) * ggml_type_size(src0->type));
 861            src_trans_buffer = src_buffer_allocator.get();
 862            size_t src_trans_nb[GGML_MAX_DIMS];
 863            src_trans_nb[0] = ggml_type_size(src0->type);
 864            for (int i = 1; i < GGML_MAX_DIMS; i++) {
 865                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
 866            }
 867            acl_tensor_ptr src_trans_tensor =
 868                ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type),
 869                                        ggml_type_size(src0->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
 870            cann_copy(ctx, acl_src.get(), src_trans_tensor.get());
 871        }
 872
 873        size_t src_reshape_nb[GGML_MAX_DIMS];
 874        src_reshape_nb[0] = ggml_type_size(src0->type);
 875        for (int i = 1; i < GGML_MAX_DIMS; i++) {
 876            src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
 877        }
 878
 879        acl_tensor_ptr trans_acl_src =
 880            ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
 881                                    dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
 882        acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 883
 884        if (dst->type == src0->type) {
 885            cann_copy(ctx, trans_acl_src.get(), acl_dst.get());
 886        } else {
 887            aclnn_cast(ctx, trans_acl_src.get(), acl_dst.get(), ggml_cann_type_mapping(dst->type));
 888        }
 889    }
 890}
 891
 892/**
 893 * @brief Creates an ACL tensor initialized with zeros using a provided buffer.
 894 *
 895 * This function initializes a tensor with zeros using the specified buffer and
 896 * tensor parameters.
 897 *
 898 * @param ctx The context for the CANN backend operations.
 899 * @param buffer The buffer to be used for the tensor data.
 900 * @param n_bytes The size of the buffer in bytes.
 901 * @param ne An array specifying the extents (sizes) of each dimension of the
 902 * tensor.
 903 * @param dims The number of dimensions of the tensor.
 904 * @param type The data type of the tensor.
 905 * @param type_size The size of each element in the tensor data type.
 906 * @return A tensor smart pointer initialized with zeros.
 907 */
 908static acl_tensor_ptr aclnn_zero(ggml_backend_cann_context & ctx,
 909                                 void *                      buffer,
 910                                 size_t                      n_bytes,
 911                                 int64_t *                   ne,
 912                                 int64_t                     dims,
 913                                 aclDataType                 type,
 914                                 size_t                      type_size) {
 915    size_t nb[GGML_MAX_DIMS];
 916    nb[0] = type_size;
 917    for (int i = 1; i < dims; i++) {
 918        nb[i] = nb[i - 1] * ne[i - 1];
 919    }
 920
 921    acl_tensor_ptr zero = ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
 922    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero.get());
 923    return zero;
 924    GGML_UNUSED(n_bytes);
 925}
 926
 927/**
 928 * @brief Creates an ACL tensor initialized with value using a provided buffer.
 929 *
 930 * This function initializes a tensor with value using the specified buffer and
 931 * tensor parameters.
 932 *
 933 * @param ctx The context for the CANN backend operations.
 934 * @param buffer The buffer to be used for the tensor data.
 935 * @param n_bytes The size of the buffer in bytes.
 936 * @param ne An array specifying the extents (sizes) of each dimension of the
 937 * tensor.
 938 * @param dims The number of dimensions of the tensor.
 939 * @param type The data type of the tensor.
 940 * @param type_size The size of each element in the tensor data type.
 941 * @param value The value to be used for initializing the tensor (default
 942 * is 1.0).
 943 * @return A tensor smart pointer initialized with value.
 944 */
 945static acl_tensor_ptr aclnn_values(ggml_backend_cann_context & ctx,
 946                                   void *                      buffer,
 947                                   size_t                      n_bytes,
 948                                   int64_t *                   ne,
 949                                   int64_t                     dims,
 950                                   aclDataType                 type,
 951                                   size_t                      type_size,
 952                                   float                       value = 1.0f) {
 953    acl_tensor_ptr acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
 954    float          alpha_host = 1.0f;
 955    acl_scalar_ptr alpha      = ggml_cann_create_scalar(&alpha_host, aclDataType::ACL_FLOAT);
 956    acl_scalar_ptr other      = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
 957    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor.get(), other.get(), alpha.get());
 958    return acl_tensor;
 959}
 960
 961/**
 962 * @brief Fills a tensor with a scalar value.
 963 *
 964 * This function fills the destination tensor `acl_dst` with the scalar value
 965 * `scalar`.
 966 *
 967 * @param ctx The context for the CANN backend operations.
 968 * @param scalar The scalar value used to fill the tensor.
 969 * @param acl_dst The destination tensor to be filled with the scalar value.
 970 */
 971static void aclnn_fill_scalar(ggml_backend_cann_context & ctx, float scalar, aclTensor * acl_dst) {
 972    acl_scalar_ptr acl_scalar = ggml_cann_create_scalar(&scalar, aclDataType::ACL_FLOAT);
 973    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar.get());
 974}
 975
 976/**
 977 * @brief Get or expand a cached tensor filled with a scalar value.
 978 *
 979 * This function manages cached device memory for tensors. If the current
 980 * cache size is insufficient for the requested tensor shape, the old memory will
 981 * be released and new memory will be allocated. The allocated buffer is
 982 * initialized  with the given scalar value using CANN operations.
 983 * Finally, an aclTensor object is created from the cached memory and returned.
 984 *
 985 * @param ctx           The CANN backend context that manages device memory.
 986 * @param buffer        A pointer to the cached device buffer (will be allocated
 987 *                      or reallocated if necessary).
 988 * @param cache_element The current number of cached elements. This will be
 989 *                      updated when the cache is expanded.
 990 * @param ne            The tensor shape array (number of elements in each dimension).
 991 * @param nb            The stride size for each dimension.
 992 * @param dtype         Data type of cached tensor.
 993 * @param dims          The number of tensor dimensions.
 994 * @param value         The scalar value used to fill the tensor (supports zero
 995 *                      initialization via memset or arbitrary values via fill_scalar).
 996 * @return              A tensor smart pointer created from the cached buffer.
 997 */
 998static acl_tensor_ptr get_cache_acl_tensor(ggml_backend_cann_context & ctx,
 999                                           void **                     buffer,
1000                                           int64_t &                   cache_element,
1001                                           int64_t *                   ne,
1002                                           size_t *                    nb,
1003                                           ggml_type                   dtype,
1004                                           int64_t                     dims,
1005                                           float                       value) {
1006    // Calculate total number of elements
1007    int64_t n_element = 1;
1008    for (int i = 0; i < dims; i++) {
1009        n_element *= ne[i];
1010    }
1011    size_t size = n_element * ggml_type_size(dtype);
1012
1013    // Allocate or expand cache if needed
1014    if (cache_element < n_element) {
1015        if (*buffer != nullptr) {
1016            aclrtFree(*buffer);
1017            *buffer = nullptr;
1018        }
1019
1020        ACL_CHECK(aclrtMalloc(buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
1021        cache_element = n_element;
1022
1023        // Initialize cache
1024        int64_t        pool_ne[1] = { n_element };
1025        size_t         pool_nb[1] = { ggml_type_size(dtype) };
1026        acl_tensor_ptr acl_value =
1027            ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), pool_ne, pool_nb, 1);
1028        aclnn_fill_scalar(ctx, value, acl_value.get());
1029    }
1030
1031    return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), ne, nb, dims);
1032}
1033
1034void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1035    ggml_tensor * src = dst->src[0];
1036
1037    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
1038    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
1039
1040    float eps;
1041    memcpy(&eps, dst->op_params, sizeof(float));
1042
1043    // build gamma.
1044    size_t acl_gamma_nb[GGML_MAX_DIMS];
1045    // gamma's type is the same with dst.
1046    acl_gamma_nb[0] = ggml_type_size(dst->type);
1047    for (int i = 1; i < GGML_MAX_DIMS; i++) {
1048        acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
1049    }
1050    acl_tensor_ptr acl_gamma = get_cache_acl_tensor(
1051        ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, src->ne, acl_gamma_nb, dst->type,
1052        1,    // dims
1053        1.0f  // value
1054    );
1055
1056    // build rstd.
1057    int64_t acl_rstd_ne[] = { src->ne[1], src->ne[2], src->ne[3] };
1058    size_t  acl_rstd_nb[GGML_MAX_DIMS - 1];
1059    // rstd will always be F32.
1060    acl_rstd_nb[0] = sizeof(float);
1061    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1062        acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
1063    }
1064    acl_tensor_ptr acl_rstd =
1065        get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
1066                             acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS - 1,
1067                             0.0f  // value
1068        );
1069
1070    GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src.get(), acl_gamma.get(), eps, acl_dst.get(), acl_rstd.get());
1071}
1072
1073// TODO: performace is low.
1074void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value) {
1075    ggml_tensor * src = dst->src[0];
1076
1077    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
1078    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
1079
1080    const int n_past = ((int32_t *) dst->op_params)[0];
1081
1082    ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
1083    void *               buffer = one_tensor_allocator.get();
1084
1085    acl_tensor_ptr mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
1086                                                         ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
1087
1088    aclnn_fill_scalar(ctx, value, mask_tensor.get());
1089
1090    float          alphaValue = 1.0f;
1091    acl_scalar_ptr alpha      = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
1092
1093    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor.get(), n_past + 1);
1094    GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), n_past + 1, acl_dst.get());
1095    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), mask_tensor.get(), alpha.get());
1096}
1097
1098/**
1099 * @brief Permutes the dimensions of a tensor according to a specified order.
1100 *
1101 * This function permutes the dimensions of the source tensor `acl_src`
1102 * according to the order specified in the `new_dim` array and stores the result
1103 * in the destination tensor `acl_dst`.
1104 *
1105 * @param ctx The context for the CANN backend operations.
1106 * @param acl_src The source tensor whose dimensions will be permuted.
1107 * @param acl_dst The destination tensor where the permuted result will be
1108 * stored.
1109 * @param new_dim An array specifying the new order of dimensions for the
1110 * tensor.
1111 * @param dims The number of dimensions in the tensor.
1112 */
1113static void aclnn_permute(ggml_backend_cann_context & ctx,
1114                          aclTensor *                 acl_src,
1115                          aclTensor *                 acl_dst,
1116                          int64_t *                   new_dim,
1117                          uint64_t                    dims) {
1118    acl_int_array_ptr acl_dims = ggml_cann_create_int_array(new_dim, dims);
1119    GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims.get(), acl_dst);
1120}
1121
1122static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context & ctx,
1123                                             ggml_tensor *               dst,
1124                                             ggml_tensor *               src1,
1125                                             aclTensor *                 tmp_cast_tensor,
1126                                             aclTensor *                 tmp_im2col_tensor) {
1127    // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1128    int64_t        dst_ne[] = { dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3] };
1129    size_t         dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[3] };
1130    acl_tensor_ptr acl_dst  = ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
1131
1132    int64_t permute_dim[] = { 0, 2, 1 };
1133    if (src1->type != dst->type) {
1134        aclnn_permute(ctx, tmp_cast_tensor, acl_dst.get(), permute_dim, 3);
1135    } else {
1136        aclnn_permute(ctx, tmp_im2col_tensor, acl_dst.get(), permute_dim, 3);
1137    }
1138}
1139
1140static void ggml_cann_im2col_1d_post_process(ggml_backend_cann_context &  ctx,
1141                                             ggml_tensor *                dst,
1142                                             ggml_tensor *                src1,
1143                                             aclTensor *                  tmp_cast_tensor,
1144                                             aclTensor *                  tmp_im2col_tensor,
1145                                             const std::vector<int64_t> & im2col_op_params) {
1146    // get params
1147    const int64_t KH             = im2col_op_params[0];
1148    const int64_t KW             = im2col_op_params[1];
1149    const int64_t IW             = im2col_op_params[2];
1150    const int64_t IC             = im2col_op_params[3];
1151    const int64_t N              = im2col_op_params[4];
1152    const int64_t OH             = im2col_op_params[5];
1153    const int64_t OW             = im2col_op_params[6];
1154    const int64_t s0             = im2col_op_params[7];
1155    const int64_t p0             = im2col_op_params[8];
1156    const int64_t d0             = im2col_op_params[9];
1157    const int64_t n_bytes_factor = im2col_op_params[10];
1158
1159    // Permute: [N, IC * KH * KW, OW * OH] ->
1160    // [N, OW * OH * n_bytes_factor, IC * KH * KW]
1161    ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
1162    tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1163    void * tmp_permute_buffer = tmp_permute_allocator.get();
1164
1165    int64_t tmp_permute_ne[] = { IC * KH * KW, OW * OH * n_bytes_factor, N };
1166    size_t  tmp_permute_nb[GGML_MAX_DIMS - 1];
1167    tmp_permute_nb[0] = ggml_type_size(dst->type);
1168    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1169        tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1170    }
1171
1172    acl_tensor_ptr tmp_permute_tensor =
1173        ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1174                                tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1175
1176    int64_t permute_dim[] = { 0, 2, 1 };
1177    if (src1->type != dst->type) {
1178        aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor.get(), permute_dim, 3);
1179    } else {
1180        aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor.get(), permute_dim, 3);
1181    }
1182
1183    // number of times the kernel moves in W dimension
1184    const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
1185    size_t    offset;
1186    void *    cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
1187
1188    // memory copy with offset to restore 1D im2col from 2d
1189    if (IC > 1) {
1190        offset          = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
1191        size_t cpy_size = KH * KW * ggml_type_size(dst->type);
1192
1193        for (int c = 0; c < IC; c++) {
1194            cur_permute_buffer = (char *) tmp_permute_buffer + offset + KH * KW * c * ggml_type_size(dst->type);
1195            cur_dst_buffer     = (char *) dst->data + c * KH * KW * n_step_w * ggml_type_size(dst->type);
1196
1197            for (int i = 0; i < n_step_w; i++) {
1198                ACL_CHECK(aclrtMemcpyAsync(cur_dst_buffer, cpy_size, cur_permute_buffer, cpy_size,
1199                                           ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1200                cur_dst_buffer     = (char *) cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1201                cur_permute_buffer = (char *) cur_permute_buffer + KH * KW * IC * ggml_type_size(dst->type);
1202            }
1203        }
1204    } else {
1205        offset = KH * KW * n_step_w * ggml_type_size(dst->type);  // equal to ggml_nbytes(dst)
1206        ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, (char *) tmp_permute_buffer + offset, offset,
1207                                   ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1208    }
1209}
1210
1211void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1212    ggml_tensor * src0 = dst->src[0];  // kernel
1213    ggml_tensor * src1 = dst->src[1];  // input
1214
1215    GGML_TENSOR_BINARY_OP_LOCALS;
1216
1217    // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
1218    // im2col and do post-processing to restore it to 1D.
1219    const bool    is_2D = ((const int32_t *) (dst->op_params))[6] == 1;
1220    const int32_t s0    = ((const int32_t *) (dst->op_params))[0];
1221    const int32_t s1    = is_2D ? ((const int32_t *) (dst->op_params))[1] : 1;
1222    const int32_t p0    = ((const int32_t *) (dst->op_params))[2];
1223    const int32_t p1    = is_2D ? ((const int32_t *) (dst->op_params))[3] : 1;
1224    const int32_t d0    = ((const int32_t *) (dst->op_params))[4];
1225    const int32_t d1    = is_2D ? ((const int32_t *) (dst->op_params))[5] : 1;
1226
1227    const int64_t N  = ne13;
1228    const int64_t IC = ne12;
1229    const int64_t KH = ne01;
1230    const int64_t KW = ne00;
1231    const int64_t IW = ne10;
1232
1233    const int64_t OH = is_2D ? ne2 : 1;
1234    const int64_t OW = ne1;
1235
1236    // memory allocated increased to 3x when is_2D == false
1237    const int64_t n_bytes_factor = is_2D ? 1 : 3;
1238
1239    // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
1240    acl_tensor_ptr acl_src1        = ggml_cann_create_tensor(src1);
1241    int64_t        tmp_im2col_ne[] = { OW * OH * n_bytes_factor, IC * KH * KW, N };
1242    size_t         tmp_im2col_nb[GGML_MAX_DIMS - 1];
1243
1244    tmp_im2col_nb[0] = ggml_type_size(src1->type);
1245    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1246        tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
1247    }
1248
1249    // Calculate im2col.
1250    // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
1251    // dst.elemcount.
1252    ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
1253    void *               tmp_im2col_buffer = im2col_allocator.get();
1254
1255    acl_tensor_ptr tmp_im2col_tensor =
1256        ggml_cann_create_tensor(tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
1257                                tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1258
1259    std::vector<int64_t> kernel_dims   = { KH, KW };
1260    std::vector<int64_t> dilation_size = { d1, d0 };
1261    std::vector<int64_t> padding_dims  = { p1, p0 };
1262    std::vector<int64_t> stride_dims   = { s1, s0 };
1263    acl_int_array_ptr    kernel_size   = ggml_cann_create_int_array(kernel_dims.data(), 2);
1264    acl_int_array_ptr    dilations     = ggml_cann_create_int_array(dilation_size.data(), 2);
1265    acl_int_array_ptr    paddings      = ggml_cann_create_int_array(padding_dims.data(), 2);
1266    acl_int_array_ptr    strides       = ggml_cann_create_int_array(stride_dims.data(), 2);
1267    GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1.get(), kernel_size.get(), dilations.get(), paddings.get(),
1268                            strides.get(), tmp_im2col_tensor.get());
1269
1270    // Cast if dst is f16.
1271    acl_tensor_ptr       tmp_cast_tensor;
1272    ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
1273    void *               tmp_cast_buffer = nullptr;
1274    if (src1->type != dst->type) {
1275        tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1276        tmp_cast_buffer = tmp_cast_allocator.get();
1277        size_t temp_cast_nb[GGML_MAX_DIMS - 1];
1278        temp_cast_nb[0] = ggml_type_size(dst->type);
1279        for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1280            temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
1281        }
1282
1283        tmp_cast_tensor =
1284            ggml_cann_create_tensor(tmp_cast_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1285                                    tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1286        aclnn_cast(ctx, tmp_im2col_tensor.get(), tmp_cast_tensor.get(), ggml_cann_type_mapping(dst->type));
1287    }
1288
1289    // post-processing
1290    if (is_2D) {
1291        ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor.get(), tmp_im2col_tensor.get());
1292    } else {
1293        std::vector<int64_t> im2col_op_params = { KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor };
1294        ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor.get(), tmp_im2col_tensor.get(),
1295                                         im2col_op_params);
1296    }
1297}
1298
1299/**
1300 * @brief Applies element-wise exponential function to the elements of a tensor.
1301 *
1302 * This function computes the exponential of each element in the source tensor
1303 * `acl_src` and stores the result back into the same tensor.
1304 * The operation is defined as:
1305 * \f[
1306 *     \text {acl_src }_i=e^{acl\_src_i}
1307 * \f]
1308 *
1309 * @param ctx The context for the CANN backend operations.
1310 * @param acl_src The tensor on which the exponential function will be applied.
1311 */
1312static void aclnn_exp(ggml_backend_cann_context & ctx, aclTensor * acl_src) {
1313    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src);
1314}
1315
1316void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
1317    if (acl_dst == nullptr) {
1318        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCos, acl_src);
1319    } else {
1320        GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst);
1321    }
1322}
1323
1324void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
1325    if (acl_dst == nullptr) {
1326        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSin, acl_src);
1327    } else {
1328        GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst);
1329    }
1330}
1331
1332void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1333    const ggml_tensor * src = dst->src[0];
1334
1335    GGML_ASSERT(src->type == GGML_TYPE_F32);
1336    GGML_ASSERT(dst->type == GGML_TYPE_F32);
1337
1338    const int dim        = dst->op_params[0];
1339    const int max_period = dst->op_params[1];
1340    int       half       = dim / 2;
1341
1342    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
1343
1344    // arange: [0, ..., half)
1345    float   start             = 0;
1346    float   stop              = half;
1347    float   step              = 1;
1348    int64_t n_elements_arange = half;
1349    int64_t tmp_arange_ne[]   = { half };
1350    size_t  tmp_arange_nb[]   = { sizeof(dst->type) };
1351
1352    ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
1353    void *               tmp_arange_buffer = arange_allocator.get();
1354    acl_tensor_ptr       tmp_arange_tensor =
1355        ggml_cann_create_tensor(tmp_arange_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1356                                tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1357
1358    aclnn_arange(ctx, tmp_arange_tensor.get(), start, stop, step, n_elements_arange);
1359
1360    // freq
1361    float freq_param = -logf(max_period) / half;
1362    bool  inplace    = true;
1363    aclnn_muls(ctx, tmp_arange_tensor.get(), freq_param, nullptr, inplace);
1364    aclnn_exp(ctx, tmp_arange_tensor.get());
1365
1366    // permute: src [0,1,2,3]->[0,1,3,2]
1367    int64_t tmp_permute_ne[] = { src->ne[1], src->ne[0], src->ne[2], src->ne[3] };
1368    size_t  tmp_permute_nb[GGML_MAX_DIMS];
1369    tmp_permute_nb[0] = ggml_type_size(src->type);
1370    for (int i = 1; i < GGML_MAX_DIMS; i++) {
1371        tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1372    }
1373
1374    ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
1375    void *               tmp_permute_buffer = permute_allocator.get();
1376    acl_tensor_ptr       tmp_permute_tensor =
1377        ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
1378                                tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
1379    int64_t permute_dim[] = { 0, 1, 3, 2 };
1380    int64_t num_dims      = 4;
1381    aclnn_permute(ctx, acl_src.get(), tmp_permute_tensor.get(), permute_dim, num_dims);
1382
1383    // timestep * freq
1384    int64_t tmp_mul_ne[] = { src->ne[1] * half, src->ne[0], src->ne[2], src->ne[3] };
1385    size_t  tmp_mul_nb[GGML_MAX_DIMS];
1386    tmp_mul_nb[0] = ggml_type_size(src->type);
1387    for (int i = 1; i < GGML_MAX_DIMS; i++) {
1388        tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
1389    }
1390
1391    int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
1392
1393    ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
1394    void *               tmp_mul_buffer = mul_allocator.get();
1395    acl_tensor_ptr       tmp_mul_tensor =
1396        ggml_cann_create_tensor(tmp_mul_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
1397                                tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
1398    aclnn_mul(ctx, tmp_permute_tensor.get(), tmp_arange_tensor.get(), tmp_mul_tensor.get());
1399
1400    // cos
1401    ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
1402    void *               tmp_cos_buffer = cos_allocator.get();
1403    acl_tensor_ptr       tmp_cos_tensor =
1404        ggml_cann_create_tensor(tmp_cos_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1405                                tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
1406
1407    aclnn_cos(ctx, tmp_mul_tensor.get(), tmp_cos_tensor.get());
1408
1409    // sin
1410    ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
1411    void *               tmp_sin_buffer = sin_allocator.get();
1412    acl_tensor_ptr       tmp_sin_tensor =
1413        ggml_cann_create_tensor(tmp_sin_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1414                                tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
1415
1416    aclnn_sin(ctx, tmp_mul_tensor.get(), tmp_sin_tensor.get());
1417
1418    // concat
1419    int64_t             concat_dim  = 3;
1420    acl_tensor_ptr      acl_dst     = ggml_cann_create_tensor(dst);
1421    acl_tensor_list_ptr tensor_list = ggml_cann_create_tensor_list(tmp_cos_tensor, tmp_sin_tensor);
1422    aclnn_concat(ctx, tensor_list.get(), acl_dst.get(), concat_dim);
1423}
1424
1425/**
1426 * @brief Raises each element of a tensor to the power of the corresponding
1427 * element in another tensor.
1428 *
1429 * This function computes the element-wise power of the destination tensor
1430 * `acl_dst` raised to the power of the exponent tensor `acl_exp`.
1431 * The operation is defined as:
1432 * \f[
1433 *     \text {acl_dst }_i=acl\_dst_i^{\text {acl_exp }_i}
1434 * \f]
1435 *
1436 * @param ctx The context for the CANN backend operations.
1437 * @param acl_dst The destination tensor, which also serves as the base tensor.
1438 * @param acl_exp The exponent tensor, each element of which is used to raise
1439 * the corresponding element in the destination tensor.
1440 */
1441static void aclnn_pow_tensor_tensor(ggml_backend_cann_context & ctx, aclTensor * acl_dst, aclTensor * acl_exp) {
1442    GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp);
1443}
1444
1445/**
1446 * @brief Generate a range of values and apply a scalar base exponentiation.
1447 *
1448 * This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
1449 * with step size `step`, stores it in a temporary buffer, and then computes:
1450 *
1451 * @f[
1452 * slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
1453 * @f]
1454 *
1455 * The results are written to the provided @p slope_buffer.
1456 *
1457 * @param ctx           CANN backend context for memory allocation and operator execution.
1458 * @param slope_buffer  Pointer to the output buffer (float array) for the computed slope values.
1459 * @param m             Scalar base for the exponentiation.
1460 * @param size          Number of elements in the generated sequence.
1461 * @param start         Starting exponent offset.
1462 * @param stop          Stopping exponent offset (exclusive).
1463 * @param step          Step size for the exponent increment.
1464 * @param dtype         Data type for slope tensor.
1465 */
1466static void aclnn_get_slope_inner(ggml_backend_cann_context & ctx,
1467                                  void *                      slope_buffer,
1468                                  float                       m,
1469                                  int64_t                     size,
1470                                  float                       start,
1471                                  float                       stop,
1472                                  float                       step,
1473                                  ggml_type                   dtype) {
1474    aclDataType acl_type  = ggml_cann_type_mapping(dtype);
1475    size_t      type_size = ggml_type_size(dtype);
1476
1477    int64_t ne[] = { size };
1478    size_t  nb[] = { type_size };
1479
1480    ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
1481    void *               arange_buffer = arange_allocator.get();
1482
1483    acl_tensor_ptr arange_tensor = ggml_cann_create_tensor(arange_buffer, acl_type, type_size, ne, nb, 1);
1484    aclnn_arange(ctx, arange_tensor.get(), start, stop, step, size);
1485
1486    acl_tensor_ptr slope_tensor = ggml_cann_create_tensor(slope_buffer, acl_type, type_size, ne, nb, 1);
1487
1488    acl_scalar_ptr sc = ggml_cann_create_scalar(&m, aclDataType::ACL_FLOAT);
1489
1490    GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc.get(), arange_tensor.get(), slope_tensor.get());
1491}
1492
1493/**
1494 * @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
1495 *
1496 * This function generates slope values for each attention head according to the ALiBi
1497 * (Attention with Linear Biases) method. It splits the computation into two ranges depending
1498 * on whether the head index is less than @p n_head_log2 or not, and uses different base values
1499 * (`m0` and `m1`) for the exponentiation.
1500 *
1501 * @f[
1502 * slope[h] =
1503 * \begin{cases}
1504 * m_0^{(h + 1)}, & h < n\_head\_log2 \\
1505 * m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
1506 * \end{cases}
1507 * \quad , \quad \text{if } max\_bias > 0
1508 * @f]
1509 *
1510 * If @p max_bias <= 0, all slope values are set to 1.0.
1511 *
1512 * @param ctx           CANN backend context for memory allocation and operator execution.
1513 * @param n_head        Total number of attention heads.
1514 * @param slope_buffer  Pointer to the output buffer (float array) for storing slopes.
1515 * @param max_bias      Maximum bias value for slope computation.
1516 * @param dtype         Data type for slope tensor.
1517 *
1518*/
1519static void aclnn_get_slope(ggml_backend_cann_context & ctx,
1520                            int64_t                     n_head,
1521                            void *                      slope_buffer,
1522                            float                       max_bias,
1523                            ggml_type                   dtype) {
1524    const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
1525
1526    float m0 = powf(2.0f, -(max_bias) / n_head_log2);
1527    float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1528
1529    // const float slope = (max_bias > 0.0f) ?
1530    //                          h < n_head_log2 ?
1531    //                              powf(m0, h + 1) :
1532    //                              powf(m1, 2*(h - n_head_log2) + 1) :
1533    //                          1.0f;
1534    // arange1
1535    float start = 0 + 1;
1536    float end   = (n_head_log2 - 1) + 1;
1537    float step  = 1;
1538    float count = n_head_log2;
1539    // end needs to be +1 because aclnn uses a left-closed, right-open interval.
1540    aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step, dtype);
1541    if (n_head_log2 < n_head) {
1542        // arange2
1543        start = 2 * (n_head_log2 - n_head_log2) + 1;
1544        end   = 2 * ((n_head - 1) - n_head_log2) + 1;
1545        step  = 2;
1546        count = n_head - n_head_log2;
1547        aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
1548                              dtype);
1549    }
1550}
1551
1552/**
1553 * @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
1554 *
1555 * This function computes the ALiBi slopes for each attention head (if max_bias > 0),
1556 * multiplies them with the attention mask to produce bias tensors, and adds these biases
1557 * to the destination tensor (@p dst).
1558 *
1559 * The function performs necessary broadcasting of the mask and slope tensors to match
1560 * the shape of the destination tensor, then applies element-wise multiplication and addition
1561 * using CANN operators.
1562 *
1563 * @param ctx         CANN backend context for memory management and operator execution.
1564 * @param mask        Input attention mask tensor, assumed to be contiguous.
1565 * @param dst         Destination tensor to which ALiBi biases will be added.
1566 * @param dst_ptr     Pointer to the memory of the destination tensor.
1567 * @param max_bias    Maximum bias value controlling the slope scaling.
1568 *
1569 * @note
1570 * - Write data into dst_ptr using only the shape information of the dst tensor.
1571 * - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
1572 */
1573static void aclnn_add_alibi(ggml_backend_cann_context & ctx,
1574                            ggml_tensor *               mask,
1575                            ggml_tensor *               dst,
1576                            void *                      dst_ptr,
1577                            float                       max_bias) {
1578    void * slope_buffer = nullptr;
1579    void * bias_buffer  = nullptr;
1580
1581    if (max_bias > 0.0f) {
1582        int64_t              n_heads = dst->ne[2];
1583        ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
1584        slope_buffer = slope_allocator.get();
1585        ggml_cann_pool_alloc bias_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
1586        bias_buffer = bias_allocator.get();
1587        aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
1588    }
1589
1590    // broadcast for mask, slop and dst;
1591    int64_t nr2 = dst->ne[2] / mask->ne[2];
1592    int64_t nr3 = dst->ne[3] / mask->ne[3];
1593
1594    // broadcast the mask across rows
1595    int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
1596    size_t  mask_nb[] = { mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
1597                          mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3] };
1598
1599    int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
1600    size_t  dst_nb[] = { dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
1601                         dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3] };
1602
1603    // slope is a 1 dim tensor, slope.ne2 == dst.ne2
1604    int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
1605    size_t  slope_nb[GGML_MAX_DIMS + 2];
1606    slope_nb[0] = sizeof(float);
1607    for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
1608        slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
1609    }
1610
1611    acl_tensor_ptr acl_slope =
1612        ggml_cann_create_tensor(slope_buffer, ACL_FLOAT, sizeof(float), slope_ne, slope_nb, GGML_MAX_DIMS + 2);
1613    acl_tensor_ptr acl_mask = ggml_cann_create_tensor(mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
1614
1615    // write data into dst_ptr using only the shape information of the dst tensor.
1616    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst_ptr, ggml_cann_type_mapping(dst->type),
1617                                                     ggml_type_size(dst->type), dst_ne, dst_nb, GGML_MAX_DIMS + 2);
1618
1619    if (max_bias > 0.0f) {
1620        int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
1621        size_t  bias_nb[GGML_MAX_DIMS + 2];
1622        bias_nb[0] = sizeof(float);
1623        for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
1624            bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
1625        }
1626        acl_tensor_ptr bias_tensor =
1627            ggml_cann_create_tensor(bias_buffer, ACL_FLOAT, sizeof(float), bias_ne, bias_nb, GGML_MAX_DIMS + 2);
1628
1629        aclnn_mul(ctx, acl_slope.get(), acl_mask.get(), bias_tensor.get());
1630        aclnn_add(ctx, acl_dst.get(), bias_tensor.get());
1631    } else {
1632        aclnn_add(ctx, acl_dst.get(), acl_mask.get());
1633    }
1634}
1635
1636void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1637    ggml_cann_dup(ctx, dst);
1638}
1639
1640/**
1641 * @brief Applies the softmax function to a tensor along a specified dimension.
1642 *
1643 * This function computes the softmax of the source tensor `acl_src` along the
1644 * specified dimension `dim` and stores the result in the destination tensor
1645 * `acl_dst`.
1646 *
1647 * @param ctx The context for the CANN backend operations.
1648 * @param acl_src The source tensor on which the softmax function will be
1649 * applied.
1650 * @param dim The dimension along which the softmax function will be computed.
1651 * @param acl_dst The destination tensor where the softmax results will be
1652 * stored.
1653 */
1654static void aclnn_softmax(ggml_backend_cann_context & ctx, aclTensor * acl_src, int64_t dim, aclTensor * acl_dst) {
1655    GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
1656}
1657
1658void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1659    ggml_tensor * src0 = dst->src[0];
1660    ggml_tensor * src1 = dst->src[1];  // mask
1661
1662    acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
1663    acl_tensor_ptr acl_dst  = ggml_cann_create_tensor(dst);
1664
1665    float scale    = 1.0f;
1666    float max_bias = 0.0f;
1667
1668    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
1669    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
1670
1671    // input mul scale
1672    acl_scalar_ptr       acl_scale = ggml_cann_create_scalar(&scale, aclDataType::ACL_FLOAT);
1673    ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
1674    void *               src_tensor_buffer = src_tensor_allocator.get();
1675    acl_tensor_ptr       softmax_tensor = ggml_cann_create_tensor(src_tensor_buffer, ggml_cann_type_mapping(src0->type),
1676                                                                  ggml_element_size(src0), src0->ne, src0->nb, GGML_MAX_DIMS);
1677
1678    aclnn_muls(ctx, acl_src0.get(), scale, softmax_tensor.get(), false);
1679
1680    // mask
1681    if (src1) {
1682        aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
1683    }
1684    // softmax
1685    aclnn_softmax(ctx, softmax_tensor.get(), 3, acl_dst.get());
1686}
1687
1688/**
1689 * @brief Performs index select operation on a 4D tensor using the CANN backend.
1690 *
1691 * This function applies the `IndexSelect` operation along a specific dimension
1692 * of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
1693 * It iterates over the last two dimensions of the source tensor, creates the corresponding
1694 * CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
1695 * operation for each slice.
1696 *
1697 * @param ctx The context for CANN backend operations.
1698 * @param src_buffer The source buffer containing the 4D input tensor data.
1699 * @param src_ne The dimensions of the source tensor.
1700 * @param src_nb The strides (byte offsets) of the source tensor.
1701 * @param dst_buffer The destination buffer where the output tensor data will be written.
1702 * @param dst_ne The dimensions of the destination tensor.
1703 * @param dst_nb The strides (byte offsets) of the destination tensor.
1704 * @param index The index tensor specifying the indices to select from the source tensor.
1705 * @param type The data type of the source and destination tensors.
1706 */
1707static void aclnn_index_select_4d(ggml_backend_cann_context & ctx,
1708                                  void *                      src_buffer,
1709                                  int64_t *                   src_ne,
1710                                  size_t *                    src_nb,
1711                                  void *                      dst_buffer,
1712                                  int64_t *                   dst_ne,
1713                                  size_t *                    dst_nb,
1714                                  ggml_tensor *               index,
1715                                  ggml_type                   type) {
1716    for (int64_t i = 0; i < src_ne[3]; i++) {
1717        for (int64_t j = 0; j < src_ne[2]; j++) {
1718            // src
1719            acl_tensor_ptr acl_src_tensor =
1720                ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
1721                                        ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
1722
1723            // index
1724            acl_tensor_ptr acl_index = ggml_cann_create_tensor(
1725                (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
1726                ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
1727
1728            // out
1729            acl_tensor_ptr acl_out =
1730                ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
1731                                        ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
1732            GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor.get(), 0, acl_index.get(), acl_out.get());
1733        }
1734    }
1735}
1736
1737/**
1738 * @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
1739 *
1740 * This function applies the `IndexCopy` operation along a specific dimension of the
1741 * destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
1742 * to positions specified by the index tensor (`index`).
1743 * It iterates over the last two dimensions of the tensors, creates the corresponding
1744 * CANN tensors for source, index, and destination slices, and performs the index copy
1745 * operation for each slice.
1746 *
1747 * @param ctx The context for CANN backend operations.
1748 * @param src_buffer The source buffer containing the 4D input tensor data to be copied.
1749 * @param src_ne The dimensions of the source tensor.
1750 * @param src_nb The strides (byte offsets) of the source tensor.
1751 * @param dst_buffer The destination buffer where values will be copied to.
1752 * @param dst_ne The dimensions of the destination tensor.
1753 * @param dst_nb The strides (byte offsets) of the destination tensor.
1754 * @param index The index tensor specifying target positions in the destination tensor.
1755 * @param type The data type of the source and destination tensors.
1756 */
1757static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx,
1758                                void *                      src_buffer,
1759                                int64_t *                   src_ne,
1760                                size_t *                    src_nb,
1761                                void *                      dst_buffer,
1762                                int64_t *                   dst_ne,
1763                                size_t *                    dst_nb,
1764                                ggml_tensor *               index,
1765                                ggml_type                   type) {
1766    for (int64_t i = 0; i < src_ne[3]; i++) {
1767        for (int64_t j = 0; j < src_ne[2]; j++) {
1768            // src
1769            acl_tensor_ptr acl_src_tensor =
1770                ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
1771                                        ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
1772
1773            // index
1774            acl_tensor_ptr acl_index = ggml_cann_create_tensor(
1775                (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
1776                ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
1777
1778            // out
1779            acl_tensor_ptr acl_out =
1780                ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
1781                                        ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
1782            GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out.get(), 0, acl_index.get(), acl_src_tensor.get());
1783        }
1784    }
1785}
1786
1787void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1788    ggml_tensor * src0 = dst->src[0];  // src
1789    ggml_tensor * src1 = dst->src[1];  // index
1790
1791    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1792
1793    switch (src0->type) {
1794        case GGML_TYPE_F16:
1795        case GGML_TYPE_F32:
1796            if (src0->type == dst->type) {
1797                aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1,
1798                                      dst->type);
1799            } else {
1800                acl_tensor_ptr       acl_src0 = ggml_cann_create_tensor(src0);
1801                ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
1802                void *               src_trans_buffer = src_buffer_allocator.get();
1803                size_t               src_trans_nb[GGML_MAX_DIMS];
1804                src_trans_nb[0] = dst->nb[0];
1805                for (int i = 1; i < GGML_MAX_DIMS; i++) {
1806                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
1807                }
1808                acl_tensor_ptr src_trans_tensor =
1809                    ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type),
1810                                            ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
1811                aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
1812                aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
1813                                      dst->type);
1814            }
1815            break;
1816        case GGML_TYPE_Q8_0:
1817            {
1818                // add 1 dim for bcast mul.
1819                size_t  weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1];
1820                int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne;
1821                int64_t scale_offset = 0;
1822                // [3,4,5,64] -> [3,4,5,2,32]
1823                weight_ne[0]         = QK8_0;
1824                weight_ne[1]         = src0->ne[0] / QK8_0;
1825                weight_nb[0]         = sizeof(int8_t);
1826                weight_nb[1]         = weight_nb[0] * weight_ne[0];
1827                for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1828                    weight_ne[i] = src0->ne[i - 1];
1829                    weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
1830                }
1831                // [3,4,5,64] -> [3,4,5,2,1]
1832                scale_ne[0] = 1;
1833                scale_ne[1] = src0->ne[0] / QK8_0;
1834                scale_nb[0] = sizeof(uint16_t);
1835                scale_nb[1] = scale_nb[0] * scale_ne[0];
1836                for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1837                    scale_ne[i] = src0->ne[i - 1];
1838                    scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
1839                }
1840                // [3,4,5,64] -> [3,4,5,2,32]
1841                dequant_ne    = weight_ne;
1842                dequant_nb[0] = ggml_type_size(dst->type);
1843                for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
1844                    dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
1845                }
1846                scale_offset = ggml_nelements(src0) * sizeof(int8_t);
1847                ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(),
1848                                                              ggml_nelements(src0) * ggml_type_size(dst->type));
1849                acl_tensor_ptr       acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
1850                                                                                 weight_ne, weight_nb, GGML_MAX_DIMS + 1);
1851                acl_tensor_ptr       acl_scale_tensor =
1852                    ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
1853                                            GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
1854                acl_tensor_ptr dequant_tensor =
1855                    ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type),
1856                                            ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
1857                aclnn_mul(ctx, acl_weight_tensor.get(), acl_scale_tensor.get(), dequant_tensor.get());
1858                dequant_nb[0] = ggml_type_size(dst->type);
1859                dequant_ne    = src0->ne;
1860                for (int i = 1; i < GGML_MAX_DIMS; i++) {
1861                    dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
1862                }
1863                aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne,
1864                                      dst->nb, src1, dst->type);
1865                break;
1866            }
1867        default:
1868            GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
1869            break;
1870    }
1871}
1872
1873void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1874    ggml_tensor * src0 = dst->src[0];  // src
1875    ggml_tensor * src1 = dst->src[1];  // index
1876
1877    switch (dst->type) {
1878        case GGML_TYPE_F32:
1879            {
1880                aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type);
1881                break;
1882            }
1883        case GGML_TYPE_F16:
1884            {
1885                acl_tensor_ptr       acl_src0 = ggml_cann_create_tensor(src0);
1886                ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
1887                void *               src_trans_buffer = src_buffer_allocator.get();
1888                size_t               src_trans_nb[GGML_MAX_DIMS];
1889                src_trans_nb[0] = sizeof(uint16_t);
1890                for (int i = 1; i < GGML_MAX_DIMS; i++) {
1891                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
1892                }
1893                acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
1894                    src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
1895                aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
1896                aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
1897                                    dst->type);
1898                break;
1899            }
1900        default:
1901            GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
1902            break;
1903    }
1904}
1905
1906/**
1907 * @brief Repeats elements of a tensor along a specified dimension.
1908 *
1909 * This function repeats each element of the source tensor `acl_src` a specified
1910 * number of times (`repeats`) along the specified dimension `dim` and stores
1911 * the result in the destination tensor `acl_dst`.
1912 *
1913 * @param ctx The context for the CANN backend operations.
1914 * @param acl_src The source tensor whose elements will be repeated.
1915 * @param acl_dst The destination tensor where the repeated elements will be
1916 * stored.
1917 * @param dim The dimension along which the elements will be repeated.
1918 * @param repeats The number of times each element will be repeated.
1919 * @param output_size The size of the output tensor.
1920 */
1921static void aclnn_repeat_interleave(ggml_backend_cann_context & ctx,
1922                                    aclTensor *                 acl_src,
1923                                    aclTensor *                 acl_dst,
1924                                    int64_t                     dim,
1925                                    int64_t                     repeats,
1926                                    int64_t                     output_size) {
1927    GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim, output_size, acl_dst);
1928}
1929
1930/**
1931 * @brief Performs matrix multiplication with floating-point precision on
1932 * tensors using the CANN backend.
1933 *
1934 * This function performs matrix multiplication of the input tensor and the
1935 * weight tensor, handling broadcasting and transposing as needed, and stores
1936 * the result in the destination tensor `dst`.
1937 *
1938 * @param ctx The context for the CANN backend operations.
1939 * @param dst The destination tensor where the result of the matrix
1940 * multiplication will be stored.
1941 */
1942static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1943    ggml_tensor * weight = dst->src[0];  // weight
1944    ggml_tensor * input  = dst->src[1];  // input
1945
1946    // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
1947    // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
1948    BCAST_MUL_MAT_SHAPE(input, weight, dst);
1949
1950    int64_t n_dims = bcast_dims;
1951    if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
1952        if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
1953            n_dims = 2;
1954        } else if (bcast_input_ne[2] == 1) {
1955            n_dims = 3;
1956        }
1957    }
1958
1959    acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
1960    int64_t        transpose_ne[]   = { bcast_weight_ne[1], bcast_weight_ne[0], bcast_weight_ne[2],
1961                                        bcast_weight_ne[3], bcast_weight_ne[4], bcast_weight_ne[5] };
1962    size_t         transpose_nb[]   = { bcast_weight_nb[1], bcast_weight_nb[0], bcast_weight_nb[2],
1963                                        bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5] };
1964    acl_tensor_ptr acl_weight_tensor;
1965
1966    // Only check env once.
1967    static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
1968    if (weight_to_nz && is_matmul_weight(weight)) {
1969        acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
1970    } else {
1971        acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
1972    }
1973    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
1974
1975    switch (n_dims) {
1976        case 2:
1977            GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(), 2);
1978            break;
1979        case 3:
1980            GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(),
1981                                    2);
1982            break;
1983        default:
1984            // ALLOW_FP32_DOWN_PRECISION, when input is
1985            // fp32, atlas a2 will transpose it to HFLOAT32.
1986            GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(), 1);
1987            break;
1988    }
1989}
1990
1991/**
1992 * @brief Performs matrix multiplication with quantized weights and
1993 * floating-point inputs using the CANN backend.
1994 *
1995 * This function performs matrix multiplication of the input tensor `src1` and
1996 * the weight tensor `src0`, handling broadcasting, transposing, and
1997 * quantization as needed, and stores the result in the destination tensor
1998 * `dst`.
1999 *
2000 * @param ctx The context for the CANN backend operations.
2001 * @param dst The destination tensor where the result of the matrix
2002 * multiplication will be stored.
2003 */
2004static void ggml_cann_mul_mat_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst, const enum ggml_type type) {
2005    ggml_tensor * src0 = dst->src[0];  // weight
2006    ggml_tensor * src1 = dst->src[1];  // input
2007
2008    // The shape of the weight is NCHW.
2009    // Matrix multiplication uses HW dims.
2010    // HC is regarded as batch.
2011    // weight need transpose.
2012    float weight_elem_size;
2013    if (type == GGML_TYPE_Q4_0) {
2014        weight_elem_size = float(sizeof(uint8_t)) / 2;
2015    } else if (type == GGML_TYPE_Q8_0) {
2016        weight_elem_size = float(sizeof(uint8_t));
2017    } else {
2018        GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
2019    }
2020    float  weight_nb[]   = { src0->ne[0] * weight_elem_size, weight_elem_size };
2021    size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
2022    size_t weight_size   = weight_stride * src0->ne[2] * src0->ne[3];
2023
2024    // scale stored at the end of weight. Also need transpose.
2025    size_t scale_elem_size = sizeof(uint16_t);
2026    size_t scale_nb[]      = { src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size };
2027    size_t scale_stride    = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
2028    char * scale_offset    = (char *) src0->data + weight_size;
2029
2030    // input
2031    size_t               input_elem_size = sizeof(uint16_t);
2032    int64_t              input_ne[]      = { src1->ne[0], src1->ne[1] };
2033    size_t               input_nb[]      = { input_elem_size, input_ne[0] * input_elem_size };
2034    size_t               input_stride    = input_ne[0] * input_ne[1] * input_elem_size;
2035    ggml_cann_pool_alloc input_alloctor(ctx.pool());
2036    void *               input_buffer = src1->data;
2037
2038    // case in
2039    if (src1->type != GGML_TYPE_F16) {
2040        acl_tensor_ptr acl_src1_tensor = ggml_cann_create_tensor(src1);
2041        input_buffer                   = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
2042
2043        int64_t * input_cast_ne = src1->ne;
2044        size_t    input_cast_nb[GGML_MAX_DIMS];
2045        input_cast_nb[0] = sizeof(uint16_t);
2046        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2047            input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
2048        }
2049
2050        acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, input_elem_size,
2051                                                                  input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
2052        aclnn_cast(ctx, acl_src1_tensor.get(), acl_input_tensor.get(), ACL_FLOAT16);
2053    }
2054
2055    // output
2056    size_t               output_elem_size = sizeof(uint16_t);
2057    size_t               output_nb[]      = { output_elem_size, dst->ne[0] * output_elem_size };
2058    ggml_cann_pool_alloc output_allocator(ctx.pool());
2059    void *               output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
2060    size_t               output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
2061
2062    // aclnn
2063    int64_t              max_elem_size = 65535;
2064    int64_t              split_size    = (src0->ne[1] / max_elem_size) + 1;
2065    ggml_cann_pool_alloc workspace_allocator(ctx.pool());
2066    for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
2067        for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
2068            int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
2069            int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
2070
2071            int64_t batch1 = (n1 * src1->ne[2]) + c1;
2072            int64_t batch0 = (n0 * src0->ne[2]) + c0;
2073
2074            acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(
2075                (char *) input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2);
2076
2077            // first split
2078            int64_t weight_ne_offset = 0;
2079            int64_t weight_ne[2]     = { max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0] };
2080            int64_t scale_ne_offset  = 0;
2081            int64_t scale_ne[2]      = { weight_ne[0], weight_ne[1] / QK8_0 };
2082            int64_t output_ne_offset = 0;
2083            int64_t output_ne[2]     = { weight_ne[0], dst->ne[1] };
2084
2085            acl_tensor_ptr acl_weight_tensor =
2086                ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
2087                                        weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
2088            acl_tensor_ptr acl_scale_tensor =
2089                ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne,
2090                                        scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
2091            acl_tensor_ptr acl_output_tensor =
2092                ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size,
2093                                        output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
2094            int64_t antiquantGroupSize = 0;
2095            if (src0->ne[0] > QK8_0) {
2096                antiquantGroupSize = QK8_0;
2097            }
2098            GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor.get(), acl_weight_tensor.get(),
2099                                    acl_scale_tensor.get(), nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
2100                                    acl_output_tensor.get());
2101
2102            // other splits
2103            for (int64_t split = 1; split < split_size; split++) {
2104                weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
2105                weight_ne[0] =
2106                    max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
2107                scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
2108                scale_ne[0] = weight_ne[0];
2109                output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
2110                output_ne[0] = weight_ne[0];
2111
2112                acl_weight_tensor =
2113                    ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
2114                                            weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
2115                acl_scale_tensor =
2116                    ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size,
2117                                            scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
2118                acl_output_tensor =
2119                    ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16,
2120                                            output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
2121                GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor.get(), acl_weight_tensor.get(),
2122                                        acl_scale_tensor.get(), nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
2123                                        acl_output_tensor.get());
2124            }
2125        }
2126    }
2127
2128    // cast out
2129    if (dst->type != GGML_TYPE_F16) {
2130        int64_t * output_cast_ne = dst->ne;
2131        size_t    output_cast_nb[GGML_MAX_DIMS];
2132        output_cast_nb[0] = sizeof(uint16_t);
2133        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2134            output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
2135        }
2136
2137        acl_tensor_ptr acl_output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
2138                                                                   output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
2139        acl_tensor_ptr acl_dst_tensor    = ggml_cann_create_tensor(dst);
2140        aclnn_cast(ctx, acl_output_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
2141    }
2142}
2143
2144void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2145    const enum ggml_type type = dst->src[0]->type;
2146    switch (type) {
2147        case GGML_TYPE_F32:
2148        case GGML_TYPE_F16:
2149            ggml_cann_mat_mul_fp(ctx, dst);
2150            break;
2151        case GGML_TYPE_Q4_0:
2152        case GGML_TYPE_Q8_0:
2153            ggml_cann_mul_mat_quant(ctx, dst, type);
2154            break;
2155        default:
2156            GGML_ABORT("Unsupported type for mul_mat");
2157            break;
2158    }
2159}
2160
2161/**
2162 * @brief Rolls the elements of a tensor along a specified dimension.
2163 *
2164 * This function rolls the elements of the source tensor `acl_src` by the
2165 * specified shifts `shifts` along the specified dimensions `dims`, and stores
2166 * the result in the destination tensor `acl_dst`.
2167 *
2168 * @param ctx The context for the CANN backend operations.
2169 * @param acl_src The source tensor whose elements will be rolled.
2170 * @param acl_dst The destination tensor where the rolled elements will be
2171 * stored.
2172 * @param shifts An array specifying the number of positions by which elements
2173 * are shifted.
2174 * @param dims An array specifying the dimensions along which elements are
2175 * shifted.
2176 */
2177static void aclnn_roll(ggml_backend_cann_context & ctx,
2178                       aclTensor *                 acl_src,
2179                       aclTensor *                 acl_dst,
2180                       int64_t *                   shifts,
2181                       int64_t *                   dims) {
2182    acl_int_array_ptr acl_shifts = ggml_cann_create_int_array(shifts, 1);
2183    acl_int_array_ptr acl_dims   = ggml_cann_create_int_array(dims, 1);
2184    GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts.get(), acl_dims.get(), acl_dst);
2185}
2186
2187/**
2188 * @brief Fills specified positions of a tensor with a scalar value.
2189 *
2190 * This function fills the positions in the source tensor `acl_src` specified by
2191 * `index` along the dimension `dim` with the scalar value `value`.
2192 *
2193 * @param ctx The context for the CANN backend operations.
2194 * @param acl_src The source tensor where the positions will be filled.
2195 * @param dim The dimension along which the positions are specified.
2196 * @param index An array specifying the positions to be filled.
2197 * @param index_num The number of positions specified in the index array.
2198 * @param value The scalar value used to fill the specified positions.
2199 */
2200static void aclnn_index_fill_tensor(ggml_backend_cann_context & ctx,
2201                                    aclTensor *                 acl_src,
2202                                    int64_t                     dim,
2203                                    int64_t *                   index,
2204                                    int64_t                     index_num,
2205                                    float                       value) {
2206    acl_int_array_ptr acl_index = ggml_cann_create_int_array(index, index_num);
2207    acl_scalar_ptr    acl_value = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
2208    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index.get(), acl_value.get());
2209}
2210
2211/**
2212 * @brief Initializes and caches all intermediate tensors required for RoPE
2213 *        (Rotary Position Embedding), including support for Yarn, mRoPE,
2214 *        i-mRoPE, Neox repeat strategy, independent sectors, frequency factors๏ผŒ
2215 *        and multi-section rotary groups.
2216 *
2217 * This function computes and caches the per-dimension ฮธ coefficients used for
2218 * Q/K rotary embedding. The cache is shared across layers, and recomputed only
2219 * when any dependent parameter changes.
2220 *
2221 * The function now supports:
2222 *   - Yarn RoPE extrapolation (via @param corr_dims and @param ext_factor)
2223 *   - Per-dimension independent sector exponent rules (indep_sects + sections[])
2224 *   - Multi-section RoPE (mRoPE) index mapping (mrope_used + is_imrope)
2225 *   - Frequency factor division (src2)
2226 *   - Neox / normal repeat expansion modes
2227 *
2228 * @param ctx                CANN backend context, containing memory pool,
2229 *                           cached buffers, and runtime stream.
2230 * @param dst                Destination ggml_tensor whose computation
2231 *                           depends on RoPE (typically Qcur or Kcur).
2232 * @param corr_dims          [low, high] Yarn correction range.
2233 * @param ext_factor         Yarn extrapolation strength. 0 = disabled.
2234 * @param theta_scale        Base multiplier for per-dimension ฮธ exponent.
2235 * @param freq_scale         Global frequency scaling factor.
2236 * @param attn_factor        Optional scaling applied to sin/cos (if needed).
2237 * @param is_neox            Whether to use Neox-style dimension interleave.
2238 * @param sections           4-way sector sizes for independent-section RoPE
2239 *                           and multi-section mRoPE (t/h/w/e).
2240 * @param mrope_used         Whether to enable multi-section rotary embedding.
2241 * @param is_imrope          Whether to apply interleaved mRoPE rules.
2242 * @param indep_sects        Whether each dimension runs independent exponent
2243 *                           resets based on @p sections.
2244 */
2245static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
2246                                  ggml_tensor *               dst,
2247                                  float *                     corr_dims,
2248                                  float                       ext_factor,
2249                                  float                       theta_scale,
2250                                  float                       freq_scale,
2251                                  float                       attn_factor,
2252                                  bool                        is_neox,
2253                                  int                         sections[4],
2254                                  bool                        mrope_used,
2255                                  bool                        is_imrope,
2256                                  bool                        indep_sects,
2257                                  int64_t                     rope_dims) {
2258    ggml_tensor * src1 = dst->src[1];  // position
2259    ggml_tensor * src2 = dst->src[2];  // freq_factors
2260
2261    int64_t theta_scale_length = rope_dims / 2;
2262    int64_t position_length    = dst->ne[2];
2263
2264    // TODO: check theta_scale_length and position_length.
2265    if (src2 == nullptr && ctx.rope_cache.cached &&
2266        ctx.rope_cache.equal(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor,
2267                             is_neox, indep_sects, mrope_used, is_imrope, sections)) {
2268        // use cache.
2269        return;
2270    }
2271
2272    // Step0: calculate tensor shape.
2273    int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 };
2274    size_t  theta_scale_nb[] = { sizeof(float), theta_scale_length * sizeof(float), theta_scale_length * sizeof(float),
2275                                 theta_scale_length * sizeof(float) };
2276
2277    GGML_ASSERT(src1->type == GGML_TYPE_I32);
2278    int64_t position_ne[] = { 1, 1, position_length, 1 };
2279    size_t  position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };
2280
2281    int64_t cache_ne[] = { theta_scale_length, 1, position_length, 1 };
2282    size_t  cache_nb[GGML_MAX_DIMS];
2283    cache_nb[0] = sizeof(float);
2284    for (int i = 1; i < GGML_MAX_DIMS; i++) {
2285        cache_nb[i] = cache_nb[i - 1] * cache_ne[i - 1];
2286    }
2287
2288    // Step1: Compute the coefficient of theta. During the cache_init process, aside from
2289    // (1) multiplying by the position,
2290    // (2) dividing by freq_factors,
2291    // (3) computing the sine and cosine,
2292    // the other parameters used in the computation generally do not change in most scenarios.
2293    // Therefore, we can first compute this part of the result and then cache it.
2294
2295    // Step1.1: prepare theta_scale exponent. if this exponent updated, should update theta_scale_tensor.
2296    acl_tensor_ptr acl_theta_scale_tensor;
2297    bool           theta_scale_updated = false;
2298    if (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.theta_scale != theta_scale ||
2299        ctx.rope_cache.indep_sects != indep_sects) {
2300        theta_scale_updated = true;
2301        if (ctx.rope_cache.theta_scale_exp_host != nullptr) {
2302            free(ctx.rope_cache.theta_scale_exp_host);
2303        }
2304        ctx.rope_cache.theta_scale_exp_host = (float *) malloc(theta_scale_length * sizeof(float));
2305        GGML_ASSERT(ctx.rope_cache.theta_scale_exp_host != nullptr);
2306        if (!indep_sects) {
2307            ctx.rope_cache.theta_scale_exp_host[0] = 1;
2308            for (int i = 1; i < theta_scale_length; i++) {
2309                ctx.rope_cache.theta_scale_exp_host[i] = ctx.rope_cache.theta_scale_exp_host[i - 1] * theta_scale;
2310            }
2311        } else {
2312            int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
2313            int sec_w     = sections[1] + sections[0];
2314            int sec_e     = sections[2] + sec_w;
2315
2316            ctx.rope_cache.theta_scale_exp_host[0] = 1;
2317            for (int i = 1; i < theta_scale_length; i++) {
2318                int sector = i % sect_dims;
2319                if (sector == 0 || sector == sections[0] || sector == sec_w || sector == sec_e) {
2320                    ctx.rope_cache.theta_scale_exp_host[i] = 1;
2321                    continue;
2322                }
2323                ctx.rope_cache.theta_scale_exp_host[i] = ctx.rope_cache.theta_scale_exp_host[i - 1] * theta_scale;
2324            }
2325        }
2326
2327        if (ctx.rope_cache.theta_scale_cache != nullptr) {
2328            ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
2329        }
2330        ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
2331                              ACL_MEM_MALLOC_HUGE_FIRST));
2332
2333        ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
2334                                   ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
2335                                   ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
2336    }
2337    acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
2338                                                     theta_scale_ne, theta_scale_nb, 1);
2339
2340    // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
2341    // TODO: acl_yarn_ramp_tensor use rope cache.
2342    bool           yarn_ramp_tensor_updated = false;
2343    acl_tensor_ptr acl_yarn_ramp_tensor;
2344    if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
2345                            ctx.rope_cache.freq_scale != freq_scale)) {
2346        yarn_ramp_tensor_updated = true;
2347        if (ctx.rope_cache.yarn_ramp_cache != nullptr) {
2348            ACL_CHECK(aclrtFree(ctx.rope_cache.yarn_ramp_cache));
2349        }
2350        ACL_CHECK(aclrtMalloc(&ctx.rope_cache.yarn_ramp_cache, theta_scale_length * sizeof(float),
2351                              ACL_MEM_MALLOC_HUGE_FIRST));
2352        // -rope_yarn_ramp
2353        // const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
2354        // return MIN(1, MAX(0, y)) - 1;
2355        acl_yarn_ramp_tensor      = ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float),
2356                                                            theta_scale_ne, theta_scale_nb, 1);
2357        float          zero_value = 0, one_value = 1;
2358        float          denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
2359        acl_scalar_ptr low              = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
2360        acl_scalar_ptr zero             = ggml_cann_create_scalar(&zero_value, aclDataType::ACL_FLOAT);
2361        acl_scalar_ptr one              = ggml_cann_create_scalar(&one_value, aclDataType::ACL_FLOAT);
2362        acl_scalar_ptr denom_safe       = ggml_cann_create_scalar(&denom_safe_value, aclDataType::ACL_FLOAT);
2363        acl_scalar_ptr ext_factor_sc    = ggml_cann_create_scalar(&ext_factor, aclDataType::ACL_FLOAT);
2364
2365        aclnn_arange(ctx, acl_yarn_ramp_tensor.get(), 0, theta_scale_length, 1, theta_scale_length);
2366        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), low.get(), one.get());
2367        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor.get(), denom_safe.get());
2368        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor.get(), zero.get(), zero.get());
2369        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor.get(), one.get());
2370        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), one.get(), one.get());
2371        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), ext_factor_sc.get());
2372
2373        // theta_interp = freq_scale * theta_extrap;
2374        // theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
2375        // theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
2376        // theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
2377        // theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
2378        //
2379        // we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
2380        // cache freq_scale + (freq_scale - 1) * ramp_mix
2381        float          freq_scale_1    = freq_scale - 1;
2382        acl_scalar_ptr freq_scale_sc   = ggml_cann_create_scalar(&freq_scale, aclDataType::ACL_FLOAT);
2383        acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
2384        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
2385        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
2386    } else {
2387        acl_yarn_ramp_tensor = ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float),
2388                                                       theta_scale_ne, theta_scale_nb, 1);
2389    }
2390    // Step 1.3: update theta_scale_tensor according to ext_factor or freq_scale.
2391    if (ext_factor != 0) {
2392        if (theta_scale_updated || yarn_ramp_tensor_updated) {
2393            theta_scale_updated = true;
2394            aclnn_mul(ctx, acl_theta_scale_tensor.get(), acl_yarn_ramp_tensor.get());
2395        }
2396    } else {
2397        if (freq_scale != 1 && (ctx.rope_cache.freq_scale != freq_scale || theta_scale_updated)) {
2398            theta_scale_updated = true;
2399            aclnn_muls(ctx, acl_theta_scale_tensor.get(), freq_scale, nullptr, true);
2400        }
2401    }
2402
2403    // Nothing changed, use cache.
2404    if (!theta_scale_updated) {
2405        acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
2406                                                         theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2407    }
2408
2409    // Step 1.4: prepare select index if mrope
2410    acl_tensor_ptr position_select_index_tensor;
2411    if (mrope_used) {
2412        if (ctx.rope_cache.sections[0] != sections[0] || ctx.rope_cache.sections[1] != sections[1] ||
2413            ctx.rope_cache.sections[2] != sections[2] || ctx.rope_cache.sections[3] != sections[3] ||
2414            ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.is_imrope != is_imrope) {
2415            if (ctx.rope_cache.position_select_index_host != nullptr) {
2416                free(ctx.rope_cache.position_select_index_host);
2417            }
2418            ctx.rope_cache.position_select_index_host = (int *) malloc(theta_scale_length * sizeof(int));
2419            GGML_ASSERT(ctx.rope_cache.position_select_index_host != nullptr);
2420            int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
2421            int sec_w     = sections[1] + sections[0];
2422            int sec_e     = sections[2] + sec_w;
2423            // t,h,w,e
2424            for (int i = 0; i < theta_scale_length; i++) {
2425                int sector = i % sect_dims;
2426
2427                if (is_imrope) {  // qwen3vl apply interleaved mrope
2428                    if (sector % 3 == 1 && sector < 3 * sections[1]) {
2429                        ctx.rope_cache.position_select_index_host[i] = 1;
2430                    } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
2431                        ctx.rope_cache.position_select_index_host[i] = 2;
2432                    } else if (sector % 3 == 0 && sector < 3 * sections[0]) {
2433                        ctx.rope_cache.position_select_index_host[i] = 0;
2434                    } else {
2435                        ctx.rope_cache.position_select_index_host[i] = 3;
2436                    }
2437                } else {
2438                    if (sector >= sections[0] && sector < sec_w) {
2439                        ctx.rope_cache.position_select_index_host[i] = 1;
2440                    } else if (sector >= sec_w && sector < sec_e) {
2441                        ctx.rope_cache.position_select_index_host[i] = 2;
2442                    } else if (sector >= sec_e) {
2443                        ctx.rope_cache.position_select_index_host[i] = 3;
2444                    } else {
2445                        ctx.rope_cache.position_select_index_host[i] = 0;
2446                    }
2447                }
2448            }
2449
2450            if (ctx.rope_cache.position_select_index != nullptr) {
2451                ACL_CHECK(aclrtFree(ctx.rope_cache.position_select_index));
2452            }
2453            ACL_CHECK(aclrtMalloc(&ctx.rope_cache.position_select_index, theta_scale_length * sizeof(int),
2454                                  ACL_MEM_MALLOC_HUGE_FIRST));
2455
2456            ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.position_select_index, theta_scale_length * sizeof(int),
2457                                       ctx.rope_cache.position_select_index_host, theta_scale_length * sizeof(int),
2458                                       ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
2459        }
2460
2461        position_select_index_tensor = ggml_cann_create_tensor(ctx.rope_cache.position_select_index, ACL_INT32,
2462                                                               sizeof(int), theta_scale_ne, theta_scale_nb, 1);
2463    }
2464
2465    // Step2: divide by freq_factors
2466    ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
2467    if (src2) {
2468        freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
2469        void *         freq_fac_res_ptr = freq_fac_res_allocator.get();
2470        acl_tensor_ptr acl_freq_factors_tensor =
2471            ggml_cann_create_tensor(src2->data, ggml_cann_type_mapping(src2->type), ggml_type_size(src2->type),
2472                                    theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2473        acl_tensor_ptr acl_freq_fac_res_tensor = ggml_cann_create_tensor(freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
2474                                                                         theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2475        aclnn_div(ctx, acl_theta_scale_tensor.get(), acl_freq_factors_tensor.get(), acl_freq_fac_res_tensor.get());
2476        std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
2477    }
2478
2479    // Step3: prepare position_tensor
2480    acl_tensor_ptr       acl_position_tensor;
2481    ggml_cann_pool_alloc mrope_position_acllocator(ctx.pool());
2482    if (mrope_used) {
2483        // Step3.1: select current position;
2484        // position :
2485        // pos1: [[0, 1 ,2 ,3 ],
2486        // pos2:  [4, 5 ,6 ,7 ],
2487        // pos3:  [8, 9 ,10,11],
2488        // pos4:  [12,13,14,15] ]
2489        //
2490        // select index = [0, 1, 2, 2, 1, 0]
2491        //
2492        // selected_tensor:
2493        // [[0, 1 ,2 ,3 ],
2494        //  [4, 5 ,6 ,7 ],
2495        //  [8, 9 ,10,11],
2496        //  [8, 9 ,10,11],
2497        //  [4, 5 ,6 ,7 ],
2498        //  [0, 1 ,2 ,3 ]]
2499        //
2500        // transpose, from [seq_len:dims] to [dims:seq_len]
2501        // [0, 4, 8 ,8 ,4, 0],
2502        // [1, 5, 9, 9, 5, 1],
2503        // [2, 6, 10,10,6 ,2],
2504        // [3, 7, 11,11,7 3 ]]
2505        //
2506        // multipy by theta_scale_tensor
2507        // [theta_scale^0, theta_scale^1, ..., theta_scale ^ n]
2508
2509        int64_t        mrope_position_ne[] = { position_length, 4 };
2510        size_t         mrope_position_nb[] = { sizeof(int), position_length * sizeof(int) };
2511        acl_tensor_ptr mrope_position =
2512            ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
2513                                    mrope_position_ne, mrope_position_nb, 2);
2514
2515        // selected position tensor's shape is a transpose of cache tensor.
2516        int64_t selected_position_ne[] = { position_length, theta_scale_length };
2517        size_t  selected_position_nb[] = { sizeof(float), position_length * sizeof(float) };
2518        mrope_position_acllocator.alloc(theta_scale_length * position_length * sizeof(float));
2519        void * mrope_position_buffer = mrope_position_acllocator.get();
2520        acl_position_tensor =
2521            ggml_cann_create_tensor(mrope_position_buffer, ggml_cann_type_mapping(src1->type),
2522                                    ggml_type_size(src1->type), selected_position_ne, selected_position_nb, 2);
2523        GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, mrope_position.get(), 0, position_select_index_tensor.get(),
2524                                acl_position_tensor.get());
2525
2526        // transpose
2527        int64_t transposed_ne[] = { position_length, 1, theta_scale_length, 1 };
2528        size_t  transposed_nb[GGML_MAX_DIMS];
2529        transposed_nb[0] = sizeof(float);
2530        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2531            transposed_nb[i] = transposed_nb[i - 1] * transposed_ne[i - 1];
2532        }
2533
2534        std::swap(transposed_ne[0], transposed_ne[2]);
2535        std::swap(transposed_nb[0], transposed_nb[2]);
2536
2537        acl_position_tensor =
2538            ggml_cann_create_tensor(mrope_position_buffer, ggml_cann_type_mapping(src1->type),
2539                                    ggml_type_size(src1->type), transposed_ne, transposed_nb, GGML_MAX_DIMS);
2540
2541    } else {
2542        // auto bcast.
2543        acl_position_tensor =
2544            ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
2545                                    position_ne, position_nb, GGML_MAX_DIMS);
2546    }
2547
2548    // Step4: multiply by the position
2549    int64_t              theta_length = theta_scale_length * position_length;
2550    ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
2551    void *               theta_buffer = theta_allocator.get();
2552
2553    acl_tensor_ptr acl_theta_tensor =
2554        ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS);
2555    aclnn_mul(ctx, acl_position_tensor.get(), acl_theta_scale_tensor.get(), acl_theta_tensor.get());
2556
2557    // Step5: calculate sin cos.
2558    // init sin_repeat && cos_repeat, only to accelerate first layer on each device
2559    if (position_length > ctx.rope_cache.position_length) {
2560        ctx.rope_cache.position_length = position_length;
2561        if (ctx.rope_cache.sin_cache != nullptr) {
2562            ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache));
2563        }
2564        if (ctx.rope_cache.cos_cache != nullptr) {
2565            ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
2566        }
2567        int64_t repeat_theta_length = theta_scale_length * position_length * 2;
2568        ACL_CHECK(
2569            aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2570        ACL_CHECK(
2571            aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2572    }
2573
2574    // sin/cos
2575    ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length * sizeof(float));
2576    void *               sin_buffer = sin_allocator.get();
2577    acl_tensor_ptr       acl_sin_tensor =
2578        ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
2579    aclnn_sin(ctx, acl_theta_tensor.get(), acl_sin_tensor.get());
2580
2581    ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length * sizeof(float));
2582    void *               cos_buffer = cos_allocator.get();
2583    acl_tensor_ptr       acl_cos_tensor =
2584        ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
2585    aclnn_cos(ctx, acl_theta_tensor.get(), acl_cos_tensor.get());
2586
2587    if (ext_factor != 0) {
2588        attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
2589    }
2590
2591    // Step 5: multiply by attn_factor
2592    if (attn_factor != 1) {
2593        aclnn_muls(ctx, acl_sin_tensor.get(), attn_factor, nullptr, true);
2594        aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
2595    }
2596
2597    int64_t sin_reshape_ne[4] = { rope_dims, 1, dst->ne[2], 1 };
2598    size_t  sin_reshape_nb[GGML_MAX_DIMS];
2599    sin_reshape_nb[0] = sizeof(float);
2600    for (int i = 1; i < GGML_MAX_DIMS; i++) {
2601        sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
2602    }
2603    acl_tensor_ptr acl_sin_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
2604                                                                   sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2605    acl_tensor_ptr acl_cos_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
2606                                                                   sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2607
2608    // Step 6: repeat
2609    if (is_neox) {
2610        // [sinฮธ1, sinฮธ1, sinฮธ2, sinฮธ2, ..., sinฮธn, sinฮธn]
2611        int64_t repeatsArray[] = { 1, 1, 1, 2 };
2612        aclnn_repeat(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), repeatsArray);
2613        aclnn_repeat(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), repeatsArray);
2614    } else {
2615        int64_t num_repeats = 2;
2616        int64_t dim         = 3;
2617        int64_t output_size = theta_scale_length * num_repeats;
2618        // [sinฮธ1, sinฮธ2, ..., sinฮธn, sinฮธ1, sinฮธ2, ..., sinฮธn]
2619        aclnn_repeat_interleave(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), dim, num_repeats, output_size);
2620        aclnn_repeat_interleave(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), dim, num_repeats, output_size);
2621    }
2622
2623    // Update cached value.
2624    ctx.rope_cache.cached = true;
2625    ctx.rope_cache.set(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, is_neox,
2626                       indep_sects, mrope_used, is_imrope, sections);
2627}
2628
2629#ifdef __cplusplus
2630extern "C" {
2631#endif
2632aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(const aclTensor * x,
2633                                                         const aclTensor * cos,
2634                                                         const aclTensor * sin,
2635                                                         int64_t           mode,
2636                                                         const aclTensor * yOut,
2637                                                         uint64_t *        workspaceSize,
2638                                                         aclOpExecutor **  executor);
2639aclnnStatus aclnnRotaryPositionEmbedding(void *          workspace,
2640                                         uint64_t        workspaceSize,
2641                                         aclOpExecutor * executor,
2642                                         aclrtStream     stream);
2643#ifdef __cplusplus
2644}
2645#endif
2646
2647void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2648    ggml_tensor * src0 = dst->src[0];  // input
2649
2650    // param
2651    float     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2652    int       sections[4];
2653    // const int n_past     = ((int32_t *) dst->op_params)[0];
2654    const int n_dims     = ((int32_t *) dst->op_params)[1];
2655    const int mode       = ((int32_t *) dst->op_params)[2];
2656    // const int n_ctx      = ((int32_t *) dst->op_params)[3];
2657    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
2658
2659    GGML_TENSOR_UNARY_OP_LOCALS
2660
2661    memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
2662    memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
2663    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
2664    memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
2665    memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
2666    memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
2667    memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int) * 4);
2668
2669    GGML_ASSERT(n_dims % 2 == 0);
2670    GGML_ASSERT(n_dims <= ne00);
2671
2672    const float theta_scale = powf(freq_base, -2.0f / n_dims);
2673
2674    float corr_dims[2];
2675    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
2676
2677    bool       is_neox    = mode & GGML_ROPE_TYPE_NEOX;
2678    const bool is_imrope  = mode == GGML_ROPE_TYPE_IMROPE;  // qwen3vl apply interleaved mrope
2679    // mrope_used means the GGML_ROPE_TYPE_MROPE bit is set.
2680    // Note: this bit is also set for imrope and some vision modes,
2681    // so mrope_used does NOT exclusively indicate pure mrope.
2682    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
2683    const bool is_vision  = mode == GGML_ROPE_TYPE_VISION;
2684
2685    if (mrope_used) {
2686        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
2687    }
2688
2689    if (is_vision) {
2690        GGML_ASSERT(n_dims == ne0 / 2);
2691    }
2692
2693    if (is_imrope || mrope_used) {
2694        is_neox = true;
2695    }
2696
2697    int64_t rope_dims = n_dims;
2698
2699    //Our current RotaryPositionEmbedding does not support the VISION mode,
2700    //but essentially it only modifies theta_base in mrope,
2701    //then repeats it at the end in the same way as is_neox.
2702    //In fact, RoPE is still applied across all dimensions.
2703    if (is_vision) {
2704        rope_dims = src0->ne[0];
2705    }
2706    int64_t tail_dims = ne00 - rope_dims;
2707    bool    has_tail  = tail_dims > 0;
2708
2709    // init ctx.rope_cos/rope_sin cache
2710    aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
2711                          mrope_used, is_imrope, is_vision, rope_dims);
2712
2713    // Cache is generated with ne00 dimensions, so we use ne00 for reshape
2714    int64_t sin_reshape_ne[4] = { rope_dims, 1, ne02, 1 };
2715    size_t  sin_reshape_nb[GGML_MAX_DIMS];
2716    sin_reshape_nb[0] = sizeof(float);
2717    for (int i = 1; i < GGML_MAX_DIMS; i++) {
2718        sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
2719    }
2720    acl_tensor_ptr acl_sin_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
2721                                                                    sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2722    acl_tensor_ptr acl_cos_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
2723                                                                    sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2724
2725    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
2726    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
2727#ifdef ASCEND_310P
2728    // Special ROPE operation for 310P
2729
2730    // roll input
2731    void *               input_roll_buffer;
2732    acl_tensor_ptr       acl_minus_one_tensor;
2733    void *               minus_one_scale_buffer = nullptr;
2734    ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
2735    ggml_cann_pool_alloc minus_one_scale_allocator(ctx.pool(), sizeof(float) * src0->ne[0]);
2736    if (!is_neox) {
2737        // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
2738        input_roll_buffer        = roll_allocator.get();
2739        int64_t input_roll_ne[4] = { 2, src0->ne[1] * (src0->ne[0] / 2), src0->ne[2], src0->ne[3] };
2740        size_t  input_roll_nb[GGML_MAX_DIMS];
2741        input_roll_nb[0] = ggml_type_size(src0->type);
2742        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2743            input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
2744        }
2745        acl_tensor_ptr acl_input_roll_tensor =
2746            ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
2747                                    input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
2748        acl_tensor_ptr acl_input_tensor =
2749            ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
2750                                    input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
2751
2752        int64_t shifts[] = { 1 };
2753        int64_t dims[]   = { 3 };
2754        aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
2755
2756        // init [-1, 1, -1, 1, ...]
2757        minus_one_scale_buffer = minus_one_scale_allocator.get();
2758
2759        int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
2760        size_t  minus_one_nb[GGML_MAX_DIMS];
2761        minus_one_nb[0] = sizeof(float);
2762        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2763            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2764        }
2765        acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
2766                                            GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
2767        int64_t   dim        = 3;
2768        int64_t * index      = new int64_t[src0->ne[0]];
2769        for (int i = 0; i < src0->ne[0]; i++) {
2770            index[i] = i / 2 * 2;
2771        }
2772        int64_t index_num = src0->ne[0];
2773        float   value     = -1;
2774        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor.get(), dim, index, index_num, value);
2775    } else {
2776        // roll input: [q0,q1,q2,...] ->
2777        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
2778        input_roll_buffer = roll_allocator.get();
2779        acl_tensor_ptr acl_input_roll_tensor =
2780            ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
2781                                    src0->ne, src0->nb, GGML_MAX_DIMS);
2782        acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(src0);
2783
2784        int64_t shifts[] = { src0->ne[0] / 2 };
2785        int64_t dims[]   = { 3 };
2786        aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
2787
2788        // init [-1, -1, -1, 1, 1๏ผŒ1๏ผŒ...]
2789        minus_one_scale_buffer  = minus_one_scale_allocator.get();
2790        int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
2791        size_t  minus_one_nb[GGML_MAX_DIMS];
2792        minus_one_nb[0] = sizeof(float);
2793        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2794            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2795        }
2796        acl_minus_one_tensor     = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
2797                                                GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
2798        // -1 * first half
2799        int64_t first_half_ne[4] = { src0->ne[0] / 2, 1, 1, 1 };
2800        size_t  first_half_nb[GGML_MAX_DIMS];
2801        first_half_nb[0] = sizeof(float);
2802        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2803            first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
2804        }
2805        acl_tensor_ptr acl_first_half_tensor = ggml_cann_create_tensor(minus_one_scale_buffer, ACL_FLOAT, sizeof(float),
2806                                                                       first_half_ne, first_half_nb, GGML_MAX_DIMS);
2807        bool           inplace               = true;
2808        float          scale                 = -1;
2809        aclnn_muls(ctx, acl_first_half_tensor.get(), scale, nullptr, inplace);
2810    }
2811
2812    // TODO: n_dims < ne0
2813    GGML_ASSERT(n_dims == src0->ne[0]);
2814
2815    // input * scale
2816    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0));
2817    void *               input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
2818    size_t               input_nb[GGML_MAX_DIMS];
2819    input_nb[0] = ggml_type_size(src0->type);
2820    for (int i = 1; i < GGML_MAX_DIMS; i++) {
2821        input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
2822    }
2823    acl_tensor_ptr acl_input_roll_mul_scale_tensor =
2824        ggml_cann_create_tensor(input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
2825                                ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
2826    acl_tensor_ptr acl_input_roll_reshape_tensor =
2827        ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
2828                                src0->ne, input_nb, GGML_MAX_DIMS);
2829
2830    aclnn_mul(ctx, acl_input_roll_reshape_tensor.get(), acl_minus_one_tensor.get(),
2831              acl_input_roll_mul_scale_tensor.get());
2832
2833    // output
2834    void * output_fp32_buffer;
2835    if (src0->type == GGML_TYPE_F32) {
2836        aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get());
2837        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get());
2838        aclnn_add(ctx, acl_src.get(), acl_input_roll_mul_scale_tensor.get(), acl_dst.get());
2839        // TODO: ne0 != n_dims in mode2
2840    } else if (src0->type == GGML_TYPE_F16) {
2841        size_t input_fp32_nb[GGML_MAX_DIMS];
2842        input_fp32_nb[0] = sizeof(float);
2843        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2844            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
2845        }
2846        ggml_cann_pool_alloc fp32_allocator1(ctx.pool(), ggml_nelements(dst) * sizeof(float));
2847        void *               input_fp32_buffer1 = fp32_allocator1.get();
2848        acl_tensor_ptr       input_fp32_tensor1 = ggml_cann_create_tensor(input_fp32_buffer1, ACL_FLOAT, sizeof(float),
2849                                                                          dst->ne, input_fp32_nb, GGML_MAX_DIMS);
2850        ggml_cann_pool_alloc fp32_allocator2(ctx.pool(), ggml_nelements(dst) * sizeof(float));
2851        void *               input_fp32_buffer2 = fp32_allocator2.get();
2852        acl_tensor_ptr       input_fp32_tensor2 = ggml_cann_create_tensor(input_fp32_buffer2, ACL_FLOAT, sizeof(float),
2853                                                                          dst->ne, input_fp32_nb, GGML_MAX_DIMS);
2854
2855        ggml_cann_pool_alloc fp32_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
2856        output_fp32_buffer                = fp32_allocator.get();
2857        acl_tensor_ptr output_fp32_tensor = ggml_cann_create_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float),
2858                                                                    dst->ne, input_fp32_nb, GGML_MAX_DIMS);
2859        aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get(), input_fp32_tensor1.get());
2860        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get(), input_fp32_tensor2.get());
2861        aclnn_add(ctx, input_fp32_tensor1.get(), input_fp32_tensor2.get(), output_fp32_tensor.get());
2862        aclnn_cast(ctx, output_fp32_tensor.get(), acl_dst.get(), ACL_FLOAT16);
2863    }
2864    return;
2865#endif
2866    int64_t acl_mode = is_neox ? 0 : 1;
2867
2868    // Pre-define head and tail dimensions for reuse
2869    int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
2870    int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 };
2871
2872    // Step 1: Prepare trans tensors for F16 type conversion to F32 if needed
2873    bool                 src_dst_need_trans = false;
2874    ggml_cann_pool_alloc src_trans_allocator(ctx.pool());
2875    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool());
2876    acl_tensor_ptr       acl_src_trans_tensor;
2877    acl_tensor_ptr       acl_dst_trans_tensor;
2878    void *               src_trans_buffer = nullptr;
2879    void *               dst_trans_buffer = nullptr;
2880    size_t               src_dst_trans_nb[GGML_MAX_DIMS];
2881    if (src0->type == GGML_TYPE_F16) {
2882        src_dst_need_trans = true;
2883        src_trans_buffer   = src_trans_allocator.alloc(ggml_nelements(src0) * sizeof(float));
2884        dst_trans_buffer   = dst_trans_allocator.alloc(ggml_nelements(dst) * sizeof(float));
2885
2886        src_dst_trans_nb[0] = sizeof(float);
2887        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2888            src_dst_trans_nb[i] = src_dst_trans_nb[i - 1] * src0->ne[i - 1];
2889        }
2890        acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne,
2891                                                       src_dst_trans_nb, GGML_MAX_DIMS);
2892        acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne,
2893                                                       src_dst_trans_nb, GGML_MAX_DIMS);
2894        aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
2895    }
2896
2897    // Step 2: Prepare head tensors for tail splitting if needed
2898    acl_tensor_ptr acl_src_head;
2899    acl_tensor_ptr acl_dst_head;
2900    if (has_tail) {
2901        // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions)
2902        // RotaryPositionEmbedding requires contiguous dst tensor, so we use a temporary buffer
2903        if (src_dst_need_trans) {
2904            // Use F32 trans tensor strides
2905            acl_src_head = ggml_cann_create_tensor((char *) src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne,
2906                                                   src_dst_trans_nb, GGML_MAX_DIMS);
2907        } else {
2908            // Use original F32 tensor strides
2909            acl_src_head = ggml_cann_create_tensor((char *) src0->data, ACL_FLOAT, sizeof(float), head_ne, src0->nb,
2910                                                   GGML_MAX_DIMS);
2911        }
2912
2913        int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
2914        ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
2915        void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
2916
2917        size_t head_contiguous_nb[GGML_MAX_DIMS];
2918        head_contiguous_nb[0] = sizeof(float);
2919        for (int i = 1; i < GGML_MAX_DIMS; i++) {
2920            head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
2921        }
2922        acl_dst_head = ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
2923                                               head_contiguous_nb, GGML_MAX_DIMS);
2924    }
2925
2926    // Step 3: Execute RotaryPositionEmbedding
2927    if (has_tail) {
2928        // Rotate only the head portion (first rope_dims dimensions)
2929        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head.get(), acl_cos_reshape_tensor.get(),
2930                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_head.get());
2931
2932        // Copy head result from contiguous buffer back to destination tensor
2933        if (src_dst_need_trans) {
2934            acl_tensor_ptr acl_dst_head_target = ggml_cann_create_tensor(
2935                (char *) dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, src_dst_trans_nb, GGML_MAX_DIMS);
2936            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
2937        } else {
2938            acl_tensor_ptr acl_dst_head_target =
2939                ggml_cann_create_tensor((char *) dst->data, ACL_FLOAT, sizeof(float), head_ne, dst->nb, GGML_MAX_DIMS);
2940            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
2941        }
2942    } else if (src_dst_need_trans) {
2943        // Rotate full tensor (no tail), using trans tensors
2944        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
2945                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
2946    } else {
2947        // Rotate full tensor (no tail), using original tensors
2948        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
2949                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
2950    }
2951
2952    // Step 4: Copy unrotated tail portion from source to destination
2953    if (has_tail) {
2954        size_t src_tail_offset;
2955        size_t dst_tail_offset;
2956
2957        auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size,
2958                                    size_t * nb_src_arr, size_t * nb_dst_arr) {
2959            acl_tensor_ptr acl_src_tail =
2960                ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS);
2961            acl_tensor_ptr acl_dst_tail =
2962                ggml_cann_create_tensor(dst_ptr, dtype, elem_size, tail_ne, nb_dst_arr, GGML_MAX_DIMS);
2963            cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get());
2964        };
2965
2966        if (src_dst_need_trans) {
2967            // Use F32 trans tensor strides and offsets
2968            src_tail_offset = rope_dims * src_dst_trans_nb[0];
2969            dst_tail_offset = rope_dims * src_dst_trans_nb[0];
2970            copy_tail_device((char *) src_trans_buffer + src_tail_offset, (char *) dst_trans_buffer + dst_tail_offset,
2971                             ACL_FLOAT, sizeof(float), src_dst_trans_nb, src_dst_trans_nb);
2972        } else {
2973            // Use original tensor strides and offsets
2974            src_tail_offset = rope_dims * nb00;
2975            dst_tail_offset = rope_dims * nb0;
2976            copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
2977                             ggml_cann_type_mapping(dst->type), ggml_element_size(dst), src0->nb, dst->nb);
2978        }
2979    }
2980
2981    // Step 5: Cast back to F16 if needed
2982    if (src_dst_need_trans) {
2983        aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
2984    }
2985}
2986
2987void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2988    ggml_tensor * src0 = dst->src[0];
2989
2990    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
2991    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
2992
2993    GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
2994}
2995
2996void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2997    ggml_tensor * src0 = dst->src[0];
2998    ggml_tensor * src1 = dst->src[1];
2999
3000    // stride
3001    int64_t s0 = ((const int32_t *) (dst->op_params))[0];
3002
3003    acl_tensor_ptr acl_input  = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
3004    acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
3005    acl_tensor_ptr acl_dst    = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
3006
3007    // get base information of input and kernel
3008    int64_t input_len   = *(src1->ne);
3009    int64_t dst_len     = *(dst->ne);
3010    int64_t kernel_size = *(src0->ne);
3011
3012    // set the max kernel size for each conv
3013    int64_t max_kernel_size = 255;
3014
3015    // compute the partition of kernel
3016    int64_t part_num = 1;
3017    part_num         = (kernel_size + max_kernel_size - 1) / max_kernel_size;
3018
3019    int64_t strideVal[1];
3020    strideVal[0]                    = s0;
3021    acl_int_array_ptr stride        = ggml_cann_create_int_array(strideVal, 1);
3022    int64_t           paddingVal[]  = { 0 };
3023    acl_int_array_ptr padding       = ggml_cann_create_int_array(paddingVal, 1);
3024    int64_t           dilationVal[] = { 1 };
3025    acl_int_array_ptr dilation      = ggml_cann_create_int_array(dilationVal, 1);
3026    bool              transposed    = true;
3027    int64_t           groups        = 1;
3028    int8_t            cubeMathType  = 0;
3029
3030#ifdef ASCEND_310P
3031    cubeMathType = 1;
3032#endif
3033
3034    auto weight_type = ggml_cann_type_mapping(src0->type);
3035    auto dst_type    = ggml_cann_type_mapping(dst->type);
3036
3037    // slice the kernel to make each conv available
3038    int64_t slice_dim   = -1;
3039    int64_t slice_start = 0;
3040    int64_t slice_end   = max_kernel_size;
3041    int64_t slice_step  = 1;
3042    int64_t interval    = max_kernel_size;
3043
3044    int64_t left_pad_len  = dilationVal[0] * (max_kernel_size - 1) + 1 - 2 * paddingVal[0];
3045    int64_t right_pad_len = 0;
3046
3047    acl_scalar_ptr alpha      = nullptr;
3048    float          alphaValue = 1.0;
3049    alpha                     = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
3050
3051    // set zero to destination
3052    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
3053
3054    for (int k = 0; k < part_num; k++) {
3055        // create part kernel tensor and slice from big kernel
3056        slice_start = max_kernel_size * k;
3057        if (k == part_num - 1) {
3058            slice_end = kernel_size;
3059            interval  = kernel_size - max_kernel_size * k;
3060        } else {
3061            slice_end = max_kernel_size * (k + 1);
3062        }
3063
3064        int64_t part_ne[4];
3065        for (int i = 0; i < 4; i++) {
3066            part_ne[i] = *(src0->ne + i);
3067        }
3068        part_ne[0] = interval;
3069
3070        size_t part_nb[4];
3071        part_nb[0] = sizeof(weight_type);
3072        for (int i = 1; i < 4; i++) {
3073            part_nb[i] = part_nb[i - 1] * part_ne[i - 1];
3074        }
3075
3076        ggml_cann_pool_alloc part_kernel_allocator;
3077        part_kernel_allocator.alloc(ctx.pool(), part_nb[3]);
3078        void * part_kernel_buf = part_kernel_allocator.get();
3079
3080        acl_tensor_ptr part_kernel = ggml_cann_create_tensor(part_kernel_buf, weight_type, ggml_element_size(src0),
3081                                                             part_ne, part_nb, 3, ACL_FORMAT_NCL);
3082
3083        GGML_CANN_CALL_ACLNN_OP(ctx, Slice, acl_weight.get(), slice_dim, slice_start, slice_end, slice_step,
3084                                part_kernel.get());
3085
3086        // create the part conv result tensor
3087        int64_t part_dst_ne[4];
3088        for (int i = 0; i < 4; i++) {
3089            part_dst_ne[i] = *(dst->ne + i);
3090        }
3091        part_dst_ne[0] = (input_len - 1) * strideVal[0] - 2 * paddingVal[0] + dilationVal[0] * (part_ne[0] - 1) + 1;
3092
3093        size_t part_dst_nb[4];
3094        part_dst_nb[0] = sizeof(weight_type);
3095        for (int i = 1; i < 4; i++) {
3096            part_dst_nb[i] = part_dst_nb[i - 1] * part_dst_ne[i - 1];
3097        }
3098        ggml_cann_pool_alloc part_dst_allocator;
3099        part_dst_allocator.alloc(ctx.pool(), part_dst_nb[3]);
3100        void * part_dst_buf = part_dst_allocator.get();
3101
3102        acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor(part_dst_buf, dst_type, ggml_element_size(dst),
3103                                                              part_dst_ne, part_dst_nb, 3, ACL_FORMAT_NCL);
3104        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_part_dst.get());
3105
3106        // compute part conv transpose 1d
3107        GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), part_kernel.get(), nullptr, stride.get(),
3108                                padding.get(), dilation.get(), transposed, padding.get(), groups, acl_part_dst.get(),
3109                                cubeMathType);
3110
3111        // compute the position of part result in final result
3112        int64_t global_start = slice_start;
3113        int64_t global_end   = std::min((input_len - 1) * strideVal[0] + slice_end, dst_len);
3114
3115        left_pad_len  = global_start;
3116        right_pad_len = dst_len - global_end;
3117
3118        std::vector<int64_t> padDataVal = { left_pad_len, right_pad_len };
3119        acl_int_array_ptr    padData    = ggml_cann_create_int_array(padDataVal.data(), 2);
3120
3121        acl_scalar_ptr pad_value    = nullptr;
3122        float          pad_valueVal = 0.0;
3123        pad_value                   = ggml_cann_create_scalar(&pad_valueVal, aclDataType::ACL_FLOAT);
3124
3125        int64_t conv_result_ne[4];
3126        for (int i = 0; i < 4; i++) {
3127            conv_result_ne[i] = *(dst->ne + i);
3128        }
3129
3130        size_t conv_result_nb[4];
3131        conv_result_nb[0] = sizeof(weight_type);
3132        for (int i = 1; i < 4; i++) {
3133            conv_result_nb[i] = conv_result_nb[i - 1] * conv_result_ne[i - 1];
3134        }
3135
3136        ggml_cann_pool_alloc conv_result_allocator;
3137        conv_result_allocator.alloc(ctx.pool(), conv_result_nb[3]);
3138        void * conv_result_buf = conv_result_allocator.get();
3139
3140        acl_tensor_ptr conv_result = ggml_cann_create_tensor(conv_result_buf, dst_type, ggml_element_size(dst),
3141                                                             conv_result_ne, conv_result_nb, 3, ACL_FORMAT_NCL);
3142
3143        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, conv_result.get());
3144        GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_part_dst.get(), padData.get(), pad_value.get(),
3145                                conv_result.get());
3146        GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), conv_result.get(), alpha.get());
3147    }
3148}
3149
3150void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3151    ggml_tensor * src0 = dst->src[0];
3152
3153    acl_tensor_ptr acl_input = ggml_cann_create_tensor(src0);
3154    acl_tensor_ptr acl_dst   = ggml_cann_create_tensor(dst);
3155
3156    float          alphaValue = 1.0f;
3157    acl_scalar_ptr alpha      = nullptr;
3158    alpha                     = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
3159
3160    GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input.get(), alpha.get(), alpha.get(), alpha.get(), acl_dst.get());
3161}
3162
3163void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3164    ggml_tensor * src0 = dst->src[0];
3165
3166    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
3167    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
3168
3169    int64_t           reduceDimValue[] = { 3 };
3170    acl_int_array_ptr reduceDim        = ggml_cann_create_int_array(reduceDimValue, 1);
3171    bool              keepDim          = true;
3172
3173    GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src.get(), reduceDim.get(), keepDim, ACL_FLOAT, acl_dst.get());
3174}
3175
3176void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3177    ggml_tensor *     src0             = dst->src[0];
3178    int32_t *         opts             = (int32_t *) dst->op_params;
3179    int64_t           paddingsArray[2] = { opts[0], opts[1] };
3180    acl_int_array_ptr paddings         = ggml_cann_create_int_array(paddingsArray, 2);
3181
3182    for (int64_t i = 0; i < src0->ne[3]; i++) {
3183        acl_tensor_ptr acl_src =
3184            ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type),
3185                                    ggml_element_size(src0), src0->ne, src0->nb, 3);
3186
3187        acl_tensor_ptr acl_dst =
3188            ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type),
3189                                    ggml_element_size(dst), dst->ne, dst->nb, 3);
3190
3191        GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
3192    }
3193}
3194
3195void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3196    ggml_tensor * src0 = dst->src[0];
3197    ggml_tensor * src1 = dst->src[1];
3198
3199    acl_tensor_ptr acl_self  = ggml_cann_create_tensor(src0);
3200    acl_tensor_ptr acl_other = ggml_cann_create_tensor(src1);
3201
3202    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self.get(), acl_other.get());
3203
3204    ggml_cann_sum(ctx, dst);
3205}
3206
3207void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3208    ggml_tensor * src0 = dst->src[0];
3209
3210    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
3211    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
3212
3213    float          alphaValue = 0.0f;
3214    acl_scalar_ptr alpha      = nullptr;
3215    alpha                     = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
3216
3217    GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src.get(), alpha.get(), acl_dst.get());
3218}
3219
3220/**
3221 * @brief Performs expert-specific matrix multiplication (MoE) with
3222 * floating-point precision using the CANN backend.
3223 *
3224 * This function executes a matrix multiplication operation tailored for
3225 * Mixture of Experts (MoE) models, where the input tensor is multiplied
3226 * with expert-specific weight matrices. It uses the CANN backend for
3227 * efficient computation and stores the result in the destination tensor `dst`.
3228 * The operation may leverage identity-based optimizations or routing masks
3229 * as part of sparse expert selection.
3230 *
3231 * @param ctx The context for executing CANN backend operations.
3232 * @param dst The destination tensor where the MoE multiplication result
3233 * will be stored.
3234 *
3235 * @note This function assumes floating-point data types and is designed for
3236 * MoE architectures, possibly involving sparse expert routing.
3237 */
3238static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3239    //dst   [M, K, N, 1]
3240    ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]  -> [D, M, K, 1]
3241    ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1 -> [D, 1, K, 1]
3242    ggml_tensor * ids  = dst->src[2];  //ids	[K, N]
3243
3244    GGML_ASSERT(src0->ne[3] == 1);
3245    GGML_ASSERT(src1->ne[3] == 1);
3246    GGML_ASSERT(dst->ne[3] == 1);
3247
3248    int64_t batch = src1->ne[2];
3249    GGML_ASSERT(batch == ids->ne[1]);
3250
3251    ggml_cann_pool_alloc export_allocator(ctx.pool(), src0->ne[0] * src0->ne[1] * ids->ne[0] * ggml_element_size(src0));
3252    void *               export_ptr = export_allocator.get();
3253    for (int64_t i = 0; i < batch; i++) {
3254        acl_tensor_ptr select_index  = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]);
3255        acl_tensor_ptr export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3);
3256
3257        int64_t select_export_ne[] = { src0->ne[0], src0->ne[1], ids->ne[0] };
3258        size_t  select_export_nb[3];
3259        select_export_nb[0] = src0->nb[0];
3260        for (int k = 1; k < 3; k++) {
3261            select_export_nb[k] = select_export_nb[k - 1] * select_export_ne[k - 1];
3262        }
3263
3264        acl_tensor_ptr select_export =
3265            ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
3266                                    select_export_ne, select_export_nb, 3);
3267        GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, export_weight.get(), 0, select_index.get(), select_export.get());
3268
3269        int64_t        select_transpose_ne[] = { select_export_ne[1], select_export_ne[0], select_export_ne[2] };
3270        size_t         select_transpose_nb[] = { select_export_nb[1], select_export_nb[0], select_export_nb[2] };
3271        acl_tensor_ptr select_export_transpose =
3272            ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
3273                                    select_transpose_ne, select_transpose_nb, 3);
3274
3275        int64_t        active_tensor_ne[] = { src1->ne[0], 1, src1->ne[1] };
3276        size_t         active_tensor_nb[] = { src1->nb[0], src1->nb[1], src1->nb[1] };
3277        acl_tensor_ptr active_tensor =
3278            ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]);
3279
3280        int64_t        dst_ne[] = { dst->ne[0], 1, dst->ne[1] };
3281        size_t         dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[1] };
3282        acl_tensor_ptr acl_dst  = ggml_cann_create_tensor(dst, dst_ne, dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]);
3283
3284        GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, active_tensor.get(), select_export_transpose.get(), acl_dst.get(), 2);
3285    }
3286}
3287
3288/**
3289 * @brief Performs quantized matrix multiplication for Mixture of Experts (MoE)
3290 * models using the CANN backend.
3291 *
3292 * This function implements MUL_MAT_ID operation for quantized weight matrices
3293 * (Q4_0 and Q8_0 formats). It selects expert-specific weight matrices based on
3294 * the provided expert indices, and computes matrix multiplication using CANN's
3295 * WeightQuantBatchMatmulV2 operator.
3296 *
3297 * The function performs the following steps:
3298 * 1. Converts input/output tensors to F16 format if necessary
3299 * 2. Uses IndexSelect to extract expert-specific weights and scales based on indices
3300 * 3. Performs quantized matrix multiplication for each expert using WeightQuantBatchMatmulV2
3301 * 4. Converts output back to the target type if needed
3302 *
3303 * Tensor shapes:
3304 * - dst:  [M, K, N, 1] - output tensor
3305 * - src0: [D, M, A, 1] - quantized weight matrices (Q4_0 or Q8_0)
3306 * - src1: [D, B, N, 1] - input activations (B = K for per-expert input, or B = 1 for broadcast)
3307 * - ids:  [K, N] - expert indices for routing
3308 *
3309 * @param ctx The CANN backend context for operation execution.
3310 * @param dst The destination tensor where the multiplication result will be stored.
3311 *
3312 * @note Only Q4_0 and Q8_0 quantization formats are supported.
3313 * @note The function handles automatic type conversion to/from F16 as needed by the hardware.
3314 */
3315static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3316    // dst:  [M, K, N, 1]
3317    // src0: [D, M, A, 1] - quantized weights
3318    // src1: [D, B, N, 1] - input activations, B = K or B = 1
3319    // ids:  [K, N] - expert indices
3320    ggml_tensor * src0 = dst->src[0];
3321    ggml_tensor * src1 = dst->src[1];
3322    ggml_tensor * ids  = dst->src[2];
3323
3324    GGML_ASSERT(src0->ne[3] == 1);
3325    GGML_ASSERT(src1->ne[3] == 1);
3326    GGML_ASSERT(dst->ne[3] == 1);
3327    GGML_ASSERT(src1->ne[2] == ids->ne[1]);
3328
3329    const int64_t        n_batches        = ids->ne[1];
3330    const int64_t        n_select_experts = ids->ne[0];
3331    const enum ggml_type type             = src0->type;
3332
3333    const int32_t group_size = QK8_0;  // Both Q4_0 and Q8_0 use group size of 32
3334    GGML_ASSERT(group_size == QK4_0);
3335
3336    // Calculate element size for quantized weights
3337    const float weight_elem_size =
3338        (type == GGML_TYPE_Q4_0) ? 0.5f :
3339        (type == GGML_TYPE_Q8_0) ? 1.0f :
3340                                   (GGML_ABORT("MUL_MAT_ID only supports Q4_0 and Q8_0"), 0.0f);
3341
3342    // Calculate scale offset in memory
3343    const size_t weight_size     = src0->ne[0] * src0->ne[1] * src0->ne[2] * weight_elem_size;
3344    const size_t scale_elem_size = sizeof(uint16_t);
3345    char *       scale_data      = (char *) src0->data + weight_size;
3346
3347    // Allocate buffers for selected expert weights and scales
3348    const size_t         selected_weight_size = src0->ne[0] * src0->ne[1] * n_select_experts * weight_elem_size;
3349    ggml_cann_pool_alloc selected_weight_alloc(ctx.pool(), selected_weight_size);
3350    void *               selected_weight_buffer = selected_weight_alloc.get();
3351
3352    const size_t selected_scale_size = (src0->ne[0] / group_size) * src0->ne[1] * n_select_experts * scale_elem_size;
3353    ggml_cann_pool_alloc selected_scale_alloc(ctx.pool(), selected_scale_size);
3354    void *               selected_scale_buffer = selected_scale_alloc.get();
3355
3356    // Helper lambda to allocate and cast tensor to F16 if needed
3357    constexpr size_t f16_elem_size      = sizeof(uint16_t);
3358    auto             prepare_f16_buffer = [&](ggml_tensor * tensor, ggml_cann_pool_alloc & allocator,
3359                                  bool need_cast = false) -> void * {
3360        if (tensor->type == GGML_TYPE_F16) {
3361            return tensor->data;
3362        }
3363
3364        size_t total_size = f16_elem_size;
3365        for (int i = 0; i < GGML_MAX_DIMS; i++) {
3366            total_size *= tensor->ne[i];
3367        }
3368        void * buffer = allocator.alloc(total_size);
3369
3370        if (need_cast == false) {
3371            return buffer;
3372        }
3373
3374        int64_t ne[GGML_MAX_DIMS];
3375        size_t  nb[GGML_MAX_DIMS] = { f16_elem_size };
3376        for (int i = 0; i < GGML_MAX_DIMS; i++) {
3377            ne[i] = tensor->ne[i];
3378            if (i > 0) {
3379                nb[i] = nb[i - 1] * ne[i - 1];
3380            }
3381        }
3382
3383        acl_tensor_ptr src_tensor = ggml_cann_create_tensor(tensor);
3384        acl_tensor_ptr f16_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
3385        aclnn_cast(ctx, src_tensor.get(), f16_tensor.get(), ACL_FLOAT16);
3386
3387        return buffer;
3388    };
3389
3390    // Prepare input and output buffers
3391    ggml_cann_pool_alloc input_alloc(ctx.pool());
3392    void *               input_buffer = prepare_f16_buffer(src1, input_alloc, true);
3393
3394    ggml_cann_pool_alloc output_alloc(ctx.pool());
3395    void *               output_buffer = prepare_f16_buffer(dst, output_alloc, false);
3396
3397    // Process each batch
3398    for (int64_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
3399        // Create index tensor for current batch
3400        const size_t   index_offset  = batch_idx * ids->nb[1];
3401        acl_tensor_ptr batch_indices = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, index_offset);
3402
3403        // Select quantized weights using expert indices
3404        // Q4_0 stores 2 values per byte, Q8_0 stores 1 value per byte
3405        const int64_t weight_d         = (type == GGML_TYPE_Q4_0) ? src0->ne[0] / 2 : src0->ne[0];
3406        const int64_t weight_m         = src0->ne[1];
3407        const int64_t weight_n_experts = src0->ne[2];
3408
3409        int64_t weight_ne[3] = { weight_d, weight_m, weight_n_experts };
3410        size_t  weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t), weight_d * weight_m * sizeof(int8_t) };
3411
3412        acl_tensor_ptr all_weights =
3413            ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, 3);
3414
3415        int64_t selected_weight_ne[3] = { weight_d, weight_m, n_select_experts };
3416        size_t  selected_weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t),
3417                                          weight_d * weight_m * sizeof(int8_t) };
3418
3419        acl_tensor_ptr selected_weights = ggml_cann_create_tensor(selected_weight_buffer, ACL_INT8, sizeof(int8_t),
3420                                                                  selected_weight_ne, selected_weight_nb, 3);
3421
3422        GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_weights.get(), 0, batch_indices.get(), selected_weights.get());
3423
3424        // Select scales using the same expert indices
3425        const int64_t scale_d     = src0->ne[0] / group_size;
3426        int64_t       scale_ne[3] = { scale_d, weight_m, weight_n_experts };
3427        size_t scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size, scale_d * weight_m * scale_elem_size };
3428
3429        acl_tensor_ptr all_scales =
3430            ggml_cann_create_tensor(scale_data, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 3);
3431
3432        int64_t selected_scale_ne[3] = { scale_d, weight_m, n_select_experts };
3433        size_t  selected_scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size,
3434                                         scale_d * weight_m * scale_elem_size };
3435
3436        acl_tensor_ptr selected_scales = ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size,
3437                                                                 selected_scale_ne, selected_scale_nb, 3);
3438
3439        GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_scales.get(), 0, batch_indices.get(), selected_scales.get());
3440
3441        // Process each expert for current batch
3442        // IndexSelect output layout: [D, M, K] in contiguous format
3443        // WeightQuantBatchMatmulV2 expects: [M, D] with row-major stride
3444        for (int64_t expert_idx = 0; expert_idx < n_select_experts; expert_idx++) {
3445            // Determine input offset: broadcast if src1->ne[1]==1, otherwise use per-expert input
3446            const size_t input_offset =
3447                (batch_idx * src1->ne[1] + (src1->ne[1] == 1 ? 0 : expert_idx)) * src1->ne[0] * f16_elem_size;
3448            const size_t output_offset = (batch_idx * dst->ne[1] + expert_idx) * dst->ne[0] * f16_elem_size;
3449
3450            // Create weight view for current expert: [D, M, K] -> [M, D]
3451            int64_t      weight_view_ne[2]  = { weight_m, src0->ne[0] };
3452            float        weight_view_nb[2]  = { src0->ne[0] * weight_elem_size, weight_elem_size };
3453            const size_t weight_view_offset = expert_idx * selected_weight_nb[2];
3454
3455            acl_tensor_ptr weight_view =
3456                ggml_cann_create_tensor(selected_weight_buffer, ggml_cann_type_mapping(type), weight_elem_size,
3457                                        weight_view_ne, weight_view_nb, 2, ACL_FORMAT_ND, weight_view_offset);
3458
3459            // Create scale view for current expert: [D, M, K] -> [M, D]
3460            int64_t      scale_view_ne[2]  = { weight_m, scale_d };
3461            size_t       scale_view_nb[2]  = { selected_scale_nb[1], selected_scale_nb[0] };
3462            const size_t scale_view_offset = expert_idx * selected_scale_nb[2];
3463
3464            acl_tensor_ptr scale_view =
3465                ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size, scale_view_ne,
3466                                        scale_view_nb, 2, ACL_FORMAT_ND, scale_view_offset);
3467
3468            // Create input activation tensor [D, 1]
3469            int64_t input_ne[2] = { src1->ne[0], 1 };
3470            size_t  input_nb[2] = { f16_elem_size, src1->ne[0] * f16_elem_size };
3471
3472            acl_tensor_ptr input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, f16_elem_size, input_ne,
3473                                                                  input_nb, 2, ACL_FORMAT_ND, input_offset);
3474
3475            // Create output tensor [M, 1]
3476            int64_t output_ne[2] = { dst->ne[0], 1 };
3477            size_t  output_nb[2] = { f16_elem_size, dst->ne[0] * f16_elem_size };
3478
3479            acl_tensor_ptr output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, output_ne,
3480                                                                   output_nb, 2, ACL_FORMAT_ND, output_offset);
3481
3482            // Perform quantized matrix multiplication
3483            GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, input_tensor.get(), weight_view.get(),
3484                                    scale_view.get(), nullptr, nullptr, nullptr, nullptr, group_size,
3485                                    output_tensor.get());
3486        }
3487    }
3488
3489    // Cast output back to original type if we used a temporary F16 buffer
3490    if (dst->type != GGML_TYPE_F16) {
3491        int64_t ne[GGML_MAX_DIMS];
3492        size_t  nb[GGML_MAX_DIMS] = { f16_elem_size };
3493        for (int i = 0; i < GGML_MAX_DIMS; i++) {
3494            ne[i] = dst->ne[i];
3495            if (i > 0) {
3496                nb[i] = nb[i - 1] * ne[i - 1];
3497            }
3498        }
3499
3500        acl_tensor_ptr f16_output =
3501            ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
3502        acl_tensor_ptr dst_tensor = ggml_cann_create_tensor(dst);
3503
3504        aclnn_cast(ctx, f16_output.get(), dst_tensor.get(), ggml_cann_type_mapping(dst->type));
3505    }
3506}
3507
3508void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3509    const enum ggml_type type = dst->src[0]->type;
3510    switch (type) {
3511        case GGML_TYPE_F32:
3512        case GGML_TYPE_F16:
3513            ggml_cann_mul_mat_id_fp(ctx, dst);
3514            break;
3515        case GGML_TYPE_Q4_0:
3516        case GGML_TYPE_Q8_0:
3517            ggml_cann_mul_mat_id_quant(ctx, dst);
3518            break;
3519        default:
3520            GGML_ABORT("Unsupported type for mul_mat_id");
3521            break;
3522    }
3523}
3524
3525void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3526    ggml_tensor * src0 = dst->src[0];  // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont)
3527    ggml_tensor * src1 = dst->src[1];  // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
3528    ggml_tensor * src2 = dst->src[2];  // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
3529    ggml_tensor * src3 = dst->src[3];  // mask, fp16
3530
3531    // B, N, S, D (uncont) -> B, S, N, D (cont)
3532    int64_t src0_bsnd_ne[GGML_MAX_DIMS];
3533    memcpy(src0_bsnd_ne, src0->ne, GGML_MAX_DIMS * sizeof(int64_t));
3534    size_t src0_bsnd_nb[GGML_MAX_DIMS];
3535    memcpy(src0_bsnd_nb, src0->nb, GGML_MAX_DIMS * sizeof(size_t));
3536    int64_t src1_bsnd_ne[GGML_MAX_DIMS];
3537    memcpy(src1_bsnd_ne, src1->ne, GGML_MAX_DIMS * sizeof(int64_t));
3538    size_t src1_bsnd_nb[GGML_MAX_DIMS];
3539    memcpy(src1_bsnd_nb, src1->nb, GGML_MAX_DIMS * sizeof(size_t));
3540    int64_t src2_bsnd_ne[GGML_MAX_DIMS];
3541    memcpy(src2_bsnd_ne, src2->ne, GGML_MAX_DIMS * sizeof(int64_t));
3542    size_t src2_bsnd_nb[GGML_MAX_DIMS];
3543    memcpy(src2_bsnd_nb, src2->nb, GGML_MAX_DIMS * sizeof(size_t));
3544
3545    auto transpose12 = [](int64_t * ne, size_t * nb) {
3546        int64_t ne_tmp = ne[1];
3547        size_t  nb_tmp = nb[1];
3548        ne[1]          = ne[2];
3549        nb[1]          = nb[2];
3550        ne[2]          = ne_tmp;
3551        nb[2]          = nb_tmp;
3552    };
3553
3554    transpose12(src0_bsnd_ne, src0_bsnd_nb);
3555    transpose12(src1_bsnd_ne, src1_bsnd_nb);
3556    transpose12(src2_bsnd_ne, src2_bsnd_nb);
3557
3558    float maxBias      = 0.0f;
3559    float scaleValue   = 1.0f;
3560    float logitSoftcap = 0.0f;
3561    memcpy(&scaleValue, (float *) dst->op_params + 0, sizeof(float));
3562    memcpy(&maxBias, (float *) dst->op_params + 1, sizeof(float));
3563    memcpy(&logitSoftcap, (float *) dst->op_params + 2, sizeof(float));
3564
3565    if (logitSoftcap == 0.0f) {
3566        size_t faElemSize = sizeof(uint16_t);
3567        auto   faDataType = ACL_FLOAT16;  //ACL_BF16;
3568
3569        acl_tensor_ptr acl_q_tensor = nullptr;
3570        acl_tensor_ptr acl_k_tensor = nullptr;
3571        acl_tensor_ptr acl_v_tensor = nullptr;
3572
3573        // Step 1: cast the src0 (Query) to fp16 if needed
3574        ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
3575        void *               src0_f16_buffer = nullptr;
3576
3577        if (ggml_cann_type_mapping(src0->type) != faDataType) {
3578            acl_tensor_ptr acl_src0_f32_tensor =
3579                ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
3580            src0_f16_buffer = src0_f16_allocator.alloc(ggml_nelements(src0) * faElemSize);
3581
3582            int64_t * src0_f16_ne = src0_bsnd_ne;
3583            size_t    src0_f16_nb[GGML_MAX_DIMS];
3584            src0_f16_nb[0] = sizeof(uint16_t);
3585            for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3586                src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
3587            }
3588
3589            acl_q_tensor = ggml_cann_create_tensor(src0_f16_buffer, faDataType, faElemSize, src0_f16_ne, src0_f16_nb,
3590                                                   GGML_MAX_DIMS);
3591            aclnn_cast(ctx, acl_src0_f32_tensor.get(), acl_q_tensor.get(), faDataType);
3592        } else {
3593            acl_q_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
3594        }
3595
3596        // Step 2: create the acl tensors for src1 (Key), src2 (Value),
3597        //         and the direct output from FusedInferAttention
3598
3599        acl_k_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
3600        acl_v_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);
3601
3602        // Step 3: create the PSEShift tensor if needed
3603        //         this tensor is considered as mask (f16) in the llama.cpp
3604        acl_tensor_ptr       bcast_pse_tensor;
3605        ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
3606        if (src3 != nullptr) {
3607            // Construct the truncated pse tensor (common for prefill/decode)
3608            int64_t trunc_pse_ne[GGML_MAX_DIMS] = {
3609                src3->ne[0],  // D
3610                src0->ne[1],  // S (number of Q tokens)
3611                src3->ne[2],  // mask N
3612                src3->ne[3]   // B
3613            };
3614            size_t * trunc_pse_nb = src3->nb;
3615
3616            acl_tensor_ptr acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
3617                src3->data, ACL_FLOAT16, sizeof(uint16_t), trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
3618
3619            int64_t bcast_pse_ne[GGML_MAX_DIMS];
3620            size_t  bcast_pse_nb[GGML_MAX_DIMS];
3621            bcast_pse_ne[0] = src3->ne[0];  // D
3622            bcast_pse_ne[1] = src0->ne[1];  // S
3623            bcast_pse_ne[2] = src0->ne[2];  // N (num_heads)
3624            bcast_pse_ne[3] = src3->ne[3];  // B
3625            if (maxBias == 0.0f) {
3626                // When maxBias == 0.0f, use nb = 0 reduce once repeat (Qwen2)
3627                // Construct the bcast tensor (simulate repeat on the head dimension using stride=0)
3628                bcast_pse_nb[0] = sizeof(uint16_t);
3629                bcast_pse_nb[1] = bcast_pse_nb[0] * bcast_pse_ne[0];
3630                bcast_pse_nb[2] = 0;  // <---- the head dimension shares the same data
3631                bcast_pse_nb[3] = src3->nb[3];
3632
3633                bcast_pse_tensor = ggml_cann_create_tensor(src3->data, ACL_FLOAT16, sizeof(uint16_t), bcast_pse_ne,
3634                                                           bcast_pse_nb, GGML_MAX_DIMS);
3635
3636            } else {
3637                bcast_pse_nb[0] = sizeof(uint16_t);
3638                for (int i = 1; i < GGML_MAX_DIMS; i++) {
3639                    bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
3640                }
3641
3642                void * bcast_pse_buffer =
3643                    bcast_pse_allocator.alloc(ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
3644
3645                bcast_pse_tensor = ggml_cann_create_tensor(bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
3646                                                           bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
3647
3648                int64_t repeats[] = { 1, src0->ne[2], 1, 1 };
3649                aclnn_repeat(ctx, acl_mask_f16_trunc_tensor.get(), bcast_pse_tensor.get(), repeats);
3650
3651                // alibi
3652                // Compute the slope if needed. Derived from ggml_cann_softmax().
3653                const int64_t        n_heads = src0->ne[2];
3654                ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
3655                void *               slope_buffer = slope_allocator.get();
3656                aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
3657
3658                int64_t slope_ne[] = { 1, 1, n_heads, 1 };
3659                size_t  slope_nb[GGML_MAX_DIMS];
3660                slope_nb[0] = sizeof(uint16_t);
3661                for (int i = 1; i < GGML_MAX_DIMS; i++) {
3662                    slope_nb[i] = slope_nb[i - 1] * slope_ne[0];
3663                }
3664
3665                acl_tensor_ptr slope_tensor = ggml_cann_create_tensor(slope_buffer, ACL_FLOAT16, sizeof(uint16_t),
3666                                                                      slope_ne, slope_nb, GGML_MAX_DIMS);
3667                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor.get(), slope_tensor.get());
3668            }
3669        }
3670
3671        // Step 4: set the inputs for FusedInferAttention.
3672        acl_tensor_list_ptr acl_k_tensor_list = ggml_cann_create_tensor_list(acl_k_tensor);
3673        acl_tensor_list_ptr acl_v_tensor_list = ggml_cann_create_tensor_list(acl_v_tensor);
3674
3675        int64_t numHeads           = src0->ne[2];  // N
3676        int64_t numKeyValueHeads   = src1->ne[2];
3677        // double  scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
3678        int64_t preTokens          = 65535;
3679        int64_t nextTokens         = 65535;
3680        char    layout[5]          = { 'B', 'S', 'N', 'D', 0 };
3681        int64_t sparseMode         = 0;
3682        int64_t innerPrecise       = (src0->ne[1] == 1) ? 0 : 2;
3683        int64_t blockSize          = 0;
3684        int64_t antiquantMode      = 0;
3685        bool    softmaxLseFlag     = false;
3686        int64_t keyAntiquantMode   = 0;
3687        int64_t valueAntiquantMode = 0;
3688
3689        GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
3690        acl_tensor_ptr       fa_dst_tensor;
3691        acl_tensor_ptr       acl_dst_tensor;
3692        ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
3693        if (dst->type == GGML_TYPE_F32) {
3694            void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
3695
3696            int64_t * out_f16_ne = src0_bsnd_ne;
3697            size_t    out_f16_nb[GGML_MAX_DIMS];
3698            out_f16_nb[0] = faElemSize;
3699            for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3700                out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
3701            }
3702
3703            fa_dst_tensor =
3704                ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
3705        } else {
3706            fa_dst_tensor = ggml_cann_create_tensor(dst);
3707        }
3708
3709        GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2, acl_q_tensor.get(), acl_k_tensor_list.get(),
3710                                acl_v_tensor_list.get(),               // q, k, v
3711                                bcast_pse_tensor.get(), nullptr,       // pse, mask
3712                                nullptr, nullptr,                      // actSeqLen, actSeqLenkv
3713                                nullptr, nullptr,                      // deqScale1, quantScale1
3714                                nullptr, nullptr, nullptr,             // deqScale2, quantScale2, quantOffset2
3715                                nullptr, nullptr,                      // antiquantScale, antiquantOffset
3716                                nullptr,                               // blockTable
3717                                nullptr, nullptr,                      // qPadSize, kvPadSize
3718                                nullptr, nullptr,                      // kAntiquantScale, kAntiQuantOffset
3719                                nullptr, nullptr,                      // vAntiquantScale, vAntiQuantOffset
3720                                nullptr, nullptr, nullptr,             // kSharedPrefix, vSharedPrefix, actSharedLen
3721                                numHeads, scaleValue,                  // heads, scaleValue
3722                                preTokens, nextTokens,                 // preTokens, nextTokens
3723                                layout,                                // inputLayout
3724                                numKeyValueHeads,                      // numKVHeads
3725                                sparseMode, innerPrecise,              // sparseMode, innerPrecise
3726                                blockSize, antiquantMode,              // blockSize, antiquantMode
3727                                softmaxLseFlag,                        // softmaxLseFlag
3728                                keyAntiquantMode, valueAntiquantMode,  // keyAntiqMode, valueAntiqMode
3729                                fa_dst_tensor.get(),                   // attentionOut
3730                                nullptr                                // softmaxLse
3731        );
3732
3733        if (dst->type == GGML_TYPE_F32) {
3734            // Step 6: post-processing, permute and cast to f32
3735            acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
3736            aclnn_cast(ctx, fa_dst_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
3737        }
3738    } else {
3739        GGML_ABORT("Function is not implemented.");
3740    }
3741}
3742
3743static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3744    ggml_tensor * src0 = dst->src[0];  // weight
3745    ggml_tensor * src1 = dst->src[1];  // input
3746    GGML_TENSOR_BINARY_OP_LOCALS
3747
3748    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
3749    GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
3750
3751    const int64_t dps2 = ne2 / ne02;
3752    const int64_t dps3 = ne3 / ne03;
3753    for (int64_t i3 = 0; i3 < ne3; i3++) {
3754        for (int64_t i2 = 0; i2 < ne2; i2++) {
3755            const int64_t i02 = i2 / dps2;
3756            const int64_t i03 = i3 / dps3;
3757
3758            const int64_t  i12 = i2;
3759            const int64_t  i13 = i3;
3760            acl_tensor_ptr accumulator =
3761                ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
3762                                        ggml_type_size(dst->type), dst->ne, dst->nb, 2);
3763
3764            // The outer product needs to be accumulated in this dimension.
3765            for (int64_t i1 = 0; i1 < ne11; i1++) {
3766                acl_tensor_ptr acl_input = ggml_cann_create_tensor(
3767                    (char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
3768                    ggml_type_size(src0->type), src1->ne, src1->nb, 1);
3769
3770                acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
3771                    (char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
3772                    ggml_type_size(src0->type), src0->ne, src0->nb, 1);
3773
3774                ggml_cann_pool_alloc output_allocator(ctx.pool());
3775                void *               output_buffer = output_allocator.alloc(ggml_nbytes(dst));
3776                acl_tensor_ptr       acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
3777                                                                       ggml_type_size(dst->type), dst->ne, dst->nb, 2);
3778
3779                GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
3780                float       alpha_value = 1.0f;
3781                aclScalar * alpha       = aclCreateScalar(&alpha_value, ACL_FLOAT);
3782                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
3783            }
3784        }
3785    }
3786}
3787
3788void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3789    ggml_tensor * src0 = dst->src[0];
3790
3791    const enum ggml_type type = src0->type;
3792
3793    switch (type) {
3794        case GGML_TYPE_F32:
3795        case GGML_TYPE_F16:
3796            ggml_cann_out_prod_fp(ctx, dst);
3797            break;
3798        default:
3799            GGML_ABORT("Unsupport type for GGML_OP_OUT_PROD");
3800            break;
3801    }
3802}
3803
3804void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3805    ggml_tensor * src0 = dst->src[0];  // conv_x
3806    ggml_tensor * src1 = dst->src[1];  // conv1d.weight
3807
3808    // This op is currently defined only for F32 in ggml_cpu
3809    GGML_ASSERT(src0->type == GGML_TYPE_F32);
3810    GGML_ASSERT(src1->type == GGML_TYPE_F32);
3811    GGML_ASSERT(dst->type == GGML_TYPE_F32);
3812
3813    // Shapes follow ggml_compute_forward_ssm_conv_f32
3814    const int64_t nc  = src1->ne[0];   // d_conv
3815    const int64_t ncs = src0->ne[0];   // d_conv - 1 + n_t
3816    const int64_t nr  = src0->ne[1];   // d_inner
3817    const int64_t n_s = src0->ne[2];   // n_seqs
3818
3819    const int64_t n_t = dst->ne[1];    // tokens per sequence
3820
3821    GGML_ASSERT(dst->ne[0] == nr);     // dst: {d_inner, n_t, n_s}
3822    GGML_ASSERT(src1->ne[1] == nr);    // weight: {d_conv, d_inner}
3823    GGML_ASSERT(ncs == nc - 1 + n_t);  // conv_x: {d_conv - 1 + n_t, d_inner, n_s}
3824    GGML_ASSERT(src0->nb[0] == sizeof(float));
3825    GGML_ASSERT(src1->nb[0] == sizeof(float));
3826
3827    // --- Build CANN tensors ---
3828
3829    // 1) Input: conv_x as NCL
3830    //
3831    // src0->ne = { ncs, nr, n_s, 1 }  // {L_in, C, N}
3832    // Passing ACL_FORMAT_NCL here means:
3833    //   reversed dims -> [N, C, L_in] = [n_s, nr, ncs]
3834    acl_tensor_ptr acl_x = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
3835
3836    // 2) Weights: depthwise conv kernel, view src1 as {K, 1, C}
3837    //
3838    // src1 original:   ne = { nc, nr, 1, 1 }  // [K, C, 1, 1]
3839    // we want a view:  ne_w = { nc, 1, nr }   // [K, 1, C]
3840    // so that reversed dims -> [C, 1, K] which matches
3841    //   [out_channels, in_channels/groups, kernel_size]
3842    int64_t w_ne[GGML_MAX_DIMS] = { nc, 1, nr, 1 };  // [K, 1 input ch. per group, C groups]
3843    // Layout: src1 data is [K, C] with
3844    //   offset(k, c) = k*nb0 + c*nb1
3845    // We want offset_w(k, 0, c) = k*nb0 + c*nb1,
3846    // so we can reuse nb0 and nb1, and set nb2 = nb1.
3847    size_t  w_nb[GGML_MAX_DIMS] = { src1->nb[0], src1->nb[1], src1->nb[1], src1->nb[3] };  // same as src1
3848
3849    acl_tensor_ptr acl_w = ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type),
3850                                                   ggml_type_size(src1->type), w_ne, w_nb, 3, ACL_FORMAT_NCL);
3851
3852    // 3) Output: dst is { d_inner, n_t, n_s } (CLN)
3853    //
3854    // We need an NCL view of the same buffer:
3855    //   desired NCL logical shape: { L_out = n_t, C = nr, N = n_s }
3856    //
3857    // Original CLN layout:
3858    //   dst->ne = { nr, n_t, n_s }
3859    //   dst->nb[0] = sizeof(float)
3860    //   dst->nb[1] = nr * sizeof(float)
3861    //   dst->nb[2] = nr * n_t * sizeof(float)
3862    //
3863    // We want offset_new(L, C, N) = offset_orig(C, L, N).
3864    // Choose:
3865    //   nb_y[0] = nr * sizeof(float);           // step in L
3866    //   nb_y[1] = sizeof(float);                // step in C
3867    //   nb_y[2] = nr * n_t * sizeof(float);     // step in N
3868    int64_t y_ne[GGML_MAX_DIMS] = { n_t, nr, n_s, 1 };  // [L_out, C, N]
3869    size_t  y_nb[GGML_MAX_DIMS] = { dst->ne[0] * sizeof(float), sizeof(float), dst->ne[0] * dst->ne[1] * sizeof(float),
3870                                    dst->nb[3] };       // [nr, 1, nr * n_t]
3871
3872    acl_tensor_ptr acl_y = ggml_cann_create_tensor(dst->data, ggml_cann_type_mapping(dst->type),
3873                                                   ggml_type_size(dst->type), y_ne, y_nb, 3, ACL_FORMAT_NCL);
3874
3875    // --- Conv1d parameters: depthwise, stride 1, no padding ("valid") ---
3876    int64_t strideVal[1]   = { 1 };
3877    int64_t paddingVal[1]  = { 0 };
3878    int64_t dilationVal[1] = { 1 };
3879
3880    acl_int_array_ptr stride   = ggml_cann_create_int_array(strideVal, 1);
3881    acl_int_array_ptr padding  = ggml_cann_create_int_array(paddingVal, 1);
3882    acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
3883
3884    const bool    transposed   = false;
3885    const int64_t groups       = nr;  // depthwise: one group per inner dim
3886    int8_t        cubeMathType = 0;
3887
3888#ifdef ASCEND_310P
3889    cubeMathType = 1;
3890#endif
3891
3892    GGML_CANN_CALL_ACLNN_OP(ctx, Convolution,
3893                            acl_x.get(),    // input:  N, C, L_in = ncs
3894                            acl_w.get(),    // weight: [C, 1, K] with groups=nr
3895                            nullptr,        // bias
3896                            stride.get(), padding.get(), dilation.get(), transposed,
3897                            padding.get(),  // output padding (unused for non-transposed)
3898                            groups, acl_y.get(), cubeMathType);
3899}
3900
3901void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx,
3902                                     ggml_tensor *               add_node,
3903                                     ggml_tensor *               rms_norm_node) {
3904    // Get the two input tensors for ADD operation
3905    ggml_tensor * x1 = add_node->src[0];
3906    ggml_tensor * x2 = add_node->src[1];
3907
3908    // Create ACL tensors for the two ADD inputs
3909    acl_tensor_ptr acl_x1 = ggml_cann_create_tensor(x1);
3910    acl_tensor_ptr acl_x2 = ggml_cann_create_tensor(x2);
3911
3912    // Get epsilon parameter from rms_norm_tensor
3913    float eps;
3914    memcpy(&eps, rms_norm_node->op_params, sizeof(float));
3915
3916    // Build gamma tensor (RMS normalization scaling factor)
3917    // Gamma should match the normalized dimensions (last dimension of x1)
3918    size_t acl_gamma_nb[GGML_MAX_DIMS];
3919    acl_gamma_nb[0] = ggml_type_size(rms_norm_node->type);
3920    for (int i = 1; i < GGML_MAX_DIMS; i++) {
3921        acl_gamma_nb[i] = acl_gamma_nb[i - 1] * x1->ne[i - 1];
3922    }
3923    acl_tensor_ptr acl_gamma =
3924        get_cache_acl_tensor(ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, x1->ne,
3925                             acl_gamma_nb, rms_norm_node->type,
3926                             1,    // dims - only the last dimension
3927                             1.0f  // value
3928        );
3929
3930    // Build rstdOut tensor (output for normalized standard deviation)
3931    // Shape should be the dimensions that are NOT normalized
3932    int64_t acl_rstd_ne[] = { 1, x1->ne[1], x1->ne[2], x1->ne[3] };
3933    size_t  acl_rstd_nb[GGML_MAX_DIMS - 1];
3934    acl_rstd_nb[0] = sizeof(float);
3935    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
3936        acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
3937    }
3938    acl_tensor_ptr acl_rstd =
3939        get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
3940                             acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS,
3941                             0.0f  // value
3942        );
3943
3944    acl_tensor_ptr acl_xout = ggml_cann_create_tensor(add_node);
3945
3946    // Create yOut tensor (final output after RMS normalization)
3947    acl_tensor_ptr acl_yout = ggml_cann_create_tensor(rms_norm_node);
3948
3949    // Call fused ADD + RMS_NORM operator
3950    GGML_CANN_CALL_ACLNN_OP(ctx, AddRmsNorm, acl_x1.get(), acl_x2.get(), acl_gamma.get(),
3951                            eps,  // double type
3952                            acl_yout.get(), acl_rstd.get(), acl_xout.get());
3953}
3954
3955void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3956    ggml_tensor * k = dst->src[0];
3957    ggml_tensor * v = dst->src[1];
3958    ggml_tensor * q = dst->src[2];
3959    ggml_tensor * g = dst->src[3];
3960    ggml_tensor * s = dst->src[4];
3961
3962    int64_t B = dst->src[4]->ne[1];
3963    int64_t T = dst->src[0]->ne[2];
3964    int64_t H = dst->src[0]->ne[1];
3965    int64_t C = dst->ne[0];
3966    int64_t D = C / H;
3967    int64_t L = T / B;
3968
3969    int64_t ne_qkg[2] = { 1, D };
3970    int64_t ne_s[2]   = { D, D };
3971    int64_t ne_st[2]  = { ne_s[1], ne_s[0] };
3972    int64_t ne_vo[2]  = { D, 1 };
3973    int64_t ne_q[1]   = { D };
3974    size_t  nb_base   = ggml_type_size(k->type);
3975    size_t  nb_qkg[2] = { nb_base, nb_base };
3976    size_t  nb_s[2]   = { nb_base, D * nb_base };
3977    size_t  nb_st[2]  = { nb_s[1], nb_s[0] };
3978    size_t  nb_vo[2]  = { nb_base, D * nb_base };
3979    size_t  nb_q[1]   = { nb_base };
3980
3981    const float scale = ggml_get_op_params_f32(dst, 0);
3982
3983    acl_tensor_ptr acl_s     = ggml_cann_create_tensor(s, s->ne, s->nb, 2, ACL_FORMAT_ND);
3984    acl_tensor_ptr new_state = ggml_cann_create_tensor(dst, s->ne, s->nb, 2, ACL_FORMAT_ND, (B * L * H * D) * nb_base);
3985    cann_copy(ctx, acl_s.get(), new_state.get());
3986
3987    for (int64_t b = 0; b < B; b++) {
3988        for (int64_t h = 0; h < H; h++) {
3989            size_t         s_offset = (b * (H * D * D) + h * (D * D)) * nb_base;
3990            // D * D
3991            acl_tensor_ptr acl_s_new =
3992                ggml_cann_create_tensor(dst, ne_s, nb_s, 2, ACL_FORMAT_ND, (B * L * H * D) * nb_base + s_offset);
3993            acl_tensor_ptr acl_s_new_t =
3994                ggml_cann_create_tensor(dst, ne_st, nb_st, 2, ACL_FORMAT_ND, (B * L * H * D) * nb_base + s_offset);
3995            for (int64_t l = 0; l < L; l++) {
3996                size_t               qkvgo_offset = (b * (L * H * D) + l * (H * D) + h * (D)) * nb_base;
3997                // D * 1
3998                acl_tensor_ptr       acl_k = ggml_cann_create_tensor(k, ne_qkg, nb_qkg, 2, ACL_FORMAT_ND, qkvgo_offset);
3999                acl_tensor_ptr       acl_g = ggml_cann_create_tensor(g, ne_qkg, nb_qkg, 2, ACL_FORMAT_ND, qkvgo_offset);
4000                // D
4001                acl_tensor_ptr       acl_q = ggml_cann_create_tensor(q, ne_q, nb_q, 1, ACL_FORMAT_ND, qkvgo_offset);
4002                // 1 * D
4003                acl_tensor_ptr       acl_v = ggml_cann_create_tensor(v, ne_vo, nb_vo, 2, ACL_FORMAT_ND, qkvgo_offset);
4004                // D
4005                acl_tensor_ptr       acl_o = ggml_cann_create_tensor(dst, ne_q, nb_q, 1, ACL_FORMAT_ND, qkvgo_offset);
4006                // k โŠ— v
4007                size_t               buf_size = D * D * nb_base;
4008                ggml_cann_pool_alloc buffer_allocator(ctx.pool(), buf_size);
4009                acl_tensor_ptr       tmp_tensor = ggml_cann_create_tensor(
4010                    buffer_allocator.get(), ggml_cann_type_mapping(k->type), nb_base, ne_s, nb_s, 2);
4011                aclnn_mul(ctx, acl_k.get(), acl_v.get(), tmp_tensor.get());
4012                //s_new = g โŠ— s_old + k โŠ— v
4013                aclnn_mul(ctx, acl_s_new.get(), acl_g.get(), nullptr);
4014                aclnn_add(ctx, acl_s_new.get(), tmp_tensor.get(), nullptr);
4015                // compute output
4016                GGML_CANN_CALL_ACLNN_OP(ctx, Mv, acl_s_new_t.get(), acl_q.get(), acl_o.get(), 1);
4017                aclnn_muls(ctx, acl_o.get(), scale, nullptr, true);
4018            }
4019        }
4020    }
4021}