1/**
   2 * Copyright (c) 2023-2026 The ggml authors
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a copy
   5 * of this software and associated documentation files (the "Software"), to
   6 * deal in the Software without restriction, including without limitation the
   7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
   8 * sell copies of the Software, and to permit persons to whom the Software is
   9 * furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20 * IN THE SOFTWARE.
  21 */
  22
  23#ifndef CANN_ACLNN_OPS
  24#define CANN_ACLNN_OPS
  25
  26#include "acl_tensor.h"
  27#include "common.h"
  28
  29#include <aclnnop/aclnn_abs.h>
  30#include <aclnnop/aclnn_arange.h>
  31#include <aclnnop/aclnn_argsort.h>
  32#include <aclnnop/aclnn_cat.h>
  33#include <aclnnop/aclnn_clamp.h>
  34#include <aclnnop/aclnn_cos.h>
  35#include <aclnnop/aclnn_exp.h>
  36#include <aclnnop/aclnn_gelu.h>
  37#include <aclnnop/aclnn_gelu_v2.h>
  38#include <aclnnop/aclnn_hardsigmoid.h>
  39#include <aclnnop/aclnn_hardswish.h>
  40#include <aclnnop/aclnn_leaky_relu.h>
  41#include <aclnnop/aclnn_log.h>
  42#include <aclnnop/aclnn_logsoftmax.h>
  43#include <aclnnop/aclnn_neg.h>
  44#include <aclnnop/aclnn_norm.h>
  45#include <aclnnop/aclnn_relu.h>
  46#include <aclnnop/aclnn_sigmoid.h>
  47#include <aclnnop/aclnn_sign.h>
  48#include <aclnnop/aclnn_silu.h>
  49#include <aclnnop/aclnn_sin.h>
  50#include <aclnnop/aclnn_slice.h>
  51#include <aclnnop/aclnn_sqrt.h>
  52#include <aclnnop/aclnn_tanh.h>
  53
  54#include <functional>
  55#include <unordered_set>
  56
  57/**
  58 * @brief   Repeats a ggml tensor along each dimension to match the dimensions
  59 *          of another tensor.
  60 *
  61 * @details This function repeats the elements of a source ggml tensor along
  62 *          each dimension to create a destination tensor with the specified
  63 *          dimensions. The operation is performed using the ACL backend and
  64 *          executed asynchronously on the device.
  65 *
  66 * @param   ctx The CANN context used for operations.
  67 * @param   dst The ggml tensor representing the destination, which op is
  68 *              GGML_OP_REPEAT and specifies the desired dimensions.
  69 */
  70void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
  71
  72/**
  73 * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
  74 *          backend.
  75 *
  76 * @details This function computes the Leaky ReLU activation for each element of
  77 *          the input tensor. The Leaky ReLU function allows a small gradient
  78 *          when the unit is not active (i.e., when the input is negative). The
  79 *          Leaky ReLU function is defined as:
  80 *          \f[
  81 *              \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
  82 *               src)
  83 *          \f]
  84 *          `negativeSlope` is in dst->params.
  85 *
  86 * @param ctx The CANN context used for operations.
  87 * @param dst The destination tensor where the result of the Leaky ReLU
  88 *            activation is stored, which op is `GGML_OP_LEAKY_RELU`
  89 */
  90void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
  91
  92/**
  93 * @brief    Concatenates multiple tensors along a specified dimension using the
  94 *           CANN backend.
  95 *
  96 * @param ctx        The CANN context used for operations.
  97 * @param tensorList A pointer to the list of tensors to be concatenated.
  98 * @param dst        The destination tensor where the result of the
  99 *                   concatenation is stored. dst->op is `GGML_OP_CONCAT`.
 100 * @param concat_dim The dimension along which the tensors are concatenated.
 101 *
 102 * @attention tensorList length should be 2 and the dimension using for concat
 103 *            default to 1.
 104 */
 105void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 106
 107/**
 108 * @brief   Generates a sequence of evenly spaced values within a specified
 109 *          interval for a ggml tensor using the CANN backend.
 110 *
 111 * @details This function creates a sequence of numbers over a specified i
 112 *          nterval, starting from `start`, ending before `stop`, and
 113 *          incrementing by `step`. The sequence is stored in the destination
 114 *          tensor `dst`.
 115 *
 116 * @param ctx The CANN context used for operations.
 117 * @param dst The destination tensor where the generated sequence will be stored.
 118 *            `start`, 'stop' and 'step' are in dst->op_params and dst->op is
 119 *            `GGML_OP_ARANGE`.
 120 */
 121void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 122
 123/**
 124 * @brief   Applies a clamp operation to the elements of a ggml tensor using the
 125 *          CANN backend.
 126 *
 127 * @details This function clamps the elements of the input tensor `src` to a
 128 *          specified range defined by `min` and `max` values. The result is
 129 *          stored in the destination tensor `dst`. The operation is defined as:
 130 *          \f[
 131 *              y = \max(\min(x, max\_value), min\_value)
 132 *           \f]
 133 *          where `x` is an element of the input tensor, and `y` is the
 134 *          corresponding element in the output tensor.
 135 * @param ctx The CANN context used for operations.
 136 * @param dst The destination tensor where the clamped values will be stored.
 137 *            dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
 138 */
 139void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 140
 141/**
 142 * @brief   Scales the elements of a ggml tensor by a constant factor using the
 143 *          CANN backend.
 144 *
 145 * @details This function multiplies each element of the input tensor `src` by
 146 *          a scaling factor `scale`, storing the result in the destination
 147 *          tensor `dst`. The operation is defined as:
 148 *          \f[
 149 *             dst = src \times scale
 150 *          \f]
 151 *
 152 * @param ctx The CANN context used for operations.
 153 * @param dst The destination tensor where the scaled values will be stored.
 154 *            dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
 155 */
 156void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 157
 158/**
 159 * @brief   Sorts the elements of a ggml tensor and returns the indices that
 160 *          would sort the tensor using the CANN backend.
 161 *
 162 * @details This function performs an argsort operation on the input tensor
 163 *          `src`. It sorts the elements of `src` in either ascending or
 164 *          descending order, depending on the `GGML_SORT_ORDER_DESC`,
 165 *          and returns the indices that would sort the original tensor.
 166 *
 167 * @param ctx The CANN context used for operations.
 168 * @param dst The destination tensor where the sorted indices will be stored.
 169 *            dst->op is `GGML_OP_ARGSORT`.
 170 */
 171void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 172
 173/**
 174 * @brief   Computes the Layer Normalization for a ggml tensor using the CANN
 175 *          backend.
 176 *
 177 * @details This function applies the Layer Normalization operation on the
 178 *          input tensor `src` and stores the result in the destination tensor
 179 *          `dst`. Layer Normalization normalizes the features at each sample in
 180 *          a mini-batch independently. It is commonly used in neural networks
 181 *          to normalize the activations of a layer by adjusting and scaling
 182 *          the outputs.
 183 *          The operation is defined as:
 184 *          \f[
 185 *              \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
 186 *          \f]
 187 *          `Var` defaults dst->ne[0]. `eps` is in dst->params.
 188 *
 189 * @param ctx The CANN context used for operations.
 190 * @param dst The destination tensor where the normalized values will be stored.
 191 * @attention `Var` defaults to dst->ne[0].
 192 */
 193void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 194
 195/**
 196 * @brief   Computes the L2 Normalization for a ggml tensor using the CANN
 197 *          backend.
 198 *
 199 * @details This function applies the L2 Normalization operation on the
 200 *          input tensor `src` and stores the result in the destination tensor
 201 *          `dst`. L2 Normalization scales the input tensor such that the
 202 *          L2 norm along the specified dimension equals 1. This operation
 203 *          is commonly used in neural networks for feature normalization
 204 *          and vector scaling.
 205 *          The operation is defined as:
 206 *          \f[
 207 *              \text{out} = \frac{x}{\sqrt{\sum{x^2}}}
 208 *          \f]
 209 *          The normalization is performed along the last dimension by default.
 210 *
 211 * @param ctx The CANN context used for operations.
 212 * @param dst The destination tensor where the normalized values will be stored.
 213 * @attention The normalization is performed along the last dimension of the
 214 *            input tensor by default.
 215 */
 216void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 217
 218/**
 219 * @brief   Computes the Cross Entropy Loss for a ggml tensor using the CANN
 220 *          backend.
 221 *
 222 * @details This function computes the cross entropy loss between the predicted
 223 *          logits and target probability distributions. The operation follows
 224 *          the same computation pattern as the CPU implementation:
 225 *          1. Applies log_softmax to the logits along the class dimension
 226 *          2. Element-wise multiplication with target distributions
 227 *          3. Summation along the class dimension to get per-sample losses
 228 *          4. Global summation and scaling by -1/nr to get final loss
 229 *
 230 *          The computation can be expressed as:
 231 *          \f[
 232 *              \text{loss} = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{C} y_{ij} \cdot \log(\text{softmax}(x_{ij}))
 233 *          \f]
 234 *          where \f$N\f$ is the total number of samples, \f$C\f$ is the number
 235 *          of classes, \f$x\f$ are the logits, and \f$y\f$ are the target
 236 *          probability distributions.
 237 *
 238 * @param ctx The CANN context used for operations.
 239 * @param dst The destination tensor where the computed loss will be stored.
 240 *            This should be a scalar tensor containing the final loss value.
 241 *
 242 * @note This implementation computes cross entropy between probability
 243 *       distributions, not the typical classification cross entropy that
 244 *       expects class indices as targets. Both input tensors (src0 and src1)
 245 *       should have the same shape and represent probability distributions
 246 *       over the class dimension.
 247 * @note The function expects two source tensors:
 248 *       - dst->src[0]: Logits tensor (before softmax)
 249 *       - dst->src[1]: Target probability distributions tensor
 250 * @note The computation is performed using CANN backend operators including
 251 *       LogSoftmax, Mul, ReduceSum, and Muls for the final scaling.
 252 */
 253void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 254
 255/**
 256 * @brief  Computes the Group Normalization for a ggml tensor using the CANN
 257 *         backend.
 258 *
 259 * @brief  This function applies the Group Normalization operation on the input
 260 *         tensor `src` and stores the result in the destination tensor `dst`.
 261 *         Group Normalization divides the channels into groups and normalizes
 262 *         the features within each group across spatial locations.
 263 *         It is commonly used in convolutional neural networks to improve
 264 *         training stability and performance.
 265 *         The operation is defined as:
 266 *         \f[
 267 *             \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
 268 *         \f]
 269 *
 270 * @param ctx The CANN context used for operations.
 271 * @param dst The destination tensor where the normalized values will be stored.
 272 *            `n_groups` is in dst->params, which split C channel to `n_groups`.
 273 *            dst->op is `GGML_OP_GROUP_NORM`.
 274 *
 275 * @attention eps defaults to 1e-6f.
 276 */
 277void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 278
 279/**
 280 * @brief   Computes the accumulation of tensors using the CANN backend.
 281 *
 282 * @details This function performs an accumulation operation on two tensors.
 283 *          Depending on the `inplace` flag, it either updates the destination
 284 *          tensor `dst` in place by adding `alpha * src1` to it, or it creates
 285 *          a new tensor as the result of `src0 + alpha * src1` and stores it in
 286 *          `dst`.
 287 *          The operation is defined as:
 288 *          \f[
 289 *               dst = src0 + alpha \times src1
 290 *          \f]
 291 *          if `inplace` is `true`, `src0` is equal to 'dst'.
 292 * @param ctx The CANN context used for operations.
 293 * @param dst The destination tensor where the accumulated values will be stored.
 294 *            `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
 295 */
 296void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 297
 298/**
 299 * @brief   Computes the sum of elements along the last dimension of a ggml tensor
 300 *          using the CANN backend.
 301 *
 302 * @details This function performs a reduction sum operation along the last
 303 *          dimension of the input tensor `src`. The result of the sum is stored
 304 *          in the destination tensor `dst`.
 305 *
 306 * @param ctx The CANN context used for operations.
 307 * @param dst The destination tensor where the reduced values will be storedใ€‚
 308 *            dst->op is `GGML_OP_SUM_ROWS`.
 309 *
 310 * @attention `reduce_dims` defaults to 3, which means the last dimension.
 311 */
 312void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 313
 314/**
 315 * @brief   Computes the sum of elements in a ggml tensor.
 316 *
 317 * @details This function performs a reduction sum operation along the last
 318 *          dimension of the input tensor `src`. The result of the sum is stored
 319 *          in the destination tensor `dst`.
 320 *
 321 * @param ctx The CANN context used for operations.
 322 * @param dst The destination tensor where the reduced values will be storedใ€‚
 323 *
 324 */
 325
 326void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 327
 328/**
 329 * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
 330 *          the CANN backend.
 331 *
 332 * @details This function performs upsampling of the input tensor `src` using
 333 *          nearest neighbor interpolation. The upsampling is applied to the
 334 *          height and width dimensions (last two dimensions) of the tensor. The
 335 *          result is stored in the destination tensor `dst`, which must have
 336 *          the appropriate dimensions for the upsampled output.
 337 *
 338 * @param ctx The CANN context used for operations.
 339 * @param dst The destination tensor where the upsampled values will be stored.
 340 *            dst->op is `GGML_OP_UPSCALE`.
 341 */
 342void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 343
 344/**
 345 * @brief   Pads a ggml tensor to match the dimensions of the destination tensor
 346 *          using the CANN backend.
 347 *
 348 * @details This function pads the input tensor `src` so that it matches the
 349 *          dimensions of the destination tensor `dst`. The amount of padding
 350 *          is calculated based on the difference in sizes between `src` and
 351 *          `dst` along each dimension. The padded tensor is stored in `dst`.
 352 *
 353 * @param ctx The CANN context used for operations.
 354 * @param dst The destination tensor, which specifies the target dimensions for
 355 *            padding. dst->op is `GGML_OP_PAD`.
 356 */
 357void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 358
 359/**
 360 * @brief   Executes a 2D pooling operation on a ggml tensor using the CANN
 361 *          backend.
 362 *
 363 * @details This function dispatches the execution of a 2D pooling operation on
 364 *          the input tensor `dst`. The type of pooling (average or max) is
 365 *          determined by the `op` parameter, which is read from the operation
 366 *          parameters of `dst`. The function supports average pooling
 367 *          (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
 368 *          invalid operation is encountered, the function asserts a failure.
 369 *
 370 * @param ctx The CANN context used for operations.
 371 * @param dst The destination tensor on which the pooling operation is to be
 372 *            performed. dst->op is `GGML_OP_POOL_2D`.
 373 */
 374void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 375
 376/**
 377 * @brief   Duplicates a ggml tensor using the CANN backend.
 378 *
 379 * @details This function duplicates the contents of the source tensor `src` to
 380 *          the destination tensor `dst`. The function supports various tensor
 381 *          types and configurations, including handling of extra data, type
 382 *          conversions, and special cases for contiguous and non-contiguous
 383 *          tensors.
 384 *
 385 * @param ctx The CANN context used for operations.
 386 * @param dst The destination tensor where the duplicated data will be stored.
 387 *            dst->op is `GGML_OP_DUP`
 388 *
 389 * @attention Only support Fp16/FP32. Not support when src and dst have
 390 *            different shape and dst is no-contiguous.
 391 * @note:     This func need to simplify.
 392 */
 393void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 394
 395/**
 396 * @brief   Computes the Root Mean Square (RMS) normalization of a ggml tensor
 397 *          using the CANN backend.
 398 *
 399 * @details This function applies RMS normalization to the input tensor `src`
 400 *          and stores the result in the destination tensor `dst`. RMS
 401 *          normalization involves computing the root mean square of the input
 402 *          tensor along a specified dimension and then dividing each element of
 403 *          the tensor by this value, adjusted by a small epsilon value to
 404 *          prevent division by zero.
 405 *          The operation is defined as:
 406 *          \f[
 407 *               \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
 408 *               \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
 409 *          \f]
 410 *          `eps` is in dst->op_params.
 411 * @param ctx The CANN context used for operations.
 412 * @param dst The destination tensor where the normalized values will be stored.
 413 *            dst->op is `GGML_OP_RMS_NORM`.
 414 */
 415void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 416
 417/**
 418 * @brief   Applies a diagonal mask to the tensor with a specified value.
 419 *
 420 * @details This function creates a mask tensor filled with ones, then applies
 421 *          an upper triangular and lower triangular operation to it based on
 422 *          the number of past elements specified. Afterward, it adds the masked
 423 *          tensor to the destination tensor in-place.
 424 *
 425 * @param ctx The backend CANN context used for operations.
 426 * @param dst The destination tensor where the result will be stored. dst->op is
 427 *            `GGML_OP_DIAG_MASK`
 428 * @param value The value to use for masking.
 429 */
 430void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value);
 431
 432/**
 433 * @brief   Performs an image-to-column transformation on the input tensor.
 434 *
 435 * @details This function takes an input tensor and applies an image-to-column
 436 *          operation, converting spatial dimensions into column-like
 437 *          structures suitable for convolutional operations. It supports both
 438 *          half-precision (F16) and single-precision (F32) floating-point data
 439 *          types.
 440 *
 441 * @param ctx The backend CANN context for executing operations.
 442 * @param dst The destination tensor that stores the result of the operation.
 443 *            dst->op is `GGML_OP_IM2COL`.
 444 */
 445void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 446
 447/**
 448 * @brief   Computes time step embeddings using sine and cosine functions.
 449 *
 450 * @details This function calculates time step embeddings by applying sine and
 451 *          cosine transformations to a given input tensor, which is typically
 452 *          used in temporal models like diffusion models or transformers to
 453 *          encode time information effectively.
 454 *
 455 * @param ctx The backend CANN context for executing operations.
 456 * @param dst The destination tensor where the result of the embedding operation
 457 *            will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
 458 */
 459void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 460
 461// @see ggml_cann_dup.
 462void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 463
 464/**
 465 * @brief   Computes the softmax activation with optional masking.
 466 *
 467 * @details This function computes the softmax activation over the input tensor,
 468 *          optionally applying a mask and scaling factor. It supports both FP16
 469 *          and FP32 data types and can handle masking by broadcasting the mask
 470 *          across rows if necessary.
 471 *          The function performs the following steps:
 472 *          1. Multiplies the input tensor by a scale factor.
 473 *          2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
 474 *          3. Broadcasts the mask tensor if its dimensions do not match the
 475 *             input tensor's dimensions.
 476 *          4. Adds the mask to the scaled input tensor.
 477 *          5. Applies the softmax activation function along the specified
 478 *             dimension.
 479 *
 480 * @param ctx The backend CANN context for executing operations.
 481 * @param dst The destination tensor where the result will be stored. dst->op is
 482 *            `GGML_OP_SOFTMAX`.
 483 */
 484void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 485
 486/**
 487 * @brief   Extracts specific rows from a tensor based on indices.
 488 *
 489 * @details This function retrieves rows from a source tensor src0 according to
 490 *          the indices provided in another tensor src1 and stores the result in
 491 *          a destination tensor (\p dst).
 492 *
 493 * @param ctx The backend CANN context for executing operations.
 494 * @param dst The destination tensor where the extracted rows will be stored.
 495 */
 496void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 497
 498/**
 499 * @brief   Writes specific rows into a tensor at positions specified by indices.
 500 *
 501 * @details This function copies rows from a source tensor into a destination
 502 *          tensor (\p dst) at the positions indicated by the indices in another
 503 *          tensor.
 504 *
 505 * @param ctx The backend CANN context for executing operations.
 506 * @param dst The destination tensor where the specified rows will be updated.
 507 */
 508void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 509
 510/**
 511 * @brief   Executes matrix multiplication for the given tensor.
 512 *
 513 * @details This function performs matrix multiplication on the source tensors
 514 *          associated with the destination tensor. It supports matrix
 515 *          multiplication F32, F16, and Q8_0.
 516 *
 517 * @param ctx The backend CANN context for executing operations.
 518 * @param dst The destination tensor for storing the result of the matrix
 519 *            multiplication. dst->op is `GGML_OP_MUL_MAT`.
 520 */
 521void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 522
 523/**
 524 * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
 525 *
 526 * @details This function implements the RoPE mechanism, which is a method to
 527 *          encode positional information into sequence data, particularly
 528 *          useful in transformer models. It supports both F32 and F16 data
 529 *          types.
 530 *
 531 * @param ctx The backend CANN context for executing operations.
 532 * @param dst The destination tensor where the RoPE-transformed data will be
 533 *            stored. dst->op is `GGML_OP_ROPE`.
 534 *
 535 * @note The function currently does not support cases where the n_dims is less
 536 *       than the input tensor's first dimension.
 537 * @note The function currently does not support cases where the freq_factors is
 538 *       not NULL.
 539 * @note The function currently does not support cases where the ext_factor is
 540 *       not equal 0.
 541 * @note The function currently does not support cases where the freq_scale is
 542 *       not equal 1.
 543 */
 544void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 545
 546/**
 547 * @brief   Computes the index of the maximum value along the specified dimension
 548 *          of a ggml tensor using the CANN backend.
 549 *
 550 * @details This function performs an argmax operation on the input tensor.
 551 *          It finds the index of the maximum value along the specified axis
 552 *          and stores these indices in the destination tensor `dst`. The
 553 *          operation is executed using the CANN backend for optimized performance.
 554 *
 555 * @param ctx The CANN context used for operations.
 556 * @param dst The destination tensor where the indices of the maximum values will
 557 *            be stored. dst->op is `GGML_OP_ARGMAX`.
 558 */
 559void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 560
 561/**
 562 * @brief Adds two tensors element-wise and stores the result in a destination
 563 * tensor.
 564 *
 565 * This function performs the operation:
 566 * \f[
 567 *    dst = acl\_src0 + alpha \times acl\_src1
 568 * \f]
 569 * where alpha is a scalar value and defaults to 1.0f.
 570 *
 571 * @param ctx The context for the CANN backend operations.
 572 * @param acl_src0 The first source tensor.
 573 * @param acl_src1 The second source tensor.
 574 * @param acl_dst The destination tensor where the result will be stored.
 575 */
 576void aclnn_add(ggml_backend_cann_context & ctx,
 577               aclTensor *                 acl_src0,
 578               aclTensor *                 acl_src1,
 579               aclTensor *                 acl_dst = nullptr);
 580
 581/**
 582 * @brief Sub two tensors element-wise and stores the result in a destination
 583 * tensor.
 584 *
 585 * This function performs the operation:
 586 * \f[
 587 *    dst = acl\_src0 - alpha \times acl\_src1
 588 * \f]
 589 * where alpha is a scalar value and defaults to 1.0f.
 590 *
 591 * @param ctx The context for the CANN backend operations.
 592 * @param acl_src0 The first source tensor.
 593 * @param acl_src1 The second source tensor.
 594 * @param acl_dst The destination tensor where the result will be stored.
 595 */
 596void aclnn_sub(ggml_backend_cann_context & ctx,
 597               aclTensor *                 acl_src0,
 598               aclTensor *                 acl_src1,
 599               aclTensor *                 acl_dst = nullptr);
 600
 601/**
 602 * @brief Performs element-wise multiplication of two tensors and stores the
 603 * result in a destination tensor.
 604 *
 605 * This function performs element-wise multiplication of the tensors `acl_src`
 606 * and `acl_other` and stores the result in the destination tensor `acl_dst`.
 607 * The operation is defined as:
 608 * \f[
 609 *     \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
 610 * \f]
 611 *
 612 * @param ctx The context for the CANN backend operations.
 613 * @param acl_src The first tensor for element-wise multiplication.
 614 * @param acl_other The second tensor for element-wise multiplication.
 615 * @param acl_dst The destination tensor where the result will be stored.
 616 */
 617void aclnn_mul(ggml_backend_cann_context & ctx,
 618               aclTensor *                 acl_src,
 619               aclTensor *                 acl_other,
 620               aclTensor *                 acl_dst = nullptr);
 621
 622/**
 623 * @brief Matrix division, optionally in-place.
 624 *
 625 * This function division each element of the source tensor `acl_src` by the
 626 * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
 627 * If `inplace` is true, `acl_dst` will not be used and the operation is
 628 * performed in-place on `acl_src`. The operation is defined as: \f[
 629 *     \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
 630 * \f]
 631 *
 632 * @param ctx The context for the CANN backend operations.
 633 * @param acl_src Numerator tensor..
 634 * @param acl_other Denominator tensor.
 635 * @param acl_dst The destination tensor where the result will be stored if
 636 * `inplace` is false.
 637 * @param inplace Flag indicating whether to perform the operation in-place on
 638 * `acl_src`.
 639 */
 640void aclnn_div(ggml_backend_cann_context & ctx,
 641               aclTensor *                 acl_src,
 642               aclTensor *                 acl_other,
 643               aclTensor *                 acl_dst = nullptr);
 644
 645/**
 646 * @brief Applies element-wise cosine function to the elements of a tensor.
 647 *
 648 * This function computes the cosine of each element in the source tensor
 649 * `acl_src` and stores the result in the destination tensor `acl_dst`. The
 650 * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
 651 * }_i\right) \f]
 652 *
 653 * @param ctx The context for the CANN backend operations.
 654 * @param acl_src The source tensor on which the cosine function will be
 655 * applied.
 656 * @param acl_dst The destination tensor where the cosine results will be
 657 * stored.
 658 */
 659void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
 660
 661/**
 662 * @brief Applies element-wise sine function to the elements of a tensor.
 663 *
 664 * This function computes the sine of each element in the source tensor
 665 `acl_src`
 666 * and stores the result in the destination tensor `acl_dst`.
 667 * The operation is defined as:
 668 * \f[
 669 *     \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
 670 * \f]
 671
 672 * @param ctx The context for the CANN backend operations.
 673 * @param acl_src The source tensor on which the sine function will be applied.
 674 * @param acl_dst The destination tensor where the sine results will be stored.
 675 */
 676void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
 677
 678/**
 679 * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
 680 * output tensor.
 681 *
 682 * This function checks whether broadcasting is needed between `src0` and `src1`.
 683 * If broadcasting is required, it calculates the proper shapes and creates
 684 * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
 685 * based on the original tensor shapes.
 686 *
 687 * @param src0     The first input tensor (reference shape).
 688 * @param src1     The second input tensor (possibly broadcasted).
 689 * @param dst      The destination/output tensor.
 690 * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
 691 * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
 692 * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
 693 */
 694void bcast_shape(ggml_tensor *    src0,
 695                 ggml_tensor *    src1,
 696                 ggml_tensor *    dst,
 697                 acl_tensor_ptr & acl_src0,
 698                 acl_tensor_ptr & acl_src1,
 699                 acl_tensor_ptr & acl_dst);
 700
 701/**
 702 * @brief   Computes the 1D transposed convolution (deconvolution) of a ggml
 703 * tensor using the CANN backend.
 704 *
 705 * @details This function performs a 1D transposed convolution (also known as
 706 * deconvolution) operation on the input tensor. The computed result is stored
 707 * in the destination tensor `dst`. The operation is optimized using the CANN
 708 * backend for improved performance.
 709 *
 710 * @param ctx The CANN context used for operations.
 711 * @param dst The destination tensor where the transposed convolution result
 712 * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
 713 */
 714void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 715
 716/**
 717 * @brief   Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
 718 * using the CANN backend.
 719 *
 720 * @details This function performs an element-wise ELU activation on the input
 721 *          tensor.
 722 *          The result is written to the destination tensor `dst` in-place.
 723 *          The ELU function is defined as:
 724 *
 725 *          \text{ELU}(x) =
 726 *          \begin{cases}
 727 *          x, & \text{if } x > 0 \\
 728 *          \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
 729 *          \end{cases}
 730 *
 731 *          where ฮฑ (alpha) is a hyperparameter, typically set to 1.0.
 732 *          This operation is optimized using the CANN backend for high-performance
 733 *          inference or training.
 734 *
 735 * @param ctx The CANN context used for operations.
 736 * @param dst The destination tensor where the ELU-activated result will be stored.
 737 *            dst->op is expected to be `GGML_OP_ELU`.
 738 */
 739void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 740
 741/**
 742 * @brief   Computes the mean of a ggml tensor element-wise using the CANN backend.
 743 *
 744 * @details This function calculates the element-wise mean of the input tensor.
 745 *          The result is written to the destination tensor `dst`.
 746 *          The mean is computed by averaging the values across the entire tensor.
 747 *
 748 *          This operation is optimized using the CANN backend for high-performance inference or training.
 749 *
 750 * @param ctx The CANN context used for operations.
 751 * @param dst The destination tensor where the mean result will be stored.
 752 *            dst->op is expected to be `GGML_OP_MEAN`.
 753 */
 754void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 755
 756/**
 757 * @brief   Applies 1D reflect padding to a ggml tensor using the CANN backend.
 758 *
 759 * @details This function performs 1D reflect padding on the input tensor.
 760 *          The amount of padding on each side is specified by parameters stored in `dst->op_params`.
 761 *          The operation reflects the values at the borders of the tensor to generate the padded output.
 762 *
 763 *          This operation is optimized using the CANN backend for high-performance inference or training.
 764 *
 765 * @param ctx The CANN context used for operations.
 766 * @param dst The destination tensor where the padded result will be stored.
 767 *            dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`.
 768 */
 769void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 770
 771/**
 772 * @brief   Counts the number of equal elements in two ggml tensors using the CANN backend.
 773 *
 774 * @details This function performs an element-wise comparison between two input tensors,
 775 *          and counts the number of positions where the elements are equal. The result is
 776 *          stored in the destination tensor `dst` as a scalar.
 777 *
 778 *          The operation is optimized using the CANN backend, making it suitable for
 779 *          high-performance inference or training scenarios.
 780 *
 781 * @param ctx The CANN context used for operations.
 782 * @param dst The destination tensor where the result will be stored.
 783 *            dst->op is expected to be `GGML_OP_COUNT_EQUAL`.
 784 */
 785void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 786
 787/**
 788 * @brief   Applies the Step activation function to a ggml tensor using the CANN backend.
 789 *
 790 * @details This function applies a step function element-wise to the input tensor, where
 791 *          each element is transformed to 1.0 if it is greater than 0, and 0.0 otherwise.
 792 *          The result is stored in the destination tensor `dst`.
 793 *
 794 *          This operation is accelerated using the CANN backend to improve runtime performance.
 795 *
 796 * @param ctx The CANN context used for operations.
 797 * @param dst The destination tensor where the result will be stored.
 798 *            dst->op is expected to be `GGML_OP_STEP`.
 799 */
 800void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 801
 802/**
 803 * @brief   Performs the Flash Attention extended operator using the CANN backend.
 804 *
 805 * @details This function implements the memory-efficient Flash Attention algorithm
 806 *          for computing scaled dot-product attention with hardware acceleration.
 807 *          The result is stored in the destination tensor `dst`.
 808 *
 809 *          This operation is accelerated using the CANN backend to improve runtime performance.
 810 *
 811 * @param ctx The CANN context used for operations.
 812 * @param dst The destination tensor where the result will be stored.
 813 *            dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
 814 */
 815void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 816
 817/**
 818 * @brief Forward Gated Linear Attention on the CANN backend.
 819 *
 820 * Expects dst->src[0..4] = {k, v, q, g, s} with shape conventions:
 821 *   k, v, q, g: [D] with outer dims T x H batched as ne[2]=T, ne[1]=H
 822 *   s: initial state [B, H, D, D], where B is batch and D=C/H
 823 * dst holds both outputs (o) and updated state; a scale factor is read from op params.
 824 *
 825 * The kernel updates per time step l: S_new = g โŠ— S_old + k โŠ— v, then computes o = (S_new^T q) * scale.
 826 *
 827 * @param ctx Backend context providing stream/allocator utilities.
 828 * @param dst Output tensor; src deps are k, v, q, g, s as above.
 829 */
 830void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 831
 832/**
 833 * @brief Launches an asynchronous task using the memory allocator.
 834 *
 835 * This macro submit an asynchronous task on the specified stream.
 836 * The task uses memory allocated by the allocator. It is guaranteed
 837 * that the memory will not be accessed by other tasks until this task
 838 * completes, due to the sequential execution order within the same stream.
 839 *
 840 * @param OP_NAME aclnn operator name.
 841 * @param args Additional arguments required by the task.
 842 *
 843 * @note
 844 * Memory from the allocator will be "freed" immediately and can be
 845 * reallocated to other pointers. However, it won't be accessed by any
 846 * other task before this asynchronous task ends, because all tasks in the
 847 * same stream are executed in queue order.
 848 */
 849
 850#    define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...)                                           \
 851        do {                                                                                     \
 852            uint64_t        workspaceSize = 0;                                                   \
 853            aclOpExecutor * executor;                                                            \
 854            void *          workspaceAddr = nullptr;                                             \
 855            ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
 856            /* workspace should alloced in main thread to keep malloc order when using vmm. */   \
 857            if (workspaceSize > 0) {                                                             \
 858                ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize);             \
 859                workspaceAddr = workspace_allocator.get();                                       \
 860            }                                                                                    \
 861            ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));     \
 862        } while (0)
 863
 864/**
 865 * @brief   Performs sparse expert-based matrix multiplication using the CANN backend.
 866 *
 867 * @details This function implements a MoE-style batched matrix multiplication, where each input token
 868 *          is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
 869 *          in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
 870 *
 871 *          For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
 872 *          performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
 873 *          and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
 874 *
 875 *          Dimensions:
 876 *              - src0: [D, M, A, 1], where A is the number of experts
 877 *              - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
 878 *              - ids : [K, N],       where K is the number of experts each token is routed to
 879 *              - dst : [M, K, N, 1], output tensor storing the result of expert ร— token multiplication
 880 *
 881 *          The function handles two main modes:
 882 *              - If `ne12 == 1`, a simpler per-token loop is used.
 883 *              - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
 884 *
 885 * @param ctx The CANN context used for operations.
 886 * @param dst The destination tensor where the expert-weighted token outputs are stored.
 887 *            Expected to be of shape [M, K, N, 1].
 888 */
 889void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);
 890
 891/**
 892 * @brief Performs fused ADD + RMS_NORM operation using the CANN backend.
 893 *
 894 * This function fuses the ADD and RMS_NORM operations into a single kernel call
 895 * for better performance. It first adds two input tensors (x1 + x2), then applies
 896 * RMS normalization to the result.
 897 *
 898 * @param ctx The context for the CANN backend operations.
 899 * @param dst The ADD operation node, contains the two input tensors to be added.
 900 * @param rms_norm_tensor The RMS_NORM operation node, contains the gamma weights
 901 *                        and epsilon parameter.
 902 */
 903void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx,
 904                                     ggml_tensor *               add_node,
 905                                     ggml_tensor *               rms_norm_node);
 906
 907/**
 908 * @brief   Check whether a tensor is a weight tensor for matrix multiplication.
 909 *
 910 * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
 911 *          typically within neural network layers. The function maintains a static set of canonical weight
 912 *          naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
 913 *          tensors even with hierarchical naming patterns.
 914 *
 915 * @param tensor Pointer to the target ggml_tensor object (const-qualified).
 916 */
 917static bool is_matmul_weight(const ggml_tensor * tensor) {
 918    std::string                                  name = ggml_get_name(tensor);
 919    static const std::unordered_set<std::string> weight_suffixes{ "output.weight",      "attn_q.weight",
 920                                                                  "attn_k.weight",      "attn_v.weight",
 921                                                                  "attn_output.weight", "ffn_gate.weight",
 922                                                                  "ffn_up.weight",      "ffn_down.weight" };
 923
 924    for (const auto & suffix : weight_suffixes) {
 925        if (name.find(suffix) != std::string::npos) {
 926            return true;
 927        }
 928    }
 929    return false;
 930}
 931
 932/**
 933 * @brief Applies a element-wise operation to two input tensors using the CANN
 934 * backend.
 935 *
 936 * This templated function takes a binary operator and applies it to two source
 937 * tensors
 938 * associated with the destination tensor. The function handles broadcasting as
 939 * needed.
 940 *
 941 * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
 942 *         the binary operation to be performed. It must take three arguments:
 943 *         (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
 944 *
 945 * @param ctx The CANN backend context used to manage execution and resources.
 946 * @param dst The destination tensor.
 947 */
 948template <auto binary_op> void ggml_cann_binary_op(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 949    ggml_tensor * src0 = dst->src[0];
 950    ggml_tensor * src1 = dst->src[1];
 951
 952    acl_tensor_ptr acl_src0, acl_src1, acl_dst;
 953
 954    // Need bcast
 955    bcast_shape(src0, src1, dst, acl_src0, acl_src1, acl_dst);
 956    binary_op(ctx, acl_src0.get(), acl_src1.get(), acl_dst.get());
 957}
 958
 959/**
 960 * @brief Applies a unary operation to an input tensor using the CANN backend.
 961 *
 962 * This templated function applies a unary operator to the source tensor of `dst`
 963 * and stores the result in the destination tensor.
 964 *
 965 * @tparam unary_op A callable with the signature:
 966 *         void(ggml_backend_cann_context&, aclTensor *, aclTensor *)
 967 *         where the first aclTensor is the source and the second is the destination.
 968 * @param ctx The CANN backend context for managing resources and execution.
 969 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
 970 */
 971template <void unary_op(ggml_backend_cann_context &, aclTensor *, aclTensor *)>
 972void ggml_cann_op_unary(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 973    ggml_tensor * src = dst->src[0];
 974
 975    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
 976    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
 977
 978    unary_op(ctx, acl_src.get(), acl_dst.get());
 979}
 980
 981/**
 982 * @brief Applies a unary operation to a ggml tensor using the CANN backend.
 983 *
 984 * @details This function applies a unary operation to the input tensor using
 985 * a user-provided lambda or callable `unary_op`. The lambda receives the
 986 * CANN backend context and two ACL tensors: the source and the destination.
 987 *
 988 * Internally, this function handles the conversion from GGML tensors to ACL tensors,
 989 * calls the provided unary op, and manages resource cleanup. The input is assumed
 990 * to be `dst->src[0]`, and the result is written to `dst`.
 991 *
 992 * This utility simplifies writing unary op wrappers by abstracting tensor preparation.
 993 *
 994 * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
 995 * @param ctx The CANN context for operation execution.
 996 * @param dst The destination ggml_tensor where the result will be stored.
 997 *            The input tensor is assumed to be `dst->src[0]`.
 998 *
 999 * @see GGML_CANN_CALL_OP_UNARY
1000 */
1001void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
1002                        ggml_backend_cann_context &                                                ctx,
1003                        ggml_tensor *                                                              dst);
1004
1005void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst);
1006
1007/**
1008 * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
1009 *
1010 * @details This function performs a gated activation such as GEGLU or ReGLU.
1011 * It supports two input modes:
1012 *
1013 * 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
1014 *    These are used directly as the value and gate tensors.
1015 *
1016 * 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
1017 *    contain a concatenation of value and gate along the first dimension. This tensor
1018 *    will be split into two equal halves to form the value and gate inputs.
1019 *
1020 * The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
1021 * then multiplies the result in-place with the gate tensor:
1022 *
1023 * @code
1024 * dst = unary_op(value) * gate;
1025 * @endcode
1026 *
1027 * The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
1028 * order of value/gate in the packed input case.
1029 *
1030 * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
1031 *                 It receives (ctx, acl_value_tensor, acl_output_tensor).
1032 * @param ctx      The CANN context used for execution.
1033 * @param dst      The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
1034 *
1035 * @see GGML_CANN_CALL_OP_UNARY_GATED
1036 */
1037void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
1038                              ggml_backend_cann_context &                                                ctx,
1039                              ggml_tensor *                                                              dst);
1040
1041/**
1042 * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
1043 *
1044 * This macro wraps the specified ACLNN unary operator name into a lambda expression,
1045 * and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
1046 * unary ops in the CANN backend.
1047 *
1048 * Internally, this macro expands to a lambda like:
1049 * @code
1050 * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
1051 *     GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1052 * };
1053 * @endcode
1054 *
1055 * This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
1056 *
1057 * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
1058 *
1059 * @see ggml_cann_op_unary
1060 * @see GGML_CANN_CALL_ACLNN_OP
1061 */
1062#    define GGML_CANN_CALL_OP_UNARY(OP_NAME)                                                              \
1063        do {                                                                                              \
1064            auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
1065                GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);                                  \
1066            };                                                                                            \
1067            ggml_cann_op_unary(lambda, ctx, dst);                                                         \
1068        } while (0)
1069
1070/**
1071 * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
1072 *
1073 * This macro wraps the specified ACLNN unary operator name into a lambda expression,
1074 * and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
1075 * executing gated unary ops in the CANN backend.
1076 *
1077 * Internally, this macro expands to a lambda like:
1078 * @code
1079 * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
1080 *     GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1081 * };
1082 * @endcode
1083 *
1084 * This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
1085 *
1086 * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
1087 *
1088 * @see ggml_cann_op_unary_gated
1089 * @see GGML_CANN_CALL_ACLNN_OP
1090 */
1091#    define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME)                                                        \
1092        do {                                                                                              \
1093            auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
1094                GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);                                  \
1095            };                                                                                            \
1096            ggml_cann_op_unary_gated(lambda, ctx, dst);                                                   \
1097        } while (0)
1098
1099#endif  // CANN_ACLNN_OPS
1100
1101/**
1102 * @brief Performs outer product operation on two ggml tensors using the CANN backend.
1103 *
1104 * @details This function computes the outer product of two input tensors (src0 and src1)
1105 * and stores the result in the destination tensor. The outer product operation is defined as:
1106 * dst[i,j,k,l] = sum_m (src0[i,m,k,l] * src1[j,m,k,l])
1107 *
1108 * The function supports multiple data types including F32, F16. For floating-point
1109 * types, it uses batch matrix multiplication for efficient computation.
1110 *
1111 * The implementation handles 4D tensor broadcasting and batch processing automatically.
1112 *
1113 * @param ctx The CANN backend context for operation execution and memory management.
1114 * @param dst The destination ggml_tensor where the outer product result will be stored.
1115 *            The input tensors are assumed to be `dst->src[0]` and `dst->src[1]`.
1116 *
1117 * @see GGML_CANN_CALL_ACLNN_OP for CANN operator invocation
1118 */
1119void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst);