1/**
2 * Copyright (c) 2023-2026 The ggml authors
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to
6 * deal in the Software without restriction, including without limitation the
7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 * sell copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22
23#ifndef CANN_ACLNN_OPS
24#define CANN_ACLNN_OPS
25
26#include "acl_tensor.h"
27#include "common.h"
28
29#include <aclnnop/aclnn_abs.h>
30#include <aclnnop/aclnn_arange.h>
31#include <aclnnop/aclnn_argsort.h>
32#include <aclnnop/aclnn_cat.h>
33#include <aclnnop/aclnn_clamp.h>
34#include <aclnnop/aclnn_cos.h>
35#include <aclnnop/aclnn_exp.h>
36#include <aclnnop/aclnn_gelu.h>
37#include <aclnnop/aclnn_gelu_v2.h>
38#include <aclnnop/aclnn_hardsigmoid.h>
39#include <aclnnop/aclnn_hardswish.h>
40#include <aclnnop/aclnn_leaky_relu.h>
41#include <aclnnop/aclnn_log.h>
42#include <aclnnop/aclnn_logsoftmax.h>
43#include <aclnnop/aclnn_neg.h>
44#include <aclnnop/aclnn_norm.h>
45#include <aclnnop/aclnn_relu.h>
46#include <aclnnop/aclnn_sigmoid.h>
47#include <aclnnop/aclnn_sign.h>
48#include <aclnnop/aclnn_silu.h>
49#include <aclnnop/aclnn_sin.h>
50#include <aclnnop/aclnn_slice.h>
51#include <aclnnop/aclnn_sqrt.h>
52#include <aclnnop/aclnn_tanh.h>
53
54#include <functional>
55#include <unordered_set>
56
57/**
58 * @brief Repeats a ggml tensor along each dimension to match the dimensions
59 * of another tensor.
60 *
61 * @details This function repeats the elements of a source ggml tensor along
62 * each dimension to create a destination tensor with the specified
63 * dimensions. The operation is performed using the ACL backend and
64 * executed asynchronously on the device.
65 *
66 * @param ctx The CANN context used for operations.
67 * @param dst The ggml tensor representing the destination, which op is
68 * GGML_OP_REPEAT and specifies the desired dimensions.
69 */
70void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
71
72/**
73 * @brief Applies the Leaky ReLU activation function to a tensor using the CANN
74 * backend.
75 *
76 * @details This function computes the Leaky ReLU activation for each element of
77 * the input tensor. The Leaky ReLU function allows a small gradient
78 * when the unit is not active (i.e., when the input is negative). The
79 * Leaky ReLU function is defined as:
80 * \f[
81 * \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
82 * src)
83 * \f]
84 * `negativeSlope` is in dst->params.
85 *
86 * @param ctx The CANN context used for operations.
87 * @param dst The destination tensor where the result of the Leaky ReLU
88 * activation is stored, which op is `GGML_OP_LEAKY_RELU`
89 */
90void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
91
92/**
93 * @brief Concatenates multiple tensors along a specified dimension using the
94 * CANN backend.
95 *
96 * @param ctx The CANN context used for operations.
97 * @param tensorList A pointer to the list of tensors to be concatenated.
98 * @param dst The destination tensor where the result of the
99 * concatenation is stored. dst->op is `GGML_OP_CONCAT`.
100 * @param concat_dim The dimension along which the tensors are concatenated.
101 *
102 * @attention tensorList length should be 2 and the dimension using for concat
103 * default to 1.
104 */
105void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
106
107/**
108 * @brief Generates a sequence of evenly spaced values within a specified
109 * interval for a ggml tensor using the CANN backend.
110 *
111 * @details This function creates a sequence of numbers over a specified i
112 * nterval, starting from `start`, ending before `stop`, and
113 * incrementing by `step`. The sequence is stored in the destination
114 * tensor `dst`.
115 *
116 * @param ctx The CANN context used for operations.
117 * @param dst The destination tensor where the generated sequence will be stored.
118 * `start`, 'stop' and 'step' are in dst->op_params and dst->op is
119 * `GGML_OP_ARANGE`.
120 */
121void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst);
122
123/**
124 * @brief Applies a clamp operation to the elements of a ggml tensor using the
125 * CANN backend.
126 *
127 * @details This function clamps the elements of the input tensor `src` to a
128 * specified range defined by `min` and `max` values. The result is
129 * stored in the destination tensor `dst`. The operation is defined as:
130 * \f[
131 * y = \max(\min(x, max\_value), min\_value)
132 * \f]
133 * where `x` is an element of the input tensor, and `y` is the
134 * corresponding element in the output tensor.
135 * @param ctx The CANN context used for operations.
136 * @param dst The destination tensor where the clamped values will be stored.
137 * dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
138 */
139void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst);
140
141/**
142 * @brief Scales the elements of a ggml tensor by a constant factor using the
143 * CANN backend.
144 *
145 * @details This function multiplies each element of the input tensor `src` by
146 * a scaling factor `scale`, storing the result in the destination
147 * tensor `dst`. The operation is defined as:
148 * \f[
149 * dst = src \times scale
150 * \f]
151 *
152 * @param ctx The CANN context used for operations.
153 * @param dst The destination tensor where the scaled values will be stored.
154 * dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
155 */
156void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst);
157
158/**
159 * @brief Sorts the elements of a ggml tensor and returns the indices that
160 * would sort the tensor using the CANN backend.
161 *
162 * @details This function performs an argsort operation on the input tensor
163 * `src`. It sorts the elements of `src` in either ascending or
164 * descending order, depending on the `GGML_SORT_ORDER_DESC`,
165 * and returns the indices that would sort the original tensor.
166 *
167 * @param ctx The CANN context used for operations.
168 * @param dst The destination tensor where the sorted indices will be stored.
169 * dst->op is `GGML_OP_ARGSORT`.
170 */
171void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst);
172
173/**
174 * @brief Computes the Layer Normalization for a ggml tensor using the CANN
175 * backend.
176 *
177 * @details This function applies the Layer Normalization operation on the
178 * input tensor `src` and stores the result in the destination tensor
179 * `dst`. Layer Normalization normalizes the features at each sample in
180 * a mini-batch independently. It is commonly used in neural networks
181 * to normalize the activations of a layer by adjusting and scaling
182 * the outputs.
183 * The operation is defined as:
184 * \f[
185 * \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
186 * \f]
187 * `Var` defaults dst->ne[0]. `eps` is in dst->params.
188 *
189 * @param ctx The CANN context used for operations.
190 * @param dst The destination tensor where the normalized values will be stored.
191 * @attention `Var` defaults to dst->ne[0].
192 */
193void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
194
195/**
196 * @brief Computes the L2 Normalization for a ggml tensor using the CANN
197 * backend.
198 *
199 * @details This function applies the L2 Normalization operation on the
200 * input tensor `src` and stores the result in the destination tensor
201 * `dst`. L2 Normalization scales the input tensor such that the
202 * L2 norm along the specified dimension equals 1. This operation
203 * is commonly used in neural networks for feature normalization
204 * and vector scaling.
205 * The operation is defined as:
206 * \f[
207 * \text{out} = \frac{x}{\sqrt{\sum{x^2}}}
208 * \f]
209 * The normalization is performed along the last dimension by default.
210 *
211 * @param ctx The CANN context used for operations.
212 * @param dst The destination tensor where the normalized values will be stored.
213 * @attention The normalization is performed along the last dimension of the
214 * input tensor by default.
215 */
216void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
217
218/**
219 * @brief Computes the Cross Entropy Loss for a ggml tensor using the CANN
220 * backend.
221 *
222 * @details This function computes the cross entropy loss between the predicted
223 * logits and target probability distributions. The operation follows
224 * the same computation pattern as the CPU implementation:
225 * 1. Applies log_softmax to the logits along the class dimension
226 * 2. Element-wise multiplication with target distributions
227 * 3. Summation along the class dimension to get per-sample losses
228 * 4. Global summation and scaling by -1/nr to get final loss
229 *
230 * The computation can be expressed as:
231 * \f[
232 * \text{loss} = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{C} y_{ij} \cdot \log(\text{softmax}(x_{ij}))
233 * \f]
234 * where \f$N\f$ is the total number of samples, \f$C\f$ is the number
235 * of classes, \f$x\f$ are the logits, and \f$y\f$ are the target
236 * probability distributions.
237 *
238 * @param ctx The CANN context used for operations.
239 * @param dst The destination tensor where the computed loss will be stored.
240 * This should be a scalar tensor containing the final loss value.
241 *
242 * @note This implementation computes cross entropy between probability
243 * distributions, not the typical classification cross entropy that
244 * expects class indices as targets. Both input tensors (src0 and src1)
245 * should have the same shape and represent probability distributions
246 * over the class dimension.
247 * @note The function expects two source tensors:
248 * - dst->src[0]: Logits tensor (before softmax)
249 * - dst->src[1]: Target probability distributions tensor
250 * @note The computation is performed using CANN backend operators including
251 * LogSoftmax, Mul, ReduceSum, and Muls for the final scaling.
252 */
253void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst);
254
255/**
256 * @brief Computes the Group Normalization for a ggml tensor using the CANN
257 * backend.
258 *
259 * @brief This function applies the Group Normalization operation on the input
260 * tensor `src` and stores the result in the destination tensor `dst`.
261 * Group Normalization divides the channels into groups and normalizes
262 * the features within each group across spatial locations.
263 * It is commonly used in convolutional neural networks to improve
264 * training stability and performance.
265 * The operation is defined as:
266 * \f[
267 * \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
268 * \f]
269 *
270 * @param ctx The CANN context used for operations.
271 * @param dst The destination tensor where the normalized values will be stored.
272 * `n_groups` is in dst->params, which split C channel to `n_groups`.
273 * dst->op is `GGML_OP_GROUP_NORM`.
274 *
275 * @attention eps defaults to 1e-6f.
276 */
277void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
278
279/**
280 * @brief Computes the accumulation of tensors using the CANN backend.
281 *
282 * @details This function performs an accumulation operation on two tensors.
283 * Depending on the `inplace` flag, it either updates the destination
284 * tensor `dst` in place by adding `alpha * src1` to it, or it creates
285 * a new tensor as the result of `src0 + alpha * src1` and stores it in
286 * `dst`.
287 * The operation is defined as:
288 * \f[
289 * dst = src0 + alpha \times src1
290 * \f]
291 * if `inplace` is `true`, `src0` is equal to 'dst'.
292 * @param ctx The CANN context used for operations.
293 * @param dst The destination tensor where the accumulated values will be stored.
294 * `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
295 */
296void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst);
297
298/**
299 * @brief Computes the sum of elements along the last dimension of a ggml tensor
300 * using the CANN backend.
301 *
302 * @details This function performs a reduction sum operation along the last
303 * dimension of the input tensor `src`. The result of the sum is stored
304 * in the destination tensor `dst`.
305 *
306 * @param ctx The CANN context used for operations.
307 * @param dst The destination tensor where the reduced values will be storedใ
308 * dst->op is `GGML_OP_SUM_ROWS`.
309 *
310 * @attention `reduce_dims` defaults to 3, which means the last dimension.
311 */
312void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
313
314/**
315 * @brief Computes the sum of elements in a ggml tensor.
316 *
317 * @details This function performs a reduction sum operation along the last
318 * dimension of the input tensor `src`. The result of the sum is stored
319 * in the destination tensor `dst`.
320 *
321 * @param ctx The CANN context used for operations.
322 * @param dst The destination tensor where the reduced values will be storedใ
323 *
324 */
325
326void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
327
328/**
329 * @brief Upsamples a ggml tensor using nearest neighbor interpolation using
330 * the CANN backend.
331 *
332 * @details This function performs upsampling of the input tensor `src` using
333 * nearest neighbor interpolation. The upsampling is applied to the
334 * height and width dimensions (last two dimensions) of the tensor. The
335 * result is stored in the destination tensor `dst`, which must have
336 * the appropriate dimensions for the upsampled output.
337 *
338 * @param ctx The CANN context used for operations.
339 * @param dst The destination tensor where the upsampled values will be stored.
340 * dst->op is `GGML_OP_UPSCALE`.
341 */
342void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
343
344/**
345 * @brief Pads a ggml tensor to match the dimensions of the destination tensor
346 * using the CANN backend.
347 *
348 * @details This function pads the input tensor `src` so that it matches the
349 * dimensions of the destination tensor `dst`. The amount of padding
350 * is calculated based on the difference in sizes between `src` and
351 * `dst` along each dimension. The padded tensor is stored in `dst`.
352 *
353 * @param ctx The CANN context used for operations.
354 * @param dst The destination tensor, which specifies the target dimensions for
355 * padding. dst->op is `GGML_OP_PAD`.
356 */
357void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst);
358
359/**
360 * @brief Executes a 2D pooling operation on a ggml tensor using the CANN
361 * backend.
362 *
363 * @details This function dispatches the execution of a 2D pooling operation on
364 * the input tensor `dst`. The type of pooling (average or max) is
365 * determined by the `op` parameter, which is read from the operation
366 * parameters of `dst`. The function supports average pooling
367 * (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
368 * invalid operation is encountered, the function asserts a failure.
369 *
370 * @param ctx The CANN context used for operations.
371 * @param dst The destination tensor on which the pooling operation is to be
372 * performed. dst->op is `GGML_OP_POOL_2D`.
373 */
374void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
375
376/**
377 * @brief Duplicates a ggml tensor using the CANN backend.
378 *
379 * @details This function duplicates the contents of the source tensor `src` to
380 * the destination tensor `dst`. The function supports various tensor
381 * types and configurations, including handling of extra data, type
382 * conversions, and special cases for contiguous and non-contiguous
383 * tensors.
384 *
385 * @param ctx The CANN context used for operations.
386 * @param dst The destination tensor where the duplicated data will be stored.
387 * dst->op is `GGML_OP_DUP`
388 *
389 * @attention Only support Fp16/FP32. Not support when src and dst have
390 * different shape and dst is no-contiguous.
391 * @note: This func need to simplify.
392 */
393void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst);
394
395/**
396 * @brief Computes the Root Mean Square (RMS) normalization of a ggml tensor
397 * using the CANN backend.
398 *
399 * @details This function applies RMS normalization to the input tensor `src`
400 * and stores the result in the destination tensor `dst`. RMS
401 * normalization involves computing the root mean square of the input
402 * tensor along a specified dimension and then dividing each element of
403 * the tensor by this value, adjusted by a small epsilon value to
404 * prevent division by zero.
405 * The operation is defined as:
406 * \f[
407 * \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
408 * \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
409 * \f]
410 * `eps` is in dst->op_params.
411 * @param ctx The CANN context used for operations.
412 * @param dst The destination tensor where the normalized values will be stored.
413 * dst->op is `GGML_OP_RMS_NORM`.
414 */
415void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst);
416
417/**
418 * @brief Applies a diagonal mask to the tensor with a specified value.
419 *
420 * @details This function creates a mask tensor filled with ones, then applies
421 * an upper triangular and lower triangular operation to it based on
422 * the number of past elements specified. Afterward, it adds the masked
423 * tensor to the destination tensor in-place.
424 *
425 * @param ctx The backend CANN context used for operations.
426 * @param dst The destination tensor where the result will be stored. dst->op is
427 * `GGML_OP_DIAG_MASK`
428 * @param value The value to use for masking.
429 */
430void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value);
431
432/**
433 * @brief Performs an image-to-column transformation on the input tensor.
434 *
435 * @details This function takes an input tensor and applies an image-to-column
436 * operation, converting spatial dimensions into column-like
437 * structures suitable for convolutional operations. It supports both
438 * half-precision (F16) and single-precision (F32) floating-point data
439 * types.
440 *
441 * @param ctx The backend CANN context for executing operations.
442 * @param dst The destination tensor that stores the result of the operation.
443 * dst->op is `GGML_OP_IM2COL`.
444 */
445void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst);
446
447/**
448 * @brief Computes time step embeddings using sine and cosine functions.
449 *
450 * @details This function calculates time step embeddings by applying sine and
451 * cosine transformations to a given input tensor, which is typically
452 * used in temporal models like diffusion models or transformers to
453 * encode time information effectively.
454 *
455 * @param ctx The backend CANN context for executing operations.
456 * @param dst The destination tensor where the result of the embedding operation
457 * will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
458 */
459void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst);
460
461// @see ggml_cann_dup.
462void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);
463
464/**
465 * @brief Computes the softmax activation with optional masking.
466 *
467 * @details This function computes the softmax activation over the input tensor,
468 * optionally applying a mask and scaling factor. It supports both FP16
469 * and FP32 data types and can handle masking by broadcasting the mask
470 * across rows if necessary.
471 * The function performs the following steps:
472 * 1. Multiplies the input tensor by a scale factor.
473 * 2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
474 * 3. Broadcasts the mask tensor if its dimensions do not match the
475 * input tensor's dimensions.
476 * 4. Adds the mask to the scaled input tensor.
477 * 5. Applies the softmax activation function along the specified
478 * dimension.
479 *
480 * @param ctx The backend CANN context for executing operations.
481 * @param dst The destination tensor where the result will be stored. dst->op is
482 * `GGML_OP_SOFTMAX`.
483 */
484void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
485
486/**
487 * @brief Extracts specific rows from a tensor based on indices.
488 *
489 * @details This function retrieves rows from a source tensor src0 according to
490 * the indices provided in another tensor src1 and stores the result in
491 * a destination tensor (\p dst).
492 *
493 * @param ctx The backend CANN context for executing operations.
494 * @param dst The destination tensor where the extracted rows will be stored.
495 */
496void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
497
498/**
499 * @brief Writes specific rows into a tensor at positions specified by indices.
500 *
501 * @details This function copies rows from a source tensor into a destination
502 * tensor (\p dst) at the positions indicated by the indices in another
503 * tensor.
504 *
505 * @param ctx The backend CANN context for executing operations.
506 * @param dst The destination tensor where the specified rows will be updated.
507 */
508void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
509
510/**
511 * @brief Executes matrix multiplication for the given tensor.
512 *
513 * @details This function performs matrix multiplication on the source tensors
514 * associated with the destination tensor. It supports matrix
515 * multiplication F32, F16, and Q8_0.
516 *
517 * @param ctx The backend CANN context for executing operations.
518 * @param dst The destination tensor for storing the result of the matrix
519 * multiplication. dst->op is `GGML_OP_MUL_MAT`.
520 */
521void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
522
523/**
524 * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
525 *
526 * @details This function implements the RoPE mechanism, which is a method to
527 * encode positional information into sequence data, particularly
528 * useful in transformer models. It supports both F32 and F16 data
529 * types.
530 *
531 * @param ctx The backend CANN context for executing operations.
532 * @param dst The destination tensor where the RoPE-transformed data will be
533 * stored. dst->op is `GGML_OP_ROPE`.
534 *
535 * @note The function currently does not support cases where the n_dims is less
536 * than the input tensor's first dimension.
537 * @note The function currently does not support cases where the freq_factors is
538 * not NULL.
539 * @note The function currently does not support cases where the ext_factor is
540 * not equal 0.
541 * @note The function currently does not support cases where the freq_scale is
542 * not equal 1.
543 */
544void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);
545
546/**
547 * @brief Computes the index of the maximum value along the specified dimension
548 * of a ggml tensor using the CANN backend.
549 *
550 * @details This function performs an argmax operation on the input tensor.
551 * It finds the index of the maximum value along the specified axis
552 * and stores these indices in the destination tensor `dst`. The
553 * operation is executed using the CANN backend for optimized performance.
554 *
555 * @param ctx The CANN context used for operations.
556 * @param dst The destination tensor where the indices of the maximum values will
557 * be stored. dst->op is `GGML_OP_ARGMAX`.
558 */
559void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst);
560
561/**
562 * @brief Adds two tensors element-wise and stores the result in a destination
563 * tensor.
564 *
565 * This function performs the operation:
566 * \f[
567 * dst = acl\_src0 + alpha \times acl\_src1
568 * \f]
569 * where alpha is a scalar value and defaults to 1.0f.
570 *
571 * @param ctx The context for the CANN backend operations.
572 * @param acl_src0 The first source tensor.
573 * @param acl_src1 The second source tensor.
574 * @param acl_dst The destination tensor where the result will be stored.
575 */
576void aclnn_add(ggml_backend_cann_context & ctx,
577 aclTensor * acl_src0,
578 aclTensor * acl_src1,
579 aclTensor * acl_dst = nullptr);
580
581/**
582 * @brief Sub two tensors element-wise and stores the result in a destination
583 * tensor.
584 *
585 * This function performs the operation:
586 * \f[
587 * dst = acl\_src0 - alpha \times acl\_src1
588 * \f]
589 * where alpha is a scalar value and defaults to 1.0f.
590 *
591 * @param ctx The context for the CANN backend operations.
592 * @param acl_src0 The first source tensor.
593 * @param acl_src1 The second source tensor.
594 * @param acl_dst The destination tensor where the result will be stored.
595 */
596void aclnn_sub(ggml_backend_cann_context & ctx,
597 aclTensor * acl_src0,
598 aclTensor * acl_src1,
599 aclTensor * acl_dst = nullptr);
600
601/**
602 * @brief Performs element-wise multiplication of two tensors and stores the
603 * result in a destination tensor.
604 *
605 * This function performs element-wise multiplication of the tensors `acl_src`
606 * and `acl_other` and stores the result in the destination tensor `acl_dst`.
607 * The operation is defined as:
608 * \f[
609 * \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
610 * \f]
611 *
612 * @param ctx The context for the CANN backend operations.
613 * @param acl_src The first tensor for element-wise multiplication.
614 * @param acl_other The second tensor for element-wise multiplication.
615 * @param acl_dst The destination tensor where the result will be stored.
616 */
617void aclnn_mul(ggml_backend_cann_context & ctx,
618 aclTensor * acl_src,
619 aclTensor * acl_other,
620 aclTensor * acl_dst = nullptr);
621
622/**
623 * @brief Matrix division, optionally in-place.
624 *
625 * This function division each element of the source tensor `acl_src` by the
626 * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
627 * If `inplace` is true, `acl_dst` will not be used and the operation is
628 * performed in-place on `acl_src`. The operation is defined as: \f[
629 * \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
630 * \f]
631 *
632 * @param ctx The context for the CANN backend operations.
633 * @param acl_src Numerator tensor..
634 * @param acl_other Denominator tensor.
635 * @param acl_dst The destination tensor where the result will be stored if
636 * `inplace` is false.
637 * @param inplace Flag indicating whether to perform the operation in-place on
638 * `acl_src`.
639 */
640void aclnn_div(ggml_backend_cann_context & ctx,
641 aclTensor * acl_src,
642 aclTensor * acl_other,
643 aclTensor * acl_dst = nullptr);
644
645/**
646 * @brief Applies element-wise cosine function to the elements of a tensor.
647 *
648 * This function computes the cosine of each element in the source tensor
649 * `acl_src` and stores the result in the destination tensor `acl_dst`. The
650 * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
651 * }_i\right) \f]
652 *
653 * @param ctx The context for the CANN backend operations.
654 * @param acl_src The source tensor on which the cosine function will be
655 * applied.
656 * @param acl_dst The destination tensor where the cosine results will be
657 * stored.
658 */
659void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
660
661/**
662 * @brief Applies element-wise sine function to the elements of a tensor.
663 *
664 * This function computes the sine of each element in the source tensor
665 `acl_src`
666 * and stores the result in the destination tensor `acl_dst`.
667 * The operation is defined as:
668 * \f[
669 * \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
670 * \f]
671
672 * @param ctx The context for the CANN backend operations.
673 * @param acl_src The source tensor on which the sine function will be applied.
674 * @param acl_dst The destination tensor where the sine results will be stored.
675 */
676void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst);
677
678/**
679 * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
680 * output tensor.
681 *
682 * This function checks whether broadcasting is needed between `src0` and `src1`.
683 * If broadcasting is required, it calculates the proper shapes and creates
684 * ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
685 * based on the original tensor shapes.
686 *
687 * @param src0 The first input tensor (reference shape).
688 * @param src1 The second input tensor (possibly broadcasted).
689 * @param dst The destination/output tensor.
690 * @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
691 * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
692 * @param acl_dst Output pointer to the created ACL tensor corresponding to dst.
693 */
694void bcast_shape(ggml_tensor * src0,
695 ggml_tensor * src1,
696 ggml_tensor * dst,
697 acl_tensor_ptr & acl_src0,
698 acl_tensor_ptr & acl_src1,
699 acl_tensor_ptr & acl_dst);
700
701/**
702 * @brief Computes the 1D transposed convolution (deconvolution) of a ggml
703 * tensor using the CANN backend.
704 *
705 * @details This function performs a 1D transposed convolution (also known as
706 * deconvolution) operation on the input tensor. The computed result is stored
707 * in the destination tensor `dst`. The operation is optimized using the CANN
708 * backend for improved performance.
709 *
710 * @param ctx The CANN context used for operations.
711 * @param dst The destination tensor where the transposed convolution result
712 * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
713 */
714void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
715
716/**
717 * @brief Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
718 * using the CANN backend.
719 *
720 * @details This function performs an element-wise ELU activation on the input
721 * tensor.
722 * The result is written to the destination tensor `dst` in-place.
723 * The ELU function is defined as:
724 *
725 * \text{ELU}(x) =
726 * \begin{cases}
727 * x, & \text{if } x > 0 \\
728 * \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
729 * \end{cases}
730 *
731 * where ฮฑ (alpha) is a hyperparameter, typically set to 1.0.
732 * This operation is optimized using the CANN backend for high-performance
733 * inference or training.
734 *
735 * @param ctx The CANN context used for operations.
736 * @param dst The destination tensor where the ELU-activated result will be stored.
737 * dst->op is expected to be `GGML_OP_ELU`.
738 */
739void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
740
741/**
742 * @brief Computes the mean of a ggml tensor element-wise using the CANN backend.
743 *
744 * @details This function calculates the element-wise mean of the input tensor.
745 * The result is written to the destination tensor `dst`.
746 * The mean is computed by averaging the values across the entire tensor.
747 *
748 * This operation is optimized using the CANN backend for high-performance inference or training.
749 *
750 * @param ctx The CANN context used for operations.
751 * @param dst The destination tensor where the mean result will be stored.
752 * dst->op is expected to be `GGML_OP_MEAN`.
753 */
754void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst);
755
756/**
757 * @brief Applies 1D reflect padding to a ggml tensor using the CANN backend.
758 *
759 * @details This function performs 1D reflect padding on the input tensor.
760 * The amount of padding on each side is specified by parameters stored in `dst->op_params`.
761 * The operation reflects the values at the borders of the tensor to generate the padded output.
762 *
763 * This operation is optimized using the CANN backend for high-performance inference or training.
764 *
765 * @param ctx The CANN context used for operations.
766 * @param dst The destination tensor where the padded result will be stored.
767 * dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`.
768 */
769void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst);
770
771/**
772 * @brief Counts the number of equal elements in two ggml tensors using the CANN backend.
773 *
774 * @details This function performs an element-wise comparison between two input tensors,
775 * and counts the number of positions where the elements are equal. The result is
776 * stored in the destination tensor `dst` as a scalar.
777 *
778 * The operation is optimized using the CANN backend, making it suitable for
779 * high-performance inference or training scenarios.
780 *
781 * @param ctx The CANN context used for operations.
782 * @param dst The destination tensor where the result will be stored.
783 * dst->op is expected to be `GGML_OP_COUNT_EQUAL`.
784 */
785void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);
786
787/**
788 * @brief Applies the Step activation function to a ggml tensor using the CANN backend.
789 *
790 * @details This function applies a step function element-wise to the input tensor, where
791 * each element is transformed to 1.0 if it is greater than 0, and 0.0 otherwise.
792 * The result is stored in the destination tensor `dst`.
793 *
794 * This operation is accelerated using the CANN backend to improve runtime performance.
795 *
796 * @param ctx The CANN context used for operations.
797 * @param dst The destination tensor where the result will be stored.
798 * dst->op is expected to be `GGML_OP_STEP`.
799 */
800void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);
801
802/**
803 * @brief Performs the Flash Attention extended operator using the CANN backend.
804 *
805 * @details This function implements the memory-efficient Flash Attention algorithm
806 * for computing scaled dot-product attention with hardware acceleration.
807 * The result is stored in the destination tensor `dst`.
808 *
809 * This operation is accelerated using the CANN backend to improve runtime performance.
810 *
811 * @param ctx The CANN context used for operations.
812 * @param dst The destination tensor where the result will be stored.
813 * dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
814 */
815void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst);
816
817/**
818 * @brief Forward Gated Linear Attention on the CANN backend.
819 *
820 * Expects dst->src[0..4] = {k, v, q, g, s} with shape conventions:
821 * k, v, q, g: [D] with outer dims T x H batched as ne[2]=T, ne[1]=H
822 * s: initial state [B, H, D, D], where B is batch and D=C/H
823 * dst holds both outputs (o) and updated state; a scale factor is read from op params.
824 *
825 * The kernel updates per time step l: S_new = g โ S_old + k โ v, then computes o = (S_new^T q) * scale.
826 *
827 * @param ctx Backend context providing stream/allocator utilities.
828 * @param dst Output tensor; src deps are k, v, q, g, s as above.
829 */
830void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor * dst);
831
832/**
833 * @brief Launches an asynchronous task using the memory allocator.
834 *
835 * This macro submit an asynchronous task on the specified stream.
836 * The task uses memory allocated by the allocator. It is guaranteed
837 * that the memory will not be accessed by other tasks until this task
838 * completes, due to the sequential execution order within the same stream.
839 *
840 * @param OP_NAME aclnn operator name.
841 * @param args Additional arguments required by the task.
842 *
843 * @note
844 * Memory from the allocator will be "freed" immediately and can be
845 * reallocated to other pointers. However, it won't be accessed by any
846 * other task before this asynchronous task ends, because all tasks in the
847 * same stream are executed in queue order.
848 */
849
850# define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
851 do { \
852 uint64_t workspaceSize = 0; \
853 aclOpExecutor * executor; \
854 void * workspaceAddr = nullptr; \
855 ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
856 /* workspace should alloced in main thread to keep malloc order when using vmm. */ \
857 if (workspaceSize > 0) { \
858 ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
859 workspaceAddr = workspace_allocator.get(); \
860 } \
861 ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream())); \
862 } while (0)
863
864/**
865 * @brief Performs sparse expert-based matrix multiplication using the CANN backend.
866 *
867 * @details This function implements a MoE-style batched matrix multiplication, where each input token
868 * is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
869 * in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
870 *
871 * For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
872 * performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
873 * and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
874 *
875 * Dimensions:
876 * - src0: [D, M, A, 1], where A is the number of experts
877 * - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
878 * - ids : [K, N], where K is the number of experts each token is routed to
879 * - dst : [M, K, N, 1], output tensor storing the result of expert ร token multiplication
880 *
881 * The function handles two main modes:
882 * - If `ne12 == 1`, a simpler per-token loop is used.
883 * - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
884 *
885 * @param ctx The CANN context used for operations.
886 * @param dst The destination tensor where the expert-weighted token outputs are stored.
887 * Expected to be of shape [M, K, N, 1].
888 */
889void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);
890
891/**
892 * @brief Performs fused ADD + RMS_NORM operation using the CANN backend.
893 *
894 * This function fuses the ADD and RMS_NORM operations into a single kernel call
895 * for better performance. It first adds two input tensors (x1 + x2), then applies
896 * RMS normalization to the result.
897 *
898 * @param ctx The context for the CANN backend operations.
899 * @param dst The ADD operation node, contains the two input tensors to be added.
900 * @param rms_norm_tensor The RMS_NORM operation node, contains the gamma weights
901 * and epsilon parameter.
902 */
903void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx,
904 ggml_tensor * add_node,
905 ggml_tensor * rms_norm_node);
906
907/**
908 * @brief Check whether a tensor is a weight tensor for matrix multiplication.
909 *
910 * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
911 * typically within neural network layers. The function maintains a static set of canonical weight
912 * naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
913 * tensors even with hierarchical naming patterns.
914 *
915 * @param tensor Pointer to the target ggml_tensor object (const-qualified).
916 */
917static bool is_matmul_weight(const ggml_tensor * tensor) {
918 std::string name = ggml_get_name(tensor);
919 static const std::unordered_set<std::string> weight_suffixes{ "output.weight", "attn_q.weight",
920 "attn_k.weight", "attn_v.weight",
921 "attn_output.weight", "ffn_gate.weight",
922 "ffn_up.weight", "ffn_down.weight" };
923
924 for (const auto & suffix : weight_suffixes) {
925 if (name.find(suffix) != std::string::npos) {
926 return true;
927 }
928 }
929 return false;
930}
931
932/**
933 * @brief Applies a element-wise operation to two input tensors using the CANN
934 * backend.
935 *
936 * This templated function takes a binary operator and applies it to two source
937 * tensors
938 * associated with the destination tensor. The function handles broadcasting as
939 * needed.
940 *
941 * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
942 * the binary operation to be performed. It must take three arguments:
943 * (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
944 *
945 * @param ctx The CANN backend context used to manage execution and resources.
946 * @param dst The destination tensor.
947 */
948template <auto binary_op> void ggml_cann_binary_op(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
949 ggml_tensor * src0 = dst->src[0];
950 ggml_tensor * src1 = dst->src[1];
951
952 acl_tensor_ptr acl_src0, acl_src1, acl_dst;
953
954 // Need bcast
955 bcast_shape(src0, src1, dst, acl_src0, acl_src1, acl_dst);
956 binary_op(ctx, acl_src0.get(), acl_src1.get(), acl_dst.get());
957}
958
959/**
960 * @brief Applies a unary operation to an input tensor using the CANN backend.
961 *
962 * This templated function applies a unary operator to the source tensor of `dst`
963 * and stores the result in the destination tensor.
964 *
965 * @tparam unary_op A callable with the signature:
966 * void(ggml_backend_cann_context&, aclTensor *, aclTensor *)
967 * where the first aclTensor is the source and the second is the destination.
968 * @param ctx The CANN backend context for managing resources and execution.
969 * @param dst The destination tensor. Its src[0] is treated as the input tensor.
970 */
971template <void unary_op(ggml_backend_cann_context &, aclTensor *, aclTensor *)>
972void ggml_cann_op_unary(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
973 ggml_tensor * src = dst->src[0];
974
975 acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
976 acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
977
978 unary_op(ctx, acl_src.get(), acl_dst.get());
979}
980
981/**
982 * @brief Applies a unary operation to a ggml tensor using the CANN backend.
983 *
984 * @details This function applies a unary operation to the input tensor using
985 * a user-provided lambda or callable `unary_op`. The lambda receives the
986 * CANN backend context and two ACL tensors: the source and the destination.
987 *
988 * Internally, this function handles the conversion from GGML tensors to ACL tensors,
989 * calls the provided unary op, and manages resource cleanup. The input is assumed
990 * to be `dst->src[0]`, and the result is written to `dst`.
991 *
992 * This utility simplifies writing unary op wrappers by abstracting tensor preparation.
993 *
994 * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
995 * @param ctx The CANN context for operation execution.
996 * @param dst The destination ggml_tensor where the result will be stored.
997 * The input tensor is assumed to be `dst->src[0]`.
998 *
999 * @see GGML_CANN_CALL_OP_UNARY
1000 */
1001void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
1002 ggml_backend_cann_context & ctx,
1003 ggml_tensor * dst);
1004
1005void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst);
1006
1007/**
1008 * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
1009 *
1010 * @details This function performs a gated activation such as GEGLU or ReGLU.
1011 * It supports two input modes:
1012 *
1013 * 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
1014 * These are used directly as the value and gate tensors.
1015 *
1016 * 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
1017 * contain a concatenation of value and gate along the first dimension. This tensor
1018 * will be split into two equal halves to form the value and gate inputs.
1019 *
1020 * The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
1021 * then multiplies the result in-place with the gate tensor:
1022 *
1023 * @code
1024 * dst = unary_op(value) * gate;
1025 * @endcode
1026 *
1027 * The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
1028 * order of value/gate in the packed input case.
1029 *
1030 * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
1031 * It receives (ctx, acl_value_tensor, acl_output_tensor).
1032 * @param ctx The CANN context used for execution.
1033 * @param dst The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
1034 *
1035 * @see GGML_CANN_CALL_OP_UNARY_GATED
1036 */
1037void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
1038 ggml_backend_cann_context & ctx,
1039 ggml_tensor * dst);
1040
1041/**
1042 * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
1043 *
1044 * This macro wraps the specified ACLNN unary operator name into a lambda expression,
1045 * and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
1046 * unary ops in the CANN backend.
1047 *
1048 * Internally, this macro expands to a lambda like:
1049 * @code
1050 * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
1051 * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1052 * };
1053 * @endcode
1054 *
1055 * This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
1056 *
1057 * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
1058 *
1059 * @see ggml_cann_op_unary
1060 * @see GGML_CANN_CALL_ACLNN_OP
1061 */
1062# define GGML_CANN_CALL_OP_UNARY(OP_NAME) \
1063 do { \
1064 auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
1065 GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1066 }; \
1067 ggml_cann_op_unary(lambda, ctx, dst); \
1068 } while (0)
1069
1070/**
1071 * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
1072 *
1073 * This macro wraps the specified ACLNN unary operator name into a lambda expression,
1074 * and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
1075 * executing gated unary ops in the CANN backend.
1076 *
1077 * Internally, this macro expands to a lambda like:
1078 * @code
1079 * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
1080 * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1081 * };
1082 * @endcode
1083 *
1084 * This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
1085 *
1086 * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
1087 *
1088 * @see ggml_cann_op_unary_gated
1089 * @see GGML_CANN_CALL_ACLNN_OP
1090 */
1091# define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \
1092 do { \
1093 auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \
1094 GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1095 }; \
1096 ggml_cann_op_unary_gated(lambda, ctx, dst); \
1097 } while (0)
1098
1099#endif // CANN_ACLNN_OPS
1100
1101/**
1102 * @brief Performs outer product operation on two ggml tensors using the CANN backend.
1103 *
1104 * @details This function computes the outer product of two input tensors (src0 and src1)
1105 * and stores the result in the destination tensor. The outer product operation is defined as:
1106 * dst[i,j,k,l] = sum_m (src0[i,m,k,l] * src1[j,m,k,l])
1107 *
1108 * The function supports multiple data types including F32, F16. For floating-point
1109 * types, it uses batch matrix multiplication for efficient computation.
1110 *
1111 * The implementation handles 4D tensor broadcasting and batch processing automatically.
1112 *
1113 * @param ctx The CANN backend context for operation execution and memory management.
1114 * @param dst The destination ggml_tensor where the outer product result will be stored.
1115 * The input tensors are assumed to be `dst->src[0]` and `dst->src[1]`.
1116 *
1117 * @see GGML_CANN_CALL_ACLNN_OP for CANN operator invocation
1118 */
1119void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst);