1#pragma once
2
3//
4// GGML Tensor Library
5//
6// This documentation is still a work in progress.
7// If you wish some specific topics to be covered, feel free to drop a comment:
8//
9// https://github.com/ggml-org/whisper.cpp/issues/40
10//
11// ## Overview
12//
13// This library implements:
14//
15// - a set of tensor operations
16// - automatic differentiation
17// - basic optimization algorithms
18//
19// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
20// but is not limited to, the following:
21//
22// - linear regression
23// - support vector machines
24// - neural networks
25//
26// The library allows the user to define a certain function using the available tensor operations. This function
27// definition is represented internally via a computation graph. Each tensor operation in the function definition
28// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
29// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
30// using one of the available optimization algorithms.
31//
32// For example, here we define the function: f(x) = a*x^2 + b
33//
34// {
35// struct ggml_init_params params = {
36// .mem_size = 16*1024*1024,
37// .mem_buffer = NULL,
38// };
39//
40// // memory allocation happens here
41// struct ggml_context * ctx = ggml_init(params);
42//
43// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
44//
45// ggml_set_param(ctx, x); // x is an input variable
46//
47// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
48// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
49// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
50// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
51//
52// ...
53// }
54//
55// Notice that the function definition above does not involve any actual computation. The computation is performed only
56// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
57//
58// {
59// ...
60//
61// struct ggml_cgraph * gf = ggml_new_graph(ctx);
62// ggml_build_forward_expand(gf, f);
63//
64// // set the input variable and parameter values
65// ggml_set_f32(x, 2.0f);
66// ggml_set_f32(a, 3.0f);
67// ggml_set_f32(b, 4.0f);
68//
69// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
70//
71// printf("f = %f\n", ggml_get_f32_1d(f, 0));
72//
73// ...
74// }
75//
76// The actual computation is performed in the ggml_graph_compute() function.
77//
78// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
79// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
80// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
81// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
82// actually needed.
83//
84// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
85// differentiation and optimization algorithms.
86//
87// The described approach allows to define the function graph once and then compute its forward or backward graphs
88// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
89// the user can avoid the memory allocation overhead at runtime.
90//
91// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
92// citizens, but in theory the library can be extended to support FP8 and integer data types.
93//
94// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
95// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
96// clear that the library needs to support more complex operations. The way to support these operations is not clear
97// yet, but a few examples are demonstrated in the following operations:
98//
99// - ggml_permute()
100// - ggml_conv_1d_1s()
101// - ggml_conv_1d_2s()
102//
103// For each tensor operator, the library implements a forward and backward computation function. The forward function
104// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
105// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
106// calculus class, or watch the following video:
107//
108// What is Automatic Differentiation?
109// https://www.youtube.com/watch?v=wG_nF1awSSY
110//
111//
112// ## Tensor data (struct ggml_tensor)
113//
114// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
115// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
116// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
117//
118// {
119// struct ggml_tensor * c = ggml_add(ctx, a, b);
120//
121// assert(c->src[0] == a);
122// assert(c->src[1] == b);
123// }
124//
125// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
126// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
127// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
128// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
129// contiguous in memory.
130//
131// The data of the tensor is accessed via the "data" pointer. For example:
132//
133// {
134// const int nx = 2;
135// const int ny = 3;
136//
137// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
138//
139// for (int y = 0; y < ny; y++) {
140// for (int x = 0; x < nx; x++) {
141// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
142// }
143// }
144//
145// ...
146// }
147//
148// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
149//
150// ## The matrix multiplication operator (ggml_mul_mat)
151//
152// TODO
153//
154//
155// ## Multi-threading
156//
157// TODO
158//
159//
160// ## Overview of ggml.c
161//
162// TODO
163//
164//
165// ## SIMD optimizations
166//
167// TODO
168//
169//
170// ## Debugging ggml
171//
172// TODO
173//
174//
175
176#ifdef GGML_SHARED
177# if defined(_WIN32) && !defined(__MINGW32__)
178# ifdef GGML_BUILD
179# define GGML_API __declspec(dllexport) extern
180# else
181# define GGML_API __declspec(dllimport) extern
182# endif
183# else
184# define GGML_API __attribute__ ((visibility ("default"))) extern
185# endif
186#else
187# define GGML_API extern
188#endif
189
190// TODO: support for clang
191#ifdef __GNUC__
192# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
193#elif defined(_MSC_VER)
194# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
195#else
196# define GGML_DEPRECATED(func, hint) func
197#endif
198
199#ifndef __GNUC__
200# define GGML_ATTRIBUTE_FORMAT(...)
201#elif defined(__MINGW32__) && !defined(__clang__)
202# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
203#else
204# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
205#endif
206
207#if defined(_WIN32) && !defined(_WIN32_WINNT)
208# define _WIN32_WINNT 0x0A00
209#endif
210
211#include <stdbool.h>
212#include <stddef.h>
213#include <stdint.h>
214#include <stdio.h>
215
216#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
217#define GGML_FILE_VERSION 2
218
219#define GGML_QNT_VERSION 2 // bump this on quantization format changes
220#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
221
222#define GGML_MAX_DIMS 4
223#define GGML_MAX_PARAMS 2048
224#define GGML_MAX_SRC 10
225#define GGML_MAX_N_THREADS 512
226#define GGML_MAX_OP_PARAMS 64
227
228#ifndef GGML_MAX_NAME
229# define GGML_MAX_NAME 64
230#endif
231
232#define GGML_DEFAULT_N_THREADS 4
233#define GGML_DEFAULT_GRAPH_SIZE 2048
234
235#if UINTPTR_MAX == 0xFFFFFFFF
236 #define GGML_MEM_ALIGN 4
237#elif defined(__EMSCRIPTEN__)
238// emscripten uses max_align_t == 8, so we need GGML_MEM_ALIGN == 8 for 64-bit wasm.
239// (for 32-bit wasm, the first conditional is true and GGML_MEM_ALIGN stays 4.)
240// ref: https://github.com/ggml-org/llama.cpp/pull/18628
241 #define GGML_MEM_ALIGN 8
242#else
243 #define GGML_MEM_ALIGN 16
244#endif
245
246#define GGML_EXIT_SUCCESS 0
247#define GGML_EXIT_ABORTED 1
248
249// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
250#define GGML_ROPE_TYPE_NORMAL 0
251#define GGML_ROPE_TYPE_NEOX 2
252#define GGML_ROPE_TYPE_MROPE 8
253#define GGML_ROPE_TYPE_VISION 24
254#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
255
256#define GGML_MROPE_SECTIONS 4
257
258#define GGML_UNUSED(x) (void)(x)
259#ifdef __CUDACC__
260template<typename... Args>
261__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
262#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
263#else
264#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
265#endif // __CUDACC__
266
267#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
268
269#ifndef NDEBUG
270# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
271#elif defined(__GNUC__)
272# define GGML_UNREACHABLE() __builtin_unreachable()
273#elif defined(_MSC_VER)
274# define GGML_UNREACHABLE() __assume(0)
275#else
276# define GGML_UNREACHABLE() ((void) 0)
277#endif
278
279#ifdef __cplusplus
280# define GGML_NORETURN [[noreturn]]
281#elif defined(_MSC_VER)
282# define GGML_NORETURN __declspec(noreturn)
283#else
284# define GGML_NORETURN _Noreturn
285#endif
286
287#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
288#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
289
290// used to copy the number of elements and stride in bytes of tensors into local variables.
291// main purpose is to reduce code duplication and improve readability.
292//
293// example:
294//
295// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
296// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
297//
298#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
299 const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
300 GGML_UNUSED(prefix##0);
301#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
302 GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
303 const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
304 GGML_UNUSED(prefix##1);
305#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
306 GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
307 const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
308 GGML_UNUSED(prefix##2);
309#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
310 GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
311 const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
312 GGML_UNUSED(prefix##3);
313
314#define GGML_TENSOR_UNARY_OP_LOCALS \
315 GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
316 GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
317 GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
318 GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
319
320#define GGML_TENSOR_BINARY_OP_LOCALS \
321 GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
322 GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
323 GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
324 GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
325 GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
326 GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
327
328#define GGML_TENSOR_TERNARY_OP_LOCALS \
329 GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
330 GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
331 GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
332 GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
333 GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
334 GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
335 GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
336 GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
337
338#define GGML_TENSOR_BINARY_OP_LOCALS01 \
339 GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
340 GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
341 GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
342 GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
343
344#ifdef __cplusplus
345extern "C" {
346#endif
347
348 // Function type used in fatal error callbacks
349 typedef void (*ggml_abort_callback_t)(const char * error_message);
350
351 // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
352 // Returns the old callback for chaining
353 GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
354
355 GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
356 GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
357
358 enum ggml_status {
359 GGML_STATUS_ALLOC_FAILED = -2,
360 GGML_STATUS_FAILED = -1,
361 GGML_STATUS_SUCCESS = 0,
362 GGML_STATUS_ABORTED = 1,
363 };
364
365 // get ggml_status name string
366 GGML_API const char * ggml_status_to_string(enum ggml_status status);
367
368 // ieee 754-2008 half-precision float16
369 // todo: make this not an integral type
370 typedef uint16_t ggml_fp16_t;
371 GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
372 GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
373 GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
374 GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
375
376 // google brain half-precision bfloat16
377 typedef struct { uint16_t bits; } ggml_bf16_t;
378 GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
379 GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
380 GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
381 GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
382 GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
383
384 struct ggml_object;
385 struct ggml_context;
386 struct ggml_cgraph;
387
388 // NOTE: always add types at the end of the enum to keep backward compatibility
389 enum ggml_type {
390 GGML_TYPE_F32 = 0,
391 GGML_TYPE_F16 = 1,
392 GGML_TYPE_Q4_0 = 2,
393 GGML_TYPE_Q4_1 = 3,
394 // GGML_TYPE_Q4_2 = 4, support has been removed
395 // GGML_TYPE_Q4_3 = 5, support has been removed
396 GGML_TYPE_Q5_0 = 6,
397 GGML_TYPE_Q5_1 = 7,
398 GGML_TYPE_Q8_0 = 8,
399 GGML_TYPE_Q8_1 = 9,
400 GGML_TYPE_Q2_K = 10,
401 GGML_TYPE_Q3_K = 11,
402 GGML_TYPE_Q4_K = 12,
403 GGML_TYPE_Q5_K = 13,
404 GGML_TYPE_Q6_K = 14,
405 GGML_TYPE_Q8_K = 15,
406 GGML_TYPE_IQ2_XXS = 16,
407 GGML_TYPE_IQ2_XS = 17,
408 GGML_TYPE_IQ3_XXS = 18,
409 GGML_TYPE_IQ1_S = 19,
410 GGML_TYPE_IQ4_NL = 20,
411 GGML_TYPE_IQ3_S = 21,
412 GGML_TYPE_IQ2_S = 22,
413 GGML_TYPE_IQ4_XS = 23,
414 GGML_TYPE_I8 = 24,
415 GGML_TYPE_I16 = 25,
416 GGML_TYPE_I32 = 26,
417 GGML_TYPE_I64 = 27,
418 GGML_TYPE_F64 = 28,
419 GGML_TYPE_IQ1_M = 29,
420 GGML_TYPE_BF16 = 30,
421 // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
422 // GGML_TYPE_Q4_0_4_8 = 32,
423 // GGML_TYPE_Q4_0_8_8 = 33,
424 GGML_TYPE_TQ1_0 = 34,
425 GGML_TYPE_TQ2_0 = 35,
426 // GGML_TYPE_IQ4_NL_4_4 = 36,
427 // GGML_TYPE_IQ4_NL_4_8 = 37,
428 // GGML_TYPE_IQ4_NL_8_8 = 38,
429 GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
430 GGML_TYPE_COUNT = 40,
431 };
432
433 // precision
434 enum ggml_prec {
435 GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
436 GGML_PREC_F32 = 10,
437 };
438
439 // model file types
440 enum ggml_ftype {
441 GGML_FTYPE_UNKNOWN = -1,
442 GGML_FTYPE_ALL_F32 = 0,
443 GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
444 GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
445 GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
446 GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
447 GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
448 GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
449 GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
450 GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
451 GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
452 GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
453 GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
454 GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
455 GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
456 GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
457 GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
458 GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
459 GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
460 GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
461 GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
462 GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
463 GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
464 GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
465 GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
466 };
467
468 // available tensor operations:
469 enum ggml_op {
470 GGML_OP_NONE = 0,
471
472 GGML_OP_DUP,
473 GGML_OP_ADD,
474 GGML_OP_ADD_ID,
475 GGML_OP_ADD1,
476 GGML_OP_ACC,
477 GGML_OP_SUB,
478 GGML_OP_MUL,
479 GGML_OP_DIV,
480 GGML_OP_SQR,
481 GGML_OP_SQRT,
482 GGML_OP_LOG,
483 GGML_OP_SIN,
484 GGML_OP_COS,
485 GGML_OP_SUM,
486 GGML_OP_SUM_ROWS,
487 GGML_OP_CUMSUM,
488 GGML_OP_MEAN,
489 GGML_OP_ARGMAX,
490 GGML_OP_COUNT_EQUAL,
491 GGML_OP_REPEAT,
492 GGML_OP_REPEAT_BACK,
493 GGML_OP_CONCAT,
494 GGML_OP_SILU_BACK,
495 GGML_OP_NORM, // normalize
496 GGML_OP_RMS_NORM,
497 GGML_OP_RMS_NORM_BACK,
498 GGML_OP_GROUP_NORM,
499 GGML_OP_L2_NORM,
500
501 GGML_OP_MUL_MAT,
502 GGML_OP_MUL_MAT_ID,
503 GGML_OP_OUT_PROD,
504
505 GGML_OP_SCALE,
506 GGML_OP_SET,
507 GGML_OP_CPY,
508 GGML_OP_CONT,
509 GGML_OP_RESHAPE,
510 GGML_OP_VIEW,
511 GGML_OP_PERMUTE,
512 GGML_OP_TRANSPOSE,
513 GGML_OP_GET_ROWS,
514 GGML_OP_GET_ROWS_BACK,
515 GGML_OP_SET_ROWS,
516 GGML_OP_DIAG,
517 GGML_OP_DIAG_MASK_INF,
518 GGML_OP_DIAG_MASK_ZERO,
519 GGML_OP_SOFT_MAX,
520 GGML_OP_SOFT_MAX_BACK,
521 GGML_OP_ROPE,
522 GGML_OP_ROPE_BACK,
523 GGML_OP_CLAMP,
524 GGML_OP_CONV_TRANSPOSE_1D,
525 GGML_OP_IM2COL,
526 GGML_OP_IM2COL_BACK,
527 GGML_OP_IM2COL_3D,
528 GGML_OP_CONV_2D,
529 GGML_OP_CONV_3D,
530 GGML_OP_CONV_2D_DW,
531 GGML_OP_CONV_TRANSPOSE_2D,
532 GGML_OP_POOL_1D,
533 GGML_OP_POOL_2D,
534 GGML_OP_POOL_2D_BACK,
535 GGML_OP_UPSCALE,
536 GGML_OP_PAD,
537 GGML_OP_PAD_REFLECT_1D,
538 GGML_OP_ROLL,
539 GGML_OP_ARANGE,
540 GGML_OP_TIMESTEP_EMBEDDING,
541 GGML_OP_ARGSORT,
542 GGML_OP_TOP_K,
543 GGML_OP_LEAKY_RELU,
544 GGML_OP_TRI,
545 GGML_OP_FILL,
546
547 GGML_OP_FLASH_ATTN_EXT,
548 GGML_OP_FLASH_ATTN_BACK,
549 GGML_OP_SSM_CONV,
550 GGML_OP_SSM_SCAN,
551 GGML_OP_WIN_PART,
552 GGML_OP_WIN_UNPART,
553 GGML_OP_GET_REL_POS,
554 GGML_OP_ADD_REL_POS,
555 GGML_OP_RWKV_WKV6,
556 GGML_OP_GATED_LINEAR_ATTN,
557 GGML_OP_RWKV_WKV7,
558 GGML_OP_SOLVE_TRI,
559
560 GGML_OP_UNARY,
561
562 GGML_OP_MAP_CUSTOM1,
563 GGML_OP_MAP_CUSTOM2,
564 GGML_OP_MAP_CUSTOM3,
565
566 GGML_OP_CUSTOM,
567
568 GGML_OP_CROSS_ENTROPY_LOSS,
569 GGML_OP_CROSS_ENTROPY_LOSS_BACK,
570 GGML_OP_OPT_STEP_ADAMW,
571 GGML_OP_OPT_STEP_SGD,
572
573 GGML_OP_GLU,
574
575 GGML_OP_COUNT,
576 };
577
578 enum ggml_unary_op {
579 GGML_UNARY_OP_ABS,
580 GGML_UNARY_OP_SGN,
581 GGML_UNARY_OP_NEG,
582 GGML_UNARY_OP_STEP,
583 GGML_UNARY_OP_TANH,
584 GGML_UNARY_OP_ELU,
585 GGML_UNARY_OP_RELU,
586 GGML_UNARY_OP_SIGMOID,
587 GGML_UNARY_OP_GELU,
588 GGML_UNARY_OP_GELU_QUICK,
589 GGML_UNARY_OP_SILU,
590 GGML_UNARY_OP_HARDSWISH,
591 GGML_UNARY_OP_HARDSIGMOID,
592 GGML_UNARY_OP_EXP,
593 GGML_UNARY_OP_EXPM1,
594 GGML_UNARY_OP_SOFTPLUS,
595 GGML_UNARY_OP_GELU_ERF,
596 GGML_UNARY_OP_XIELU,
597 GGML_UNARY_OP_FLOOR,
598 GGML_UNARY_OP_CEIL,
599 GGML_UNARY_OP_ROUND,
600 GGML_UNARY_OP_TRUNC,
601
602 GGML_UNARY_OP_COUNT,
603 };
604
605 enum ggml_glu_op {
606 GGML_GLU_OP_REGLU,
607 GGML_GLU_OP_GEGLU,
608 GGML_GLU_OP_SWIGLU,
609 GGML_GLU_OP_SWIGLU_OAI,
610 GGML_GLU_OP_GEGLU_ERF,
611 GGML_GLU_OP_GEGLU_QUICK,
612
613 GGML_GLU_OP_COUNT,
614 };
615
616 enum ggml_object_type {
617 GGML_OBJECT_TYPE_TENSOR,
618 GGML_OBJECT_TYPE_GRAPH,
619 GGML_OBJECT_TYPE_WORK_BUFFER
620 };
621
622 enum ggml_log_level {
623 GGML_LOG_LEVEL_NONE = 0,
624 GGML_LOG_LEVEL_DEBUG = 1,
625 GGML_LOG_LEVEL_INFO = 2,
626 GGML_LOG_LEVEL_WARN = 3,
627 GGML_LOG_LEVEL_ERROR = 4,
628 GGML_LOG_LEVEL_CONT = 5, // continue previous log
629 };
630
631 // this tensor...
632 enum ggml_tensor_flag {
633 GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
634 GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
635 GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
636 GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
637 GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
638 };
639
640 enum ggml_tri_type {
641 GGML_TRI_TYPE_UPPER_DIAG = 0,
642 GGML_TRI_TYPE_UPPER = 1,
643 GGML_TRI_TYPE_LOWER_DIAG = 2,
644 GGML_TRI_TYPE_LOWER = 3
645 };
646
647 struct ggml_init_params {
648 // memory pool
649 size_t mem_size; // bytes
650 void * mem_buffer; // if NULL, memory will be allocated internally
651 bool no_alloc; // don't allocate memory for the tensor data
652 };
653
654 // n-dimensional tensor
655 struct ggml_tensor {
656 enum ggml_type type;
657
658 struct ggml_backend_buffer * buffer;
659
660 int64_t ne[GGML_MAX_DIMS]; // number of elements
661 size_t nb[GGML_MAX_DIMS]; // stride in bytes:
662 // nb[0] = ggml_type_size(type)
663 // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
664 // nb[i] = nb[i-1] * ne[i-1]
665
666 // compute data
667 enum ggml_op op;
668
669 // op params - allocated as int32_t for alignment
670 int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
671
672 int32_t flags;
673
674 struct ggml_tensor * src[GGML_MAX_SRC];
675
676 // source tensor and offset for views
677 struct ggml_tensor * view_src;
678 size_t view_offs;
679
680 void * data;
681
682 char name[GGML_MAX_NAME];
683
684 void * extra; // extra things e.g. for ggml-cuda.cu
685
686 char padding[8];
687 };
688
689 static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
690
691 // Abort callback
692 // If not NULL, called before ggml computation
693 // If it returns true, the computation is aborted
694 typedef bool (*ggml_abort_callback)(void * data);
695
696
697 //
698 // GUID
699 //
700
701 // GUID types
702 typedef uint8_t ggml_guid[16];
703 typedef ggml_guid * ggml_guid_t;
704
705 GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
706
707 // misc
708
709 GGML_API const char * ggml_version(void);
710 GGML_API const char * ggml_commit(void);
711
712 GGML_API void ggml_time_init(void); // call this once at the beginning of the program
713 GGML_API int64_t ggml_time_ms(void);
714 GGML_API int64_t ggml_time_us(void);
715 GGML_API int64_t ggml_cycles(void);
716 GGML_API int64_t ggml_cycles_per_ms(void);
717
718 // accepts a UTF-8 path, even on Windows
719 GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
720
721 GGML_API void ggml_print_object (const struct ggml_object * obj);
722 GGML_API void ggml_print_objects(const struct ggml_context * ctx);
723
724 GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
725 GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
726 GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
727 GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
728
729 GGML_API int64_t ggml_blck_size(enum ggml_type type);
730 GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
731 GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
732
733 GGML_DEPRECATED(
734 GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
735 "use ggml_row_size() instead");
736
737 GGML_API const char * ggml_type_name(enum ggml_type type);
738 GGML_API const char * ggml_op_name (enum ggml_op op);
739 GGML_API const char * ggml_op_symbol(enum ggml_op op);
740
741 GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
742 GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op);
743 GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
744
745 GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
746
747 GGML_API bool ggml_is_quantized(enum ggml_type type);
748
749 // TODO: temporary until model loading of ggml examples is refactored
750 GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
751
752 GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
753 GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
754 GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
755 GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
756 GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
757 GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
758 GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
759 GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
760
761 // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
762 GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
763 GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
764 GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
765 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
766
767 // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
768 GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
769
770 // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
771 GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
772
773 // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
774 GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
775
776 GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
777 GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
778
779 GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
780
781 // use this to compute the memory overhead of a tensor
782 GGML_API size_t ggml_tensor_overhead(void);
783
784 GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
785
786 // main
787
788 GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
789 GGML_API void ggml_reset(struct ggml_context * ctx);
790 GGML_API void ggml_free (struct ggml_context * ctx);
791
792 GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
793
794 GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
795 GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
796
797 GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
798 GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
799 GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
800
801 GGML_API struct ggml_tensor * ggml_new_tensor(
802 struct ggml_context * ctx,
803 enum ggml_type type,
804 int n_dims,
805 const int64_t *ne);
806
807 GGML_API struct ggml_tensor * ggml_new_tensor_1d(
808 struct ggml_context * ctx,
809 enum ggml_type type,
810 int64_t ne0);
811
812 GGML_API struct ggml_tensor * ggml_new_tensor_2d(
813 struct ggml_context * ctx,
814 enum ggml_type type,
815 int64_t ne0,
816 int64_t ne1);
817
818 GGML_API struct ggml_tensor * ggml_new_tensor_3d(
819 struct ggml_context * ctx,
820 enum ggml_type type,
821 int64_t ne0,
822 int64_t ne1,
823 int64_t ne2);
824
825 GGML_API struct ggml_tensor * ggml_new_tensor_4d(
826 struct ggml_context * ctx,
827 enum ggml_type type,
828 int64_t ne0,
829 int64_t ne1,
830 int64_t ne2,
831 int64_t ne3);
832
833 GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
834
835 GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
836 GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
837
838 // Context tensor enumeration and lookup
839 GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
840 GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
841 GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
842
843 // Converts a flat index into coordinates
844 GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
845
846 GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
847 GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor);
848
849 GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
850 GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
851
852 GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
853 GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
854 GGML_ATTRIBUTE_FORMAT(2, 3)
855 GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
856
857 // Tensor flags
858 GGML_API void ggml_set_input(struct ggml_tensor * tensor);
859 GGML_API void ggml_set_output(struct ggml_tensor * tensor);
860 GGML_API void ggml_set_param(struct ggml_tensor * tensor);
861 GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
862
863 //
864 // operations on tensors with backpropagation
865 //
866
867 GGML_API struct ggml_tensor * ggml_dup(
868 struct ggml_context * ctx,
869 struct ggml_tensor * a);
870
871 // in-place, returns view(a)
872 GGML_API struct ggml_tensor * ggml_dup_inplace(
873 struct ggml_context * ctx,
874 struct ggml_tensor * a);
875
876 GGML_API struct ggml_tensor * ggml_add(
877 struct ggml_context * ctx,
878 struct ggml_tensor * a,
879 struct ggml_tensor * b);
880
881 GGML_API struct ggml_tensor * ggml_add_inplace(
882 struct ggml_context * ctx,
883 struct ggml_tensor * a,
884 struct ggml_tensor * b);
885
886 GGML_API struct ggml_tensor * ggml_add_cast(
887 struct ggml_context * ctx,
888 struct ggml_tensor * a,
889 struct ggml_tensor * b,
890 enum ggml_type type);
891
892 // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
893 GGML_API struct ggml_tensor * ggml_add_id(
894 struct ggml_context * ctx,
895 struct ggml_tensor * a,
896 struct ggml_tensor * b,
897 struct ggml_tensor * ids);
898
899 GGML_API struct ggml_tensor * ggml_add1(
900 struct ggml_context * ctx,
901 struct ggml_tensor * a,
902 struct ggml_tensor * b);
903
904 GGML_API struct ggml_tensor * ggml_add1_inplace(
905 struct ggml_context * ctx,
906 struct ggml_tensor * a,
907 struct ggml_tensor * b);
908
909 // dst = a
910 // view(dst, nb1, nb2, nb3, offset) += b
911 // return dst
912 GGML_API struct ggml_tensor * ggml_acc(
913 struct ggml_context * ctx,
914 struct ggml_tensor * a,
915 struct ggml_tensor * b,
916 size_t nb1,
917 size_t nb2,
918 size_t nb3,
919 size_t offset);
920
921 GGML_API struct ggml_tensor * ggml_acc_inplace(
922 struct ggml_context * ctx,
923 struct ggml_tensor * a,
924 struct ggml_tensor * b,
925 size_t nb1,
926 size_t nb2,
927 size_t nb3,
928 size_t offset);
929
930 GGML_API struct ggml_tensor * ggml_sub(
931 struct ggml_context * ctx,
932 struct ggml_tensor * a,
933 struct ggml_tensor * b);
934
935 GGML_API struct ggml_tensor * ggml_sub_inplace(
936 struct ggml_context * ctx,
937 struct ggml_tensor * a,
938 struct ggml_tensor * b);
939
940 GGML_API struct ggml_tensor * ggml_mul(
941 struct ggml_context * ctx,
942 struct ggml_tensor * a,
943 struct ggml_tensor * b);
944
945 GGML_API struct ggml_tensor * ggml_mul_inplace(
946 struct ggml_context * ctx,
947 struct ggml_tensor * a,
948 struct ggml_tensor * b);
949
950 GGML_API struct ggml_tensor * ggml_div(
951 struct ggml_context * ctx,
952 struct ggml_tensor * a,
953 struct ggml_tensor * b);
954
955 GGML_API struct ggml_tensor * ggml_div_inplace(
956 struct ggml_context * ctx,
957 struct ggml_tensor * a,
958 struct ggml_tensor * b);
959
960 GGML_API struct ggml_tensor * ggml_sqr(
961 struct ggml_context * ctx,
962 struct ggml_tensor * a);
963
964 GGML_API struct ggml_tensor * ggml_sqr_inplace(
965 struct ggml_context * ctx,
966 struct ggml_tensor * a);
967
968 GGML_API struct ggml_tensor * ggml_sqrt(
969 struct ggml_context * ctx,
970 struct ggml_tensor * a);
971
972 GGML_API struct ggml_tensor * ggml_sqrt_inplace(
973 struct ggml_context * ctx,
974 struct ggml_tensor * a);
975
976 GGML_API struct ggml_tensor * ggml_log(
977 struct ggml_context * ctx,
978 struct ggml_tensor * a);
979
980 GGML_API struct ggml_tensor * ggml_log_inplace(
981 struct ggml_context * ctx,
982 struct ggml_tensor * a);
983
984 GGML_API struct ggml_tensor * ggml_expm1(
985 struct ggml_context * ctx,
986 struct ggml_tensor * a);
987
988 GGML_API struct ggml_tensor * ggml_expm1_inplace(
989 struct ggml_context * ctx,
990 struct ggml_tensor * a);
991
992 GGML_API struct ggml_tensor * ggml_softplus(
993 struct ggml_context * ctx,
994 struct ggml_tensor * a);
995
996 GGML_API struct ggml_tensor * ggml_softplus_inplace(
997 struct ggml_context * ctx,
998 struct ggml_tensor * a);
999
1000 GGML_API struct ggml_tensor * ggml_sin(
1001 struct ggml_context * ctx,
1002 struct ggml_tensor * a);
1003
1004 GGML_API struct ggml_tensor * ggml_sin_inplace(
1005 struct ggml_context * ctx,
1006 struct ggml_tensor * a);
1007
1008 GGML_API struct ggml_tensor * ggml_cos(
1009 struct ggml_context * ctx,
1010 struct ggml_tensor * a);
1011
1012 GGML_API struct ggml_tensor * ggml_cos_inplace(
1013 struct ggml_context * ctx,
1014 struct ggml_tensor * a);
1015
1016 // return scalar
1017 GGML_API struct ggml_tensor * ggml_sum(
1018 struct ggml_context * ctx,
1019 struct ggml_tensor * a);
1020
1021 // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
1022 GGML_API struct ggml_tensor * ggml_sum_rows(
1023 struct ggml_context * ctx,
1024 struct ggml_tensor * a);
1025
1026 GGML_API struct ggml_tensor * ggml_cumsum(
1027 struct ggml_context * ctx,
1028 struct ggml_tensor * a);
1029
1030 // mean along rows
1031 GGML_API struct ggml_tensor * ggml_mean(
1032 struct ggml_context * ctx,
1033 struct ggml_tensor * a);
1034
1035 // argmax along rows
1036 GGML_API struct ggml_tensor * ggml_argmax(
1037 struct ggml_context * ctx,
1038 struct ggml_tensor * a);
1039
1040 // count number of equal elements in a and b
1041 GGML_API struct ggml_tensor * ggml_count_equal(
1042 struct ggml_context * ctx,
1043 struct ggml_tensor * a,
1044 struct ggml_tensor * b);
1045
1046 // if a is the same shape as b, and a is not parameter, return a
1047 // otherwise, return a new tensor: repeat(a) to fit in b
1048 GGML_API struct ggml_tensor * ggml_repeat(
1049 struct ggml_context * ctx,
1050 struct ggml_tensor * a,
1051 struct ggml_tensor * b);
1052
1053 // repeat a to the specified shape
1054 GGML_API struct ggml_tensor * ggml_repeat_4d(
1055 struct ggml_context * ctx,
1056 struct ggml_tensor * a,
1057 int64_t ne0,
1058 int64_t ne1,
1059 int64_t ne2,
1060 int64_t ne3);
1061
1062 // sums repetitions in a into shape of b
1063 GGML_API struct ggml_tensor * ggml_repeat_back(
1064 struct ggml_context * ctx,
1065 struct ggml_tensor * a,
1066 struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
1067
1068 // concat a and b along dim
1069 // used in stable-diffusion
1070 GGML_API struct ggml_tensor * ggml_concat(
1071 struct ggml_context * ctx,
1072 struct ggml_tensor * a,
1073 struct ggml_tensor * b,
1074 int dim);
1075
1076 GGML_API struct ggml_tensor * ggml_abs(
1077 struct ggml_context * ctx,
1078 struct ggml_tensor * a);
1079
1080 GGML_API struct ggml_tensor * ggml_abs_inplace(
1081 struct ggml_context * ctx,
1082 struct ggml_tensor * a);
1083
1084 GGML_API struct ggml_tensor * ggml_sgn(
1085 struct ggml_context * ctx,
1086 struct ggml_tensor * a);
1087
1088 GGML_API struct ggml_tensor * ggml_sgn_inplace(
1089 struct ggml_context * ctx,
1090 struct ggml_tensor * a);
1091
1092 GGML_API struct ggml_tensor * ggml_neg(
1093 struct ggml_context * ctx,
1094 struct ggml_tensor * a);
1095
1096 GGML_API struct ggml_tensor * ggml_neg_inplace(
1097 struct ggml_context * ctx,
1098 struct ggml_tensor * a);
1099
1100 GGML_API struct ggml_tensor * ggml_step(
1101 struct ggml_context * ctx,
1102 struct ggml_tensor * a);
1103
1104 GGML_API struct ggml_tensor * ggml_step_inplace(
1105 struct ggml_context * ctx,
1106 struct ggml_tensor * a);
1107
1108 GGML_API struct ggml_tensor * ggml_tanh(
1109 struct ggml_context * ctx,
1110 struct ggml_tensor * a);
1111
1112 GGML_API struct ggml_tensor * ggml_tanh_inplace(
1113 struct ggml_context * ctx,
1114 struct ggml_tensor * a);
1115
1116 GGML_API struct ggml_tensor * ggml_elu(
1117 struct ggml_context * ctx,
1118 struct ggml_tensor * a);
1119
1120 GGML_API struct ggml_tensor * ggml_elu_inplace(
1121 struct ggml_context * ctx,
1122 struct ggml_tensor * a);
1123
1124 GGML_API struct ggml_tensor * ggml_relu(
1125 struct ggml_context * ctx,
1126 struct ggml_tensor * a);
1127
1128 GGML_API struct ggml_tensor * ggml_leaky_relu(
1129 struct ggml_context * ctx,
1130 struct ggml_tensor * a, float negative_slope, bool inplace);
1131
1132 GGML_API struct ggml_tensor * ggml_relu_inplace(
1133 struct ggml_context * ctx,
1134 struct ggml_tensor * a);
1135
1136 GGML_API struct ggml_tensor * ggml_sigmoid(
1137 struct ggml_context * ctx,
1138 struct ggml_tensor * a);
1139
1140 GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1141 struct ggml_context * ctx,
1142 struct ggml_tensor * a);
1143
1144 GGML_API struct ggml_tensor * ggml_gelu(
1145 struct ggml_context * ctx,
1146 struct ggml_tensor * a);
1147
1148 GGML_API struct ggml_tensor * ggml_gelu_inplace(
1149 struct ggml_context * ctx,
1150 struct ggml_tensor * a);
1151
1152 // GELU using erf (error function) when possible
1153 // some backends may fallback to approximation based on Abramowitz and Stegun formula
1154 GGML_API struct ggml_tensor * ggml_gelu_erf(
1155 struct ggml_context * ctx,
1156 struct ggml_tensor * a);
1157
1158 GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
1159 struct ggml_context * ctx,
1160 struct ggml_tensor * a);
1161
1162 GGML_API struct ggml_tensor * ggml_gelu_quick(
1163 struct ggml_context * ctx,
1164 struct ggml_tensor * a);
1165
1166 GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
1167 struct ggml_context * ctx,
1168 struct ggml_tensor * a);
1169
1170 GGML_API struct ggml_tensor * ggml_silu(
1171 struct ggml_context * ctx,
1172 struct ggml_tensor * a);
1173
1174 GGML_API struct ggml_tensor * ggml_silu_inplace(
1175 struct ggml_context * ctx,
1176 struct ggml_tensor * a);
1177
1178 // a - x
1179 // b - dy
1180 GGML_API struct ggml_tensor * ggml_silu_back(
1181 struct ggml_context * ctx,
1182 struct ggml_tensor * a,
1183 struct ggml_tensor * b);
1184
1185 // hardswish(x) = x * relu6(x + 3) / 6
1186 GGML_API struct ggml_tensor * ggml_hardswish(
1187 struct ggml_context * ctx,
1188 struct ggml_tensor * a);
1189
1190 // hardsigmoid(x) = relu6(x + 3) / 6
1191 GGML_API struct ggml_tensor * ggml_hardsigmoid(
1192 struct ggml_context * ctx,
1193 struct ggml_tensor * a);
1194
1195 GGML_API struct ggml_tensor * ggml_exp(
1196 struct ggml_context * ctx,
1197 struct ggml_tensor * a);
1198
1199 GGML_API struct ggml_tensor * ggml_exp_inplace(
1200 struct ggml_context * ctx,
1201 struct ggml_tensor * a);
1202
1203 GGML_API struct ggml_tensor * ggml_floor(
1204 struct ggml_context * ctx,
1205 struct ggml_tensor * a);
1206
1207 GGML_API struct ggml_tensor * ggml_floor_inplace(
1208 struct ggml_context * ctx,
1209 struct ggml_tensor * a);
1210
1211 GGML_API struct ggml_tensor * ggml_ceil(
1212 struct ggml_context * ctx,
1213 struct ggml_tensor * a);
1214
1215 GGML_API struct ggml_tensor * ggml_ceil_inplace(
1216 struct ggml_context * ctx,
1217 struct ggml_tensor * a);
1218
1219 GGML_API struct ggml_tensor * ggml_round(
1220 struct ggml_context * ctx,
1221 struct ggml_tensor * a);
1222
1223 GGML_API struct ggml_tensor * ggml_round_inplace(
1224 struct ggml_context * ctx,
1225 struct ggml_tensor * a);
1226
1227 /**
1228 * Truncates the fractional part of each element in the tensor (towards zero).
1229 * For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
1230 * Similar to std::trunc in C/C++.
1231 */
1232
1233 GGML_API struct ggml_tensor * ggml_trunc(
1234 struct ggml_context * ctx,
1235 struct ggml_tensor * a);
1236
1237 GGML_API struct ggml_tensor * ggml_trunc_inplace(
1238 struct ggml_context * ctx,
1239 struct ggml_tensor * a);
1240
1241
1242
1243 // xIELU activation function
1244 // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
1245 // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
1246 // that constrain the positive and negative source alpha values respectively
1247 GGML_API struct ggml_tensor * ggml_xielu(
1248 struct ggml_context * ctx,
1249 struct ggml_tensor * a,
1250 float alpha_n,
1251 float alpha_p,
1252 float beta,
1253 float eps);
1254
1255 // gated linear unit ops
1256 // A: n columns, r rows,
1257 // result is n / 2 columns, r rows,
1258 // expects gate in second half of row, unless swapped is true
1259 GGML_API struct ggml_tensor * ggml_glu(
1260 struct ggml_context * ctx,
1261 struct ggml_tensor * a,
1262 enum ggml_glu_op op,
1263 bool swapped);
1264
1265 GGML_API struct ggml_tensor * ggml_reglu(
1266 struct ggml_context * ctx,
1267 struct ggml_tensor * a);
1268
1269 GGML_API struct ggml_tensor * ggml_reglu_swapped(
1270 struct ggml_context * ctx,
1271 struct ggml_tensor * a);
1272
1273 GGML_API struct ggml_tensor * ggml_geglu(
1274 struct ggml_context * ctx,
1275 struct ggml_tensor * a);
1276
1277 GGML_API struct ggml_tensor * ggml_geglu_swapped(
1278 struct ggml_context * ctx,
1279 struct ggml_tensor * a);
1280
1281 GGML_API struct ggml_tensor * ggml_swiglu(
1282 struct ggml_context * ctx,
1283 struct ggml_tensor * a);
1284
1285 GGML_API struct ggml_tensor * ggml_swiglu_swapped(
1286 struct ggml_context * ctx,
1287 struct ggml_tensor * a);
1288
1289 GGML_API struct ggml_tensor * ggml_geglu_erf(
1290 struct ggml_context * ctx,
1291 struct ggml_tensor * a);
1292
1293 GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
1294 struct ggml_context * ctx,
1295 struct ggml_tensor * a);
1296
1297 GGML_API struct ggml_tensor * ggml_geglu_quick(
1298 struct ggml_context * ctx,
1299 struct ggml_tensor * a);
1300
1301 GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
1302 struct ggml_context * ctx,
1303 struct ggml_tensor * a);
1304
1305 // A: n columns, r rows,
1306 // B: n columns, r rows,
1307 GGML_API struct ggml_tensor * ggml_glu_split(
1308 struct ggml_context * ctx,
1309 struct ggml_tensor * a,
1310 struct ggml_tensor * b,
1311 enum ggml_glu_op op);
1312
1313 GGML_API struct ggml_tensor * ggml_reglu_split(
1314 struct ggml_context * ctx,
1315 struct ggml_tensor * a,
1316 struct ggml_tensor * b);
1317
1318 GGML_API struct ggml_tensor * ggml_geglu_split(
1319 struct ggml_context * ctx,
1320 struct ggml_tensor * a,
1321 struct ggml_tensor * b);
1322
1323 GGML_API struct ggml_tensor * ggml_swiglu_split(
1324 struct ggml_context * ctx,
1325 struct ggml_tensor * a,
1326 struct ggml_tensor * b);
1327
1328 GGML_API struct ggml_tensor * ggml_geglu_erf_split(
1329 struct ggml_context * ctx,
1330 struct ggml_tensor * a,
1331 struct ggml_tensor * b);
1332
1333 GGML_API struct ggml_tensor * ggml_geglu_quick_split(
1334 struct ggml_context * ctx,
1335 struct ggml_tensor * a,
1336 struct ggml_tensor * b);
1337
1338 GGML_API struct ggml_tensor * ggml_swiglu_oai(
1339 struct ggml_context * ctx,
1340 struct ggml_tensor * a,
1341 struct ggml_tensor * b,
1342 float alpha,
1343 float limit);
1344
1345 // normalize along rows
1346 GGML_API struct ggml_tensor * ggml_norm(
1347 struct ggml_context * ctx,
1348 struct ggml_tensor * a,
1349 float eps);
1350
1351 GGML_API struct ggml_tensor * ggml_norm_inplace(
1352 struct ggml_context * ctx,
1353 struct ggml_tensor * a,
1354 float eps);
1355
1356 GGML_API struct ggml_tensor * ggml_rms_norm(
1357 struct ggml_context * ctx,
1358 struct ggml_tensor * a,
1359 float eps);
1360
1361 GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
1362 struct ggml_context * ctx,
1363 struct ggml_tensor * a,
1364 float eps);
1365
1366 // group normalize along ne0*ne1*n_groups
1367 // used in stable-diffusion
1368 GGML_API struct ggml_tensor * ggml_group_norm(
1369 struct ggml_context * ctx,
1370 struct ggml_tensor * a,
1371 int n_groups,
1372 float eps);
1373
1374 GGML_API struct ggml_tensor * ggml_group_norm_inplace(
1375 struct ggml_context * ctx,
1376 struct ggml_tensor * a,
1377 int n_groups,
1378 float eps);
1379
1380 // l2 normalize along rows
1381 // used in rwkv v7
1382 GGML_API struct ggml_tensor * ggml_l2_norm(
1383 struct ggml_context * ctx,
1384 struct ggml_tensor * a,
1385 float eps);
1386
1387 GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
1388 struct ggml_context * ctx,
1389 struct ggml_tensor * a,
1390 float eps);
1391
1392 // a - x
1393 // b - dy
1394 GGML_API struct ggml_tensor * ggml_rms_norm_back(
1395 struct ggml_context * ctx,
1396 struct ggml_tensor * a,
1397 struct ggml_tensor * b,
1398 float eps);
1399
1400 // A: k columns, n rows => [ne03, ne02, n, k]
1401 // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1402 // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1403 GGML_API struct ggml_tensor * ggml_mul_mat(
1404 struct ggml_context * ctx,
1405 struct ggml_tensor * a,
1406 struct ggml_tensor * b);
1407
1408 // change the precision of a matrix multiplication
1409 // set to GGML_PREC_F32 for higher precision (useful for phi-2)
1410 GGML_API void ggml_mul_mat_set_prec(
1411 struct ggml_tensor * a,
1412 enum ggml_prec prec);
1413
1414 // indirect matrix multiplication
1415 GGML_API struct ggml_tensor * ggml_mul_mat_id(
1416 struct ggml_context * ctx,
1417 struct ggml_tensor * as,
1418 struct ggml_tensor * b,
1419 struct ggml_tensor * ids);
1420
1421 // A: m columns, n rows,
1422 // B: p columns, n rows,
1423 // result is m columns, p rows
1424 GGML_API struct ggml_tensor * ggml_out_prod(
1425 struct ggml_context * ctx,
1426 struct ggml_tensor * a,
1427 struct ggml_tensor * b);
1428
1429 //
1430 // operations on tensors without backpropagation
1431 //
1432
1433 GGML_API struct ggml_tensor * ggml_scale(
1434 struct ggml_context * ctx,
1435 struct ggml_tensor * a,
1436 float s);
1437
1438 // in-place, returns view(a)
1439 GGML_API struct ggml_tensor * ggml_scale_inplace(
1440 struct ggml_context * ctx,
1441 struct ggml_tensor * a,
1442 float s);
1443
1444 // x = s * a + b
1445 GGML_API struct ggml_tensor * ggml_scale_bias(
1446 struct ggml_context * ctx,
1447 struct ggml_tensor * a,
1448 float s,
1449 float b);
1450
1451 GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
1452 struct ggml_context * ctx,
1453 struct ggml_tensor * a,
1454 float s,
1455 float b);
1456
1457 // b -> view(a,offset,nb1,nb2,3), return modified a
1458 GGML_API struct ggml_tensor * ggml_set(
1459 struct ggml_context * ctx,
1460 struct ggml_tensor * a,
1461 struct ggml_tensor * b,
1462 size_t nb1,
1463 size_t nb2,
1464 size_t nb3,
1465 size_t offset); // in bytes
1466
1467 // b -> view(a,offset,nb1,nb2,3), return view(a)
1468 GGML_API struct ggml_tensor * ggml_set_inplace(
1469 struct ggml_context * ctx,
1470 struct ggml_tensor * a,
1471 struct ggml_tensor * b,
1472 size_t nb1,
1473 size_t nb2,
1474 size_t nb3,
1475 size_t offset); // in bytes
1476
1477 GGML_API struct ggml_tensor * ggml_set_1d(
1478 struct ggml_context * ctx,
1479 struct ggml_tensor * a,
1480 struct ggml_tensor * b,
1481 size_t offset); // in bytes
1482
1483 GGML_API struct ggml_tensor * ggml_set_1d_inplace(
1484 struct ggml_context * ctx,
1485 struct ggml_tensor * a,
1486 struct ggml_tensor * b,
1487 size_t offset); // in bytes
1488
1489 // b -> view(a,offset,nb1,nb2,3), return modified a
1490 GGML_API struct ggml_tensor * ggml_set_2d(
1491 struct ggml_context * ctx,
1492 struct ggml_tensor * a,
1493 struct ggml_tensor * b,
1494 size_t nb1,
1495 size_t offset); // in bytes
1496
1497 // b -> view(a,offset,nb1,nb2,3), return view(a)
1498 GGML_API struct ggml_tensor * ggml_set_2d_inplace(
1499 struct ggml_context * ctx,
1500 struct ggml_tensor * a,
1501 struct ggml_tensor * b,
1502 size_t nb1,
1503 size_t offset); // in bytes
1504
1505 // a -> b, return view(b)
1506 GGML_API struct ggml_tensor * ggml_cpy(
1507 struct ggml_context * ctx,
1508 struct ggml_tensor * a,
1509 struct ggml_tensor * b);
1510
1511 // note: casting from f32 to i32 will discard the fractional part
1512 GGML_API struct ggml_tensor * ggml_cast(
1513 struct ggml_context * ctx,
1514 struct ggml_tensor * a,
1515 enum ggml_type type);
1516
1517 // make contiguous
1518 GGML_API struct ggml_tensor * ggml_cont(
1519 struct ggml_context * ctx,
1520 struct ggml_tensor * a);
1521
1522 // make contiguous, with new shape
1523 GGML_API struct ggml_tensor * ggml_cont_1d(
1524 struct ggml_context * ctx,
1525 struct ggml_tensor * a,
1526 int64_t ne0);
1527
1528 GGML_API struct ggml_tensor * ggml_cont_2d(
1529 struct ggml_context * ctx,
1530 struct ggml_tensor * a,
1531 int64_t ne0,
1532 int64_t ne1);
1533
1534 GGML_API struct ggml_tensor * ggml_cont_3d(
1535 struct ggml_context * ctx,
1536 struct ggml_tensor * a,
1537 int64_t ne0,
1538 int64_t ne1,
1539 int64_t ne2);
1540
1541 GGML_API struct ggml_tensor * ggml_cont_4d(
1542 struct ggml_context * ctx,
1543 struct ggml_tensor * a,
1544 int64_t ne0,
1545 int64_t ne1,
1546 int64_t ne2,
1547 int64_t ne3);
1548
1549 // return view(a), b specifies the new shape
1550 // TODO: when we start computing gradient, make a copy instead of view
1551 GGML_API struct ggml_tensor * ggml_reshape(
1552 struct ggml_context * ctx,
1553 struct ggml_tensor * a,
1554 struct ggml_tensor * b);
1555
1556 // return view(a)
1557 // TODO: when we start computing gradient, make a copy instead of view
1558 GGML_API struct ggml_tensor * ggml_reshape_1d(
1559 struct ggml_context * ctx,
1560 struct ggml_tensor * a,
1561 int64_t ne0);
1562
1563 GGML_API struct ggml_tensor * ggml_reshape_2d(
1564 struct ggml_context * ctx,
1565 struct ggml_tensor * a,
1566 int64_t ne0,
1567 int64_t ne1);
1568
1569 // return view(a)
1570 // TODO: when we start computing gradient, make a copy instead of view
1571 GGML_API struct ggml_tensor * ggml_reshape_3d(
1572 struct ggml_context * ctx,
1573 struct ggml_tensor * a,
1574 int64_t ne0,
1575 int64_t ne1,
1576 int64_t ne2);
1577
1578 GGML_API struct ggml_tensor * ggml_reshape_4d(
1579 struct ggml_context * ctx,
1580 struct ggml_tensor * a,
1581 int64_t ne0,
1582 int64_t ne1,
1583 int64_t ne2,
1584 int64_t ne3);
1585
1586 // offset in bytes
1587 GGML_API struct ggml_tensor * ggml_view_1d(
1588 struct ggml_context * ctx,
1589 struct ggml_tensor * a,
1590 int64_t ne0,
1591 size_t offset);
1592
1593 GGML_API struct ggml_tensor * ggml_view_2d(
1594 struct ggml_context * ctx,
1595 struct ggml_tensor * a,
1596 int64_t ne0,
1597 int64_t ne1,
1598 size_t nb1, // row stride in bytes
1599 size_t offset);
1600
1601 GGML_API struct ggml_tensor * ggml_view_3d(
1602 struct ggml_context * ctx,
1603 struct ggml_tensor * a,
1604 int64_t ne0,
1605 int64_t ne1,
1606 int64_t ne2,
1607 size_t nb1, // row stride in bytes
1608 size_t nb2, // slice stride in bytes
1609 size_t offset);
1610
1611 GGML_API struct ggml_tensor * ggml_view_4d(
1612 struct ggml_context * ctx,
1613 struct ggml_tensor * a,
1614 int64_t ne0,
1615 int64_t ne1,
1616 int64_t ne2,
1617 int64_t ne3,
1618 size_t nb1, // row stride in bytes
1619 size_t nb2, // slice stride in bytes
1620 size_t nb3,
1621 size_t offset);
1622
1623 GGML_API struct ggml_tensor * ggml_permute(
1624 struct ggml_context * ctx,
1625 struct ggml_tensor * a,
1626 int axis0,
1627 int axis1,
1628 int axis2,
1629 int axis3);
1630
1631 // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
1632 GGML_API struct ggml_tensor * ggml_transpose(
1633 struct ggml_context * ctx,
1634 struct ggml_tensor * a);
1635
1636 // supports 4D a:
1637 // a [n_embd, ne1, ne2, ne3]
1638 // b I32 [n_rows, ne2, ne3, 1]
1639 //
1640 // return [n_embd, n_rows, ne2, ne3]
1641 GGML_API struct ggml_tensor * ggml_get_rows(
1642 struct ggml_context * ctx,
1643 struct ggml_tensor * a, // data
1644 struct ggml_tensor * b); // row indices
1645
1646 GGML_API struct ggml_tensor * ggml_get_rows_back(
1647 struct ggml_context * ctx,
1648 struct ggml_tensor * a, // gradients of ggml_get_rows result
1649 struct ggml_tensor * b, // row indices
1650 struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1651
1652 // a TD [n_embd, ne1, ne2, ne3]
1653 // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
1654 // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
1655 //
1656 // undefined behavior if destination rows overlap
1657 //
1658 // broadcast:
1659 // ne2 % ne11 == 0
1660 // ne3 % ne12 == 0
1661 //
1662 // return view(a)
1663 GGML_API struct ggml_tensor * ggml_set_rows(
1664 struct ggml_context * ctx,
1665 struct ggml_tensor * a, // destination
1666 struct ggml_tensor * b, // source
1667 struct ggml_tensor * c); // row indices
1668
1669 GGML_API struct ggml_tensor * ggml_diag(
1670 struct ggml_context * ctx,
1671 struct ggml_tensor * a);
1672
1673 // set elements above the diagonal to -INF
1674 GGML_API struct ggml_tensor * ggml_diag_mask_inf(
1675 struct ggml_context * ctx,
1676 struct ggml_tensor * a,
1677 int n_past);
1678
1679 // in-place, returns view(a)
1680 GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
1681 struct ggml_context * ctx,
1682 struct ggml_tensor * a,
1683 int n_past);
1684
1685 // set elements above the diagonal to 0
1686 GGML_API struct ggml_tensor * ggml_diag_mask_zero(
1687 struct ggml_context * ctx,
1688 struct ggml_tensor * a,
1689 int n_past);
1690
1691 // in-place, returns view(a)
1692 GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
1693 struct ggml_context * ctx,
1694 struct ggml_tensor * a,
1695 int n_past);
1696
1697 GGML_API struct ggml_tensor * ggml_soft_max(
1698 struct ggml_context * ctx,
1699 struct ggml_tensor * a);
1700
1701 // in-place, returns view(a)
1702 GGML_API struct ggml_tensor * ggml_soft_max_inplace(
1703 struct ggml_context * ctx,
1704 struct ggml_tensor * a);
1705
1706 // a [ne0, ne01, ne02, ne03]
1707 // mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
1708 //
1709 // broadcast:
1710 // ne02 % ne12 == 0
1711 // ne03 % ne13 == 0
1712 //
1713 // fused soft_max(a*scale + mask*(ALiBi slope))
1714 // max_bias = 0.0f for no ALiBi
1715 GGML_API struct ggml_tensor * ggml_soft_max_ext(
1716 struct ggml_context * ctx,
1717 struct ggml_tensor * a,
1718 struct ggml_tensor * mask,
1719 float scale,
1720 float max_bias);
1721
1722 GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
1723 struct ggml_context * ctx,
1724 struct ggml_tensor * a,
1725 struct ggml_tensor * mask,
1726 float scale,
1727 float max_bias);
1728
1729 GGML_API void ggml_soft_max_add_sinks(
1730 struct ggml_tensor * a,
1731 struct ggml_tensor * sinks);
1732
1733 GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
1734 struct ggml_context * ctx,
1735 struct ggml_tensor * a,
1736 struct ggml_tensor * b,
1737 float scale,
1738 float max_bias);
1739
1740 // in-place, returns view(a)
1741 GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
1742 struct ggml_context * ctx,
1743 struct ggml_tensor * a,
1744 struct ggml_tensor * b,
1745 float scale,
1746 float max_bias);
1747
1748 // rotary position embedding
1749 // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
1750 // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
1751 //
1752 // b is an int32 vector with size a->ne[2], it contains the positions
1753 GGML_API struct ggml_tensor * ggml_rope(
1754 struct ggml_context * ctx,
1755 struct ggml_tensor * a,
1756 struct ggml_tensor * b,
1757 int n_dims,
1758 int mode);
1759
1760 // in-place, returns view(a)
1761 GGML_API struct ggml_tensor * ggml_rope_inplace(
1762 struct ggml_context * ctx,
1763 struct ggml_tensor * a,
1764 struct ggml_tensor * b,
1765 int n_dims,
1766 int mode);
1767
1768 // custom RoPE
1769 // c is freq factors (e.g. phi3-128k), (optional)
1770 GGML_API struct ggml_tensor * ggml_rope_ext(
1771 struct ggml_context * ctx,
1772 struct ggml_tensor * a,
1773 struct ggml_tensor * b,
1774 struct ggml_tensor * c,
1775 int n_dims,
1776 int mode,
1777 int n_ctx_orig,
1778 float freq_base,
1779 float freq_scale,
1780 float ext_factor,
1781 float attn_factor,
1782 float beta_fast,
1783 float beta_slow);
1784
1785 GGML_API struct ggml_tensor * ggml_rope_multi(
1786 struct ggml_context * ctx,
1787 struct ggml_tensor * a,
1788 struct ggml_tensor * b,
1789 struct ggml_tensor * c,
1790 int n_dims,
1791 int sections[GGML_MROPE_SECTIONS],
1792 int mode,
1793 int n_ctx_orig,
1794 float freq_base,
1795 float freq_scale,
1796 float ext_factor,
1797 float attn_factor,
1798 float beta_fast,
1799 float beta_slow);
1800
1801 // in-place, returns view(a)
1802 GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1803 struct ggml_context * ctx,
1804 struct ggml_tensor * a,
1805 struct ggml_tensor * b,
1806 struct ggml_tensor * c,
1807 int n_dims,
1808 int mode,
1809 int n_ctx_orig,
1810 float freq_base,
1811 float freq_scale,
1812 float ext_factor,
1813 float attn_factor,
1814 float beta_fast,
1815 float beta_slow);
1816
1817 GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
1818 struct ggml_context * ctx,
1819 struct ggml_tensor * a,
1820 struct ggml_tensor * b,
1821 struct ggml_tensor * c,
1822 int n_dims,
1823 int sections[GGML_MROPE_SECTIONS],
1824 int mode,
1825 int n_ctx_orig,
1826 float freq_base,
1827 float freq_scale,
1828 float ext_factor,
1829 float attn_factor,
1830 float beta_fast,
1831 float beta_slow);
1832
1833 GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1834 struct ggml_context * ctx,
1835 struct ggml_tensor * a,
1836 struct ggml_tensor * b,
1837 int n_dims,
1838 int mode,
1839 int n_ctx_orig,
1840 float freq_base,
1841 float freq_scale,
1842 float ext_factor,
1843 float attn_factor,
1844 float beta_fast,
1845 float beta_slow),
1846 "use ggml_rope_ext instead");
1847
1848 GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1849 struct ggml_context * ctx,
1850 struct ggml_tensor * a,
1851 struct ggml_tensor * b,
1852 int n_dims,
1853 int mode,
1854 int n_ctx_orig,
1855 float freq_base,
1856 float freq_scale,
1857 float ext_factor,
1858 float attn_factor,
1859 float beta_fast,
1860 float beta_slow),
1861 "use ggml_rope_ext_inplace instead");
1862
1863 // compute correction dims for YaRN RoPE scaling
1864 GGML_API void ggml_rope_yarn_corr_dims(
1865 int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1866
1867 // rotary position embedding backward, i.e compute dx from dy
1868 // a - dy
1869 GGML_API struct ggml_tensor * ggml_rope_ext_back(
1870 struct ggml_context * ctx,
1871 struct ggml_tensor * a, // gradients of ggml_rope result
1872 struct ggml_tensor * b, // positions
1873 struct ggml_tensor * c, // freq factors
1874 int n_dims,
1875 int mode,
1876 int n_ctx_orig,
1877 float freq_base,
1878 float freq_scale,
1879 float ext_factor,
1880 float attn_factor,
1881 float beta_fast,
1882 float beta_slow);
1883
1884 GGML_API struct ggml_tensor * ggml_rope_multi_back(
1885 struct ggml_context * ctx,
1886 struct ggml_tensor * a,
1887 struct ggml_tensor * b,
1888 struct ggml_tensor * c,
1889 int n_dims,
1890 int sections[4],
1891 int mode,
1892 int n_ctx_orig,
1893 float freq_base,
1894 float freq_scale,
1895 float ext_factor,
1896 float attn_factor,
1897 float beta_fast,
1898 float beta_slow);
1899
1900
1901 // clamp
1902 // in-place, returns view(a)
1903 GGML_API struct ggml_tensor * ggml_clamp(
1904 struct ggml_context * ctx,
1905 struct ggml_tensor * a,
1906 float min,
1907 float max);
1908
1909 // im2col
1910 // converts data into a format that effectively results in a convolution when combined with matrix multiplication
1911 GGML_API struct ggml_tensor * ggml_im2col(
1912 struct ggml_context * ctx,
1913 struct ggml_tensor * a, // convolution kernel
1914 struct ggml_tensor * b, // data
1915 int s0, // stride dimension 0
1916 int s1, // stride dimension 1
1917 int p0, // padding dimension 0
1918 int p1, // padding dimension 1
1919 int d0, // dilation dimension 0
1920 int d1, // dilation dimension 1
1921 bool is_2D,
1922 enum ggml_type dst_type);
1923
1924 GGML_API struct ggml_tensor * ggml_im2col_back(
1925 struct ggml_context * ctx,
1926 struct ggml_tensor * a, // convolution kernel
1927 struct ggml_tensor * b, // gradient of im2col output
1928 int64_t * ne, // shape of im2col input
1929 int s0, // stride dimension 0
1930 int s1, // stride dimension 1
1931 int p0, // padding dimension 0
1932 int p1, // padding dimension 1
1933 int d0, // dilation dimension 0
1934 int d1, // dilation dimension 1
1935 bool is_2D);
1936
1937 GGML_API struct ggml_tensor * ggml_conv_1d(
1938 struct ggml_context * ctx,
1939 struct ggml_tensor * a, // convolution kernel
1940 struct ggml_tensor * b, // data
1941 int s0, // stride
1942 int p0, // padding
1943 int d0); // dilation
1944
1945 // conv_1d with padding = half
1946 // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1947 GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1948 struct ggml_context * ctx,
1949 struct ggml_tensor * a, // convolution kernel
1950 struct ggml_tensor * b, // data
1951 int s, // stride
1952 int d); // dilation
1953
1954 // depthwise
1955 // TODO: this is very likely wrong for some cases! - needs more testing
1956 GGML_API struct ggml_tensor * ggml_conv_1d_dw(
1957 struct ggml_context * ctx,
1958 struct ggml_tensor * a, // convolution kernel
1959 struct ggml_tensor * b, // data
1960 int s0, // stride
1961 int p0, // padding
1962 int d0); // dilation
1963
1964 GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
1965 struct ggml_context * ctx,
1966 struct ggml_tensor * a, // convolution kernel
1967 struct ggml_tensor * b, // data
1968 int s0, // stride
1969 int d0); // dilation
1970
1971 GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1972 struct ggml_context * ctx,
1973 struct ggml_tensor * a, // convolution kernel
1974 struct ggml_tensor * b, // data
1975 int s0, // stride
1976 int p0, // padding
1977 int d0); // dilation
1978
1979 GGML_API struct ggml_tensor * ggml_conv_2d(
1980 struct ggml_context * ctx,
1981 struct ggml_tensor * a, // convolution kernel
1982 struct ggml_tensor * b, // data
1983 int s0, // stride dimension 0
1984 int s1, // stride dimension 1
1985 int p0, // padding dimension 0
1986 int p1, // padding dimension 1
1987 int d0, // dilation dimension 0
1988 int d1); // dilation dimension 1
1989
1990 GGML_API struct ggml_tensor * ggml_im2col_3d(
1991 struct ggml_context * ctx,
1992 struct ggml_tensor * a,
1993 struct ggml_tensor * b,
1994 int64_t IC,
1995 int s0, // stride width
1996 int s1, // stride height
1997 int s2, // stride depth
1998 int p0, // padding width
1999 int p1, // padding height
2000 int p2, // padding depth
2001 int d0, // dilation width
2002 int d1, // dilation height
2003 int d2, // dilation depth
2004 enum ggml_type dst_type);
2005
2006 // a: [OC*IC, KD, KH, KW]
2007 // b: [N*IC, ID, IH, IW]
2008 // result: [N*OC, OD, OH, OW]
2009 GGML_API struct ggml_tensor * ggml_conv_3d(
2010 struct ggml_context * ctx,
2011 struct ggml_tensor * a,
2012 struct ggml_tensor * b,
2013 int64_t IC,
2014 int s0, // stride width
2015 int s1, // stride height
2016 int s2, // stride depth
2017 int p0, // padding width
2018 int p1, // padding height
2019 int p2, // padding depth
2020 int d0, // dilation width
2021 int d1, // dilation height
2022 int d2 // dilation depth
2023 );
2024
2025 // kernel size is a->ne[0] x a->ne[1]
2026 // stride is equal to kernel size
2027 // padding is zero
2028 // example:
2029 // a: 16 16 3 768
2030 // b: 1024 1024 3 1
2031 // res: 64 64 768 1
2032 // used in sam
2033 GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
2034 struct ggml_context * ctx,
2035 struct ggml_tensor * a,
2036 struct ggml_tensor * b);
2037
2038 // kernel size is a->ne[0] x a->ne[1]
2039 // stride is 1
2040 // padding is half
2041 // example:
2042 // a: 3 3 256 256
2043 // b: 64 64 256 1
2044 // res: 64 64 256 1
2045 // used in sam
2046 GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
2047 struct ggml_context * ctx,
2048 struct ggml_tensor * a,
2049 struct ggml_tensor * b);
2050
2051 // depthwise (via im2col and mul_mat)
2052 GGML_API struct ggml_tensor * ggml_conv_2d_dw(
2053 struct ggml_context * ctx,
2054 struct ggml_tensor * a, // convolution kernel
2055 struct ggml_tensor * b, // data
2056 int s0, // stride dimension 0
2057 int s1, // stride dimension 1
2058 int p0, // padding dimension 0
2059 int p1, // padding dimension 1
2060 int d0, // dilation dimension 0
2061 int d1); // dilation dimension 1
2062
2063 // Depthwise 2D convolution
2064 // may be faster than ggml_conv_2d_dw, but not available in all backends
2065 // a: KW KH 1 C convolution kernel
2066 // b: W H C N input data
2067 // res: W_out H_out C N
2068 GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
2069 struct ggml_context * ctx,
2070 struct ggml_tensor * a,
2071 struct ggml_tensor * b,
2072 int stride0,
2073 int stride1,
2074 int pad0,
2075 int pad1,
2076 int dilation0,
2077 int dilation1);
2078
2079 GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
2080 struct ggml_context * ctx,
2081 struct ggml_tensor * a,
2082 struct ggml_tensor * b,
2083 int stride);
2084
2085 GGML_API struct ggml_tensor * ggml_conv_2d_direct(
2086 struct ggml_context * ctx,
2087 struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
2088 struct ggml_tensor * b, // input data [W, H, C, N]
2089 int s0, // stride dimension 0
2090 int s1, // stride dimension 1
2091 int p0, // padding dimension 0
2092 int p1, // padding dimension 1
2093 int d0, // dilation dimension 0
2094 int d1); // dilation dimension 1
2095
2096 GGML_API struct ggml_tensor * ggml_conv_3d_direct(
2097 struct ggml_context * ctx,
2098 struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
2099 struct ggml_tensor * b, // input [W, H, D, C * N]
2100 int s0, // stride
2101 int s1,
2102 int s2,
2103 int p0, // padding
2104 int p1,
2105 int p2,
2106 int d0, // dilation
2107 int d1,
2108 int d2,
2109 int n_channels,
2110 int n_batch,
2111 int n_channels_out);
2112
2113 enum ggml_op_pool {
2114 GGML_OP_POOL_MAX,
2115 GGML_OP_POOL_AVG,
2116 GGML_OP_POOL_COUNT,
2117 };
2118
2119 GGML_API struct ggml_tensor * ggml_pool_1d(
2120 struct ggml_context * ctx,
2121 struct ggml_tensor * a,
2122 enum ggml_op_pool op,
2123 int k0, // kernel size
2124 int s0, // stride
2125 int p0); // padding
2126
2127 // the result will have 2*p0 padding for the first dimension
2128 // and 2*p1 padding for the second dimension
2129 GGML_API struct ggml_tensor * ggml_pool_2d(
2130 struct ggml_context * ctx,
2131 struct ggml_tensor * a,
2132 enum ggml_op_pool op,
2133 int k0,
2134 int k1,
2135 int s0,
2136 int s1,
2137 float p0,
2138 float p1);
2139
2140 GGML_API struct ggml_tensor * ggml_pool_2d_back(
2141 struct ggml_context * ctx,
2142 struct ggml_tensor * a,
2143 struct ggml_tensor * af, // "a"/input used in forward pass
2144 enum ggml_op_pool op,
2145 int k0,
2146 int k1,
2147 int s0,
2148 int s1,
2149 float p0,
2150 float p1);
2151
2152 enum ggml_scale_mode {
2153 GGML_SCALE_MODE_NEAREST = 0,
2154 GGML_SCALE_MODE_BILINEAR = 1,
2155 GGML_SCALE_MODE_BICUBIC = 2,
2156
2157 GGML_SCALE_MODE_COUNT
2158 };
2159
2160 enum ggml_scale_flag {
2161 GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
2162 GGML_SCALE_FLAG_ANTIALIAS = (1 << 9),
2163 };
2164
2165 // interpolate
2166 // multiplies ne0 and ne1 by scale factor
2167 GGML_API struct ggml_tensor * ggml_upscale(
2168 struct ggml_context * ctx,
2169 struct ggml_tensor * a,
2170 int scale_factor,
2171 enum ggml_scale_mode mode);
2172
2173 // interpolate
2174 // interpolate scale to specified dimensions
2175 GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
2176 struct ggml_context * ctx,
2177 struct ggml_tensor * a,
2178 int ne0,
2179 int ne1,
2180 int ne2,
2181 int ne3,
2182 enum ggml_scale_mode mode),
2183 "use ggml_interpolate instead");
2184
2185 // Up- or downsamples the input to the specified size.
2186 // 2D scale modes (eg. bilinear) are applied to the first two dimensions.
2187 GGML_API struct ggml_tensor * ggml_interpolate(
2188 struct ggml_context * ctx,
2189 struct ggml_tensor * a,
2190 int64_t ne0,
2191 int64_t ne1,
2192 int64_t ne2,
2193 int64_t ne3,
2194 uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
2195
2196 // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
2197 GGML_API struct ggml_tensor * ggml_pad(
2198 struct ggml_context * ctx,
2199 struct ggml_tensor * a,
2200 int p0,
2201 int p1,
2202 int p2,
2203 int p3);
2204
2205 // pad each dimension with values on the other side of the torus (looping around)
2206 GGML_API struct ggml_tensor * ggml_pad_circular(
2207 struct ggml_context * ctx,
2208 struct ggml_tensor * a,
2209 int p0,
2210 int p1,
2211 int p2,
2212 int p3);
2213
2214 GGML_API struct ggml_tensor * ggml_pad_ext(
2215 struct ggml_context * ctx,
2216 struct ggml_tensor * a,
2217 int lp0,
2218 int rp0,
2219 int lp1,
2220 int rp1,
2221 int lp2,
2222 int rp2,
2223 int lp3,
2224 int rp3
2225 );
2226
2227 // pad each dimension with values on the other side of the torus (looping around)
2228 GGML_API struct ggml_tensor * ggml_pad_ext_circular(
2229 struct ggml_context * ctx,
2230 struct ggml_tensor * a,
2231 int lp0,
2232 int rp0,
2233 int lp1,
2234 int rp1,
2235 int lp2,
2236 int rp2,
2237 int lp3,
2238 int rp3);
2239
2240 // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
2241 GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
2242 struct ggml_context * ctx,
2243 struct ggml_tensor * a,
2244 int p0,
2245 int p1);
2246
2247 // Move tensor elements by an offset given for each dimension. Elements that
2248 // are shifted beyond the last position are wrapped around to the beginning.
2249 GGML_API struct ggml_tensor * ggml_roll(
2250 struct ggml_context * ctx,
2251 struct ggml_tensor * a,
2252 int shift0,
2253 int shift1,
2254 int shift2,
2255 int shift3);
2256
2257 // Convert matrix into a triangular one (upper, strict upper, lower or strict lower) by writing
2258 // zeroes everywhere outside the masked area
2259 GGML_API struct ggml_tensor * ggml_tri(
2260 struct ggml_context * ctx,
2261 struct ggml_tensor * a,
2262 enum ggml_tri_type type);
2263
2264 // Fill tensor a with constant c
2265 GGML_API struct ggml_tensor * ggml_fill(
2266 struct ggml_context * ctx,
2267 struct ggml_tensor * a,
2268 float c);
2269
2270 GGML_API struct ggml_tensor * ggml_fill_inplace(
2271 struct ggml_context * ctx,
2272 struct ggml_tensor * a,
2273 float c);
2274
2275 // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
2276 // timesteps: [N,]
2277 // return: [N, dim]
2278 GGML_API struct ggml_tensor * ggml_timestep_embedding(
2279 struct ggml_context * ctx,
2280 struct ggml_tensor * timesteps,
2281 int dim,
2282 int max_period);
2283
2284 // sort rows
2285 enum ggml_sort_order {
2286 GGML_SORT_ORDER_ASC,
2287 GGML_SORT_ORDER_DESC,
2288 };
2289
2290 GGML_API struct ggml_tensor * ggml_argsort(
2291 struct ggml_context * ctx,
2292 struct ggml_tensor * a,
2293 enum ggml_sort_order order);
2294
2295 // similar to ggml_top_k but implemented as `argsort` + `view`
2296 GGML_API struct ggml_tensor * ggml_argsort_top_k(
2297 struct ggml_context * ctx,
2298 struct ggml_tensor * a,
2299 int k);
2300
2301 // top k elements per row
2302 // note: the resulting top k indices are in no particular order
2303 GGML_API struct ggml_tensor * ggml_top_k(
2304 struct ggml_context * ctx,
2305 struct ggml_tensor * a,
2306 int k);
2307
2308 GGML_API struct ggml_tensor * ggml_arange(
2309 struct ggml_context * ctx,
2310 float start,
2311 float stop,
2312 float step);
2313
2314 // q: [n_embd_k, n_batch, n_head, ne3 ]
2315 // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2316 // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2317 // mask: [n_kv, n_batch, ne32, ne33]
2318 // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2319 //
2320 // broadcast:
2321 // n_head % n_head_kv == 0
2322 // n_head % ne32 == 0
2323 // ne3 % ne33 == 0
2324 //
2325 GGML_API struct ggml_tensor * ggml_flash_attn_ext(
2326 struct ggml_context * ctx,
2327 struct ggml_tensor * q,
2328 struct ggml_tensor * k,
2329 struct ggml_tensor * v,
2330 struct ggml_tensor * mask,
2331 float scale,
2332 float max_bias,
2333 float logit_softcap);
2334
2335 GGML_API void ggml_flash_attn_ext_set_prec(
2336 struct ggml_tensor * a,
2337 enum ggml_prec prec);
2338
2339 GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
2340 const struct ggml_tensor * a);
2341
2342 GGML_API void ggml_flash_attn_ext_add_sinks(
2343 struct ggml_tensor * a,
2344 struct ggml_tensor * sinks);
2345
2346 // TODO: needs to be adapted to ggml_flash_attn_ext
2347 GGML_API struct ggml_tensor * ggml_flash_attn_back(
2348 struct ggml_context * ctx,
2349 struct ggml_tensor * q,
2350 struct ggml_tensor * k,
2351 struct ggml_tensor * v,
2352 struct ggml_tensor * d,
2353 bool masked);
2354
2355 GGML_API struct ggml_tensor * ggml_ssm_conv(
2356 struct ggml_context * ctx,
2357 struct ggml_tensor * sx,
2358 struct ggml_tensor * c);
2359
2360 GGML_API struct ggml_tensor * ggml_ssm_scan(
2361 struct ggml_context * ctx,
2362 struct ggml_tensor * s,
2363 struct ggml_tensor * x,
2364 struct ggml_tensor * dt,
2365 struct ggml_tensor * A,
2366 struct ggml_tensor * B,
2367 struct ggml_tensor * C,
2368 struct ggml_tensor * ids);
2369
2370 // partition into non-overlapping windows with padding if needed
2371 // example:
2372 // a: 768 64 64 1
2373 // w: 14
2374 // res: 768 14 14 25
2375 // used in sam
2376 GGML_API struct ggml_tensor * ggml_win_part(
2377 struct ggml_context * ctx,
2378 struct ggml_tensor * a,
2379 int w);
2380
2381 // reverse of ggml_win_part
2382 // used in sam
2383 GGML_API struct ggml_tensor * ggml_win_unpart(
2384 struct ggml_context * ctx,
2385 struct ggml_tensor * a,
2386 int w0,
2387 int h0,
2388 int w);
2389
2390 GGML_API struct ggml_tensor * ggml_unary(
2391 struct ggml_context * ctx,
2392 struct ggml_tensor * a,
2393 enum ggml_unary_op op);
2394
2395 GGML_API struct ggml_tensor * ggml_unary_inplace(
2396 struct ggml_context * ctx,
2397 struct ggml_tensor * a,
2398 enum ggml_unary_op op);
2399
2400 // used in sam
2401 GGML_API struct ggml_tensor * ggml_get_rel_pos(
2402 struct ggml_context * ctx,
2403 struct ggml_tensor * a,
2404 int qh,
2405 int kh);
2406
2407 // used in sam
2408 GGML_API struct ggml_tensor * ggml_add_rel_pos(
2409 struct ggml_context * ctx,
2410 struct ggml_tensor * a,
2411 struct ggml_tensor * pw,
2412 struct ggml_tensor * ph);
2413
2414 GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
2415 struct ggml_context * ctx,
2416 struct ggml_tensor * a,
2417 struct ggml_tensor * pw,
2418 struct ggml_tensor * ph);
2419
2420 GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
2421 struct ggml_context * ctx,
2422 struct ggml_tensor * k,
2423 struct ggml_tensor * v,
2424 struct ggml_tensor * r,
2425 struct ggml_tensor * tf,
2426 struct ggml_tensor * td,
2427 struct ggml_tensor * state);
2428
2429 GGML_API struct ggml_tensor * ggml_gated_linear_attn(
2430 struct ggml_context * ctx,
2431 struct ggml_tensor * k,
2432 struct ggml_tensor * v,
2433 struct ggml_tensor * q,
2434 struct ggml_tensor * g,
2435 struct ggml_tensor * state,
2436 float scale);
2437
2438 GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
2439 struct ggml_context * ctx,
2440 struct ggml_tensor * r,
2441 struct ggml_tensor * w,
2442 struct ggml_tensor * k,
2443 struct ggml_tensor * v,
2444 struct ggml_tensor * a,
2445 struct ggml_tensor * b,
2446 struct ggml_tensor * state);
2447
2448 /* Solves a specific equation of the form Ax=B, where A is a triangular matrix
2449 * without zeroes on the diagonal (i.e. invertible).
2450 * B can have any number of columns, but must have the same number of rows as A
2451 * If A is [n, n] and B is [n, m], then the result will be [n, m] as well
2452 * Has O(n^3) complexity (unlike most matrix ops out there), so use on cases
2453 * where n > 100 sparingly, pre-chunk if necessary.
2454 *
2455 * If left = false, solves xA=B instead
2456 * If lower = false, assumes upper triangular instead
2457 * If uni = true, assumes diagonal of A to be all ones (will override actual values)
2458 *
2459 * TODO: currently only lower, right, non-unitriangular variant is implemented
2460 */
2461 GGML_API struct ggml_tensor * ggml_solve_tri(
2462 struct ggml_context * ctx,
2463 struct ggml_tensor * a,
2464 struct ggml_tensor * b,
2465 bool left,
2466 bool lower,
2467 bool uni);
2468
2469 // custom operators
2470
2471 typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
2472 typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
2473 typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
2474
2475#define GGML_N_TASKS_MAX (-1)
2476 // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
2477
2478 GGML_API struct ggml_tensor * ggml_map_custom1(
2479 struct ggml_context * ctx,
2480 struct ggml_tensor * a,
2481 ggml_custom1_op_t fun,
2482 int n_tasks,
2483 void * userdata);
2484
2485 GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
2486 struct ggml_context * ctx,
2487 struct ggml_tensor * a,
2488 ggml_custom1_op_t fun,
2489 int n_tasks,
2490 void * userdata);
2491
2492 GGML_API struct ggml_tensor * ggml_map_custom2(
2493 struct ggml_context * ctx,
2494 struct ggml_tensor * a,
2495 struct ggml_tensor * b,
2496 ggml_custom2_op_t fun,
2497 int n_tasks,
2498 void * userdata);
2499
2500 GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
2501 struct ggml_context * ctx,
2502 struct ggml_tensor * a,
2503 struct ggml_tensor * b,
2504 ggml_custom2_op_t fun,
2505 int n_tasks,
2506 void * userdata);
2507
2508 GGML_API struct ggml_tensor * ggml_map_custom3(
2509 struct ggml_context * ctx,
2510 struct ggml_tensor * a,
2511 struct ggml_tensor * b,
2512 struct ggml_tensor * c,
2513 ggml_custom3_op_t fun,
2514 int n_tasks,
2515 void * userdata);
2516
2517 GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
2518 struct ggml_context * ctx,
2519 struct ggml_tensor * a,
2520 struct ggml_tensor * b,
2521 struct ggml_tensor * c,
2522 ggml_custom3_op_t fun,
2523 int n_tasks,
2524 void * userdata);
2525
2526 typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
2527
2528 GGML_API struct ggml_tensor * ggml_custom_4d(
2529 struct ggml_context * ctx,
2530 enum ggml_type type,
2531 int64_t ne0,
2532 int64_t ne1,
2533 int64_t ne2,
2534 int64_t ne3,
2535 struct ggml_tensor ** args,
2536 int n_args,
2537 ggml_custom_op_t fun,
2538 int n_tasks,
2539 void * userdata);
2540
2541 GGML_API struct ggml_tensor * ggml_custom_inplace(
2542 struct ggml_context * ctx,
2543 struct ggml_tensor * a,
2544 struct ggml_tensor ** args,
2545 int n_args,
2546 ggml_custom_op_t fun,
2547 int n_tasks,
2548 void * userdata);
2549
2550 // loss function
2551
2552 GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
2553 struct ggml_context * ctx,
2554 struct ggml_tensor * a, // logits
2555 struct ggml_tensor * b); // labels
2556
2557 GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
2558 struct ggml_context * ctx,
2559 struct ggml_tensor * a, // logits
2560 struct ggml_tensor * b, // labels
2561 struct ggml_tensor * c); // gradients of cross_entropy_loss result
2562
2563 // AdamW optimizer step
2564 // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
2565 // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
2566 GGML_API struct ggml_tensor * ggml_opt_step_adamw(
2567 struct ggml_context * ctx,
2568 struct ggml_tensor * a,
2569 struct ggml_tensor * grad,
2570 struct ggml_tensor * m,
2571 struct ggml_tensor * v,
2572 struct ggml_tensor * adamw_params); // parameters such as the learning rate
2573
2574 // stochastic gradient descent step (with weight decay)
2575 GGML_API struct ggml_tensor * ggml_opt_step_sgd(
2576 struct ggml_context * ctx,
2577 struct ggml_tensor * a,
2578 struct ggml_tensor * grad,
2579 struct ggml_tensor * sgd_params); // alpha, weight decay
2580
2581 // build forward mutiple tensors and select one of them for computing
2582 // this is useful for creating graphs that have constant topology but compute different things based on the input
2583 // ref: https://github.com/ggml-org/llama.cpp/pull/18550
2584 //
2585 // nodes:
2586 // | - build forward into the graph but do not compute
2587 // c - build forward into the graph and compute
2588 //
2589 // | | ... c ... |
2590 // | | ... c ... |
2591 // | | ... c ... |
2592 // [0 1 ... idx ... n-1] <-- ggml_build_forward_select(..., n, idx)
2593 // c
2594 // c
2595 //
2596 // example:
2597 // struct ggml_tensor * curs[3];
2598 //
2599 // curs[0] = compute0(...);
2600 // curs[1] = compute1(...);
2601 // curs[2] = compute2(...);
2602 //
2603 // int idx = select_branch(some_input);
2604 //
2605 // struct ggml_tensor * out = ggml_build_forward_select(cgraph, curs, 3, idx);
2606 //
2607 GGML_API struct ggml_tensor * ggml_build_forward_select(
2608 struct ggml_cgraph * cgraph,
2609 struct ggml_tensor ** tensors,
2610 int n_tensors,
2611 int idx);
2612
2613 GGML_API void ggml_build_forward_expand(
2614 struct ggml_cgraph * cgraph,
2615 struct ggml_tensor * tensor);
2616
2617 GGML_API void ggml_build_backward_expand(
2618 struct ggml_context * ctx, // context for gradient computation
2619 struct ggml_cgraph * cgraph,
2620 struct ggml_tensor ** grad_accs);
2621
2622 // graph allocation in a context
2623 GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2624 GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2625 GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
2626 GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2627 GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2628 GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
2629
2630 GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
2631 GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
2632 GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
2633 GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
2634
2635 GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2636
2637 GGML_API size_t ggml_graph_overhead(void);
2638 GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
2639
2640 GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
2641 GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2642 GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2643
2644 // print info and performance information for the graph
2645 GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
2646
2647 // dump the graph into a file using the dot format
2648 GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename);
2649
2650 // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
2651 typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2652
2653 // Set callback for all future logging events.
2654 // If this is not called, or NULL is supplied, everything is output on stderr.
2655 GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
2656 GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2657
2658 GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2659
2660 //
2661 // quantization
2662 //
2663
2664 // - ggml_quantize_init can be called multiple times with the same type
2665 // it will only initialize the quantization tables for the first call or after ggml_quantize_free
2666 // automatically called by ggml_quantize_chunk for convenience
2667 //
2668 // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
2669 // call this at the end of the program to avoid memory leaks
2670 //
2671 // note: these are thread-safe
2672 //
2673 GGML_API void ggml_quantize_init(enum ggml_type type);
2674 GGML_API void ggml_quantize_free(void);
2675
2676 // some quantization type cannot be used without an importance matrix
2677 GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2678
2679 // calls ggml_quantize_init internally (i.e. can allocate memory)
2680 GGML_API size_t ggml_quantize_chunk(
2681 enum ggml_type type,
2682 const float * src,
2683 void * dst,
2684 int64_t start,
2685 int64_t nrows,
2686 int64_t n_per_row,
2687 const float * imatrix);
2688
2689#ifdef __cplusplus
2690 // restrict not standard in C++
2691# if defined(__GNUC__)
2692# define GGML_RESTRICT __restrict__
2693# elif defined(__clang__)
2694# define GGML_RESTRICT __restrict
2695# elif defined(_MSC_VER)
2696# define GGML_RESTRICT __restrict
2697# else
2698# define GGML_RESTRICT
2699# endif
2700#else
2701# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
2702# define GGML_RESTRICT __restrict
2703# else
2704# define GGML_RESTRICT restrict
2705# endif
2706#endif
2707 typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2708 typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2709
2710 struct ggml_type_traits {
2711 const char * type_name;
2712 int64_t blck_size;
2713 int64_t blck_size_interleave; // interleave elements in blocks
2714 size_t type_size;
2715 bool is_quantized;
2716 ggml_to_float_t to_float;
2717 ggml_from_float_t from_float_ref;
2718 };
2719
2720 GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
2721
2722 // ggml threadpool
2723 // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
2724 // the goal should be to create an API that other backends can use move everything to the ggml base
2725
2726 // scheduling priorities
2727 enum ggml_sched_priority {
2728 GGML_SCHED_PRIO_LOW = -1,
2729 GGML_SCHED_PRIO_NORMAL,
2730 GGML_SCHED_PRIO_MEDIUM,
2731 GGML_SCHED_PRIO_HIGH,
2732 GGML_SCHED_PRIO_REALTIME
2733 };
2734
2735 // threadpool params
2736 // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
2737 struct ggml_threadpool_params {
2738 bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
2739 int n_threads; // number of threads
2740 enum ggml_sched_priority prio; // thread priority
2741 uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
2742 bool strict_cpu; // strict cpu placement
2743 bool paused; // start in paused state
2744 };
2745
2746 struct ggml_threadpool; // forward declaration, see ggml.c
2747
2748 typedef struct ggml_threadpool * ggml_threadpool_t;
2749
2750 GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2751 GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2752 GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2753
2754#ifdef __cplusplus
2755}
2756#endif