diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp')
| -rw-r--r-- | llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp | 224 |
1 files changed, 224 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp b/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp new file mode 100644 index 0000000..895a571 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp @@ -0,0 +1,224 @@ +#include "amx.h" +#include "common.h" +#include "mmq.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "traits.h" + +#if defined(__linux__) +#include <sys/syscall.h> +#include <unistd.h> +#endif + +#include <cstdlib> +#include <cstring> +#include <memory> + +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) + +// AMX type_trais +namespace ggml::cpu::amx { +class tensor_traits : public ggml::cpu::tensor_traits { + bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + size = ggml_backend_amx_desired_wsize(op); + return true; + } + + bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT) { + ggml_backend_amx_mul_mat(params, op); + return true; + } + return false; + } +}; + +static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) { + static tensor_traits traits; + return &traits; +} +} // namespace ggml::cpu::amx + +// AMX buffer interface +static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); +} + +static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *) (buffer->context); +} + +static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor); + + GGML_UNUSED(buffer); + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + uint8_t value, size_t offset, size_t size) { + memset((char *) tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) { + if (qtype_has_amx_kernels(tensor->type)) { + GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type)); + ggml_backend_amx_convert_weight(tensor, data, offset, size); + } else { + memcpy((char *) tensor->data + offset, data, size); + } + + GGML_UNUSED(buffer); +} + +/* +// need to figure what we need to do with buffer->extra. +static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); + memcpy(data, (const char *)tensor->data + offset, size); + + GGML_UNUSED(buffer); +} + +static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_buffer_is_host(src->buffer)) { + if (qtype_has_amx_kernels(src->type)) { + ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst)); + } else { + memcpy(dst->data, src->data, ggml_nbytes(src)); + } + return true; + } + return false; + + GGML_UNUSED(buffer); +} +*/ + +static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); +} + +static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { + /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, + /* .get_base = */ ggml_backend_amx_buffer_get_base, + /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, + /* .get_tensor = */ nullptr, + /* .cpy_tensor = */ nullptr, + /* .clear = */ ggml_backend_amx_buffer_clear, + /* .reset = */ nullptr, +}; + +static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "AMX"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * data = ggml_aligned_malloc(size); + if (data == NULL) { + fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } + + return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size); +} + +static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return TENSOR_ALIGNMENT; + + GGML_UNUSED(buft); +} + +namespace ggml::cpu::amx { +class extra_buffer_type : ggml::cpu::extra_buffer_type { + bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + // handle only 2d gemm for now + auto is_contiguous_2d = [](const struct ggml_tensor * t) { + return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; + }; + + if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous + is_contiguous_2d(op->src[1]) && // src1 must be contiguous + op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() && + op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315) + op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x + (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) { + // src1 must be host buffer + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + // src1 must be float32 + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + } + return false; + } + + ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && + op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) { + return (ggml::cpu::tensor_traits *) op->src[0]->extra; + } + + return nullptr; + } +}; +} // namespace ggml::cpu::amx + +static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + return ggml_backend_amx_get_alloc_size(tensor); + + GGML_UNUSED(buft); +} + +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define XFEATURE_XTILECFG 17 +#define XFEATURE_XTILEDATA 18 + +static bool ggml_amx_init() { +#if defined(__linux__) + if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { + fprintf(stderr, "AMX is not ready to be used!\n"); + return false; + } + return true; +#elif defined(_WIN32) + return true; +#else + return false; +#endif +} + +ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { + /* .iface = */ { + /* .get_name = */ ggml_backend_amx_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, + /* .is_host = */ nullptr, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ new ggml::cpu::amx::extra_buffer_type(), + }; + + if (!ggml_amx_init()) { + return nullptr; + } + + return &ggml_backend_buffer_type_amx; +} + +#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) |
