1 files changed, 224 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp b/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp
new file mode 100644
index 0000000..895a571
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp
@@ -0,0 +1,224 @@
+#include "amx.h"
+#include "common.h"
+#include "mmq.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "traits.h"
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+// AMX type_trais
+namespace ggml::cpu::amx {
+class tensor_traits : public ggml::cpu::tensor_traits {
+    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
+        size = ggml_backend_amx_desired_wsize(op);
+        return true;
+    }
+    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT) {
+            ggml_backend_amx_mul_mat(params, op);
+            return true;
+        }
+        return false;
+    }
+};
+static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
+    static tensor_traits traits;
+    return &traits;
+}
+}  // namespace ggml::cpu::amx
+// AMX buffer interface
+static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+}
+static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *) (buffer->context);
+}
+static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
+    GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
+}
+static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                                  uint8_t value, size_t offset, size_t size) {
+    memset((char *) tensor->data + offset, value, size);
+    GGML_UNUSED(buffer);
+}
+static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                               const void * data, size_t offset, size_t size) {
+    if (qtype_has_amx_kernels(tensor->type)) {
+        GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
+        ggml_backend_amx_convert_weight(tensor, data, offset, size);
+    } else {
+        memcpy((char *) tensor->data + offset, data, size);
+    }
+    GGML_UNUSED(buffer);
+}
+/*
+// need to figure what we need to do with buffer->extra.
+static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
+    memcpy(data, (const char *)tensor->data + offset, size);
+    GGML_UNUSED(buffer);
+}
+static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        if (qtype_has_amx_kernels(src->type)) {
+            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
+        } else {
+            memcpy(dst->data, src->data, ggml_nbytes(src));
+        }
+        return true;
+    }
+    return false;
+    GGML_UNUSED(buffer);
+}
+*/
+static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_amx_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
+    /* .get_tensor      = */ nullptr,
+    /* .cpy_tensor      = */ nullptr,
+    /* .clear           = */ ggml_backend_amx_buffer_clear,
+    /* .reset           = */ nullptr,
+};
+static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "AMX";
+    GGML_UNUSED(buft);
+}
+static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = ggml_aligned_malloc(size);
+    if (data == NULL) {
+        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
+}
+static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+    GGML_UNUSED(buft);
+}
+namespace ggml::cpu::amx {
+class extra_buffer_type : ggml::cpu::extra_buffer_type {
+    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
+        // handle only 2d gemm for now
+        auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+            return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+        };
+        if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
+            is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
+            op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
+            op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
+            op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
+            (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
+            // src1 must be host buffer
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            // src1 must be float32
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+        }
+        return false;
+    }
+    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
+            op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
+            return (ggml::cpu::tensor_traits *) op->src[0]->extra;
+        }
+        return nullptr;
+    }
+};
+}  // namespace ggml::cpu::amx
+static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    return ggml_backend_amx_get_alloc_size(tensor);
+    GGML_UNUSED(buft);
+}
+#define ARCH_GET_XCOMP_PERM     0x1022
+#define ARCH_REQ_XCOMP_PERM     0x1023
+#define XFEATURE_XTILECFG       17
+#define XFEATURE_XTILEDATA      18
+static bool ggml_amx_init() {
+#if defined(__linux__)
+    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
+        fprintf(stderr, "AMX is not ready to be used!\n");
+        return false;
+    }
+    return true;
+#elif defined(_WIN32)
+    return true;
+#else
+    return false;
+#endif
+}
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
+        /* .iface = */ {
+                        /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+                        /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+                        /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+                        /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                        /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+                        /* .is_host          = */ nullptr,
+                        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
+    };
+    if (!ggml_amx_init()) {
+        return nullptr;
+    }
+    return &ggml_backend_buffer_type_amx;
+}
+#endif  // defined(__AMX_INT8__) && defined(__AVX512VNNI__)

diff --git a/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp b/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp new file mode 100644 index 0000000..895a571 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp
@@ -0,0 +1,224 @@
	1	#include "amx.h"
	2	#include "common.h"
	3	#include "mmq.h"
	4	#include "ggml-backend-impl.h"
	5	#include "ggml-backend.h"
	6	#include "ggml-impl.h"
	7	#include "ggml-cpu.h"
	8	#include "traits.h"
	9
	10	#if defined(__linux__)
	11	#include <sys/syscall.h>
	12	#include <unistd.h>
	13	#endif
	14
	15	#include <cstdlib>
	16	#include <cstring>
	17	#include <memory>
	18
	19	#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
	20
	21	// AMX type_trais
	22	namespace ggml::cpu::amx {
	23	class tensor_traits : public ggml::cpu::tensor_traits {
	24	bool work_size(int /* n_threads /, const struct ggml_tensor op, size_t & size) override {
	25	size = ggml_backend_amx_desired_wsize(op);
	26	return true;
	27	}
	28
	29	bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
	30	if (op->op == GGML_OP_MUL_MAT) {
	31	ggml_backend_amx_mul_mat(params, op);
	32	return true;
	33	}
	34	return false;
	35	}
	36	};
	37
	38	static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
	39	static tensor_traits traits;
	40	return &traits;
	41	}
	42	} // namespace ggml::cpu::amx
	43
	44	// AMX buffer interface
	45	static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
	46	free(buffer->context);
	47	}
	48
	49	static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
	50	return (void *) (buffer->context);
	51	}
	52
	53	static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
	54	tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
	55
	56	GGML_UNUSED(buffer);
	57	return GGML_STATUS_SUCCESS;
	58	}
	59
	60	static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
	61	uint8_t value, size_t offset, size_t size) {
	62	memset((char *) tensor->data + offset, value, size);
	63
	64	GGML_UNUSED(buffer);
	65	}
	66
	67	static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
	68	const void * data, size_t offset, size_t size) {
	69	if (qtype_has_amx_kernels(tensor->type)) {
	70	GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
	71	ggml_backend_amx_convert_weight(tensor, data, offset, size);
	72	} else {
	73	memcpy((char *) tensor->data + offset, data, size);
	74	}
	75
	76	GGML_UNUSED(buffer);
	77	}
	78
	79	/*
	80	// need to figure what we need to do with buffer->extra.
	81	static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
	82	GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
	83	memcpy(data, (const char *)tensor->data + offset, size);
	84
	85	GGML_UNUSED(buffer);
	86	}
	87
	88	static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
	89	if (ggml_backend_buffer_is_host(src->buffer)) {
	90	if (qtype_has_amx_kernels(src->type)) {
	91	ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
	92	} else {
	93	memcpy(dst->data, src->data, ggml_nbytes(src));
	94	}
	95	return true;
	96	}
	97	return false;
	98
	99	GGML_UNUSED(buffer);
	100	}
	101	*/
	102
	103	static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
	104	memset(buffer->context, value, buffer->size);
	105	}
	106
	107	static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
	108	/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
	109	/* .get_base = */ ggml_backend_amx_buffer_get_base,
	110	/* .init_tensor = */ ggml_backend_amx_buffer_init_tensor,
	111	/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
	112	/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
	113	/* .get_tensor = */ nullptr,
	114	/* .cpy_tensor = */ nullptr,
	115	/* .clear = */ ggml_backend_amx_buffer_clear,
	116	/* .reset = */ nullptr,
	117	};
	118
	119	static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
	120	return "AMX";
	121
	122	GGML_UNUSED(buft);
	123	}
	124
	125	static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
	126	void * data = ggml_aligned_malloc(size);
	127	if (data == NULL) {
	128	fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
	129	return NULL;
	130	}
	131
	132	return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
	133	}
	134
	135	static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
	136	return TENSOR_ALIGNMENT;
	137
	138	GGML_UNUSED(buft);
	139	}
	140
	141	namespace ggml::cpu::amx {
	142	class extra_buffer_type : ggml::cpu::extra_buffer_type {
	143	bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
	144	// handle only 2d gemm for now
	145	auto is_contiguous_2d = [](const struct ggml_tensor * t) {
	146	return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
	147	};
	148
	149	if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
	150	is_contiguous_2d(op->src[1]) && // src1 must be contiguous
	151	op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
	152	op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
	153	op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
	154	(qtype_has_amx_kernels(op->src[0]->type) \|\| (op->src[0]->type == GGML_TYPE_F16))) {
	155	// src1 must be host buffer
	156	if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
	157	return false;
	158	}
	159	// src1 must be float32
	160	if (op->src[1]->type == GGML_TYPE_F32) {
	161	return true;
	162	}
	163	}
	164	return false;
	165	}
	166
	167	ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
	168	if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
	169	op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
	170	return (ggml::cpu::tensor_traits *) op->src[0]->extra;
	171	}
	172
	173	return nullptr;
	174	}
	175	};
	176	} // namespace ggml::cpu::amx
	177
	178	static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
	179	return ggml_backend_amx_get_alloc_size(tensor);
	180
	181	GGML_UNUSED(buft);
	182	}
	183
	184	#define ARCH_GET_XCOMP_PERM 0x1022
	185	#define ARCH_REQ_XCOMP_PERM 0x1023
	186	#define XFEATURE_XTILECFG 17
	187	#define XFEATURE_XTILEDATA 18
	188
	189	static bool ggml_amx_init() {
	190	#if defined(__linux__)
	191	if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
	192	fprintf(stderr, "AMX is not ready to be used!\n");
	193	return false;
	194	}
	195	return true;
	196	#elif defined(_WIN32)
	197	return true;
	198	#else
	199	return false;
	200	#endif
	201	}
	202
	203	ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
	204	static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
	205	/* .iface = */ {
	206	/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
	207	/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
	208	/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
	209	/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
	210	/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
	211	/* .is_host = */ nullptr,
	212	},
	213	/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
	214	/* .context = */ new ggml::cpu::amx::extra_buffer_type(),
	215	};
	216
	217	if (!ggml_amx_init()) {
	218	return nullptr;
	219	}
	220
	221	return &ggml_backend_buffer_type_amx;
	222	}
	223
	224	#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)