Engage!

author: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-02-12 20:57:17 +0100
committer: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-02-12 20:57:17 +0100
commit: b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree: 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
download: llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
1 files changed, 221 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
new file mode 100644
index 0000000..289f4b7
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
@@ -0,0 +1,221 @@
+#include "ggml-impl.h"
+#include "apir_cs.h"
+#include "apir_cs_rpc.h"
+// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
+static inline void apir_encode_ggml_buffer_host_handle(apir_encoder *                    enc,
+                                                       const apir_buffer_host_handle_t * handle);
+static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
+/* apir_rpc_tensor */
+static inline void apir_encode_rcp_tensor(apir_encoder * enc, const apir_rpc_tensor * apir_rpc_tensor) {
+    size_t apir_rpc_tensor_size = sizeof(*apir_rpc_tensor);
+    apir_encode(enc, apir_rpc_tensor_size, apir_rpc_tensor, apir_rpc_tensor_size);
+}
+static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder * dec) {
+    size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor);
+    return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
+}
+static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
+                                                                          uint32_t       n_tensors) {
+    size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
+    return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
+}
+/* ggml_tensor */
+static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor * tensor) {
+    apir_rpc_tensor serialized = apir_serialize_tensor(tensor);
+    apir_encode_rcp_tensor(enc, &serialized);
+}
+static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
+    const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
+    if (!apir_rpc_tensor) {
+        return NULL;
+    }
+    ggml_init_params params{
+        /*.mem_size   =*/ ggml_tensor_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context * ctx = ggml_init(params);
+    const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);
+    return tensor;
+}
+/* *** ggml_backend_buffer_type_t *** */
+// ggml_backend_buffer_type_t is a POINTER (to a struct).
+// Only the host pointer is shared between the host and guest.
+// The guest stores it in `buft->context`.
+// The host simply writes the pointer address in the buffer variable.
+static inline void apir_encode_ggml_buffer_type(apir_encoder * enc, ggml_backend_buffer_type_t buft) {
+    apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
+    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decoder * dec) {
+    apir_buffer_type_host_handle_t handle;
+    apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+    return (ggml_backend_buffer_type_t) handle;
+}
+static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) {
+    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
+    apir_buffer_type_host_handle_t handle;
+    apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+    return handle;
+}
+/* *** ggml_backend_type_t *** */
+// ggml_backend_buffer_t is a POINTER.
+// same logic as for ggml_backend_buffer_type_t
+static inline void apir_encode_ggml_buffer(apir_encoder * enc, const ggml_backend_buffer_t buffer) {
+    apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
+    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) {
+    ggml_backend_buffer_t buffer;
+    size_t                buffer_ptr_size = sizeof(buffer);
+    apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
+    return buffer;
+}
+/* enum ggml_status */
+static inline void apir_encode_ggml_status(apir_encoder * enc, const ggml_status * status) {
+    apir_encoder_write(enc, sizeof(*status), status, sizeof(*status));
+}
+static inline void apir_decode_ggml_status(apir_decoder * dec, ggml_status * status) {
+    apir_decoder_read(dec, sizeof(*status), status, sizeof(*status));
+}
+/* virtgpu_shmem */
+static inline void apir_encode_virtgpu_shmem_res_id(apir_encoder * enc, uint32_t shmem_res_id) {
+    apir_encode_uint32_t(enc, &shmem_res_id);
+}
+static inline void apir_decode_virtgpu_shmem_res_id(apir_decoder * dec, uint32_t * shmem_res_id) {
+    apir_decode_uint32_t(dec, shmem_res_id);
+}
+/* ggml_cgraph */
+static inline size_t apir_serialize_ggml_cgraph(ggml_cgraph * cgraph, std::vector<uint8_t> & cgraph_data) {
+    apir_serialize_graph(cgraph, cgraph_data);
+    return cgraph_data.size();
+}
+static inline void apir_encode_cgraph_data(apir_encoder * enc, std::vector<uint8_t> & cgraph_data) {
+    size_t cgraph_size = cgraph_data.size();
+    apir_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
+}
+static inline ggml_cgraph * apir_decode_ggml_cgraph(apir_decoder * dec, size_t cgraph_size) {
+    GGML_UNUSED(cgraph_size);
+    uint32_t n_nodes;
+    apir_decode_uint32_t(dec, &n_nodes);
+    const uint64_t * nodes = apir_decode_uint64_t_array_inplace(dec, n_nodes);
+    uint32_t n_tensors;
+    apir_decode_uint32_t(dec, &n_tensors);
+    const apir_rpc_tensor * tensors = apir_decode_apir_rpc_tensor_array_inplace(dec, n_tensors);
+    return apir_deserialize_graph(n_nodes, n_tensors, tensors, nodes);
+}
+static inline void apir_encode_ggml_buffer_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle) {
+    apir_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml_tensor * tensor) {
+    size_t tensor_size = sizeof(*tensor);
+    if (tensor->extra) {
+        GGML_ABORT("%s: Cannot pass tensors with extra", __func__);
+    }
+    if (tensor->src[0] && tensor->buffer) {
+        static int first = 1;
+        if (first) {
+            GGML_LOG_WARN("%s: Cannot pass tensors with src and buffer\n", __func__);
+            first = 0;
+        }
+    }
+    apir_encoder_write(enc, tensor_size, tensor, tensor_size);
+    // tensor->data is a pointer inside the device buffer. No need to touch it
+    // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+    // (could also make a copy of the tensor, and update locally.)
+    if (tensor->buffer) {
+        apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+        apir_encode_ggml_buffer_handle(enc, &buffer_handle);
+    }
+    if (tensor->view_src) {
+        apir_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+    }
+    for (int i = 0; tensor->src[i]; i++) {
+        const ggml_tensor * tensor_src = tensor->src[i];
+        apir_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+    }
+}
+static inline const ggml_tensor * apir_decode_ggml_tensor_inplace(apir_decoder * dec) {
+    // it safe to remove the `const` qualifier here, we *do* want to
+    // modify the shared memory data to fix the `src` pointers.
+    ggml_tensor * tensor = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    // tensor->data is a pointer inside the device buffer. No need to touch it
+    // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+    if (tensor->buffer) {
+        tensor->buffer = apir_decode_ggml_buffer(dec);
+    }
+    if (tensor->view_src) {
+        ggml_tensor * tensor_view_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+        tensor->view_src              = tensor_view_src;
+    }
+    for (int i = 0; tensor->src[i]; i++) {
+        ggml_tensor * tensor_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+        tensor->src[i] = tensor_src;  // overwrite op->src[i] pointer with the actual location of the src tensor
+    }
+    return tensor;
+}
author	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-12 20:57:17 +0100
committer	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-12 20:57:17 +0100
commit	b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree	211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
download	llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz

diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h new file mode 100644 index 0000000..289f4b7 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
@@ -0,0 +1,221 @@
	1	#include "ggml-impl.h"
	2	#include "apir_cs.h"
	3	#include "apir_cs_rpc.h"
	4
	5	// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
	6
	7	static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc,
	8	const apir_buffer_host_handle_t * handle);
	9
	10	static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
	11
	12	/* apir_rpc_tensor */
	13
	14	static inline void apir_encode_rcp_tensor(apir_encoder * enc, const apir_rpc_tensor * apir_rpc_tensor) {
	15	size_t apir_rpc_tensor_size = sizeof(*apir_rpc_tensor);
	16	apir_encode(enc, apir_rpc_tensor_size, apir_rpc_tensor, apir_rpc_tensor_size);
	17	}
	18
	19	static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder * dec) {
	20	size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor);
	21
	22	return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
	23	}
	24
	25	static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
	26	uint32_t n_tensors) {
	27	size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
	28
	29	return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
	30	}
	31
	32	/* ggml_tensor */
	33
	34	static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor * tensor) {
	35	apir_rpc_tensor serialized = apir_serialize_tensor(tensor);
	36
	37	apir_encode_rcp_tensor(enc, &serialized);
	38	}
	39
	40	static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
	41	const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
	42
	43	if (!apir_rpc_tensor) {
	44	return NULL;
	45	}
	46
	47	ggml_init_params params{
	48	/.mem_size =/ ggml_tensor_overhead(),
	49	/.mem_buffer =/ NULL,
	50	/.no_alloc =/ true,
	51	};
	52
	53	ggml_context * ctx = ggml_init(params);
	54
	55	const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);
	56
	57	return tensor;
	58	}
	59
	60	/* * ggml_backend_buffer_type_t * */
	61
	62	// ggml_backend_buffer_type_t is a POINTER (to a struct).
	63	// Only the host pointer is shared between the host and guest.
	64	// The guest stores it in `buft->context`.
	65	// The host simply writes the pointer address in the buffer variable.
	66
	67	static inline void apir_encode_ggml_buffer_type(apir_encoder * enc, ggml_backend_buffer_type_t buft) {
	68	apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
	69	apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
	70	}
	71
	72	static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decoder * dec) {
	73	apir_buffer_type_host_handle_t handle;
	74
	75	apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
	76
	77	return (ggml_backend_buffer_type_t) handle;
	78	}
	79
	80	static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) {
	81	apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
	82	}
	83
	84	static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
	85	apir_buffer_type_host_handle_t handle;
	86
	87	apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
	88
	89	return handle;
	90	}
	91
	92	/* * ggml_backend_type_t * */
	93
	94	// ggml_backend_buffer_t is a POINTER.
	95	// same logic as for ggml_backend_buffer_type_t
	96
	97	static inline void apir_encode_ggml_buffer(apir_encoder * enc, const ggml_backend_buffer_t buffer) {
	98	apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
	99	apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
	100	}
	101
	102	static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) {
	103	ggml_backend_buffer_t buffer;
	104	size_t buffer_ptr_size = sizeof(buffer);
	105
	106	apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
	107
	108	return buffer;
	109	}
	110
	111	/* enum ggml_status */
	112
	113	static inline void apir_encode_ggml_status(apir_encoder * enc, const ggml_status * status) {
	114	apir_encoder_write(enc, sizeof(status), status, sizeof(status));
	115	}
	116
	117	static inline void apir_decode_ggml_status(apir_decoder * dec, ggml_status * status) {
	118	apir_decoder_read(dec, sizeof(status), status, sizeof(status));
	119	}
	120
	121	/* virtgpu_shmem */
	122
	123	static inline void apir_encode_virtgpu_shmem_res_id(apir_encoder * enc, uint32_t shmem_res_id) {
	124	apir_encode_uint32_t(enc, &shmem_res_id);
	125	}
	126
	127	static inline void apir_decode_virtgpu_shmem_res_id(apir_decoder * dec, uint32_t * shmem_res_id) {
	128	apir_decode_uint32_t(dec, shmem_res_id);
	129	}
	130
	131	/* ggml_cgraph */
	132
	133	static inline size_t apir_serialize_ggml_cgraph(ggml_cgraph * cgraph, std::vector<uint8_t> & cgraph_data) {
	134	apir_serialize_graph(cgraph, cgraph_data);
	135
	136	return cgraph_data.size();
	137	}
	138
	139	static inline void apir_encode_cgraph_data(apir_encoder * enc, std::vector<uint8_t> & cgraph_data) {
	140	size_t cgraph_size = cgraph_data.size();
	141
	142	apir_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
	143	}
	144
	145	static inline ggml_cgraph * apir_decode_ggml_cgraph(apir_decoder * dec, size_t cgraph_size) {
	146	GGML_UNUSED(cgraph_size);
	147
	148	uint32_t n_nodes;
	149	apir_decode_uint32_t(dec, &n_nodes);
	150	const uint64_t * nodes = apir_decode_uint64_t_array_inplace(dec, n_nodes);
	151
	152	uint32_t n_tensors;
	153	apir_decode_uint32_t(dec, &n_tensors);
	154	const apir_rpc_tensor * tensors = apir_decode_apir_rpc_tensor_array_inplace(dec, n_tensors);
	155
	156	return apir_deserialize_graph(n_nodes, n_tensors, tensors, nodes);
	157	}
	158
	159	static inline void apir_encode_ggml_buffer_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle) {
	160	apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
	161	}
	162
	163	static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml_tensor * tensor) {
	164	size_t tensor_size = sizeof(*tensor);
	165
	166	if (tensor->extra) {
	167	GGML_ABORT("%s: Cannot pass tensors with extra", __func__);
	168	}
	169
	170	if (tensor->src[0] && tensor->buffer) {
	171	static int first = 1;
	172	if (first) {
	173	GGML_LOG_WARN("%s: Cannot pass tensors with src and buffer\n", __func__);
	174	first = 0;
	175	}
	176	}
	177
	178	apir_encoder_write(enc, tensor_size, tensor, tensor_size);
	179
	180	// tensor->data is a pointer inside the device buffer. No need to touch it
	181	// tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
	182	// (could also make a copy of the tensor, and update locally.)
	183
	184	if (tensor->buffer) {
	185	apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
	186	apir_encode_ggml_buffer_handle(enc, &buffer_handle);
	187	}
	188
	189	if (tensor->view_src) {
	190	apir_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
	191	}
	192
	193	for (int i = 0; tensor->src[i]; i++) {
	194	const ggml_tensor * tensor_src = tensor->src[i];
	195	apir_encoder_write(enc, tensor_size, tensor_src, tensor_size);
	196	}
	197	}
	198
	199	static inline const ggml_tensor * apir_decode_ggml_tensor_inplace(apir_decoder * dec) {
	200	// it safe to remove the `const` qualifier here, we do want to
	201	// modify the shared memory data to fix the `src` pointers.
	202	ggml_tensor * tensor = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
	203
	204	// tensor->data is a pointer inside the device buffer. No need to touch it
	205	// tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
	206	if (tensor->buffer) {
	207	tensor->buffer = apir_decode_ggml_buffer(dec);
	208	}
	209
	210	if (tensor->view_src) {
	211	ggml_tensor * tensor_view_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
	212	tensor->view_src = tensor_view_src;
	213	}
	214
	215	for (int i = 0; tensor->src[i]; i++) {
	216	ggml_tensor * tensor_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
	217	tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
	218	}
	219
	220	return tensor;
	221	}