summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-virtgpu/backend/shared
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/ggml/src/ggml-virtgpu/backend/shared')
-rw-r--r--llama.cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h90
-rw-r--r--llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h36
-rw-r--r--llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h46
-rw-r--r--llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h384
-rw-r--r--llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h221
-rw-r--r--llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h54
6 files changed, 831 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h
new file mode 100644
index 0000000..f19a5d1
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h
@@ -0,0 +1,90 @@
+#pragma once
+
+/* the rest of this file must match virglrenderer/src/apir-protocol.h */
+
+#include <unistd.h>
+
+#include <cstdint>
+
+#define APIR_PROTOCOL_MAJOR 0
+#define APIR_PROTOCOL_MINOR 1
+
+#define APIR_HANDSHAKE_MAGIC 0xab1e
+
+enum ApirCommandType {
+ APIR_COMMAND_TYPE_HANDSHAKE = 0,
+ APIR_COMMAND_TYPE_LOADLIBRARY = 1,
+ APIR_COMMAND_TYPE_FORWARD = 2,
+
+ APIR_COMMAND_TYPE_LENGTH = 3,
+};
+
+typedef uint64_t ApirCommandFlags;
+
+enum ApirLoadLibraryReturnCode {
+ APIR_LOAD_LIBRARY_SUCCESS = 0,
+ APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1,
+ APIR_LOAD_LIBRARY_ALREADY_LOADED = 2,
+ APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3,
+ APIR_LOAD_LIBRARY_CANNOT_OPEN = 4,
+ APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5,
+ APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code
+};
+
+enum ApirForwardReturnCode {
+ APIR_FORWARD_SUCCESS = 0,
+ APIR_FORWARD_NO_DISPATCH_FCT = 1,
+ APIR_FORWARD_TIMEOUT = 2,
+
+ APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code
+} ;
+
+__attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) {
+ switch (type) {
+ case APIR_COMMAND_TYPE_HANDSHAKE:
+ return "HandShake";
+ case APIR_COMMAND_TYPE_LOADLIBRARY:
+ return "LoadLibrary";
+ case APIR_COMMAND_TYPE_FORWARD:
+ return "Forward";
+ default:
+ return "unknown";
+ }
+}
+
+__attribute__((unused)) static const char * apir_load_library_error(ApirLoadLibraryReturnCode code) {
+#define APIR_LOAD_LIBRARY_ERROR(code_name) \
+ do { \
+ if (code == code_name) \
+ return #code_name; \
+ } while (0)
+
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SUCCESS);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ALREADY_LOADED);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ENV_VAR_MISSING);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_CANNOT_OPEN);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SYMBOL_MISSING);
+ APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
+
+ return "Unknown APIR_COMMAND_TYPE_LoadLibrary error";
+
+#undef APIR_LOAD_LIBRARY_ERROR
+}
+
+__attribute__((unused)) static const char * apir_forward_error(ApirForwardReturnCode code) {
+#define APIR_FORWARD_ERROR(code_name) \
+ do { \
+ if (code == code_name) \
+ return #code_name; \
+ } while (0)
+
+ APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS);
+ APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT);
+ APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT);
+ APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX);
+
+ return "Unknown APIR_COMMAND_TYPE_FORWARD error";
+
+#undef APIR_FORWARD_ERROR
+}
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h
new file mode 100644
index 0000000..d214b6f
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h
@@ -0,0 +1,36 @@
+typedef enum ApirBackendCommandType {
+
+ /* device */
+ APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT = 0,
+ APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 1,
+ APIR_COMMAND_TYPE_DEVICE_GET_NAME = 2,
+ APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 3,
+ APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 4,
+ APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 5,
+ APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 6,
+ APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 7,
+ APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 8,
+ APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 9,
+
+ /* buffer-type */
+ APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 10,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 11,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 12,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 13,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 14,
+ APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = 15,
+
+ /* buffer */
+ APIR_COMMAND_TYPE_BUFFER_GET_BASE = 16,
+ APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 17,
+ APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 18,
+ APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR = 19,
+ APIR_COMMAND_TYPE_BUFFER_CLEAR = 20,
+ APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 21,
+
+ /* backend */
+ APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 22,
+
+ // last command_type index + 1
+ APIR_BACKEND_DISPATCH_TABLE_COUNT = 23,
+} ApirBackendCommandType;
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h
new file mode 100644
index 0000000..f3efa52
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "apir_backend.gen.h"
+
+#include <stdint.h> // for uintptr_t
+#include <time.h> // for timespec, clock_gettime
+
+#define APIR_BACKEND_INITIALIZE_SUCCESS 0
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2
+#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3
+#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4
+#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5
+#define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED 6
+#define APIR_BACKEND_INITIALIZE_ALREADY_INITED 7
+#define APIR_BACKEND_INITIALIZE_NO_DEVICE 8
+
+
+// new entries here need to be added to the apir_backend_initialize_error function below
+
+#define APIR_BACKEND_FORWARD_INDEX_INVALID 6
+
+// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received
+#define APIR_BACKEND_CHECK_SUPPORTS_OP 0
+
+typedef uintptr_t apir_buffer_type_host_handle_t;
+typedef uintptr_t apir_buffer_host_handle_t;
+
+static const char * apir_backend_initialize_error(int code) {
+#define APIR_BACKEND_INITIALIZE_ERROR(code_name) \
+ do { \
+ if (code == code_name) \
+ return #code_name; \
+ } while (0)
+
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_SUCCESS);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS);
+ APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED);
+
+ return "Unknown APIR_BACKEND_INITIALIZE error:/";
+
+#undef APIR_BACKEND_INITIALIZE_ERROR
+}
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h
new file mode 100644
index 0000000..1bc3a5f
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h
@@ -0,0 +1,384 @@
+#pragma once
+
+#include "ggml-impl.h"
+
+#include <cassert>
+#include <cstring>
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+struct apir_encoder {
+ char * cur;
+ const char * start;
+ const char * end;
+ bool fatal;
+
+};
+
+struct apir_decoder {
+ const char * cur;
+ const char * end;
+ bool fatal;
+};
+
+/*
+ * new encoder and decoder
+ */
+
+static apir_decoder apir_new_decoder(const char * ptr, size_t size) {
+ apir_decoder dec = {
+ .cur = ptr,
+ .end = ptr + size,
+ .fatal = false,
+ };
+
+ return dec;
+}
+
+static apir_encoder apir_new_encoder(char * ptr, size_t size) {
+ apir_encoder enc = {
+ .cur = ptr,
+ .start = ptr,
+ .end = ptr + size,
+ .fatal = false,
+ };
+
+ return enc;
+}
+
+/*
+ * fatal flag handling
+ */
+
+static inline void apir_encoder_reset_fatal(apir_encoder * enc) {
+ enc->fatal = false;
+}
+
+static inline void apir_encoder_set_fatal(apir_encoder * enc) {
+ enc->fatal = true;
+}
+
+static inline bool apir_encoder_get_fatal(const apir_encoder * enc) {
+ return enc->fatal;
+}
+
+static inline void apir_decoder_reset_fatal(apir_decoder * dec) {
+ dec->fatal = false;
+}
+
+static inline void apir_decoder_set_fatal(apir_decoder * dec) {
+ dec->fatal = true;
+}
+
+static inline bool apir_decoder_get_fatal(const apir_decoder * dec) {
+ return dec->fatal;
+}
+
+/*
+ * encode peek
+ */
+
+static inline bool apir_decoder_peek_internal(apir_decoder * dec,
+ size_t size,
+ void * val,
+ size_t val_size) {
+ assert(val_size <= size);
+
+ if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+ GGML_LOG_ERROR("%s: reading too much from the decoder ...\n", __func__);
+ apir_decoder_set_fatal(dec);
+ memset(val, 0, val_size);
+ return false;
+ }
+
+ /* we should not rely on the compiler to optimize away memcpy... */
+ memcpy(val, dec->cur, val_size);
+ return true;
+}
+
+static inline void apir_decoder_peek(apir_decoder * dec, size_t size, void * val, size_t val_size) {
+ apir_decoder_peek_internal(dec, size, val, val_size);
+}
+
+static inline const void * apir_decoder_use_inplace(apir_decoder * dec, size_t size) {
+ if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+ GGML_LOG_ERROR("%s: reading too much from the decoder ...\n", __func__);
+ apir_decoder_set_fatal(dec);
+ return NULL;
+ }
+ const void * addr = dec->cur;
+ dec->cur += size;
+
+ return addr;
+}
+
+/*
+ * read/write
+ */
+
+static inline void apir_decoder_read(apir_decoder * dec, size_t size, void * val, size_t val_size) {
+ if (apir_decoder_peek_internal(dec, size, val, val_size)) {
+ dec->cur += size;
+ }
+}
+
+static inline char * apir_encoder_write(apir_encoder * enc, size_t size, const void * val, size_t val_size) {
+ assert(val_size <= size);
+ assert(size <= ((size_t) (enc->end - enc->cur)));
+
+ char * write_addr = enc->cur;
+ /* we should not rely on the compiler to optimize away memcpy... */
+ memcpy(write_addr, val, val_size);
+ enc->cur += size;
+
+ return write_addr;
+}
+
+/*
+ * encode/decode
+ */
+
+static inline void apir_decode(apir_decoder * dec, size_t size, void * data, size_t data_size) {
+ assert(size % 4 == 0);
+ apir_decoder_read(dec, size, data, data_size);
+}
+
+static inline void apir_encode(apir_encoder * enc, size_t size, const void * data, size_t data_size) {
+ assert(size % 4 == 0);
+ apir_encoder_write(enc, size, data, data_size);
+}
+
+/*
+ * typed encode/decode
+ */
+
+/* uint8_t */
+
+static inline void apir_encode_uint8_t(apir_encoder * enc, const uint8_t * val) {
+ apir_encode(enc, sizeof(int), val, sizeof(*val));
+}
+
+static inline void apir_decode_uint8_t(apir_decoder * dec, uint8_t * val) {
+ apir_decode(dec, sizeof(int), val, sizeof(*val));
+}
+
+/* uint64_t */
+
+static inline void apir_encode_uint64_t(apir_encoder * enc, const uint64_t * val) {
+ apir_encode(enc, 8, val, sizeof(*val));
+}
+
+static inline void apir_decode_uint64_t(apir_decoder * dec, uint64_t * val) {
+ apir_decode(dec, 8, val, sizeof(*val));
+}
+
+static inline void apir_encode_uint64_t_array(apir_encoder * enc, const uint64_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_uint64_t_array(apir_decoder * dec, uint64_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_decode(dec, size, val, size);
+}
+
+static inline const uint64_t * apir_decode_uint64_t_array_inplace(apir_decoder * dec, uint32_t count) {
+ return (uint64_t *) (uintptr_t) apir_decoder_use_inplace(dec, count * sizeof(uint64_t));
+}
+
+/* int32_t */
+
+static inline void apir_encode_int32_t(apir_encoder * enc, const int32_t * val) {
+ apir_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void apir_decode_int32_t(apir_decoder * dec, int32_t * val) {
+ apir_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline void apir_encode_int32_t_array(apir_encoder * enc, const int32_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_int32_t_array(apir_decoder * dec, int32_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_decode(dec, size, val, size);
+}
+
+/* array size (uint64_t) */
+
+static inline void apir_encode_array_size(apir_encoder * enc, uint64_t size) {
+ apir_encode_uint64_t(enc, &size);
+}
+
+static inline uint64_t apir_decode_array_size(apir_decoder * dec, uint64_t expected_size) {
+ uint64_t size;
+ apir_decode_uint64_t(dec, &size);
+ if (size != expected_size) {
+ GGML_LOG_ERROR("%s: Couldn't decode array from the decoder\n", __func__);
+ apir_decoder_set_fatal(dec);
+ size = 0;
+ }
+ return size;
+}
+
+static inline uint64_t apir_decode_array_size_unchecked(apir_decoder * dec) {
+ uint64_t size;
+ apir_decode_uint64_t(dec, &size);
+ return size;
+}
+
+/* non-array pointer */
+
+static inline bool apir_encode_simple_pointer(apir_encoder * enc, const void * val) {
+ apir_encode_array_size(enc, val ? 1 : 0);
+ return val;
+}
+
+static inline bool apir_decode_simple_pointer(apir_decoder * dec) {
+ return apir_decode_array_size_unchecked(dec);
+}
+
+/* uint32_t */
+
+static inline void apir_encode_uint32_t(apir_encoder * enc, const uint32_t * val) {
+ apir_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void apir_decode_uint32_t(apir_decoder * dec, uint32_t * val) {
+ apir_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline void apir_encode_uint32_t_array(apir_encoder * enc, const uint32_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_encode(enc, size, val, size);
+}
+
+static inline void apir_decode_uint32_t_array(apir_decoder * dec, uint32_t * val, uint32_t count) {
+ const size_t size = sizeof(*val) * count;
+ assert(size >= count);
+ apir_decode(dec, size, val, size);
+}
+
+/* size_t */
+
+static inline void apir_encode_size_t(apir_encoder * enc, const size_t * val) {
+ const uint64_t tmp = *val;
+ apir_encode_uint64_t(enc, &tmp);
+}
+
+static inline void apir_decode_size_t(apir_decoder * dec, size_t * val) {
+ uint64_t tmp;
+ apir_decode_uint64_t(dec, &tmp);
+ *val = tmp;
+}
+
+static inline void apir_encode_size_t_array(apir_encoder * enc, const size_t * val, uint32_t count) {
+ if (sizeof(size_t) == sizeof(uint64_t)) {
+ apir_encode_uint64_t_array(enc, (const uint64_t *) val, count);
+ } else {
+ for (uint32_t i = 0; i < count; i++) {
+ apir_encode_size_t(enc, &val[i]);
+ }
+ }
+}
+
+static inline void apir_decode_size_t_array(apir_decoder * dec, size_t * val, uint32_t count) {
+ if (sizeof(size_t) == sizeof(uint64_t)) {
+ apir_decode_uint64_t_array(dec, (uint64_t *) val, count);
+ } else {
+ for (uint32_t i = 0; i < count; i++) {
+ apir_decode_size_t(dec, &val[i]);
+ }
+ }
+}
+
+/* opaque blob */
+
+static inline void apir_encode_blob_array(apir_encoder * enc, const void * val, size_t size) {
+ apir_encode(enc, (size + 3) & ~3, val, size);
+}
+
+static inline void apir_decode_blob_array(apir_decoder * dec, void * val, size_t size) {
+ apir_decode(dec, (size + 3) & ~3, val, size);
+}
+
+/* string */
+
+static inline void apir_encode_char_array(apir_encoder * enc, const char * val, size_t size) {
+ assert(size && strlen(val) < size);
+ apir_encode_blob_array(enc, val, size);
+}
+
+static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t size) {
+ apir_decode_blob_array(dec, val, size);
+ if (size) {
+ val[size - 1] = '\0';
+ } else {
+ GGML_LOG_ERROR("%s: Couldn't decode the blog array\n", __func__);
+ apir_decoder_set_fatal(dec);
+ }
+}
+
+/* (temp) buffer allocation */
+
+static inline void * apir_decoder_alloc_array(size_t size, size_t count) {
+ size_t alloc_size;
+ if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
+ GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n",
+ __func__, size, count);
+ return NULL;
+ }
+
+ return malloc(alloc_size);
+}
+
+/* bool */
+
+static inline void apir_encode_bool_t(apir_encoder * enc, const bool * val) {
+ apir_encode(enc, sizeof(int), val, sizeof(bool));
+}
+
+static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) {
+ apir_decode(dec, sizeof(int), val, sizeof(bool));
+}
+
+/* apir_buffer_type_host_handle_t */
+
+static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc,
+ const apir_buffer_type_host_handle_t * val) {
+ apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec,
+ apir_buffer_type_host_handle_t * val) {
+ apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+/* apir_buffer_host_handle_t */
+
+static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc,
+ const apir_buffer_host_handle_t * val) {
+ apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+static inline void apir_decode_apir_buffer_host_handle_t(apir_decoder * dec, apir_buffer_host_handle_t * val) {
+ apir_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+/* uintptr_t */
+
+static inline void apir_encode_uintptr_t(apir_encoder * enc, const uintptr_t * val) {
+ apir_encode(enc, sizeof(*val), val, sizeof(*val));
+}
+
+static inline void apir_decode_uintptr_t(apir_decoder * dec, uintptr_t * val) {
+ apir_decode(dec, sizeof(*val), val, sizeof(*val));
+}
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
new file mode 100644
index 0000000..289f4b7
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h
@@ -0,0 +1,221 @@
+#include "ggml-impl.h"
+#include "apir_cs.h"
+#include "apir_cs_rpc.h"
+
+// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
+
+static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc,
+ const apir_buffer_host_handle_t * handle);
+
+static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
+
+/* apir_rpc_tensor */
+
+static inline void apir_encode_rcp_tensor(apir_encoder * enc, const apir_rpc_tensor * apir_rpc_tensor) {
+ size_t apir_rpc_tensor_size = sizeof(*apir_rpc_tensor);
+ apir_encode(enc, apir_rpc_tensor_size, apir_rpc_tensor, apir_rpc_tensor_size);
+}
+
+static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder * dec) {
+ size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor);
+
+ return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
+}
+
+static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
+ uint32_t n_tensors) {
+ size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
+
+ return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
+}
+
+/* ggml_tensor */
+
+static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor * tensor) {
+ apir_rpc_tensor serialized = apir_serialize_tensor(tensor);
+
+ apir_encode_rcp_tensor(enc, &serialized);
+}
+
+static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
+ const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
+
+ if (!apir_rpc_tensor) {
+ return NULL;
+ }
+
+ ggml_init_params params{
+ /*.mem_size =*/ ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context * ctx = ggml_init(params);
+
+ const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);
+
+ return tensor;
+}
+
+/* *** ggml_backend_buffer_type_t *** */
+
+// ggml_backend_buffer_type_t is a POINTER (to a struct).
+// Only the host pointer is shared between the host and guest.
+// The guest stores it in `buft->context`.
+// The host simply writes the pointer address in the buffer variable.
+
+static inline void apir_encode_ggml_buffer_type(apir_encoder * enc, ggml_backend_buffer_type_t buft) {
+ apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
+ apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decoder * dec) {
+ apir_buffer_type_host_handle_t handle;
+
+ apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+ return (ggml_backend_buffer_type_t) handle;
+}
+
+static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) {
+ apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
+ apir_buffer_type_host_handle_t handle;
+
+ apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+ return handle;
+}
+
+/* *** ggml_backend_type_t *** */
+
+// ggml_backend_buffer_t is a POINTER.
+// same logic as for ggml_backend_buffer_type_t
+
+static inline void apir_encode_ggml_buffer(apir_encoder * enc, const ggml_backend_buffer_t buffer) {
+ apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
+ apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) {
+ ggml_backend_buffer_t buffer;
+ size_t buffer_ptr_size = sizeof(buffer);
+
+ apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
+
+ return buffer;
+}
+
+/* enum ggml_status */
+
+static inline void apir_encode_ggml_status(apir_encoder * enc, const ggml_status * status) {
+ apir_encoder_write(enc, sizeof(*status), status, sizeof(*status));
+}
+
+static inline void apir_decode_ggml_status(apir_decoder * dec, ggml_status * status) {
+ apir_decoder_read(dec, sizeof(*status), status, sizeof(*status));
+}
+
+/* virtgpu_shmem */
+
+static inline void apir_encode_virtgpu_shmem_res_id(apir_encoder * enc, uint32_t shmem_res_id) {
+ apir_encode_uint32_t(enc, &shmem_res_id);
+}
+
+static inline void apir_decode_virtgpu_shmem_res_id(apir_decoder * dec, uint32_t * shmem_res_id) {
+ apir_decode_uint32_t(dec, shmem_res_id);
+}
+
+/* ggml_cgraph */
+
+static inline size_t apir_serialize_ggml_cgraph(ggml_cgraph * cgraph, std::vector<uint8_t> & cgraph_data) {
+ apir_serialize_graph(cgraph, cgraph_data);
+
+ return cgraph_data.size();
+}
+
+static inline void apir_encode_cgraph_data(apir_encoder * enc, std::vector<uint8_t> & cgraph_data) {
+ size_t cgraph_size = cgraph_data.size();
+
+ apir_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
+}
+
+static inline ggml_cgraph * apir_decode_ggml_cgraph(apir_decoder * dec, size_t cgraph_size) {
+ GGML_UNUSED(cgraph_size);
+
+ uint32_t n_nodes;
+ apir_decode_uint32_t(dec, &n_nodes);
+ const uint64_t * nodes = apir_decode_uint64_t_array_inplace(dec, n_nodes);
+
+ uint32_t n_tensors;
+ apir_decode_uint32_t(dec, &n_tensors);
+ const apir_rpc_tensor * tensors = apir_decode_apir_rpc_tensor_array_inplace(dec, n_tensors);
+
+ return apir_deserialize_graph(n_nodes, n_tensors, tensors, nodes);
+}
+
+static inline void apir_encode_ggml_buffer_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle) {
+ apir_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+
+static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml_tensor * tensor) {
+ size_t tensor_size = sizeof(*tensor);
+
+ if (tensor->extra) {
+ GGML_ABORT("%s: Cannot pass tensors with extra", __func__);
+ }
+
+ if (tensor->src[0] && tensor->buffer) {
+ static int first = 1;
+ if (first) {
+ GGML_LOG_WARN("%s: Cannot pass tensors with src and buffer\n", __func__);
+ first = 0;
+ }
+ }
+
+ apir_encoder_write(enc, tensor_size, tensor, tensor_size);
+
+ // tensor->data is a pointer inside the device buffer. No need to touch it
+ // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+ // (could also make a copy of the tensor, and update locally.)
+
+ if (tensor->buffer) {
+ apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+ apir_encode_ggml_buffer_handle(enc, &buffer_handle);
+ }
+
+ if (tensor->view_src) {
+ apir_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+ }
+
+ for (int i = 0; tensor->src[i]; i++) {
+ const ggml_tensor * tensor_src = tensor->src[i];
+ apir_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+ }
+}
+
+static inline const ggml_tensor * apir_decode_ggml_tensor_inplace(apir_decoder * dec) {
+ // it safe to remove the `const` qualifier here, we *do* want to
+ // modify the shared memory data to fix the `src` pointers.
+ ggml_tensor * tensor = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+
+ // tensor->data is a pointer inside the device buffer. No need to touch it
+ // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+ if (tensor->buffer) {
+ tensor->buffer = apir_decode_ggml_buffer(dec);
+ }
+
+ if (tensor->view_src) {
+ ggml_tensor * tensor_view_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+ tensor->view_src = tensor_view_src;
+ }
+
+ for (int i = 0; tensor->src[i]; i++) {
+ ggml_tensor * tensor_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
+ tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
+ }
+
+ return tensor;
+}
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h
new file mode 100644
index 0000000..f681798
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h
@@ -0,0 +1,54 @@
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <cstdint>
+
+// ggml_tensor is serialized into apir_rpc_tensor
+struct apir_rpc_tensor {
+ uint64_t id;
+ uint32_t type;
+ uint64_t buffer;
+ uint32_t ne[GGML_MAX_DIMS];
+ uint32_t nb[GGML_MAX_DIMS];
+ uint32_t op;
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+ int32_t flags;
+ uint64_t src[GGML_MAX_SRC];
+ uint64_t view_src;
+ uint64_t view_offs;
+ uint64_t data;
+ char name[GGML_MAX_NAME];
+
+ char padding[4];
+};
+
+/* frontend */
+
+apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor);
+
+void apir_serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output);
+
+/* backend */
+
+void apir_track_backend_buffer(ggml_backend_buffer_t buffer);
+bool apir_untrack_backend_buffer(ggml_backend_buffer_t buffer);
+std::unordered_set<ggml_backend_buffer_t> apir_get_track_backend_buffers();
+
+void apir_add_tensor(ggml_tensor * tensor,
+ std::vector<apir_rpc_tensor> & tensors,
+ std::unordered_set<ggml_tensor *> & visited);
+
+ggml_tensor * apir_deserialize_tensor(ggml_context * ctx, const apir_rpc_tensor * tensor);
+
+ggml_tensor * apir_create_node(uint64_t id,
+ ggml_context * ctx,
+ const std::unordered_map<uint64_t, const apir_rpc_tensor *> & tensor_ptrs,
+ std::unordered_map<uint64_t, ggml_tensor *> & tensor_map);
+
+ggml_cgraph * apir_deserialize_graph(uint32_t n_nodes,
+ uint32_t n_tensors,
+ const apir_rpc_tensor * tensors,
+ const uint64_t * nodes);