diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h')
| -rw-r--r-- | llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h new file mode 100644 index 0000000..289f4b7 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h | |||
| @@ -0,0 +1,221 @@ | |||
| 1 | #include "ggml-impl.h" | ||
| 2 | #include "apir_cs.h" | ||
| 3 | #include "apir_cs_rpc.h" | ||
| 4 | |||
| 5 | // ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer); | ||
| 6 | |||
| 7 | static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, | ||
| 8 | const apir_buffer_host_handle_t * handle); | ||
| 9 | |||
| 10 | static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec); | ||
| 11 | |||
| 12 | /* apir_rpc_tensor */ | ||
| 13 | |||
| 14 | static inline void apir_encode_rcp_tensor(apir_encoder * enc, const apir_rpc_tensor * apir_rpc_tensor) { | ||
| 15 | size_t apir_rpc_tensor_size = sizeof(*apir_rpc_tensor); | ||
| 16 | apir_encode(enc, apir_rpc_tensor_size, apir_rpc_tensor, apir_rpc_tensor_size); | ||
| 17 | } | ||
| 18 | |||
| 19 | static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder * dec) { | ||
| 20 | size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor); | ||
| 21 | |||
| 22 | return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size); | ||
| 23 | } | ||
| 24 | |||
| 25 | static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, | ||
| 26 | uint32_t n_tensors) { | ||
| 27 | size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors; | ||
| 28 | |||
| 29 | return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size); | ||
| 30 | } | ||
| 31 | |||
| 32 | /* ggml_tensor */ | ||
| 33 | |||
| 34 | static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor * tensor) { | ||
| 35 | apir_rpc_tensor serialized = apir_serialize_tensor(tensor); | ||
| 36 | |||
| 37 | apir_encode_rcp_tensor(enc, &serialized); | ||
| 38 | } | ||
| 39 | |||
| 40 | static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) { | ||
| 41 | const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec); | ||
| 42 | |||
| 43 | if (!apir_rpc_tensor) { | ||
| 44 | return NULL; | ||
| 45 | } | ||
| 46 | |||
| 47 | ggml_init_params params{ | ||
| 48 | /*.mem_size =*/ ggml_tensor_overhead(), | ||
| 49 | /*.mem_buffer =*/ NULL, | ||
| 50 | /*.no_alloc =*/ true, | ||
| 51 | }; | ||
| 52 | |||
| 53 | ggml_context * ctx = ggml_init(params); | ||
| 54 | |||
| 55 | const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor); | ||
| 56 | |||
| 57 | return tensor; | ||
| 58 | } | ||
| 59 | |||
| 60 | /* *** ggml_backend_buffer_type_t *** */ | ||
| 61 | |||
| 62 | // ggml_backend_buffer_type_t is a POINTER (to a struct). | ||
| 63 | // Only the host pointer is shared between the host and guest. | ||
| 64 | // The guest stores it in `buft->context`. | ||
| 65 | // The host simply writes the pointer address in the buffer variable. | ||
| 66 | |||
| 67 | static inline void apir_encode_ggml_buffer_type(apir_encoder * enc, ggml_backend_buffer_type_t buft) { | ||
| 68 | apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft); | ||
| 69 | apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); | ||
| 70 | } | ||
| 71 | |||
| 72 | static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decoder * dec) { | ||
| 73 | apir_buffer_type_host_handle_t handle; | ||
| 74 | |||
| 75 | apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); | ||
| 76 | |||
| 77 | return (ggml_backend_buffer_type_t) handle; | ||
| 78 | } | ||
| 79 | |||
| 80 | static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) { | ||
| 81 | apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); | ||
| 82 | } | ||
| 83 | |||
| 84 | static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) { | ||
| 85 | apir_buffer_type_host_handle_t handle; | ||
| 86 | |||
| 87 | apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); | ||
| 88 | |||
| 89 | return handle; | ||
| 90 | } | ||
| 91 | |||
| 92 | /* *** ggml_backend_type_t *** */ | ||
| 93 | |||
| 94 | // ggml_backend_buffer_t is a POINTER. | ||
| 95 | // same logic as for ggml_backend_buffer_type_t | ||
| 96 | |||
| 97 | static inline void apir_encode_ggml_buffer(apir_encoder * enc, const ggml_backend_buffer_t buffer) { | ||
| 98 | apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer); | ||
| 99 | apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); | ||
| 100 | } | ||
| 101 | |||
| 102 | static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) { | ||
| 103 | ggml_backend_buffer_t buffer; | ||
| 104 | size_t buffer_ptr_size = sizeof(buffer); | ||
| 105 | |||
| 106 | apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size); | ||
| 107 | |||
| 108 | return buffer; | ||
| 109 | } | ||
| 110 | |||
| 111 | /* enum ggml_status */ | ||
| 112 | |||
| 113 | static inline void apir_encode_ggml_status(apir_encoder * enc, const ggml_status * status) { | ||
| 114 | apir_encoder_write(enc, sizeof(*status), status, sizeof(*status)); | ||
| 115 | } | ||
| 116 | |||
| 117 | static inline void apir_decode_ggml_status(apir_decoder * dec, ggml_status * status) { | ||
| 118 | apir_decoder_read(dec, sizeof(*status), status, sizeof(*status)); | ||
| 119 | } | ||
| 120 | |||
| 121 | /* virtgpu_shmem */ | ||
| 122 | |||
| 123 | static inline void apir_encode_virtgpu_shmem_res_id(apir_encoder * enc, uint32_t shmem_res_id) { | ||
| 124 | apir_encode_uint32_t(enc, &shmem_res_id); | ||
| 125 | } | ||
| 126 | |||
| 127 | static inline void apir_decode_virtgpu_shmem_res_id(apir_decoder * dec, uint32_t * shmem_res_id) { | ||
| 128 | apir_decode_uint32_t(dec, shmem_res_id); | ||
| 129 | } | ||
| 130 | |||
| 131 | /* ggml_cgraph */ | ||
| 132 | |||
| 133 | static inline size_t apir_serialize_ggml_cgraph(ggml_cgraph * cgraph, std::vector<uint8_t> & cgraph_data) { | ||
| 134 | apir_serialize_graph(cgraph, cgraph_data); | ||
| 135 | |||
| 136 | return cgraph_data.size(); | ||
| 137 | } | ||
| 138 | |||
| 139 | static inline void apir_encode_cgraph_data(apir_encoder * enc, std::vector<uint8_t> & cgraph_data) { | ||
| 140 | size_t cgraph_size = cgraph_data.size(); | ||
| 141 | |||
| 142 | apir_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size); | ||
| 143 | } | ||
| 144 | |||
| 145 | static inline ggml_cgraph * apir_decode_ggml_cgraph(apir_decoder * dec, size_t cgraph_size) { | ||
| 146 | GGML_UNUSED(cgraph_size); | ||
| 147 | |||
| 148 | uint32_t n_nodes; | ||
| 149 | apir_decode_uint32_t(dec, &n_nodes); | ||
| 150 | const uint64_t * nodes = apir_decode_uint64_t_array_inplace(dec, n_nodes); | ||
| 151 | |||
| 152 | uint32_t n_tensors; | ||
| 153 | apir_decode_uint32_t(dec, &n_tensors); | ||
| 154 | const apir_rpc_tensor * tensors = apir_decode_apir_rpc_tensor_array_inplace(dec, n_tensors); | ||
| 155 | |||
| 156 | return apir_deserialize_graph(n_nodes, n_tensors, tensors, nodes); | ||
| 157 | } | ||
| 158 | |||
| 159 | static inline void apir_encode_ggml_buffer_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle) { | ||
| 160 | apir_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle)); | ||
| 161 | } | ||
| 162 | |||
| 163 | static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml_tensor * tensor) { | ||
| 164 | size_t tensor_size = sizeof(*tensor); | ||
| 165 | |||
| 166 | if (tensor->extra) { | ||
| 167 | GGML_ABORT("%s: Cannot pass tensors with extra", __func__); | ||
| 168 | } | ||
| 169 | |||
| 170 | if (tensor->src[0] && tensor->buffer) { | ||
| 171 | static int first = 1; | ||
| 172 | if (first) { | ||
| 173 | GGML_LOG_WARN("%s: Cannot pass tensors with src and buffer\n", __func__); | ||
| 174 | first = 0; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | |||
| 178 | apir_encoder_write(enc, tensor_size, tensor, tensor_size); | ||
| 179 | |||
| 180 | // tensor->data is a pointer inside the device buffer. No need to touch it | ||
| 181 | // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence. | ||
| 182 | // (could also make a copy of the tensor, and update locally.) | ||
| 183 | |||
| 184 | if (tensor->buffer) { | ||
| 185 | apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer); | ||
| 186 | apir_encode_ggml_buffer_handle(enc, &buffer_handle); | ||
| 187 | } | ||
| 188 | |||
| 189 | if (tensor->view_src) { | ||
| 190 | apir_encoder_write(enc, tensor_size, tensor->view_src, tensor_size); | ||
| 191 | } | ||
| 192 | |||
| 193 | for (int i = 0; tensor->src[i]; i++) { | ||
| 194 | const ggml_tensor * tensor_src = tensor->src[i]; | ||
| 195 | apir_encoder_write(enc, tensor_size, tensor_src, tensor_size); | ||
| 196 | } | ||
| 197 | } | ||
| 198 | |||
| 199 | static inline const ggml_tensor * apir_decode_ggml_tensor_inplace(apir_decoder * dec) { | ||
| 200 | // it safe to remove the `const` qualifier here, we *do* want to | ||
| 201 | // modify the shared memory data to fix the `src` pointers. | ||
| 202 | ggml_tensor * tensor = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor)); | ||
| 203 | |||
| 204 | // tensor->data is a pointer inside the device buffer. No need to touch it | ||
| 205 | // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence. | ||
| 206 | if (tensor->buffer) { | ||
| 207 | tensor->buffer = apir_decode_ggml_buffer(dec); | ||
| 208 | } | ||
| 209 | |||
| 210 | if (tensor->view_src) { | ||
| 211 | ggml_tensor * tensor_view_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor)); | ||
| 212 | tensor->view_src = tensor_view_src; | ||
| 213 | } | ||
| 214 | |||
| 215 | for (int i = 0; tensor->src[i]; i++) { | ||
| 216 | ggml_tensor * tensor_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor)); | ||
| 217 | tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor | ||
| 218 | } | ||
| 219 | |||
| 220 | return tensor; | ||
| 221 | } | ||
