llmnpc - llama.cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs

Path: llmnpc / llama.cpp / ggml / src / ggml-virtgpu / backend / shared / apir_cs_ggml.h (raw)
  1#include "ggml-impl.h"
  2#include "apir_cs.h"
  3#include "apir_cs_rpc.h"
  4
  5// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
  6
  7static inline void apir_encode_ggml_buffer_host_handle(apir_encoder *                    enc,
  8                                                       const apir_buffer_host_handle_t * handle);
  9
 10static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
 11
 12/* apir_rpc_tensor */
 13
 14static inline void apir_encode_rcp_tensor(apir_encoder * enc, const apir_rpc_tensor * apir_rpc_tensor) {
 15    size_t apir_rpc_tensor_size = sizeof(*apir_rpc_tensor);
 16    apir_encode(enc, apir_rpc_tensor_size, apir_rpc_tensor, apir_rpc_tensor_size);
 17}
 18
 19static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder * dec) {
 20    size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor);
 21
 22    return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
 23}
 24
 25static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
 26                                                                          uint32_t       n_tensors) {
 27    size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
 28
 29    return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
 30}
 31
 32/* ggml_tensor */
 33
 34static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor * tensor) {
 35    apir_rpc_tensor serialized = apir_serialize_tensor(tensor);
 36
 37    apir_encode_rcp_tensor(enc, &serialized);
 38}
 39
 40static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
 41    const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
 42
 43    if (!apir_rpc_tensor) {
 44        return NULL;
 45    }
 46
 47    ggml_init_params params{
 48        /*.mem_size   =*/ ggml_tensor_overhead(),
 49        /*.mem_buffer =*/ NULL,
 50        /*.no_alloc   =*/ true,
 51    };
 52
 53    ggml_context * ctx = ggml_init(params);
 54
 55    const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);
 56
 57    return tensor;
 58}
 59
 60/* *** ggml_backend_buffer_type_t *** */
 61
 62// ggml_backend_buffer_type_t is a POINTER (to a struct).
 63// Only the host pointer is shared between the host and guest.
 64// The guest stores it in `buft->context`.
 65// The host simply writes the pointer address in the buffer variable.
 66
 67static inline void apir_encode_ggml_buffer_type(apir_encoder * enc, ggml_backend_buffer_type_t buft) {
 68    apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
 69    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
 70}
 71
 72static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decoder * dec) {
 73    apir_buffer_type_host_handle_t handle;
 74
 75    apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
 76
 77    return (ggml_backend_buffer_type_t) handle;
 78}
 79
 80static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) {
 81    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
 82}
 83
 84static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
 85    apir_buffer_type_host_handle_t handle;
 86
 87    apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
 88
 89    return handle;
 90}
 91
 92/* *** ggml_backend_type_t *** */
 93
 94// ggml_backend_buffer_t is a POINTER.
 95// same logic as for ggml_backend_buffer_type_t
 96
 97static inline void apir_encode_ggml_buffer(apir_encoder * enc, const ggml_backend_buffer_t buffer) {
 98    apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
 99    apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
100}
101
102static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) {
103    ggml_backend_buffer_t buffer;
104    size_t                buffer_ptr_size = sizeof(buffer);
105
106    apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
107
108    return buffer;
109}
110
111/* enum ggml_status */
112
113static inline void apir_encode_ggml_status(apir_encoder * enc, const ggml_status * status) {
114    apir_encoder_write(enc, sizeof(*status), status, sizeof(*status));
115}
116
117static inline void apir_decode_ggml_status(apir_decoder * dec, ggml_status * status) {
118    apir_decoder_read(dec, sizeof(*status), status, sizeof(*status));
119}
120
121/* virtgpu_shmem */
122
123static inline void apir_encode_virtgpu_shmem_res_id(apir_encoder * enc, uint32_t shmem_res_id) {
124    apir_encode_uint32_t(enc, &shmem_res_id);
125}
126
127static inline void apir_decode_virtgpu_shmem_res_id(apir_decoder * dec, uint32_t * shmem_res_id) {
128    apir_decode_uint32_t(dec, shmem_res_id);
129}
130
131/* ggml_cgraph */
132
133static inline size_t apir_serialize_ggml_cgraph(ggml_cgraph * cgraph, std::vector<uint8_t> & cgraph_data) {
134    apir_serialize_graph(cgraph, cgraph_data);
135
136    return cgraph_data.size();
137}
138
139static inline void apir_encode_cgraph_data(apir_encoder * enc, std::vector<uint8_t> & cgraph_data) {
140    size_t cgraph_size = cgraph_data.size();
141
142    apir_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
143}
144
145static inline ggml_cgraph * apir_decode_ggml_cgraph(apir_decoder * dec, size_t cgraph_size) {
146    GGML_UNUSED(cgraph_size);
147
148    uint32_t n_nodes;
149    apir_decode_uint32_t(dec, &n_nodes);
150    const uint64_t * nodes = apir_decode_uint64_t_array_inplace(dec, n_nodes);
151
152    uint32_t n_tensors;
153    apir_decode_uint32_t(dec, &n_tensors);
154    const apir_rpc_tensor * tensors = apir_decode_apir_rpc_tensor_array_inplace(dec, n_tensors);
155
156    return apir_deserialize_graph(n_nodes, n_tensors, tensors, nodes);
157}
158
159static inline void apir_encode_ggml_buffer_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle) {
160    apir_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
161}
162
163static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml_tensor * tensor) {
164    size_t tensor_size = sizeof(*tensor);
165
166    if (tensor->extra) {
167        GGML_ABORT("%s: Cannot pass tensors with extra", __func__);
168    }
169
170    if (tensor->src[0] && tensor->buffer) {
171        static int first = 1;
172        if (first) {
173            GGML_LOG_WARN("%s: Cannot pass tensors with src and buffer\n", __func__);
174            first = 0;
175        }
176    }
177
178    apir_encoder_write(enc, tensor_size, tensor, tensor_size);
179
180    // tensor->data is a pointer inside the device buffer. No need to touch it
181    // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
182    // (could also make a copy of the tensor, and update locally.)
183
184    if (tensor->buffer) {
185        apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
186        apir_encode_ggml_buffer_handle(enc, &buffer_handle);
187    }
188
189    if (tensor->view_src) {
190        apir_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
191    }
192
193    for (int i = 0; tensor->src[i]; i++) {
194        const ggml_tensor * tensor_src = tensor->src[i];
195        apir_encoder_write(enc, tensor_size, tensor_src, tensor_size);
196    }
197}
198
199static inline const ggml_tensor * apir_decode_ggml_tensor_inplace(apir_decoder * dec) {
200    // it safe to remove the `const` qualifier here, we *do* want to
201    // modify the shared memory data to fix the `src` pointers.
202    ggml_tensor * tensor = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
203
204    // tensor->data is a pointer inside the device buffer. No need to touch it
205    // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
206    if (tensor->buffer) {
207        tensor->buffer = apir_decode_ggml_buffer(dec);
208    }
209
210    if (tensor->view_src) {
211        ggml_tensor * tensor_view_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
212        tensor->view_src              = tensor_view_src;
213    }
214
215    for (int i = 0; tensor->src[i]; i++) {
216        ggml_tensor * tensor_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor));
217        tensor->src[i] = tensor_src;  // overwrite op->src[i] pointer with the actual location of the src tensor
218    }
219
220    return tensor;
221}