summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp')
-rw-r--r--llama.cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp179
1 files changed, 179 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp b/llama.cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp
new file mode 100644
index 0000000..8a2805e
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp
@@ -0,0 +1,179 @@
+#include "virtgpu-utils.h"
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include <cstring>
+
+#define NODE_ALLOC_ALIGN 64
+#define NODE_PTR_MASK (~((uintptr_t) NODE_ALLOC_ALIGN - 1))
+#define NODE_LEVEL_MASK ((uintptr_t) NODE_ALLOC_ALIGN - 1)
+#define NULL_NODE 0
+
+#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
+#define os_free_aligned(_ptr) free(_ptr)
+#define p_atomic_cmpxchg(v, old, _new) __sync_val_compare_and_swap((v), (old), (_new))
+
+static inline uint64_t util_logbase2_64(uint64_t n) {
+#if defined(HAVE___BUILTIN_CLZLL)
+ return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+ uint64_t pos = 0ull;
+ if (n >= 1ull << 32) {
+ n >>= 32;
+ pos += 32;
+ }
+ if (n >= 1ull << 16) {
+ n >>= 16;
+ pos += 16;
+ }
+ if (n >= 1ull << 8) {
+ n >>= 8;
+ pos += 8;
+ }
+ if (n >= 1ull << 4) {
+ n >>= 4;
+ pos += 4;
+ }
+ if (n >= 1ull << 2) {
+ n >>= 2;
+ pos += 2;
+ }
+ if (n >= 1ull << 1) {
+ pos += 1;
+ }
+ return pos;
+#endif
+}
+
+void util_sparse_array_init(util_sparse_array * arr, size_t elem_size, size_t node_size) {
+ memset(arr, 0, sizeof(*arr));
+ arr->elem_size = elem_size;
+ arr->node_size_log2 = util_logbase2_64(node_size);
+ assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2));
+}
+
+static inline void * os_malloc_aligned(size_t size, size_t alignment) {
+ void * ptr;
+ alignment = (alignment + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
+ if (posix_memalign(&ptr, alignment, size) != 0) {
+ return NULL;
+ }
+ return ptr;
+}
+
+static inline void * _util_sparse_array_node_data(uintptr_t handle) {
+ return (void *) (handle & NODE_PTR_MASK);
+}
+
+static inline unsigned _util_sparse_array_node_level(uintptr_t handle) {
+ return handle & NODE_LEVEL_MASK;
+}
+
+static inline void _util_sparse_array_node_finish(util_sparse_array * arr, uintptr_t node) {
+ if (_util_sparse_array_node_level(node) > 0) {
+ uintptr_t * children = (uintptr_t *) _util_sparse_array_node_data(node);
+ size_t node_size = 1ull << arr->node_size_log2;
+ for (size_t i = 0; i < node_size; i++) {
+ if (children[i]) {
+ _util_sparse_array_node_finish(arr, children[i]);
+ }
+ }
+ }
+
+ os_free_aligned(_util_sparse_array_node_data(node));
+}
+
+static inline uintptr_t _util_sparse_array_node(void * data, unsigned level) {
+ assert(data != NULL);
+ assert(((uintptr_t) data & NODE_LEVEL_MASK) == 0);
+ assert((level & NODE_PTR_MASK) == 0);
+ return (uintptr_t) data | level;
+}
+
+inline uintptr_t _util_sparse_array_node_alloc(util_sparse_array * arr, unsigned level) {
+ size_t size;
+ if (level == 0) {
+ size = arr->elem_size << arr->node_size_log2;
+ } else {
+ size = sizeof(uintptr_t) << arr->node_size_log2;
+ }
+
+ void * data = os_malloc_aligned(size, NODE_ALLOC_ALIGN);
+ memset(data, 0, size);
+
+ return _util_sparse_array_node(data, level);
+}
+
+static inline uintptr_t _util_sparse_array_set_or_free_node(uintptr_t * node_ptr, uintptr_t cmp_node, uintptr_t node) {
+ uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node);
+
+ if (prev_node != cmp_node) {
+ /* We lost the race. Free this one and return the one that was already
+ * allocated.
+ */
+ os_free_aligned(_util_sparse_array_node_data(node));
+ return prev_node;
+ } else {
+ return node;
+ }
+}
+
+void * util_sparse_array_get(util_sparse_array * arr, uint64_t idx) {
+ const unsigned node_size_log2 = arr->node_size_log2;
+ uintptr_t root = p_atomic_read(&arr->root);
+ if (unlikely(!root)) {
+ unsigned root_level = 0;
+ uint64_t idx_iter = idx >> node_size_log2;
+ while (idx_iter) {
+ idx_iter >>= node_size_log2;
+ root_level++;
+ }
+ uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level);
+ root = _util_sparse_array_set_or_free_node(&arr->root, NULL_NODE, new_root);
+ }
+
+ while (1) {
+ unsigned root_level = _util_sparse_array_node_level(root);
+ uint64_t root_idx = idx >> (root_level * node_size_log2);
+ if (likely(root_idx < (1ull << node_size_log2))) {
+ break;
+ }
+
+ /* In this case, we have a root but its level is low enough that the
+ * requested index is out-of-bounds.
+ */
+ uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1);
+
+ uintptr_t * new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root);
+ new_root_children[0] = root;
+
+ /* We only add one at a time instead of the whole tree because it's
+ * easier to ensure correctness of both the tree building and the
+ * clean-up path. Because we're only adding one node we never have to
+ * worry about trying to free multiple things without freeing the old
+ * things.
+ */
+ root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root);
+ }
+
+ void * node_data = _util_sparse_array_node_data(root);
+ unsigned node_level = _util_sparse_array_node_level(root);
+ while (node_level > 0) {
+ uint64_t child_idx = (idx >> (node_level * node_size_log2)) & ((1ull << node_size_log2) - 1);
+
+ uintptr_t * children = (uintptr_t *) node_data;
+ uintptr_t child = p_atomic_read(&children[child_idx]);
+
+ if (unlikely(!child)) {
+ child = _util_sparse_array_node_alloc(arr, node_level - 1);
+ child = _util_sparse_array_set_or_free_node(&children[child_idx], NULL_NODE, child);
+ }
+
+ node_data = _util_sparse_array_node_data(child);
+ node_level = _util_sparse_array_node_level(child);
+ }
+
+ uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1);
+ return (void *) ((char *) node_data + (elem_idx * arr->elem_size));
+}