diff options
Diffstat (limited to 'examples/redis-unstable/modules/vector-sets/hnsw.c')
| -rw-r--r-- | examples/redis-unstable/modules/vector-sets/hnsw.c | 2999 |
1 files changed, 0 insertions, 2999 deletions
diff --git a/examples/redis-unstable/modules/vector-sets/hnsw.c b/examples/redis-unstable/modules/vector-sets/hnsw.c deleted file mode 100644 index 2b4ebc0..0000000 --- a/examples/redis-unstable/modules/vector-sets/hnsw.c +++ /dev/null @@ -1,2999 +0,0 @@ -/* HNSW (Hierarchical Navigable Small World) Implementation. - * - * Based on the paper by Yu. A. Malkov, D. A. Yashunin. - * - * Many details of this implementation, not covered in the paper, were - * obtained simulating different workloads and checking the connection - * quality of the graph. - * - * Notably, this implementation: - * - * 1. Only uses bi-directional links, implementing strategies in order to - * link new nodes even when candidates are full, and our new node would - * be not close enough to replace old links in candidate. - * - * 2. We normalize on-insert, making cosine similarity and dot product the - * same. This means we can't use euclidean distance or alike here. - * Together with quantization, this provides an important speedup that - * makes HNSW more practical. - * - * 3. The quantization used is int8. And it is performed per-vector, so the - * "range" (max abs value) is also stored alongside with the quantized data. - * - * 4. This library implements true elements deletion, not just marking the - * element as deleted, but removing it (we can do it since our links are - * bidirectional), and reliking the nodes orphaned of one link among - * them. - * - * Copyright (c) 2009-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of (a) the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - * Originally authored by: Salvatore Sanfilippo. - */ - -#define _DEFAULT_SOURCE -#define _POSIX_C_SOURCE 200809L - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include <stdint.h> -#include <float.h> /* for INFINITY if not in math.h */ -#include <assert.h> -#include "hnsw.h" -#include "mixer.h" - -/* Check if we can compile SIMD code with function attributes */ -#if defined (__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4)) -#if defined(__has_attribute) && __has_attribute(target) -#define HAVE_AVX2 -#define HAVE_AVX512 -#endif -#endif - -#if defined (HAVE_AVX2) -#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2,fma"))) -#define VSET_USE_AVX2 (__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma")) -#else -#define ATTRIBUTE_TARGET_AVX2 -#define VSET_USE_AVX2 0 -#endif - -#if defined (HAVE_AVX512) -#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,fma"))) -#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f")) -#else -#define ATTRIBUTE_TARGET_AVX512 -#define VSET_USE_AVX512 0 -#endif - -/* Include SIMD headers when supported */ -#if defined(HAVE_AVX2) || defined(HAVE_AVX512) -#include <immintrin.h> -#endif - -#if 0 -#define debugmsg printf -#else -#define debugmsg if(0) printf -#endif - -#ifndef INFINITY -#define INFINITY (1.0/0.0) -#endif - -#define MIN(a,b) ((a) < (b) ? (a) : (b)) - -/* Algorithm parameters. */ - -#define HNSW_P 0.25 /* Probability of level increase. */ -#define HNSW_MAX_LEVEL 16 /* Max level nodes can reach. */ -#define HNSW_EF_C 200 /* Default size of dynamic candidate list while - * inserting a new node, in case 0 is passed to - * the 'ef' argument while inserting. This is also - * used when deleting nodes for the search step - * needed sometimes to reconnect nodes that remain - * orphaned of one link. */ - -static void (*hfree)(void *p) = free; -static void *(*hmalloc)(size_t s) = malloc; -static void *(*hrealloc)(void *old, size_t s) = realloc; - -void hnsw_set_allocator(void (*free_ptr)(void*), void *(*malloc_ptr)(size_t), - void *(*realloc_ptr)(void*, size_t)) -{ - hfree = free_ptr; - hmalloc = malloc_ptr; - hrealloc = realloc_ptr; -} - -// Get a warning if you use the libc allocator functions for mistake. -#define malloc use_hmalloc_instead -#define realloc use_hrealloc_instead -#define free use_hfree_instead - -/* ============================== Prototypes ================================ */ -void hnsw_cursor_element_deleted(HNSW *index, hnswNode *deleted); - -/* ============================ Priority queue ================================ - * We need a priority queue to take an ordered list of candidates. Right now - * it is implemented as a linear array, since it is relatively small. - * - * You may find it to be odd that we take the best element (smaller distance) - * at the end of the array, but this way popping from the pqueue is O(1), as - * we need to just decrement the count, and this is a very used operation - * in a critical code path. This makes the priority queue implementation a - * bit more complex in the insertion, but for good reasons. */ - -/* Maximum number of candidates we'll ever need (cit. Bill Gates). */ -#define HNSW_MAX_CANDIDATES 256 - -typedef struct { - hnswNode *node; - float distance; -} pqitem; - -typedef struct { - pqitem *items; /* Array of items. */ - uint32_t count; /* Current number of items. */ - uint32_t cap; /* Maximum capacity. */ -} pqueue; - -/* The HNSW algorithms access the pqueue conceptually from nearest (index 0) - * to farthest (larger indexes) node, so the following macros are used to - * access the pqueue in this fashion, even if the internal order is - * actually reversed. */ -#define pq_get_node(q,i) ((q)->items[(q)->count-(i+1)].node) -#define pq_get_distance(q,i) ((q)->items[(q)->count-(i+1)].distance) - -/* Create a new priority queue with given capacity. Adding to the - * pqueue only retains 'capacity' elements with the shortest distance. */ -pqueue *pq_new(uint32_t capacity) { - pqueue *pq = hmalloc(sizeof(*pq)); - if (!pq) return NULL; - - pq->items = hmalloc(sizeof(pqitem) * capacity); - if (!pq->items) { - hfree(pq); - return NULL; - } - - pq->count = 0; - pq->cap = capacity; - return pq; -} - -/* Free a priority queue. */ -void pq_free(pqueue *pq) { - if (!pq) return; - hfree(pq->items); - hfree(pq); -} - -/* Insert maintaining distance order (higher distances first). */ -void pq_push(pqueue *pq, hnswNode *node, float distance) { - if (pq->count < pq->cap) { - /* Queue not full: shift right from high distances to make room. */ - uint32_t i = pq->count; - while (i > 0 && pq->items[i-1].distance < distance) { - pq->items[i] = pq->items[i-1]; - i--; - } - pq->items[i].node = node; - pq->items[i].distance = distance; - pq->count++; - } else { - /* Queue full: if new item is worse than worst, ignore it. */ - if (distance >= pq->items[0].distance) return; - - /* Otherwise shift left from low distances to drop worst. */ - uint32_t i = 0; - while (i < pq->cap-1 && pq->items[i+1].distance > distance) { - pq->items[i] = pq->items[i+1]; - i++; - } - pq->items[i].node = node; - pq->items[i].distance = distance; - } -} - -/* Remove and return the top (closest) element, which is at count-1 - * since we store elements with higher distances first. - * Runs in constant time. */ -hnswNode *pq_pop(pqueue *pq, float *distance) { - if (pq->count == 0) return NULL; - pq->count--; - *distance = pq->items[pq->count].distance; - return pq->items[pq->count].node; -} - -/* Get distance of the furthest element. - * An empty priority queue has infinite distance as its furthest element, - * note that this behavior is needed by the algorithms below. */ -float pq_max_distance(pqueue *pq) { - if (pq->count == 0) return INFINITY; - return pq->items[0].distance; -} - -/* ============================ HNSW algorithm ============================== */ - -#if defined(HAVE_AVX512) -/* AVX512 optimized dot product for float vectors */ -ATTRIBUTE_TARGET_AVX512 -float vectors_distance_float_avx512(const float *x, const float *y, uint32_t dim) { - __m512 sum = _mm512_setzero_ps(); - uint32_t i; - - /* Process 16 floats at a time with AVX512 */ - for (i = 0; i + 15 < dim; i += 16) { - __m512 vx = _mm512_loadu_ps(&x[i]); - __m512 vy = _mm512_loadu_ps(&y[i]); - sum = _mm512_fmadd_ps(vx, vy, sum); - } - - /* Horizontal sum of the 16 elements in sum */ - float dot = _mm512_reduce_add_ps(sum); - - /* Handle remaining elements */ - for (; i < dim; i++) { - dot += x[i] * y[i]; - } - - return 1.0f - dot; -} -#endif /* HAVE_AVX512 */ - -#if defined(HAVE_AVX2) -/* AVX2 optimized dot product for float vectors */ -ATTRIBUTE_TARGET_AVX2 -float vectors_distance_float_avx2(const float *x, const float *y, uint32_t dim) { - __m256 sum1 = _mm256_setzero_ps(); - __m256 sum2 = _mm256_setzero_ps(); - uint32_t i; - - /* Process 16 floats at a time with two AVX2 registers */ - for (i = 0; i + 15 < dim; i += 16) { - __m256 vx1 = _mm256_loadu_ps(&x[i]); - __m256 vy1 = _mm256_loadu_ps(&y[i]); - __m256 vx2 = _mm256_loadu_ps(&x[i + 8]); - __m256 vy2 = _mm256_loadu_ps(&y[i + 8]); - - sum1 = _mm256_fmadd_ps(vx1, vy1, sum1); - sum2 = _mm256_fmadd_ps(vx2, vy2, sum2); - } - - /* Combine the two sums */ - __m256 combined = _mm256_add_ps(sum1, sum2); - - /* Horizontal sum of the 8 elements */ - __m128 sum_high = _mm256_extractf128_ps(combined, 1); - __m128 sum_low = _mm256_castps256_ps128(combined); - __m128 sum_128 = _mm_add_ps(sum_high, sum_low); - - sum_128 = _mm_hadd_ps(sum_128, sum_128); - sum_128 = _mm_hadd_ps(sum_128, sum_128); - - float dot = _mm_cvtss_f32(sum_128); - - /* Handle remaining elements */ - for (; i < dim; i++) { - dot += x[i] * y[i]; - } - - return 1.0f - dot; -} -#endif /* HAVE_AVX2 */ - -/* Optimized dot product: automatically selects best available implementation - * Dot product: our vectors are already normalized. - * Version for not quantized vectors of floats. */ -float vectors_distance_float(const float *x, const float *y, uint32_t dim) { -#if defined(HAVE_AVX512) - if (dim >= 16 && VSET_USE_AVX512) { - return vectors_distance_float_avx512(x, y, dim); - } -#endif - -#if defined(HAVE_AVX2) - if (VSET_USE_AVX2 && dim >= 16) { - return vectors_distance_float_avx2(x, y, dim); - } -#endif - - /* Fallback to original scalar implementation */ - float dot0 = 0.0f, dot1 = 0.0f; - uint32_t i; - - /* Use two accumulators to reduce dependencies among multiplications. - * This provides a clear speed boost in Apple silicon, but should be - * help in general. */ - for (i = 0; i + 7 < dim; i += 8) { - dot0 += x[i] * y[i] + - x[i+1] * y[i+1] + - x[i+2] * y[i+2] + - x[i+3] * y[i+3]; - - dot1 += x[i+4] * y[i+4] + - x[i+5] * y[i+5] + - x[i+6] * y[i+6] + - x[i+7] * y[i+7]; - } - - /* Handle the remaining elements. These are a minority in the case - * of a small vector, don't optimize this part. */ - for (; i < dim; i++) dot0 += x[i] * y[i]; - - /* The following line may be counter intuitive. The dot product of - * normalized vectors is equivalent to their cosine similarity. The - * cosine will be from -1 (vectors facing opposite directions in the - * N-dim space) to 1 (vectors are facing in the same direction). - * - * We kinda want a "score" of distance from 0 to 2 (this is a distance - * function and we want minimize the distance for K-NN searches), so we - * can't just add 1: that would return a number in the 0-2 range, with - * 0 meaning opposite vectors and 2 identical vectors, so this is - * similarity, not distance. - * - * Returning instead (1 - dotprod) inverts the meaning: 0 is identical - * and 2 is opposite, hence it is their distance. - * - * Why don't normalize the similarity right now, and return from 0 to - * 1? Because division is costly. */ - return 1.0f - (dot0 + dot1); -} - -/* Q8 quants dotproduct. We do integer math and later fix it by range. */ -float vectors_distance_q8(const int8_t *x, const int8_t *y, uint32_t dim, - float range_a, float range_b) { - // Handle zero vectors special case. - if (range_a == 0 || range_b == 0) { - /* Zero vector distance from anything is 1.0 - * (since 1.0 - dot_product where dot_product = 0). */ - return 1.0f; - } - - /* Each vector is quantized from [-max_abs, +max_abs] to [-127, 127] - * where range = 2*max_abs. */ - const float scale_product = (range_a/127) * (range_b/127); - - int32_t dot0 = 0, dot1 = 0; - uint32_t i; - - // Process 8 elements at a time for better pipeline utilization. - for (i = 0; i + 7 < dim; i += 8) { - dot0 += ((int32_t)x[i]) * ((int32_t)y[i]) + - ((int32_t)x[i+1]) * ((int32_t)y[i+1]) + - ((int32_t)x[i+2]) * ((int32_t)y[i+2]) + - ((int32_t)x[i+3]) * ((int32_t)y[i+3]); - - dot1 += ((int32_t)x[i+4]) * ((int32_t)y[i+4]) + - ((int32_t)x[i+5]) * ((int32_t)y[i+5]) + - ((int32_t)x[i+6]) * ((int32_t)y[i+6]) + - ((int32_t)x[i+7]) * ((int32_t)y[i+7]); - } - - // Handle remaining elements. - for (; i < dim; i++) dot0 += ((int32_t)x[i]) * ((int32_t)y[i]); - - // Convert to original range. - float dotf = (dot0 + dot1) * scale_product; - float distance = 1.0f - dotf; - - // Clamp distance to [0, 2]. - if (distance < 0) distance = 0; - else if (distance > 2) distance = 2; - return distance; -} - -static inline int popcount64(uint64_t x) { - x = (x & 0x5555555555555555) + ((x >> 1) & 0x5555555555555555); - x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); - x = (x & 0x0F0F0F0F0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F0F0F0F0F); - x = (x & 0x00FF00FF00FF00FF) + ((x >> 8) & 0x00FF00FF00FF00FF); - x = (x & 0x0000FFFF0000FFFF) + ((x >> 16) & 0x0000FFFF0000FFFF); - x = (x & 0x00000000FFFFFFFF) + ((x >> 32) & 0x00000000FFFFFFFF); - return x; -} - -/* Binary vectors distance. */ -float vectors_distance_bin(const uint64_t *x, const uint64_t *y, uint32_t dim) { - uint32_t len = (dim+63)/64; - uint32_t opposite = 0; - for (uint32_t j = 0; j < len; j++) { - uint64_t xor = x[j]^y[j]; - opposite += popcount64(xor); - } - return (float)opposite*2/dim; -} - -/* Dot product between nodes. Will call the right version depending on the - * quantization used. */ -float hnsw_distance(HNSW *index, hnswNode *a, hnswNode *b) { - switch(index->quant_type) { - case HNSW_QUANT_NONE: - return vectors_distance_float(a->vector,b->vector,index->vector_dim); - case HNSW_QUANT_Q8: - return vectors_distance_q8(a->vector,b->vector,index->vector_dim,a->quants_range,b->quants_range); - case HNSW_QUANT_BIN: - return vectors_distance_bin(a->vector,b->vector,index->vector_dim); - default: - assert(1 != 1); - return 0; - } -} - -/* This do Q8 'range' quantization. - * For people looking at this code thinking: Oh, I could use min/max - * quants instead! Well: I tried with min/max normalization but the dot - * product needs to accumulate the sum for later correction, and it's slower. */ -void quantize_to_q8(float *src, int8_t *dst, uint32_t dim, float *rangeptr) { - float max_abs = 0; - for (uint32_t j = 0; j < dim; j++) { - if (src[j] > max_abs) max_abs = src[j]; - if (-src[j] > max_abs) max_abs = -src[j]; - } - - if (max_abs == 0) { - if (rangeptr) *rangeptr = 0; - memset(dst, 0, dim); - return; - } - - const float scale = 127.0f / max_abs; // Scale to map to [-127, 127]. - - for (uint32_t j = 0; j < dim; j++) { - dst[j] = (int8_t)roundf(src[j] * scale); - } - if (rangeptr) *rangeptr = max_abs; // Return max_abs instead of 2*max_abs. -} - -/* Binary quantization of vector 'src' to 'dst'. We use full words of - * 64 bit as smallest unit, we will just set all the unused bits to 0 - * so that they'll be the same in all the vectors, and when xor+popcount - * is used to compute the distance, such bits are not considered. This - * allows to go faster. */ -void quantize_to_bin(float *src, uint64_t *dst, uint32_t dim) { - memset(dst,0,(dim+63)/64*sizeof(uint64_t)); - for (uint32_t j = 0; j < dim; j++) { - uint32_t word = j/64; - uint32_t bit = j&63; - /* Since cosine similarity checks the vector direction and - * not magnitudo, we do likewise in the binary quantization and - * just remember if the component is positive or negative. */ - if (src[j] > 0) dst[word] |= 1ULL<<bit; - } -} - -/* L2 normalization of the float vector. - * - * Store the L2 value on 'l2ptr' if not NULL. This way the process - * can be reversed even if some precision will be lost. */ -void hnsw_normalize_vector(float *x, float *l2ptr, uint32_t dim) { - float l2 = 0; - uint32_t i; - for (i = 0; i + 3 < dim; i += 4) { - l2 += x[i]*x[i] + - x[i+1]*x[i+1] + - x[i+2]*x[i+2] + - x[i+3]*x[i+3]; - } - for (; i < dim; i++) l2 += x[i]*x[i]; - if (l2 == 0) return; // All zero vector, can't normalize. - - l2 = sqrtf(l2); - if (l2ptr) *l2ptr = l2; - for (i = 0; i < dim; i++) x[i] /= l2; -} - -/* Helper function to generate random level. */ -uint32_t random_level(void) { - static const int threshold = HNSW_P * RAND_MAX; - uint32_t level = 0; - - while (rand() < threshold && level < HNSW_MAX_LEVEL) - level += 1; - return level; -} - -/* Create new HNSW index, quantized or not. */ -HNSW *hnsw_new(uint32_t vector_dim, uint32_t quant_type, uint32_t m) { - HNSW *index = hmalloc(sizeof(HNSW)); - if (!index) return NULL; - - /* M parameter sanity check. */ - if (m == 0) m = HNSW_DEFAULT_M; - else if (m > HNSW_MAX_M) m = HNSW_MAX_M; - - index->M = m; - index->quant_type = quant_type; - index->enter_point = NULL; - index->max_level = 0; - index->vector_dim = vector_dim; - index->node_count = 0; - index->last_id = 0; - index->head = NULL; - index->cursors = NULL; - - /* Initialize epochs array. */ - for (int i = 0; i < HNSW_MAX_THREADS; i++) - index->current_epoch[i] = 0; - - /* Initialize locks. */ - if (pthread_rwlock_init(&index->global_lock, NULL) != 0) { - hfree(index); - return NULL; - } - - for (int i = 0; i < HNSW_MAX_THREADS; i++) { - if (pthread_mutex_init(&index->slot_locks[i], NULL) != 0) { - /* Clean up previously initialized mutexes. */ - for (int j = 0; j < i; j++) - pthread_mutex_destroy(&index->slot_locks[j]); - pthread_rwlock_destroy(&index->global_lock); - hfree(index); - return NULL; - } - } - - /* Initialize atomic variables. */ - index->next_slot = 0; - index->version = 0; - return index; -} - -/* Fill 'vec' with the node vector, de-normalizing and de-quantizing it - * as needed. Note that this function will return an approximated version - * of the original vector. */ -void hnsw_get_node_vector(HNSW *index, hnswNode *node, float *vec) { - if (index->quant_type == HNSW_QUANT_NONE) { - memcpy(vec,node->vector,index->vector_dim*sizeof(float)); - } else if (index->quant_type == HNSW_QUANT_Q8) { - int8_t *quants = node->vector; - for (uint32_t j = 0; j < index->vector_dim; j++) - vec[j] = (quants[j]*node->quants_range)/127; - } else if (index->quant_type == HNSW_QUANT_BIN) { - uint64_t *bits = node->vector; - for (uint32_t j = 0; j < index->vector_dim; j++) { - uint32_t word = j/64; - uint32_t bit = j&63; - vec[j] = (bits[word] & (1ULL<<bit)) ? 1.0f : -1.0f; - } - } - - // De-normalize. - if (index->quant_type != HNSW_QUANT_BIN) { - for (uint32_t j = 0; j < index->vector_dim; j++) - vec[j] *= node->l2; - } -} - -/* Return the number of bytes needed to represent a vector in the index, - * that is function of the dimension of the vectors and the quantization - * type used. */ -uint32_t hnsw_quants_bytes(HNSW *index) { - switch(index->quant_type) { - case HNSW_QUANT_NONE: return index->vector_dim * sizeof(float); - case HNSW_QUANT_Q8: return index->vector_dim; - case HNSW_QUANT_BIN: return (index->vector_dim+63)/64*8; - default: assert(0 && "Quantization type not supported."); - } -} - -/* Create new node. Returns NULL on out of memory. - * It is possible to pass the vector as floats or, in case this index - * was already stored on disk and is being loaded, or serialized and - * transmitted in any form, the already quantized version in - * 'qvector'. - * - * Only vector or qvector should be non-NULL. The reason why passing - * a quantized vector is useful, is that because re-normalizing and - * re-quantizing several times the same vector may accumulate rounding - * errors. So if you work with quantized indexes, you should save - * the quantized indexes. - * - * Note that, together with qvector, the quantization range is needed, - * since this library uses per-vector quantization. In case of quantized - * vectors the l2 is considered to be '1', so if you want to restore - * the right l2 (to use the API that returns an approximation of the - * original vector) make sure to save the l2 on disk and set it back - * after the node creation (see later for the serialization API that - * handles this and more). */ -hnswNode *hnsw_node_new(HNSW *index, uint64_t id, const float *vector, const int8_t *qvector, float qrange, uint32_t level, int normalize) { - hnswNode *node = hmalloc(sizeof(hnswNode)+(sizeof(hnswNodeLayer)*(level+1))); - if (!node) return NULL; - - if (id == 0) id = ++index->last_id; - node->level = level; - node->id = id; - node->next = NULL; - node->vector = NULL; - node->l2 = 1; // Default in case of already quantized vectors. It is - // up to the caller to fill this later, if needed. - - /* Initialize visited epoch array. */ - for (int i = 0; i < HNSW_MAX_THREADS; i++) - node->visited_epoch[i] = 0; - - if (qvector == NULL) { - /* Copy input vector. */ - node->vector = hmalloc(sizeof(float) * index->vector_dim); - if (!node->vector) { - hfree(node); - return NULL; - } - memcpy(node->vector, vector, sizeof(float) * index->vector_dim); - if (normalize) - hnsw_normalize_vector(node->vector,&node->l2,index->vector_dim); - - /* Handle quantization. */ - if (index->quant_type != HNSW_QUANT_NONE) { - void *quants = hmalloc(hnsw_quants_bytes(index)); - if (quants == NULL) { - hfree(node->vector); - hfree(node); - return NULL; - } - - // Quantize. - switch(index->quant_type) { - case HNSW_QUANT_Q8: - quantize_to_q8(node->vector,quants,index->vector_dim,&node->quants_range); - break; - case HNSW_QUANT_BIN: - quantize_to_bin(node->vector,quants,index->vector_dim); - break; - default: - assert(0 && "Quantization type not handled."); - break; - } - - // Discard the full precision vector. - hfree(node->vector); - node->vector = quants; - } - } else { - // We got the already quantized vector. Just copy it. - assert(index->quant_type != HNSW_QUANT_NONE); - uint32_t vector_bytes = hnsw_quants_bytes(index); - node->vector = hmalloc(vector_bytes); - node->quants_range = qrange; - if (node->vector == NULL) { - hfree(node); - return NULL; - } - memcpy(node->vector,qvector,vector_bytes); - } - - /* Initialize each layer. */ - for (uint32_t i = 0; i <= level; i++) { - uint32_t max_links = (i == 0) ? index->M*2 : index->M; - node->layers[i].max_links = max_links; - node->layers[i].num_links = 0; - node->layers[i].worst_distance = 0; - node->layers[i].worst_idx = 0; - node->layers[i].links = hmalloc(sizeof(hnswNode*) * max_links); - if (!node->layers[i].links) { - for (uint32_t j = 0; j < i; j++) hfree(node->layers[j].links); - hfree(node->vector); - hfree(node); - return NULL; - } - } - - return node; -} - -/* Free a node. */ -void hnsw_node_free(hnswNode *node) { - if (!node) return; - - for (uint32_t i = 0; i <= node->level; i++) - hfree(node->layers[i].links); - - hfree(node->vector); - hfree(node); -} - -/* Free the entire index. */ -void hnsw_free(HNSW *index,void(*free_value)(void*value)) { - if (!index) return; - - hnswNode *current = index->head; - while (current) { - hnswNode *next = current->next; - if (free_value) free_value(current->value); - hnsw_node_free(current); - current = next; - } - - /* Destroy locks */ - pthread_rwlock_destroy(&index->global_lock); - for (int i = 0; i < HNSW_MAX_THREADS; i++) { - pthread_mutex_destroy(&index->slot_locks[i]); - } - - hfree(index); -} - -/* Add node to linked list of nodes. We may need to scan the whole - * HNSW graph for several reasons. The list is doubly linked since we - * also need the ability to remove a node without scanning the whole thing. */ -void hnsw_add_node(HNSW *index, hnswNode *node) { - node->next = index->head; - node->prev = NULL; - if (index->head) - index->head->prev = node; - index->head = node; - index->node_count++; -} - -/* Search the specified layer starting from the specified entry point - * to collect 'ef' nodes that are near to 'query'. - * - * This function implements optional hybrid search, so that each node - * can be accepted or not based on its associated value. In this case - * a callback 'filter_callback' should be passed, together with a maximum - * effort for the search (number of candidates to evaluate), since even - * with a a low "EF" value we risk that there are too few nodes that satisfy - * the provided filter, and we could trigger a full scan. */ -pqueue *search_layer_with_filter( - HNSW *index, hnswNode *query, hnswNode *entry_point, - uint32_t ef, uint32_t layer, uint32_t slot, - int (*filter_callback)(void *value, void *privdata), - void *filter_privdata, uint32_t max_candidates) -{ - // Mark visited nodes with a never seen epoch. - index->current_epoch[slot]++; - - pqueue *candidates = pq_new(HNSW_MAX_CANDIDATES); - pqueue *results = pq_new(ef); - if (!candidates || !results) { - if (candidates) pq_free(candidates); - if (results) pq_free(results); - return NULL; - } - - // Take track of the total effort: only used when filtering via - // a callback to have a bound effort. - uint32_t evaluated_candidates = 1; - - // Add entry point. - float dist = hnsw_distance(index, query, entry_point); - pq_push(candidates, entry_point, dist); - if (filter_callback == NULL || - filter_callback(entry_point->value, filter_privdata)) - { - pq_push(results, entry_point, dist); - } - entry_point->visited_epoch[slot] = index->current_epoch[slot]; - - // Process candidates. - while (candidates->count > 0) { - // Max effort. If zero, we keep scanning. - if (filter_callback && - max_candidates && - evaluated_candidates >= max_candidates) break; - - float cur_dist; - hnswNode *current = pq_pop(candidates, &cur_dist); - evaluated_candidates++; - - float furthest = pq_max_distance(results); - if (results->count >= ef && cur_dist > furthest) break; - - /* Check neighbors. */ - for (uint32_t i = 0; i < current->layers[layer].num_links; i++) { - hnswNode *neighbor = current->layers[layer].links[i]; - - if (neighbor->visited_epoch[slot] == index->current_epoch[slot]) - continue; // Already visited during this scan. - - neighbor->visited_epoch[slot] = index->current_epoch[slot]; - float neighbor_dist = hnsw_distance(index, query, neighbor); - - furthest = pq_max_distance(results); - if (filter_callback == NULL) { - /* Original HNSW logic when no filtering: - * Add to results if better than current max or - * results not full. */ - if (neighbor_dist < furthest || results->count < ef) { - pq_push(candidates, neighbor, neighbor_dist); - pq_push(results, neighbor, neighbor_dist); - } - } else { - /* With filtering: we add candidates even if doesn't match - * the filter, in order to continue to explore the graph. */ - if (neighbor_dist < furthest || candidates->count < ef) { - pq_push(candidates, neighbor, neighbor_dist); - } - - /* Add results only if passes filter. */ - if (filter_callback(neighbor->value, filter_privdata)) { - if (neighbor_dist < furthest || results->count < ef) { - pq_push(results, neighbor, neighbor_dist); - } - } - } - } - } - - pq_free(candidates); - return results; -} - -/* Just a wrapper without hybrid search callback. */ -pqueue *search_layer(HNSW *index, hnswNode *query, hnswNode *entry_point, - uint32_t ef, uint32_t layer, uint32_t slot) -{ - return search_layer_with_filter(index, query, entry_point, ef, layer, slot, - NULL, NULL, 0); -} - -/* This function is used in order to initialize a node allocated in the - * function stack with the specified vector. The idea is that we can - * easily use hnsw_distance() from a vector and the HNSW nodes this way: - * - * hnswNode myQuery; - * hnsw_init_tmp_node(myIndex,&myQuery,0,some_vector); - * hnsw_distance(&myQuery, some_hnsw_node); - * - * Make sure to later free the node with: - * - * hnsw_free_tmp_node(&myQuery,some_vector); - * You have to pass the vector to the free function, because sometimes - * hnsw_init_tmp_node() may just avoid allocating a vector at all, - * just reusing 'some_vector' pointer. - * - * Return 0 on out of memory, 1 on success. - */ -int hnsw_init_tmp_node(HNSW *index, hnswNode *node, int is_normalized, const float *vector) { - node->vector = NULL; - - /* Work on a normalized query vector if the input vector is - * not normalized. */ - if (!is_normalized) { - node->vector = hmalloc(sizeof(float)*index->vector_dim); - if (node->vector == NULL) return 0; - memcpy(node->vector,vector,sizeof(float)*index->vector_dim); - hnsw_normalize_vector(node->vector,NULL,index->vector_dim); - } else { - node->vector = (float*)vector; - } - - /* If quantization is enabled, our query fake node should be - * quantized as well. */ - if (index->quant_type != HNSW_QUANT_NONE) { - void *quants = hmalloc(hnsw_quants_bytes(index)); - if (quants == NULL) { - if (node->vector != vector) hfree(node->vector); - return 0; - } - switch(index->quant_type) { - case HNSW_QUANT_Q8: - quantize_to_q8(node->vector, quants, index->vector_dim, &node->quants_range); - break; - case HNSW_QUANT_BIN: - quantize_to_bin(node->vector, quants, index->vector_dim); - } - if (node->vector != vector) hfree(node->vector); - node->vector = quants; - } - return 1; -} - -/* Free the stack allocated node initialized by hnsw_init_tmp_node(). */ -void hnsw_free_tmp_node(hnswNode *node, const float *vector) { - if (node->vector != vector) hfree(node->vector); -} - -/* Return approximated K-NN items. Note that neighbors and distances - * arrays must have space for at least 'k' items. - * norm_query should be set to 1 if the query vector is already - * normalized, otherwise, if 0, the function will copy the vector, - * L2-normalize the copy and search using the normalized version. - * - * If the filter_privdata callback is passed, only elements passing the - * specified filter (invoked with privdata and the value associated - * to the node as arguments) are returned. In such case, if max_candidates - * is not NULL, it represents the maximum number of nodes to explore, since - * the search may be otherwise unbound if few or no elements pass the - * filter. */ -int hnsw_search_with_filter - (HNSW *index, const float *query_vector, uint32_t k, - hnswNode **neighbors, float *distances, uint32_t slot, - int query_vector_is_normalized, - int (*filter_callback)(void *value, void *privdata), - void *filter_privdata, uint32_t max_candidates) - -{ - if (!index || !query_vector || !neighbors || k == 0) return -1; - if (!index->enter_point) return 0; // Empty index. - - /* Use a fake node that holds the query vector, this way we can - * use our normal node to node distance functions when checking - * the distance between query and graph nodes. */ - hnswNode query; - if (hnsw_init_tmp_node(index,&query,query_vector_is_normalized,query_vector) == 0) return -1; - - // Start searching from the entry point. - hnswNode *curr_ep = index->enter_point; - - /* Start from higher layer to layer 1 (layer 0 is handled later) - * in the next section. Descend to the most similar node found - * so far. */ - for (int lc = index->max_level; lc > 0; lc--) { - pqueue *results = search_layer(index, &query, curr_ep, 1, lc, slot); - if (!results) continue; - - if (results->count > 0) { - curr_ep = pq_get_node(results,0); - } - pq_free(results); - } - - /* Search bottom layer (the most densely populated) with ef = k */ - pqueue *results = search_layer_with_filter( - index, &query, curr_ep, k, 0, slot, filter_callback, - filter_privdata, max_candidates); - if (!results) { - hnsw_free_tmp_node(&query, query_vector); - return -1; - } - - /* Copy results. */ - uint32_t found = MIN(k, results->count); - for (uint32_t i = 0; i < found; i++) { - neighbors[i] = pq_get_node(results,i); - if (distances) { - distances[i] = pq_get_distance(results,i); - } - } - - pq_free(results); - hnsw_free_tmp_node(&query, query_vector); - return found; -} - -/* Wrapper to hnsw_search_with_filter() when no filter is needed. */ -int hnsw_search(HNSW *index, const float *query_vector, uint32_t k, - hnswNode **neighbors, float *distances, uint32_t slot, - int query_vector_is_normalized) -{ - return hnsw_search_with_filter(index,query_vector,k,neighbors, - distances,slot,query_vector_is_normalized, - NULL,NULL,0); -} - -/* Rescan a node and update the wortst neighbor index. - * The followinng two functions are variants of this function to be used - * when links are added or removed: they may do less work than a full scan. */ -void hnsw_update_worst_neighbor(HNSW *index, hnswNode *node, uint32_t layer) { - float worst_dist = 0; - uint32_t worst_idx = 0; - for (uint32_t i = 0; i < node->layers[layer].num_links; i++) { - float dist = hnsw_distance(index, node, node->layers[layer].links[i]); - if (dist > worst_dist) { - worst_dist = dist; - worst_idx = i; - } - } - node->layers[layer].worst_distance = worst_dist; - node->layers[layer].worst_idx = worst_idx; -} - -/* Update node worst neighbor distance information when a new neighbor - * is added. */ -void hnsw_update_worst_neighbor_on_add(HNSW *index, hnswNode *node, uint32_t layer, uint32_t added_index, float distance) { - (void) index; // Unused but here for API symmetry. - if (node->layers[layer].num_links == 1 || // First neighbor? - distance > node->layers[layer].worst_distance) // New worst? - { - node->layers[layer].worst_distance = distance; - node->layers[layer].worst_idx = added_index; - } -} - -/* Update node worst neighbor distance information when a linked neighbor - * is removed. */ -void hnsw_update_worst_neighbor_on_remove(HNSW *index, hnswNode *node, uint32_t layer, uint32_t removed_idx) -{ - if (node->layers[layer].num_links == 0) { - node->layers[layer].worst_distance = 0; - node->layers[layer].worst_idx = 0; - } else if (removed_idx == node->layers[layer].worst_idx) { - hnsw_update_worst_neighbor(index,node,layer); - } else if (removed_idx < node->layers[layer].worst_idx) { - // Just update index if we removed element before worst. - node->layers[layer].worst_idx--; - } -} - -/* We have a list of candidate nodes to link to the new node, when inserting - * one. This function selects which nodes to link and performs the linking. - * - * Parameters: - * - * - 'candidates' is the priority queue of potential good nodes to link to the - * new node 'new_node'. - * - 'required_links' is as many links we would like our new_node to get - * at the specified layer. - * - 'aggressive' changes the strategy used to find good neighbors as follows: - * - * This function is called with aggressive=0 for all the layers, including - * layer 0. When called like that, it will use the diversity of links and - * quality of links checks before linking our new node with some candidate. - * - * However if the insert function finds that at layer 0, with aggressive=0, - * few connections were made, it calls this function again with aggressiveness - * levels greater up to 2. - * - * At aggressive=1, the diversity checks are disabled, and the candidate - * node for linking is accepted even if it is nearest to an already accepted - * neighbor than it is to the new node. - * - * When we link our new node by replacing the link of a candidate neighbor - * that already has the max number of links, inevitably some other node loses - * a connection (to make space for our new node link). In this case: - * - * 1. If such "dropped" node would remain with too little links, we try with - * some different neighbor instead, however as the 'aggressive' parameter - * has incremental values (0, 1, 2) we are more and more willing to leave - * the dropped node with fever connections. - * 2. If aggressive=2, we will scan the candidate neighbor node links to - * find a different linked-node to replace, one better connected even if - * its distance is not the worse. - * - * Note: this function is also called during deletion of nodes in order to - * provide certain nodes with additional links. - */ -void select_neighbors(HNSW *index, pqueue *candidates, hnswNode *new_node, - uint32_t layer, uint32_t required_links, int aggressive) -{ - for (uint32_t i = 0; i < candidates->count; i++) { - hnswNode *neighbor = pq_get_node(candidates,i); - if (neighbor == new_node) continue; // Don't link node with itself. - - /* Use our cached distance among the new node and the candidate. */ - float dist = pq_get_distance(candidates,i); - - /* First of all, since our links are all bidirectional, if the - * new node for any reason has no longer room, or if it accumulated - * the required number of links, return ASAP. */ - if (new_node->layers[layer].num_links >= new_node->layers[layer].max_links || - new_node->layers[layer].num_links >= required_links) return; - - /* If aggressive is true, it is possible that the new node - * already got some link among the candidates (see the top comment, - * this function gets re-called in case of too few links). - * So we need to check if this candidate is already linked to - * the new node. */ - if (aggressive) { - int duplicated = 0; - for (uint32_t j = 0; j < new_node->layers[layer].num_links; j++) { - if (new_node->layers[layer].links[j] == neighbor) { - duplicated = 1; - break; - } - } - if (duplicated) continue; - } - - /* Diversity check. We accept new candidates - * only if there is no element already accepted that is nearest - * to the candidate than the new element itself. - * However this check is disabled if we have pressure to find - * new links (aggressive != 0) */ - if (!aggressive) { - int diversity_failed = 0; - for (uint32_t j = 0; j < new_node->layers[layer].num_links; j++) { - float link_dist = hnsw_distance(index, neighbor, - new_node->layers[layer].links[j]); - if (link_dist < dist) { - diversity_failed = 1; - break; - } - } - if (diversity_failed) continue; - } - - /* If potential neighbor node has space, simply add the new link. - * We will have space as well. */ - uint32_t n = neighbor->layers[layer].num_links; - if (n < neighbor->layers[layer].max_links) { - /* Link candidate to new node. */ - neighbor->layers[layer].links[n] = new_node; - neighbor->layers[layer].num_links++; - - /* Update candidate worst link info. */ - hnsw_update_worst_neighbor_on_add(index,neighbor,layer,n,dist); - - /* Link new node to candidate. */ - uint32_t new_links = new_node->layers[layer].num_links; - new_node->layers[layer].links[new_links] = neighbor; - new_node->layers[layer].num_links++; - - /* Update new node worst link info. */ - hnsw_update_worst_neighbor_on_add(index,new_node,layer,new_links,dist); - continue; - } - - /* ==================================================================== - * Replacing existing candidate neighbor link step. - * ================================================================== */ - - /* If we are here, our accepted candidate for linking is full. - * - * If new node is more distant to candidate than its current worst link - * then we skip it: we would not be able to establish a bidirectional - * connection without compromising link quality of candidate. - * - * At aggressiveness > 0 we don't care about this check. */ - if (!aggressive && dist >= neighbor->layers[layer].worst_distance) - continue; - - /* We can add it: we are ready to replace the candidate neighbor worst - * link with the new node, assuming certain conditions are met. */ - hnswNode *worst_node = neighbor->layers[layer].links[neighbor->layers[layer].worst_idx]; - - /* The worst node linked to our candidate may remain too disconnected - * if we remove the candidate node as its link. Let's check if - * this is the case: */ - if (aggressive == 0 && - worst_node->layers[layer].num_links <= index->M/2) - continue; - - /* Aggressive level = 1. It's ok if the node remains with just - * HNSW_M/4 links. */ - else if (aggressive == 1 && - worst_node->layers[layer].num_links <= index->M/4) - continue; - - /* If aggressive is set to 2, then the new node we are adding failed - * to find enough neighbors. We can't insert an almost orphaned new - * node, so let's see if the target node has some other link - * that is well connected in the graph: we could drop it instead - * of the worst link. */ - if (aggressive == 2 && worst_node->layers[layer].num_links <= - index->M/4) - { - /* Let's see if we can find at least a candidate link that - * would remain with a few connections. Track the one - * that is the farthest away (worst distance) from our candidate - * neighbor (in order to remove the less interesting link). */ - worst_node = NULL; - uint32_t worst_idx = 0; - float max_dist = 0; - for (uint32_t j = 0; j < neighbor->layers[layer].num_links; j++) { - hnswNode *to_drop = neighbor->layers[layer].links[j]; - - /* Skip this if it would remain too disconnected as well. - * - * NOTE about index->M/4 min connections requirement: - * - * It is not too strict, since leaving a node with just a - * single link does not just leave it too weakly connected, but - * also sometimes creates cycles with few disconnected - * nodes linked among them. */ - if (to_drop->layers[layer].num_links <= index->M/4) continue; - - float link_dist = hnsw_distance(index, neighbor, to_drop); - if (worst_node == NULL || link_dist > max_dist) { - worst_node = to_drop; - max_dist = link_dist; - worst_idx = j; - } - } - - if (worst_node != NULL) { - /* We found a node that we can drop. Let's pretend this is - * the worst node of the candidate to unify the following - * code path. Later we will fix the worst node info anyway. */ - neighbor->layers[layer].worst_distance = max_dist; - neighbor->layers[layer].worst_idx = worst_idx; - } else { - /* Otherwise we have no other option than reallocating - * the max number of links for this target node, and - * ensure at least a few connections for our new node. */ - uint32_t reallocation_limit = layer == 0 ? - index->M * 3 : index->M *2; - if (neighbor->layers[layer].max_links >= reallocation_limit) - continue; - - uint32_t new_max_links = neighbor->layers[layer].max_links+1; - hnswNode **new_links = hrealloc(neighbor->layers[layer].links, - sizeof(hnswNode*) * new_max_links); - if (new_links == NULL) continue; // Non critical. - - /* Update neighbor's link capacity. */ - neighbor->layers[layer].links = new_links; - neighbor->layers[layer].max_links = new_max_links; - - /* Establish bidirectional link. */ - uint32_t n = neighbor->layers[layer].num_links; - neighbor->layers[layer].links[n] = new_node; - neighbor->layers[layer].num_links++; - hnsw_update_worst_neighbor_on_add(index, neighbor, layer, - n, dist); - - n = new_node->layers[layer].num_links; - new_node->layers[layer].links[n] = neighbor; - new_node->layers[layer].num_links++; - hnsw_update_worst_neighbor_on_add(index, new_node, layer, - n, dist); - continue; - } - } - - // Remove backlink from the worst node of our candidate. - for (uint64_t j = 0; j < worst_node->layers[layer].num_links; j++) { - if (worst_node->layers[layer].links[j] == neighbor) { - memmove(&worst_node->layers[layer].links[j], - &worst_node->layers[layer].links[j+1], - (worst_node->layers[layer].num_links - j - 1) * sizeof(hnswNode*)); - worst_node->layers[layer].num_links--; - hnsw_update_worst_neighbor_on_remove(index,worst_node,layer,j); - break; - } - } - - /* Replace worst link with the new node. */ - neighbor->layers[layer].links[neighbor->layers[layer].worst_idx] = new_node; - - /* Update the worst link in the target node, at this point - * the link that we replaced may no longer be the worst. */ - hnsw_update_worst_neighbor(index,neighbor,layer); - - // Add new node -> candidate link. - uint32_t new_links = new_node->layers[layer].num_links; - new_node->layers[layer].links[new_links] = neighbor; - new_node->layers[layer].num_links++; - - // Update new node worst link. - hnsw_update_worst_neighbor_on_add(index,new_node,layer,new_links,dist); - } -} - -/* This function implements node reconnection after a node deletion in HNSW. - * When a node is deleted, other nodes at the specified layer lose one - * connection (all the neighbors of the deleted node). This function attempts - * to pair such nodes together in a way that maximizes connection quality - * among the M nodes that were former neighbors of our deleted node. - * - * The algorithm works by first building a distance matrix among the nodes: - * - * N0 N1 N2 N3 - * N0 0 1.2 0.4 0.9 - * N1 1.2 0 0.8 0.5 - * N2 0.4 0.8 0 1.1 - * N3 0.9 0.5 1.1 0 - * - * For each potential pairing (i,j) we compute a score that combines: - * 1. The direct cosine distance between the two nodes - * 2. The average distance to other nodes that would no longer be - * available for pairing if we select this pair - * - * We want to balance local node-to-node requirements and global requirements. - * For instance sometimes connecting A with B, while optimal, would leave - * C and D to be connected without other choices, and this could be a very - * bad connection. Maybe instead A and C and B and D are both relatively high - * quality connections. - * - * The formula used to calculate the score of each connection is: - * - * score[i,j] = W1*(2-distance[i,j]) + W2*((new_avg_i + new_avg_j)/2) - * where new_avg_x is the average of distances in row x excluding distance[i,j] - * - * So the score is directly proportional to the SIMILARITY of the two nodes - * and also directly proportional to the DISTANCE of the potential other - * connections that we lost by pairign i,j. So we have a cost for missed - * opportunities, or better, in this case, a reward if the missing - * opportunities are not so good (big average distance). - * - * W1 and W2 are weights (defaults: 0.7 and 0.3) that determine the relative - * importance of immediate connection quality vs future pairing potential. - * - * After the initial pairing phase, any nodes that couldn't be paired - * (due to odd count or existing connections) are handled by searching - * the broader graph using the standard HNSW neighbor selection logic. - */ -void hnsw_reconnect_nodes(HNSW *index, hnswNode **nodes, int count, uint32_t layer) { - if (count <= 0) return; - debugmsg("Reconnecting %d nodes\n", count); - - /* Step 1: Build the distance matrix between all nodes. - * Since distance(i,j) = distance(j,i), we only compute the upper triangle - * and mirror it to the lower triangle. */ - float *distances = hmalloc((unsigned long) count * count * sizeof(float)); - if (!distances) return; - - for (int i = 0; i < count; i++) { - distances[i*count + i] = 0; // Distance to self is 0 - for (int j = i+1; j < count; j++) { - float dist = hnsw_distance(index, nodes[i], nodes[j]); - distances[i*count + j] = dist; // Upper triangle. - distances[j*count + i] = dist; // Lower triangle. - } - } - - /* Step 2: Calculate row averages (will be used in scoring): - * please note that we just calculate row averages and not - * columns averages since the matrix is symmetrical, so those - * are the same: check the image in the top comment if you have any - * doubt about this. */ - float *row_avgs = hmalloc(count * sizeof(float)); - if (!row_avgs) { - hfree(distances); - return; - } - - for (int i = 0; i < count; i++) { - float sum = 0; - int valid_count = 0; - for (int j = 0; j < count; j++) { - if (i != j) { - sum += distances[i*count + j]; - valid_count++; - } - } - row_avgs[i] = valid_count ? sum / valid_count : 0; - } - - /* Step 3: Build scoring matrix. What we do here is to combine how - * good is a given i,j nodes connection, with how badly connecting - * i,j will affect the remaining quality of connections left to - * pair the other nodes. */ - float *scores = hmalloc((unsigned long) count * count * sizeof(float)); - if (!scores) { - hfree(distances); - hfree(row_avgs); - return; - } - - /* Those weights were obtained manually... No guarantee that they - * are optimal. However with these values the algorithm is certain - * better than its greedy version that just attempts to pick the - * best pair each time (verified experimentally). */ - const float W1 = 0.7; // Weight for immediate distance. - const float W2 = 0.3; // Weight for future potential. - - for (int i = 0; i < count; i++) { - for (int j = 0; j < count; j++) { - if (i == j) { - scores[i*count + j] = -1; // Invalid pairing. - continue; - } - - // Check for existing connection between i and j. - int already_linked = 0; - for (uint32_t k = 0; k < nodes[i]->layers[layer].num_links; k++) - { - if (nodes[i]->layers[layer].links[k] == nodes[j]) { - scores[i*count + j] = -1; // Already linked. - already_linked = 1; - break; - } - } - if (already_linked) continue; - - float dist = distances[i*count + j]; - - /* Calculate new averages excluding this pair. - * Handle edge case where we might have too few elements. - * Note that it would be not very smart to recompute the average - * each time scanning the row, we can remove the element - * and adjust the average without it. */ - float new_avg_i = 0, new_avg_j = 0; - if (count > 2) { - new_avg_i = (row_avgs[i] * (count-1) - dist) / (count-2); - new_avg_j = (row_avgs[j] * (count-1) - dist) / (count-2); - } - - /* Final weighted score: the more similar i,j, the better - * the score. The more distant are the pairs we lose by - * connecting i,j, the better the score. */ - scores[i*count + j] = W1*(2-dist) + W2*((new_avg_i + new_avg_j)/2); - } - } - - // Step 5: Pair nodes greedily based on scores. - int *used = hmalloc(count*sizeof(int)); - memset(used,0,count*sizeof(int)); - if (!used) { - hfree(distances); - hfree(row_avgs); - hfree(scores); - return; - } - - /* Scan the matrix looking each time for the potential - * link with the best score. */ - while(1) { - float max_score = -1; - int best_j = -1, best_i = -1; - - // Seek best score i,j values. - for (int i = 0; i < count; i++) { - if (used[i]) continue; // Already connected. - - /* No space left? Not possible after a node deletion but makes - * this function more future-proof. */ - if (nodes[i]->layers[layer].num_links >= - nodes[i]->layers[layer].max_links) continue; - - for (int j = 0; j < count; j++) { - if (i == j) continue; // Same node, skip. - if (used[j]) continue; // Already connected. - float score = scores[i*count + j]; - if (score < 0) continue; // Invalid link. - - /* If the target node has space, and its score is better - * than any other seen so far... remember it is the best. */ - if (score > max_score && - nodes[j]->layers[layer].num_links < - nodes[j]->layers[layer].max_links) - { - // Track the best connection found so far. - max_score = score; - best_j = j; - best_i = i; - } - } - } - - // Possible link found? Connect i and j. - if (best_j != -1) { - debugmsg("[%d] linking %d with %d: %f\n", layer, (int)best_i, (int)best_j, max_score); - // Link i -> j. - int link_idx = nodes[best_i]->layers[layer].num_links; - nodes[best_i]->layers[layer].links[link_idx] = nodes[best_j]; - nodes[best_i]->layers[layer].num_links++; - - // Update worst distance if needed. - float dist = distances[best_i*count + best_j]; - hnsw_update_worst_neighbor_on_add(index,nodes[best_i],layer,link_idx,dist); - - // Link j -> i. - link_idx = nodes[best_j]->layers[layer].num_links; - nodes[best_j]->layers[layer].links[link_idx] = nodes[best_i]; - nodes[best_j]->layers[layer].num_links++; - - // Update worst distance if needed. - hnsw_update_worst_neighbor_on_add(index,nodes[best_j],layer,link_idx,dist); - - // Mark connection as used. - used[best_i] = used[best_j] = 1; - } else { - break; // No more valid connections available. - } - } - - /* Step 6: Handle remaining unpaired nodes using the standard HNSW - * neighbor selection. */ - for (int i = 0; i < count; i++) { - if (used[i]) continue; - - // Skip if node is already at max connections. - if (nodes[i]->layers[layer].num_links >= - nodes[i]->layers[layer].max_links) - continue; - - debugmsg("[%d] Force linking %d\n", layer, i); - - /* First, try with local nodes as candidates. - * Some candidate may have space. */ - pqueue *candidates = pq_new(count); - if (!candidates) continue; - - /* Add all the local nodes having some space as candidates - * to be linked with this node. */ - for (int j = 0; j < count; j++) { - if (i != j && // Must not be itself. - nodes[j]->layers[layer].num_links < // Must not be full. - nodes[j]->layers[layer].max_links) - { - float dist = distances[i*count + j]; - pq_push(candidates, nodes[j], dist); - } - } - - /* Try local candidates first with aggressive = 1. - * So we will link only if there is space. - * We want one link more than the links we already have. */ - uint32_t wanted_links = nodes[i]->layers[layer].num_links+1; - if (candidates->count > 0) { - select_neighbors(index, candidates, nodes[i], layer, - wanted_links, 1); - debugmsg("Final links after attempt with local nodes: %d (wanted: %d)\n", (int)nodes[i]->layers[layer].num_links, wanted_links); - } - - // If still no connection, search the broader graph. - if (nodes[i]->layers[layer].num_links != wanted_links) { - debugmsg("No force linking possible with local candidates\n"); - pq_free(candidates); - - // Find entry point for target layer by descending through levels. - hnswNode *curr_ep = index->enter_point; - for (uint32_t lc = index->max_level; lc > layer; lc--) { - pqueue *results = search_layer(index, nodes[i], curr_ep, 1, lc, 0); - if (results) { - if (results->count > 0) { - curr_ep = pq_get_node(results,0); - } - pq_free(results); - } - } - - if (curr_ep) { - /* Search this layer for candidates. - * Use the default EF_C in this case, since it's not an - * "insert" operation, and we don't know the user - * specified "EF". */ - candidates = search_layer(index, nodes[i], curr_ep, HNSW_EF_C, layer, 0); - if (candidates) { - /* Try to connect with aggressiveness proportional to the - * node linking condition. */ - int aggressiveness = - (nodes[i]->layers[layer].num_links > index->M / 2) - ? 1 : 2; - select_neighbors(index, candidates, nodes[i], layer, - wanted_links, aggressiveness); - debugmsg("Final links with broader search: %d (wanted: %d)\n", (int)nodes[i]->layers[layer].num_links, wanted_links); - pq_free(candidates); - } - } - } else { - pq_free(candidates); - } - } - - // Cleanup. - hfree(distances); - hfree(row_avgs); - hfree(scores); - hfree(used); -} - -/* This is an helper function in order to support node deletion. - * It's goal is just to: - * - * 1. Remove the node from the bidirectional links of neighbors in the graph. - * 2. Remove the node from the linked list of nodes. - * 3. Fix the entry point in the graph. We just select one of the neighbors - * of the deleted node at a lower level. If none is found, we do - * a full scan. - * 4. The node itself amd its aux value field are NOT freed. It's up to the - * caller to do it, by using hnsw_node_free(). - * 5. The node associated value (node->value) is NOT freed. - * - * Why this function will not free the node? Because in node updates it - * could be a good idea to reuse the node allocation for different reasons - * (currently not implemented). - * In general it is more future-proof to be able to reuse the node if - * needed. Right now this library reuses the node only when links are - * not touched (see hnsw_update() for more information). */ -void hnsw_unlink_node(HNSW *index, hnswNode *node) { - if (!index || !node) return; - - index->version++; // This node may be missing in an already compiled list - // of neighbors. Make optimistic concurrent inserts fail. - - /* Remove all bidirectional links at each level. - * Note that in this implementation all the - * links are guaranteed to be bedirectional. */ - - /* For each level of the deleted node... */ - for (uint32_t level = 0; level <= node->level; level++) { - /* For each linked node of the deleted node... */ - for (uint32_t i = 0; i < node->layers[level].num_links; i++) { - hnswNode *linked = node->layers[level].links[i]; - /* Find and remove the backlink in the linked node */ - for (uint32_t j = 0; j < linked->layers[level].num_links; j++) { - if (linked->layers[level].links[j] == node) { - /* Remove by shifting remaining links left */ - memmove(&linked->layers[level].links[j], - &linked->layers[level].links[j + 1], - (linked->layers[level].num_links - j - 1) * sizeof(hnswNode*)); - linked->layers[level].num_links--; - hnsw_update_worst_neighbor_on_remove(index,linked,level,j); - break; - } - } - } - } - - /* Update cursors pointing at this element. */ - if (index->cursors) hnsw_cursor_element_deleted(index,node); - - /* Update the previous node's next pointer. */ - if (node->prev) { - node->prev->next = node->next; - } else { - /* If there's no previous node, this is the head. */ - index->head = node->next; - } - - /* Update the next node's prev pointer. */ - if (node->next) node->next->prev = node->prev; - - /* Update node count. */ - index->node_count--; - - /* If this node was the enter_point, we need to update it. */ - if (node == index->enter_point) { - /* Reset entry point - we'll find a new one (unless the HNSW is - * now empty) */ - index->enter_point = NULL; - index->max_level = 0; - - /* Step 1: Try to find a replacement by scanning levels - * from top to bottom. Under normal conditions, if there is - * any other node at the same level, we have a link. Anyway - * we descend levels to find any neighbor at the higher level - * possible. */ - for (int level = node->level; level >= 0; level--) { - if (node->layers[level].num_links > 0) { - index->enter_point = node->layers[level].links[0]; - break; - } - } - - /* Step 2: If no links were found at any level, do a full scan. - * This should never happen in practice if the HNSW is not - * empty. */ - if (!index->enter_point) { - uint32_t new_max_level = 0; - hnswNode *current = index->head; - - while (current) { - if (current != node && current->level >= new_max_level) { - new_max_level = current->level; - index->enter_point = current; - } - current = current->next; - } - } - - /* Update max_level. */ - if (index->enter_point) - index->max_level = index->enter_point->level; - } - - /* Clear the node's links but don't free the node itself */ - node->prev = node->next = NULL; -} - -/* Higher level API for hnsw_unlink_node() + hnsw_reconnect_nodes() actual work. - * This will get the write lock, will delete the node, free it, - * reconnect the node neighbors among themselves, and unlock again. - * If free_value function pointer is not NULL, then the function provided is - * used to free node->value. - * - * The function returns 0 on error (inability to acquire the lock), otherwise - * 1 is returned. */ -int hnsw_delete_node(HNSW *index, hnswNode *node, void(*free_value)(void*value)) { - if (pthread_rwlock_wrlock(&index->global_lock) != 0) return 0; - hnsw_unlink_node(index,node); - if (free_value && node->value) free_value(node->value); - - /* Relink all the nodes orphaned of this node link. - * Do it for all the levels. */ - for (unsigned int j = 0; j <= node->level; j++) { - hnsw_reconnect_nodes(index, node->layers[j].links, - node->layers[j].num_links, j); - } - hnsw_node_free(node); - pthread_rwlock_unlock(&index->global_lock); - return 1; -} - -/* ============================ Threaded API ================================ - * Concurrent readers should use the following API to get a slot assigned - * (and a lock, too), do their read-only call, and unlock the slot. - * - * There is a reason why read operations don't implement opaque transparent - * locking directly on behalf of the user: when we return a result set - * with hnsw_search(), we report a set of nodes. The caller will do something - * with the nodes and the associated values, so the unlocking of the - * slot should happen AFTER the result was already used, otherwise we may - * have changes to the HNSW nodes as the result is being accessed. */ - -/* Try to acquire a read slot. Returns the slot number (0 to HNSW_MAX_THREADS-1) - * on success, -1 on error (pthread mutex errors). */ -int hnsw_acquire_read_slot(HNSW *index) { - /* First try a non-blocking approach on all slots. */ - for (uint32_t i = 0; i < HNSW_MAX_THREADS; i++) { - if (pthread_mutex_trylock(&index->slot_locks[i]) == 0) { - if (pthread_rwlock_rdlock(&index->global_lock) != 0) { - pthread_mutex_unlock(&index->slot_locks[i]); - return -1; - } - return i; - } - } - - /* All trylock attempts failed, use atomic increment to select slot. */ - uint32_t slot = index->next_slot++ % HNSW_MAX_THREADS; - - /* Try to lock the selected slot. */ - if (pthread_mutex_lock(&index->slot_locks[slot]) != 0) return -1; - - /* Get read lock. */ - if (pthread_rwlock_rdlock(&index->global_lock) != 0) { - pthread_mutex_unlock(&index->slot_locks[slot]); - return -1; - } - - return slot; -} - -/* Release a previously acquired read slot: note that it is important that - * nodes returned by hnsw_search() are accessed while the read lock is - * still active, to be sure that nodes are not freed. */ -void hnsw_release_read_slot(HNSW *index, int slot) { - if (slot < 0 || slot >= HNSW_MAX_THREADS) return; - pthread_rwlock_unlock(&index->global_lock); - pthread_mutex_unlock(&index->slot_locks[slot]); -} - -/* ============================ Nodes insertion ============================= - * We have an optimistic API separating the read-only candidates search - * and the write side (actual node insertion). We internally also use - * this API to provide the plain hnsw_insert() function for code unification. */ - -struct InsertContext { - pqueue *level_queues[HNSW_MAX_LEVEL]; /* Candidates for each level. */ - hnswNode *node; /* Pre-allocated node ready for insertion */ - uint64_t version; /* Index version at preparation time. This is used - * for CAS-like locking during change commit. */ -}; - -/* Optimistic insertion API. - * - * WARNING: Note that this is an internal function: users should call - * hnsw_prepare_insert() instead. - * - * This is how it works: you use hnsw_prepare_insert() and it will return - * a context where good candidate neighbors are already pre-selected. - * This step only uses read locks. - * - * Then finally you try to actually commit the new node with - * hnsw_try_commit_insert(): this time we will require a write lock, but - * for less time than it would be otherwise needed if using directly - * hnsw_insert(). When you try to commit the write, if no node was deleted in - * the meantime, your operation will succeed, otherwise it will fail, and - * you should try to just use the hnsw_insert() API, since there is - * contention. - * - * See hnsw_node_new() for information about 'vector' and 'qvector' - * arguments, and which one to pass. */ -InsertContext *hnsw_prepare_insert_nolock(HNSW *index, const float *vector, - const int8_t *qvector, float qrange, uint64_t id, - int slot, int ef) -{ - InsertContext *ctx = hmalloc(sizeof(*ctx)); - if (!ctx) return NULL; - - memset(ctx, 0, sizeof(*ctx)); - ctx->version = index->version; - - /* Crete a new node that we may be able to insert into the - * graph later, when calling the commit function. */ - uint32_t level = random_level(); - ctx->node = hnsw_node_new(index, id, vector, qvector, qrange, level, 1); - if (!ctx->node) { - hfree(ctx); - return NULL; - } - - hnswNode *curr_ep = index->enter_point; - - /* Empty graph, no need to collect candidates. */ - if (curr_ep == NULL) return ctx; - - /* Phase 1: Find good entry point on the highest level of the new - * node we are going to insert. */ - for (unsigned int lc = index->max_level; lc > level; lc--) { - pqueue *results = search_layer(index, ctx->node, curr_ep, 1, lc, slot); - - if (results) { - if (results->count > 0) curr_ep = pq_get_node(results,0); - pq_free(results); - } - } - - /* Phase 2: Collect a set of potential connections for each layer of - * the new node. */ - for (int lc = MIN(level, index->max_level); lc >= 0; lc--) { - pqueue *candidates = - search_layer(index, ctx->node, curr_ep, ef, lc, slot); - - if (!candidates) continue; - curr_ep = (candidates->count > 0) ? pq_get_node(candidates,0) : curr_ep; - ctx->level_queues[lc] = candidates; - } - - return ctx; -} - -/* External API for hnsw_prepare_insert_nolock(), handling locking. */ -InsertContext *hnsw_prepare_insert(HNSW *index, const float *vector, - const int8_t *qvector, float qrange, uint64_t id, - int ef) -{ - InsertContext *ctx; - int slot = hnsw_acquire_read_slot(index); - ctx = hnsw_prepare_insert_nolock(index,vector,qvector,qrange,id,slot,ef); - hnsw_release_read_slot(index,slot); - return ctx; -} - -/* Free an insert context and all its resources. */ -void hnsw_free_insert_context(InsertContext *ctx) { - if (!ctx) return; - for (uint32_t i = 0; i < HNSW_MAX_LEVEL; i++) { - if (ctx->level_queues[i]) pq_free(ctx->level_queues[i]); - } - if (ctx->node) hnsw_node_free(ctx->node); - hfree(ctx); -} - -/* Commit a prepared insert operation. This function is a low level API that - * should not be called by the user. See instead hnsw_try_commit_insert(), that - * will perform the CAS check and acquire the write lock. - * - * See the top comment in hnsw_prepare_insert() for more information - * on the optimistic insertion API. - * - * This function can't fail and always returns the pointer to the - * just inserted node. Out of memory is not possible since no critical - * allocation is never performed in this code path: we populate links - * on already allocated nodes. */ -hnswNode *hnsw_commit_insert_nolock(HNSW *index, InsertContext *ctx, void *value) { - hnswNode *node = ctx->node; - node->value = value; - - /* Handle first node case. */ - if (index->enter_point == NULL) { - index->version++; // First node, make concurrent inserts fail. - index->enter_point = node; - index->max_level = node->level; - hnsw_add_node(index, node); - ctx->node = NULL; // So hnsw_free_insert_context() will not free it. - hnsw_free_insert_context(ctx); - return node; - } - - /* Connect the node with near neighbors at each level. */ - for (int lc = MIN(node->level,index->max_level); lc >= 0; lc--) { - if (ctx->level_queues[lc] == NULL) continue; - - /* Try to provide index->M connections to our node. The call - * is not guaranteed to be able to provide all the links we would - * like to have for the new node: they must be bi-directional, obey - * certain quality checks, and so forth, so later there are further - * calls to force the hand a bit if needed. - * - * Let's start with aggressiveness = 0. */ - select_neighbors(index, ctx->level_queues[lc], node, lc, index->M, 0); - - /* Layer 0 and too few connections? Let's be more aggressive. */ - if (lc == 0 && node->layers[0].num_links < index->M/2) { - select_neighbors(index, ctx->level_queues[lc], node, lc, - index->M, 1); - - /* Still too few connections? Let's go to - * aggressiveness level '2' in linking strategy. */ - if (node->layers[0].num_links < index->M/4) { - select_neighbors(index, ctx->level_queues[lc], node, lc, - index->M/4, 2); - } - } - } - - /* If new node level is higher than current max, update entry point. */ - if (node->level > index->max_level) { - index->version++; // Entry point changed, make concurrent inserts fail. - index->enter_point = node; - index->max_level = node->level; - } - - /* Add node to the linked list. */ - hnsw_add_node(index, node); - ctx->node = NULL; // So hnsw_free_insert_context() will not free the node. - hnsw_free_insert_context(ctx); - return node; -} - -/* If the context obtained with hnsw_prepare_insert() is still valid - * (nodes not deleted in the meantime) then add the new node to the HNSW - * index and return its pointer. Otherwise NULL is returned and the operation - * should be either performed with the blocking API hnsw_insert() or attempted - * again. */ -hnswNode *hnsw_try_commit_insert(HNSW *index, InsertContext *ctx, void *value) { - /* Check if the version changed since preparation. Note that we - * should access index->version under the write lock in order to - * be sure we can safely commit the write: this is just a fast-path - * in order to return ASAP without acquiring the write lock in case - * the version changed. */ - if (ctx->version != index->version) { - hnsw_free_insert_context(ctx); - return NULL; - } - - /* Try to acquire write lock. */ - if (pthread_rwlock_wrlock(&index->global_lock) != 0) { - hnsw_free_insert_context(ctx); - return NULL; - } - - /* Check version again under write lock. */ - if (ctx->version != index->version) { - pthread_rwlock_unlock(&index->global_lock); - hnsw_free_insert_context(ctx); - return NULL; - } - - /* Commit the change: note that it's up to hnsw_commit_insert_nolock() - * to free the insertion context. */ - hnswNode *node = hnsw_commit_insert_nolock(index, ctx, value); - - /* Release the write lock. */ - pthread_rwlock_unlock(&index->global_lock); - return node; -} - -/* Insert a new element into the graph. - * See hnsw_node_new() for information about 'vector' and 'qvector' - * arguments, and which one to pass. - * - * Return NULL on out of memory during insert. Otherwise the newly - * inserted node pointer is returned. */ -hnswNode *hnsw_insert(HNSW *index, const float *vector, const int8_t *qvector, float qrange, uint64_t id, void *value, int ef) { - /* Write lock. We acquire the write lock even for the prepare() - * operation (that is a read-only operation) since we want this function - * to don't fail in the check-and-set stage of commit(). - * - * Basically here we are using the optimistic API in a non-optimistinc - * way in order to have a single insertion code in the implementation. */ - if (pthread_rwlock_wrlock(&index->global_lock) != 0) return NULL; - - // Prepare the insertion - note we pass slot 0 since we're single threaded. - InsertContext *ctx = hnsw_prepare_insert_nolock(index, vector, qvector, - qrange, id, 0, ef); - if (!ctx) { - pthread_rwlock_unlock(&index->global_lock); - return NULL; - } - - // Commit the prepared insertion without version checking. - hnswNode *node = hnsw_commit_insert_nolock(index, ctx, value); - - // Release write lock and return our node pointer. - pthread_rwlock_unlock(&index->global_lock); - return node; -} - -/* Helper function for qsort call in hnsw_should_reuse_node(). */ -static int compare_floats(const float *a, const float *b) { - if (*a < *b) return 1; - if (*a > *b) return -1; - return 0; -} - -/* This function determines if a node can be reused with a new vector by: - * - * 1. Computing average of worst 25% of current distances. - * 2. Checking if at least 50% of new distances stay below this threshold. - * 3. Requiring a minimum number of links for the check to be meaningful. - * - * This check is useful when we want to just update a node that already - * exists in the graph. Often the new vector is a learned embedding generated - * by some model, and the embedding represents some document that perhaps - * changed just slightly compared to the past, so the new embedding will - * be very nearby. We need to find a way do determine if the current node - * neighbors (practically speaking its location in the grapb) are good - * enough even with the new vector. - * - * XXX: this function needs improvements: successive updates to the same - * node with more and more distant vectors will make the node drift away - * from its neighbors. One of the additional metrics used could be - * neighbor-to-neighbor distance, that represents a more absolute check - * of fit for the new vector. */ -int hnsw_should_reuse_node(HNSW *index, hnswNode *node, int is_normalized, const float *new_vector) { - /* Step 1: Not enough links? Advice to avoid reuse. */ - const uint32_t min_links_for_reuse = 4; - uint32_t layer0_connections = node->layers[0].num_links; - if (layer0_connections < min_links_for_reuse) return 0; - - /* Step2: get all current distances and run our heuristic. */ - float *old_distances = hmalloc(sizeof(float) * layer0_connections); - if (!old_distances) return 0; - - // Temporary node with the new vector, to simplify the next logic. - hnswNode tmp_node; - if (hnsw_init_tmp_node(index,&tmp_node,is_normalized,new_vector) == 0) { - hfree(old_distances); - return 0; - } - - /* Get old dinstances and sort them to access the 25% worst - * (bigger) ones. */ - for (uint32_t i = 0; i < layer0_connections; i++) { - old_distances[i] = hnsw_distance(index, node, node->layers[0].links[i]); - } - qsort(old_distances, layer0_connections, sizeof(float), - (int (*)(const void*, const void*))(&compare_floats)); - - uint32_t count = (layer0_connections+3)/4; // 25% approx to larger int. - if (count > layer0_connections) count = layer0_connections; // Futureproof. - float worst_avg = 0; - - // Compute average of 25% worst dinstances. - for (uint32_t i = 0; i < count; i++) worst_avg += old_distances[i]; - worst_avg /= count; - hfree(old_distances); - - // Count how many new distances stay below the threshold. - uint32_t good_distances = 0; - for (uint32_t i = 0; i < layer0_connections; i++) { - float new_dist = hnsw_distance(index, &tmp_node, node->layers[0].links[i]); - if (new_dist <= worst_avg) good_distances++; - } - hnsw_free_tmp_node(&tmp_node,new_vector); - - /* At least 50% of the nodes should pass our quality test, for the - * node to be reused. */ - return good_distances >= layer0_connections/2; -} - -/** - * Return a random node from the HNSW graph. - * - * This function performs a random walk starting from the entry point, - * using only level 0 connections for navigation. It uses log^2(N) steps - * to ensure proper mixing time. - */ - -hnswNode *hnsw_random_node(HNSW *index, int slot) { - if (index->node_count == 0 || index->enter_point == NULL) - return NULL; - - (void)slot; // Unused, but we need the caller to acquire the lock. - - /* First phase: descend from max level to level 0 taking random paths. - * Note that we don't need a more conservative log^2(N) steps for - * proper mixing, since we already descend to a random cluster here. */ - hnswNode *current = index->enter_point; - for (uint32_t level = index->max_level; level > 0; level--) { - /* If current node doesn't have this level or no links, continue - * to lower level. */ - if (current->level < level || current->layers[level].num_links == 0) - continue; - - /* Choose random neighbor at this level. */ - uint32_t rand_neighbor = rand() % current->layers[level].num_links; - current = current->layers[level].links[rand_neighbor]; - } - - /* Second phase: at level 0, take log(N) * c random steps. */ - const int c = 3; // Multiplier for more thorough exploration. - double logN = log2(index->node_count + 1); - uint32_t num_walks = (uint32_t)(logN * c); - - /* Avoid the ping-pong effect: imagine there are just two nodes and - * the number of walks selected is even. We will select always the - * first element of the graph; conversely, if it is odd, we will always - * select the other element. One way to add more selection randomness is - * to randomly add '1' or '0' to the number of walks to perform. */ - num_walks += rand() & 1; - - // Perform random walk at level 0. - for (uint32_t i = 0; i < num_walks; i++) { - if (current->layers[0].num_links == 0) return current; - - // Choose random neighbor. - uint32_t rand_neighbor = rand() % current->layers[0].num_links; - current = current->layers[0].links[rand_neighbor]; - } - return current; -} - -/* ============================= Serialization ============================== - * - * TO SERIALIZE - * ============ - * - * To serialize on disk, you need to persist the vector dimension, number - * of elements, and the quantization type index->quant_type. These are - * global values for the whole index. - * - * Then, to serialize each node: - * - * call hnsw_serialize_node() with each node you find in the linked list - * of nodes, starting at index->head (each node has a next pointer). - * The function will return an hnswSerNode structure, you will need - * to store the following on disk (for each node): - * - * - The sernode->vector data, that is sernode->vector_size bytes. - * - The sernode->params array, that points to an array of uint64_t - * integers. There are sernode->params_count total items. These - * parameters contain everything there is to need about your node: how - * many levels it has, its ID, the list of neighbors for each level (as node - * IDs), and so forth. - * - * You need to to save your own node->value in some way as well, but it already - * belongs to the user of the API, since, for this library, it's just a pointer, - * so the user should know how to serialized its private data. - * - * RELOADING FROM DISK / NET - * ========================= - * - * When reloading nodes, you first load the index vector dimension and - * quantization type, and create the index with: - * - * HNSW *hnsw_new(uint32_t vector_dim, uint32_t quant_type); - * - * Then you load back, for each node (you stored how many nodes you had) - * the vector and the params array / count. - * You also load the value associated with your node. - * - * At this point you add back the loaded elements into the index with: - * - * hnsw_insert_serialized(HNSW *index, void *vector, uint64_t params, - * uint32_t params_len, void *value); - * - * Once you added all the nodes back, you need to resolve the pointers - * (since so far they are added just with the node IDs as reference), so - * you call: - * - * hnsw_deserialize_index(index); - * - * The index is now ready to be used like if it has been always in memory. - * - * DESIGN NOTES - * ============ - * - * Why this API does not just give you a binary blob to save? Because in - * many systems (and in Redis itself) to save integers / floats can have - * more interesting encodings that just storing a 64 bit value. Many vector - * indexes will be small, and their IDs will be small numbers, so the storage - * system can exploit that and use less disk space, less network bandwidth - * and so forth. - * - * How is the data stored in these arrays of numbers? Oh well, we have - * things that are obviously numbers like node ID, number of levels for the - * node and so forth. Also each of our nodes have an unique incremental ID, - * so we can store a node set of links in terms of linked node IDs. This - * data is put directly in the loaded node pointer space! We just cast the - * integer to the pointer (so THIS IS NOT SAFE for 32 bit systems). Then - * we want to translate such IDs into pointers. To do that, we build an - * hash table, then scan all the nodes again and fix all the links converting - * the ID to the pointer. */ - -/* History of serialization versions: - * version 0: the first implementation, lacking worst node id/info. - * version 1: includes worst link id/info. */ -#define HNSW_SERIALIZATION_VERSION 1 - -/* This is a special worst link index that is set when loading a serialized - * node with version 0 (this version of the serialization lacked explicit - * information about the worst link index/distance). This way, later, the - * function that fixes a deserialized index will know to compute the worst - * index info at runtime. */ -#define HNSW_SER_WORSTLINK_MISSING UINT32_MAX - -/* Return the serialized node information as specified in the top comment - * above. Note that the returned information is true as long as the node - * provided is not deleted or modified, so this function should be called - * when there are no concurrent writes. - * - * The function hnsw_serialize_node() should be called in order to - * free the result of this function. */ -hnswSerNode *hnsw_serialize_node(HNSW *index, hnswNode *node) { - /* The first step is calculating the number of uint64_t parameters - * that we need in order to serialize the node. */ - uint32_t num_params = 0; - num_params += 2; // node ID, number of layers. - for (uint32_t i = 0; i <= node->level; i++) { - num_params += 2; // max_links and num_links info for this layer. - num_params += node->layers[i].num_links; // The IDs of linked nodes. - num_params += 1; // worst link id/distance parameter. - } - - /* We use another 64bit value to store two floats that are about - * the vector: l2 and quantization range (that is only used if the - * vector is quantized). */ - num_params++; - - /* Allocate the return object and the parameters array. */ - hnswSerNode *sn = hmalloc(sizeof(hnswSerNode)); - if (sn == NULL) return NULL; - sn->params = hmalloc(sizeof(uint64_t)*num_params); - if (sn->params == NULL) { - hfree(sn); - return NULL; - } - - /* Fill data. */ - sn->params_count = num_params; - sn->vector = node->vector; - sn->vector_size = hnsw_quants_bytes(index); - - uint32_t param_idx = 0; - sn->params[param_idx++] = node->id; - /* The second parameter contains information about the serialization - * version of this node, the node level and some unused field: - * - * +--------+--------+--------+--------+ - * |VVVVVVVV|........|........|LLLLLLLL| - * +--------+--------+--------+--------+ - * - * V is the version, 8 bits. - * L is the node level, 8 bits (but actually 16 is the max so far). - * The middle two bytes are reserved for future uses. */ - sn->params[param_idx] = node->level & 0xff; - sn->params[param_idx] |= HNSW_SERIALIZATION_VERSION << 24; - param_idx++; - for (uint32_t i = 0; i <= node->level; i++) { - sn->params[param_idx++] = node->layers[i].num_links; - sn->params[param_idx++] = node->layers[i].max_links; - for (uint32_t j = 0; j < node->layers[i].num_links; j++) { - sn->params[param_idx++] = node->layers[i].links[j]->id; - } - /* Since version 1: pack and store worst_idx and worst_distance. */ - uint32_t worst_distance_bits; - memcpy(&worst_distance_bits, &node->layers[i].worst_distance, - sizeof(float)); - uint64_t wi = - (((uint64_t)worst_distance_bits) << 32) | node->layers[i].worst_idx; - sn->params[param_idx++] = wi; - } - - /* Store l2 and range as uint32_t, in a way that is endian-safe. - * Note that in big endian archs both are reversed: integers and - * also the bytes of floats, so they will match. */ - uint64_t l2_and_range; - uint32_t l2_bits, range_bits; - memcpy(&l2_bits,&node->l2,sizeof(float)); - memcpy(&range_bits,&node->quants_range,sizeof(float)); - l2_and_range = ((uint64_t)range_bits<<32) | l2_bits; - - sn->params[param_idx++] = l2_and_range; - - /* Better safe than sorry: */ - assert(param_idx == num_params); - return sn; -} - -/* This is needed in order to free hnsw_serialize_node() returned - * structure. */ -void hnsw_free_serialized_node(hnswSerNode *sn) { - hfree(sn->params); - hfree(sn); -} - -/* Load a serialized node. See the top comment in this section of code - * for the documentation about how to use this. - * - * The function returns NULL both on out of memory and if the remaining - * parameters length does not match the number of links or other items - * to load. */ -hnswNode *hnsw_insert_serialized(HNSW *index, void *vector, uint64_t *params, uint32_t params_len, void *value) -{ - if (params_len < 2) return NULL; - - uint64_t id = params[0]; - /* Check the node serialization function for the specific layout - * of param[1] fields. */ - uint32_t level = params[1] & 0xff; // Node level. - uint32_t version = (params[1] & 0xff000000) >> 24; // Format version. - - if (version > HNSW_SERIALIZATION_VERSION) return NULL; - int has_worst_link_info = version > 0; - - /* Keep track of maximum ID seen while loading. */ - if (id >= index->last_id) index->last_id = id; - - /* Create node, passing vector data directly based on quantization type. */ - hnswNode *node; - if (index->quant_type != HNSW_QUANT_NONE) { - node = hnsw_node_new(index, id, NULL, vector, 0, level, 0); - } else { - node = hnsw_node_new(index, id, vector, NULL, 0, level, 0); - } - if (!node) return NULL; - - /* Load params array into the node. */ - uint32_t param_idx = 2; - for (uint32_t i = 0; i <= level; i++) { - /* Sanity check. */ - if (param_idx + 2 + has_worst_link_info > params_len) { - hnsw_node_free(node); - return NULL; - } - - uint32_t num_links = params[param_idx++]; - uint32_t max_links = params[param_idx++]; - - /* Sanity check: links should be less than max links and - * in general a reasonable amount. */ - if (num_links > max_links || max_links > HNSW_MAX_M*4) { - hnsw_node_free(node); - return NULL; - } - - /* If max_links is larger than current allocation, reallocate. - * It could happen in select_neighbors() that we over-allocate the - * node under very unlikely to happen conditions. */ - if (max_links > node->layers[i].max_links) { - hnswNode **new_links = hrealloc(node->layers[i].links, - sizeof(hnswNode*) * max_links); - if (!new_links) { - hnsw_node_free(node); - return NULL; - } - node->layers[i].links = new_links; - node->layers[i].max_links = max_links; - } - node->layers[i].num_links = num_links; - - /* Sanity check. */ - if (param_idx + num_links + has_worst_link_info > params_len) { - hnsw_node_free(node); - return NULL; - } - - /* Fill links for this layer with the IDs. Note that this - * is going to not work in 32 bit systems. Deleting / adding-back - * nodes can produce IDs larger than 2^32-1 even if we can't never - * fit more than 2^32 nodes in a 32 bit system. */ - for (uint32_t j = 0; j < num_links; j++) - node->layers[i].links[j] = (hnswNode*)params[param_idx++]; - - if (has_worst_link_info) { - uint64_t wi = params[param_idx++]; - uint32_t worst_idx = wi & 0xffffffff; - uint32_t worst_distance_bits = wi >> 32; - float worst_distance; - memcpy(&worst_distance,&worst_distance_bits,sizeof(float)); - node->layers[i].worst_idx = worst_idx; - node->layers[i].worst_distance = worst_distance; - - // Sanity check the worst ID range. - if (node->layers[i].num_links > 0 && - node->layers[i].worst_idx >= node->layers[i].num_links) - { - hnsw_node_free(node); - return NULL; - } - } else { - node->layers[i].worst_idx = HNSW_SER_WORSTLINK_MISSING; - node->layers[i].worst_distance = 0; - } - } - - /* Get l2 and quantization range. */ - if (param_idx >= params_len) { - hnsw_node_free(node); - return NULL; - } - - /* Load l2 and range packed into an uint64_t in an endian safe way. */ - uint64_t l2_and_range = params[param_idx]; - uint32_t l2_bits, range_bits; - l2_bits = l2_and_range & 0xffffffff; - range_bits = l2_and_range >> 32; - memcpy(&node->l2, &l2_bits, sizeof(float)); - memcpy(&node->quants_range, &range_bits, sizeof(float)); - - node->value = value; - hnsw_add_node(index, node); - - /* Keep track of higher node level and set the entry point to the - * greatest level node seen so far: thanks to this check we don't - * need to remember what our entry point was during serialization. */ - if (index->enter_point == NULL || level > index->max_level) { - index->max_level = level; - index->enter_point = node; - } - return node; -} - -/* Integer hashing, used by hnsw_deserialize_index(). - * MurmurHash3's 64-bit finalizer function. */ -uint64_t hnsw_hash_node_id(uint64_t id) { - id ^= id >> 33; - id *= 0xff51afd7ed558ccd; - id ^= id >> 33; - id *= 0xc4ceb9fe1a85ec53; - id ^= id >> 33; - return id; -} - -/* Helper for duplicated link detection in hnsw_deserialize_index(). */ -static int qsort_compare_pointers(const void *aptr, const void *bptr) { - uintptr_t a = *((uintptr_t*)aptr); - uintptr_t b = *((uintptr_t*)bptr); - if (a > b) return 1; - if (a < b) return -1; - return 0; -} - -/* Fix pointers of neighbors nodes: after loading the serialized nodes, the - * neighbors links are just IDs (casted to pointers), instead of the actual - * pointers. We need to resolve IDs into pointers. - * - * The two integers salt0 and salt1 are used to make the internal state - * of the function unguessable to an external attacker, in order to protect - * from corruptions. Show be two random numbers from /dev/urandom if possible - * otherwise can be just 0,0 if the application is not security critical and - * never processes untrusted inputs. - * - * Return 0 on error (out of memory or some ID that can't be resolved), 1 on - * success. */ -int hnsw_deserialize_index(HNSW *index, uint64_t salt0, uint64_t salt1) { - /* We will use simple linear probing, so over-allocating is a good - * idea: anyway this flat array of pointers will consume a fraction - * of the memory of the loaded index. */ - uint64_t min_size = index->node_count*2; - uint64_t table_size = 1; - while(table_size < min_size) table_size <<= 1; - - hnswNode **table = hmalloc(sizeof(hnswNode*) * table_size); - if (table == NULL) return 0; - memset(table,0,sizeof(hnswNode*) * table_size); - - /* First pass: populate the ID -> pointer hash table. */ - hnswNode *node = index->head; - while(node) { - uint64_t bucket = hnsw_hash_node_id(node->id) & (table_size-1); - for (uint64_t j = 0; j < table_size; j++) { - if (table[bucket] == NULL) { - table[bucket] = node; - break; - } - bucket = (bucket+1) & (table_size-1); - } - node = node->next; - } - - /* Second pass: fix pointers of all the neighbors links. - * As we scan and fix the links, we also compute the accumulator - * register "reciprocal", that is used in order to guarantee that all - * the links are reciprocal. - * - * This is how it works, we hash (using a strong hash function) the - * following key for each link that we see from A to B (or vice versa): - * - * hash(salt || A || B || link-level) - * - * We always sort A and B, so the same link from A to B and from B to A - * will hash the same. The we xor the result into the 128 bit accumulator. - * If each link has its own backlink, the accumulator is guaranteed to - * be zero at the end. - * - * Collisions are extremely unlikely to happen, and an external attacker - * can't easily control the hash function output, since the salt is - * unknown, and also there would be to control the pointers. - * - * This algorithm is O(1) for each node so it is basically free for - * us, as we scan the list of nodes, and runs on constant and very - * small memory. */ - uint64_t accumulator[2] = {0,0}; - - node = index->head; // Rewind. - while(node) { - uint64_t this_node_id = node->id; - for (uint32_t i = 0; i <= node->level; i++) { - // Check if there are duplicated links: those are - // also corruptions of the on-disk serialization format. - if (node->layers[i].num_links > 0) { - qsort(node->layers[i].links, node->layers[i].num_links, - sizeof(void*), qsort_compare_pointers); - for (uint32_t j = 0; j < node->layers[i].num_links-1; j++) { - if (node->layers[i].links[j] == node->layers[i].links[j+1]) - goto corrupted; - } - } - - // Resolve pointers. - for (uint32_t j = 0; j < node->layers[i].num_links; j++) { - uint64_t linked_id = (uint64_t) node->layers[i].links[j]; - - // We can't link to our own node. - if (linked_id == this_node_id) goto corrupted; - - // Compute accumulator for reciprocal links check. - uint64_t mixed_h1, mixed_h2; - secure_pair_mixer_128(salt0, salt1, this_node_id, linked_id, (uint64_t)i, &mixed_h1, &mixed_h2); - - accumulator[0] ^= mixed_h1; - accumulator[1] ^= mixed_h2; - - // Fix links. - uint64_t bucket = hnsw_hash_node_id(linked_id) & (table_size-1); - hnswNode *neighbor = NULL; - for (uint64_t k = 0; k < table_size; k++) { - if (table[bucket] && table[bucket]->id == linked_id) { - neighbor = table[bucket]; - break; - } - bucket = (bucket+1) & (table_size-1); - } - - /* The neighbor must exist and also exist at the right - * level. */ - if (neighbor == NULL || neighbor->level < i) { - /* Unresolved link! Either a bug in this code - * or broken serialization data. */ - goto corrupted; - } - node->layers[i].links[j] = neighbor; - } - - /* The worst link information was missing from older - * serialization formats. Compute it on the fly if needed. */ - if (node->layers[i].worst_idx == HNSW_SER_WORSTLINK_MISSING) { - hnsw_update_worst_neighbor(index,node,i); - } - } - node = node->next; - } - - /* Check that links are reciprocal, otherwise fail. */ - if (accumulator[0] || accumulator[1]) goto corrupted; - - /* Everything fine. Return success. */ - hfree(table); - return 1; - -corrupted: - /* Some corruption error detected. */ - hfree(table); - return 0; -} - -/* ================================ Iterator ================================ */ - -/* Get a cursor that can be used as argument of hnsw_cursor_next() to iterate - * all the elements that remain there from the start to the end of the - * iteration, excluding newly added elements. - * - * The function returns NULL on out of memory. */ -hnswCursor *hnsw_cursor_init(HNSW *index) { - if (pthread_rwlock_wrlock(&index->global_lock) != 0) return NULL; - hnswCursor *cursor = hmalloc(sizeof(*cursor)); - if (cursor == NULL) { - pthread_rwlock_unlock(&index->global_lock); - return NULL; - } - cursor->index = index; - cursor->next = index->cursors; - cursor->current = index->head; - index->cursors = cursor; - pthread_rwlock_unlock(&index->global_lock); - return cursor; -} - -/* Free the cursor. Can be called both at the end of the iteration, when - * hnsw_cursor_next() returned NULL, or before. */ -void hnsw_cursor_free(hnswCursor *cursor) { - HNSW *index = cursor->index; - if (pthread_rwlock_wrlock(&index->global_lock) != 0) { - // No easy way to recover from that. We will leak memory. - return; - } - - hnswCursor *x = index->cursors; - hnswCursor *prev = NULL; - while(x) { - if (x == cursor) { - if (prev) - prev->next = cursor->next; - else - index->cursors = cursor->next; - hfree(cursor); - break; - } - prev = x; - x = x->next; - } - pthread_rwlock_unlock(&index->global_lock); -} - -/* Acquire a lock to use the cursor. Returns 1 if the lock was acquired - * with success, otherwise zero is returned. The returned element is - * protected after calling hnsw_cursor_next() for all the time required to - * access it, then hnsw_cursor_release_lock() should be called in order - * to unlock the HNSW index. */ -int hnsw_cursor_acquire_lock(hnswCursor *cursor) { - return pthread_rwlock_rdlock(&cursor->index->global_lock) == 0; -} - -/* Release the cursor lock, see hnsw_cursor_acquire_lock() top comment - * for more information. */ -void hnsw_cursor_release_lock(hnswCursor *cursor) { - pthread_rwlock_unlock(&cursor->index->global_lock); -} - -/* Return the next element of the HNSW. See hnsw_cursor_init() for - * the guarantees of the function. */ -hnswNode *hnsw_cursor_next(hnswCursor *cursor) { - hnswNode *ret = cursor->current; - if (ret) cursor->current = ret->next; - return ret; -} - -/* Called by hnsw_unlink_node() if there is at least an active cursor. - * Will scan the cursors to see if any cursor is going to yield this - * one, and in this case, updates the current element to the next. */ -void hnsw_cursor_element_deleted(HNSW *index, hnswNode *deleted) { - hnswCursor *x = index->cursors; - while(x) { - if (x->current == deleted) x->current = deleted->next; - x = x->next; - } -} - -/* ============================ Debugging stuff ============================= */ - -/* Show stats about nodes connections. */ -void hnsw_print_stats(HNSW *index) { - if (!index || !index->head) { - printf("Empty index or NULL pointer passed\n"); - return; - } - - long long total_links = 0; - int min_links = -1; // We'll set this to first node's count. - int isolated_nodes = 0; - uint32_t node_count = 0; - - // Iterate through all nodes using the linked list. - hnswNode *current = index->head; - while (current) { - // Count total links for this node across all layers. - int node_total_links = 0; - for (uint32_t layer = 0; layer <= current->level; layer++) - node_total_links += current->layers[layer].num_links; - - // Update statistics. - total_links += node_total_links; - - // Initialize or update minimum links. - if (min_links == -1 || node_total_links < min_links) { - min_links = node_total_links; - } - - // Check if node is isolated (no links at all). - if (node_total_links == 0) isolated_nodes++; - - node_count++; - current = current->next; - } - - // Print statistics - printf("HNSW Graph Statistics:\n"); - printf("----------------------\n"); - printf("Total nodes: %u\n", node_count); - if (node_count > 0) { - printf("Average links per node: %.2f\n", - (float)total_links / node_count); - printf("Minimum links in a single node: %d\n", min_links); - printf("Number of isolated nodes: %d (%.1f%%)\n", - isolated_nodes, - (float)isolated_nodes * 100 / node_count); - } -} - -/* Validate graph connectivity and link reciprocity. Takes pointers to store results: - * - connected_nodes: will contain number of reachable nodes from entry point. - * - reciprocal_links: will contain 1 if all links are reciprocal, 0 otherwise. - * Returns 0 on success, -1 on error (NULL parameters and such). - */ -int hnsw_validate_graph(HNSW *index, uint64_t *connected_nodes, int *reciprocal_links) { - if (!index || !connected_nodes || !reciprocal_links) return -1; - if (!index->enter_point) { - *connected_nodes = 0; - *reciprocal_links = 1; // Empty graph is valid. - return 0; - } - - // Initialize connectivity check. - index->current_epoch[0]++; - *connected_nodes = 0; - *reciprocal_links = 1; - - // Initialize node stack. - uint64_t stack_size = index->node_count; - hnswNode **stack = hmalloc(sizeof(hnswNode*) * stack_size); - if (!stack) return -1; - uint64_t stack_top = 0; - - // Start from entry point. - index->enter_point->visited_epoch[0] = index->current_epoch[0]; - (*connected_nodes)++; - stack[stack_top++] = index->enter_point; - - // Process all reachable nodes. - while (stack_top > 0) { - hnswNode *current = stack[--stack_top]; - - // Explore all neighbors at each level. - for (uint32_t level = 0; level <= current->level; level++) { - for (uint64_t i = 0; i < current->layers[level].num_links; i++) { - hnswNode *neighbor = current->layers[level].links[i]; - - // Check reciprocity. - int found_backlink = 0; - for (uint64_t j = 0; j < neighbor->layers[level].num_links; j++) { - if (neighbor->layers[level].links[j] == current) { - found_backlink = 1; - break; - } - } - if (!found_backlink) { - *reciprocal_links = 0; - } - - // If we haven't visited this neighbor yet. - if (neighbor->visited_epoch[0] != index->current_epoch[0]) { - neighbor->visited_epoch[0] = index->current_epoch[0]; - (*connected_nodes)++; - if (stack_top < stack_size) { - stack[stack_top++] = neighbor; - } else { - // This should never happen in a valid graph. - hfree(stack); - return -1; - } - } - } - } - } - - hfree(stack); - - // Now scan for unreachable nodes and print debug info. - printf("\nUnreachable nodes debug information:\n"); - printf("=====================================\n"); - - hnswNode *current = index->head; - while (current) { - if (current->visited_epoch[0] != index->current_epoch[0]) { - printf("\nUnreachable node found:\n"); - printf("- Node pointer: %p\n", (void*)current); - printf("- Node ID: %llu\n", (unsigned long long)current->id); - printf("- Node level: %u\n", current->level); - - // Print info about all its links at each level. - for (uint32_t level = 0; level <= current->level; level++) { - printf(" Level %u links (%u):\n", level, - current->layers[level].num_links); - for (uint64_t i = 0; i < current->layers[level].num_links; i++) { - hnswNode *neighbor = current->layers[level].links[i]; - // Check reciprocity for this specific link - int found_backlink = 0; - for (uint64_t j = 0; j < neighbor->layers[level].num_links; j++) { - if (neighbor->layers[level].links[j] == current) { - found_backlink = 1; - break; - } - } - printf(" - Link %llu: pointer=%p, id=%llu, visited=%s,recpr=%s\n", - (unsigned long long)i, (void*)neighbor, - (unsigned long long)neighbor->id, - neighbor->visited_epoch[0] == index->current_epoch[0] ? - "yes" : "no", - found_backlink ? "yes" : "no"); - } - } - } - current = current->next; - } - - printf("Total connected nodes: %llu\n", (unsigned long long)*connected_nodes); - printf("All links are bi-directiona? %s\n", (*reciprocal_links)?"yes":"no"); - return 0; -} - -/* Test graph recall ability by verifying each node can be found searching - * for its own vector. This helps validate that the majority of nodes are - * properly connected and easily reachable in the graph structure. Every - * unreachable node is reported. - * - * Normally only a small percentage of nodes will be not reachable when - * visited. This is expected and part of the statistical properties - * of HNSW. This happens especially with entries that have an ambiguous - * meaning in the represented space, and are across two or multiple clusters - * of items. - * - * The function works by: - * 1. Iterating through all nodes in the linked list - * 2. Using each node's vector to perform a search with specified EF - * 3. Verifying the node can find itself as nearest neighbor - * 4. Collecting and reporting statistics about reachability - * - * This is just a debugging function that reports stuff in the standard - * output, part of the implementation because this kind of functions - * provide some visibility on what happens inside the HNSW. - */ -void hnsw_test_graph_recall(HNSW *index, int test_ef, int verbose) { - // Stats - uint32_t total_nodes = 0; - uint32_t unreachable_nodes = 0; - uint32_t perfectly_reachable = 0; // Node finds itself as first result - - // For storing search results - hnswNode **neighbors = hmalloc(sizeof(hnswNode*) * test_ef); - float *distances = hmalloc(sizeof(float) * test_ef); - float *test_vector = hmalloc(sizeof(float) * index->vector_dim); - if (!neighbors || !distances || !test_vector) { - hfree(neighbors); - hfree(distances); - hfree(test_vector); - return; - } - - // Get a read slot for searching (even if it's highly unlikely that - // this test will be run threaded...). - int slot = hnsw_acquire_read_slot(index); - if (slot < 0) { - hfree(neighbors); - hfree(distances); - return; - } - - printf("\nTesting graph recall\n"); - printf("====================\n"); - - // Process one node at a time using the linked list - hnswNode *current = index->head; - while (current) { - total_nodes++; - - // If using quantization, we need to reconstruct the normalized vector - if (index->quant_type == HNSW_QUANT_Q8) { - int8_t *quants = current->vector; - // Reconstruct normalized vector from quantized data - for (uint32_t j = 0; j < index->vector_dim; j++) { - test_vector[j] = (quants[j] * current->quants_range) / 127; - } - } else if (index->quant_type == HNSW_QUANT_NONE) { - memcpy(test_vector,current->vector,sizeof(float)*index->vector_dim); - } else { - assert(0 && "Quantization type not supported."); - } - - // Search using the node's own vector with high ef - int found = hnsw_search(index, test_vector, test_ef, neighbors, - distances, slot, 1); - - if (found == 0) continue; // Empty HNSW? - - // Look for the node itself in the results - int found_self = 0; - int self_position = -1; - for (int i = 0; i < found; i++) { - if (neighbors[i] == current) { - found_self = 1; - self_position = i; - break; - } - } - - if (!found_self || self_position != 0) { - unreachable_nodes++; - if (verbose) { - if (!found_self) - printf("\nNode %s cannot find itself:\n", (char*)current->value); - else - printf("\nNode %s is not top result:\n", (char*)current->value); - printf("- Node ID: %llu\n", (unsigned long long)current->id); - printf("- Node level: %u\n", current->level); - printf("- Found %d neighbors but self not among them\n", found); - printf("- Closest neighbor distance: %f\n", distances[0]); - printf("- Neighbors: "); - for (uint32_t i = 0; i < current->layers[0].num_links; i++) { - printf("%s ", (char*)current->layers[0].links[i]->value); - } - printf("\n"); - printf("\nFound instead: "); - for (int j = 0; j < found && j < 10; j++) { - printf("%s ", (char*)neighbors[j]->value); - } - printf("\n"); - } - } else { - perfectly_reachable++; - } - current = current->next; - } - - // Release read slot - hnsw_release_read_slot(index, slot); - - // Free resources - hfree(neighbors); - hfree(distances); - hfree(test_vector); - - // Print final statistics - printf("Total nodes tested: %u\n", total_nodes); - printf("Perfectly reachable nodes: %u (%.1f%%)\n", - perfectly_reachable, - total_nodes ? (float)perfectly_reachable * 100 / total_nodes : 0); - printf("Unreachable/suboptimal nodes: %u (%.1f%%)\n", - unreachable_nodes, - total_nodes ? (float)unreachable_nodes * 100 / total_nodes : 0); -} - -/* Return exact K-NN items by performing a linear scan of all nodes. - * This function has the same signature as hnsw_search_with_filter() but - * instead of using the graph structure, it scans all nodes to find the - * true nearest neighbors. - * - * Note that neighbors and distances arrays must have space for at least 'k' items. - * norm_query should be set to 1 if the query vector is already normalized. - * - * If the filter_callback is passed, only elements passing the specified filter - * are returned. The slot parameter is ignored but kept for API consistency. */ -int hnsw_ground_truth_with_filter - (HNSW *index, const float *query_vector, uint32_t k, - hnswNode **neighbors, float *distances, uint32_t slot, - int query_vector_is_normalized, - int (*filter_callback)(void *value, void *privdata), - void *filter_privdata) -{ - /* Note that we don't really use the slot here: it's a linear scan. - * Yet we want the user to acquire the slot as this will hold the - * global lock in read only mode. */ - (void) slot; - - /* Take our query vector into a temporary node. */ - hnswNode query; - if (hnsw_init_tmp_node(index, &query, query_vector_is_normalized, query_vector) == 0) return -1; - - /* Accumulate best results into a priority queue. */ - pqueue *results = pq_new(k); - if (!results) { - hnsw_free_tmp_node(&query, query_vector); - return -1; - } - - /* Scan all nodes linearly. */ - hnswNode *current = index->head; - while (current) { - /* Apply filter if needed. */ - if (filter_callback && - !filter_callback(current->value, filter_privdata)) - { - current = current->next; - continue; - } - - /* Calculate distance to query. */ - float dist = hnsw_distance(index, &query, current); - - /* Add to results to pqueue. Will be accepted only if better than - * the current worse or pqueue not full. */ - pq_push(results, current, dist); - current = current->next; - } - - /* Copy results to output arrays. */ - uint32_t found = MIN(k, results->count); - for (uint32_t i = 0; i < found; i++) { - neighbors[i] = pq_get_node(results, i); - if (distances) distances[i] = pq_get_distance(results, i); - } - - /* Clean up. */ - pq_free(results); - hnsw_free_tmp_node(&query, query_vector); - return found; -} |
