diff options
Diffstat (limited to 'examples/redis-unstable/modules/vector-sets/tests/reduce.py')
| -rw-r--r-- | examples/redis-unstable/modules/vector-sets/tests/reduce.py | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/examples/redis-unstable/modules/vector-sets/tests/reduce.py b/examples/redis-unstable/modules/vector-sets/tests/reduce.py new file mode 100644 index 0000000..e39164f --- /dev/null +++ b/examples/redis-unstable/modules/vector-sets/tests/reduce.py @@ -0,0 +1,71 @@ +from test import TestCase, fill_redis_with_vectors, generate_random_vector + +class Reduce(TestCase): + def getname(self): + return "Dimension Reduction" + + def estimated_runtime(self): + return 0.2 + + def test(self): + original_dim = 100 + reduced_dim = 80 + count = 1000 + k = 50 # Number of nearest neighbors to check + + # Fill Redis with vectors using REDUCE and get reference data + data = fill_redis_with_vectors(self.redis, self.test_key, count, original_dim, reduced_dim) + + # Verify dimension is reduced + dim = self.redis.execute_command('VDIM', self.test_key) + assert dim == reduced_dim, f"Expected dimension {reduced_dim}, got {dim}" + + # Generate query vector and get nearest neighbors using Redis + query_vec = generate_random_vector(original_dim) + redis_raw = self.redis.execute_command('VSIM', self.test_key, 'VALUES', + original_dim, *[str(x) for x in query_vec], + 'COUNT', k, 'WITHSCORES') + + # Convert Redis results to dict + redis_results = {} + for i in range(0, len(redis_raw), 2): + key = redis_raw[i].decode() + score = float(redis_raw[i+1]) + redis_results[key] = score + + # Get results from linear scan with original vectors + linear_results = data.find_k_nearest(query_vec, k) + linear_items = {name: score for name, score in linear_results} + + # Compare overlap between reduced and non-reduced results + redis_set = set(redis_results.keys()) + linear_set = set(linear_items.keys()) + overlap = len(redis_set & linear_set) + overlap_ratio = overlap / k + + # With random projection, we expect some loss of accuracy but should + # maintain at least some similarity structure. + # Note that gaussian distribution is the worse with this test, so + # in real world practice, things will be better. + min_expected_overlap = 0.1 # At least 10% overlap in top-k + assert overlap_ratio >= min_expected_overlap, \ + f"Dimension reduction lost too much structure. Only {overlap_ratio*100:.1f}% overlap in top {k}" + + # For items that appear in both results, scores should be reasonably correlated + common_items = redis_set & linear_set + for item in common_items: + redis_score = redis_results[item] + linear_score = linear_items[item] + # Allow for some deviation due to dimensionality reduction + assert abs(redis_score - linear_score) < 0.2, \ + f"Score mismatch too high for {item}: Redis={redis_score:.3f} Linear={linear_score:.3f}" + + # If test fails, print comparison for debugging + if overlap_ratio < min_expected_overlap: + print("\nLow overlap in results. Details:") + print("\nTop results from linear scan (original vectors):") + for name, score in linear_results: + print(f"{name}: {score:.3f}") + print("\nTop results from Redis (reduced vectors):") + for item, score in sorted(redis_results.items(), key=lambda x: x[1], reverse=True): + print(f"{item}: {score:.3f}") |
