summaryrefslogtreecommitdiff
path: root/examples/redis-unstable/src/crccombine.c
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-01-21 22:40:55 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-01-21 22:40:55 +0100
commit5d8dfe892a2ea89f706ee140c3bdcfd89fe03fda (patch)
tree1acdfa5220cd13b7be43a2a01368e80d306473ca /examples/redis-unstable/src/crccombine.c
parentc7ab12bba64d9c20ccd79b132dac475f7bc3923e (diff)
downloadcrep-5d8dfe892a2ea89f706ee140c3bdcfd89fe03fda.tar.gz
Add Redis source code for testing
Diffstat (limited to 'examples/redis-unstable/src/crccombine.c')
-rw-r--r--examples/redis-unstable/src/crccombine.c252
1 files changed, 252 insertions, 0 deletions
diff --git a/examples/redis-unstable/src/crccombine.c b/examples/redis-unstable/src/crccombine.c
new file mode 100644
index 0000000..20add37
--- /dev/null
+++ b/examples/redis-unstable/src/crccombine.c
@@ -0,0 +1,252 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <strings.h>
+#if defined(__i386__) || defined(__X86_64__)
+#include <immintrin.h>
+#endif
+#include "crccombine.h"
+
+/* Copyright (C) 2013 Mark Adler
+ * Copyright (C) 2019-2024 Josiah Carlson
+ * Portions originally from: crc64.c Version 1.4 16 Dec 2013 Mark Adler
+ * Modifications by Josiah Carlson <josiah.carlson@gmail.com>
+ * - Added implementation variations with sample timings for gf_matrix_times*()
+ * - Most folks would be best using gf2_matrix_times_vec or
+ * gf2_matrix_times_vec2, unless some processor does AVX2 fast.
+ * - This is the implementation of the MERGE_CRC macro defined in
+ * crcspeed.c (which calls crc_combine()), and is a specialization of the
+ * generic crc_combine() (and related from the 2013 edition of Mark Adler's
+ * crc64.c)) for the sake of clarity and performance.
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the author be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Mark Adler
+ madler@alumni.caltech.edu
+*/
+
+#define STATIC_ASSERT(VVV) do {int test = 1 / (VVV);test++;} while (0)
+
+#if !((defined(__i386__) || defined(__X86_64__)))
+
+/* This cuts 40% of the time vs bit-by-bit. */
+
+uint64_t gf2_matrix_times_switch(uint64_t *mat, uint64_t vec) {
+ /*
+ * Without using any vector math, this handles 4 bits at a time,
+ * and saves 40+% of the time compared to the bit-by-bit version. Use if you
+ * have no vector compile option available to you. With cache, we see:
+ * E5-2670 ~1-2us to extend ~1 meg 64 bit hash
+ */
+ uint64_t sum;
+
+ sum = 0;
+ while (vec) {
+ /* reversing the case order is ~10% slower on Xeon E5-2670 */
+ switch (vec & 15) {
+ case 15:
+ sum ^= *mat ^ *(mat+1) ^ *(mat+2) ^ *(mat+3);
+ break;
+ case 14:
+ sum ^= *(mat+1) ^ *(mat+2) ^ *(mat+3);
+ break;
+ case 13:
+ sum ^= *mat ^ *(mat+2) ^ *(mat+3);
+ break;
+ case 12:
+ sum ^= *(mat+2) ^ *(mat+3);
+ break;
+ case 11:
+ sum ^= *mat ^ *(mat+1) ^ *(mat+3);
+ break;
+ case 10:
+ sum ^= *(mat+1) ^ *(mat+3);
+ break;
+ case 9:
+ sum ^= *mat ^ *(mat+3);
+ break;
+ case 8:
+ sum ^= *(mat+3);
+ break;
+ case 7:
+ sum ^= *mat ^ *(mat+1) ^ *(mat+2);
+ break;
+ case 6:
+ sum ^= *(mat+1) ^ *(mat+2);
+ break;
+ case 5:
+ sum ^= *mat ^ *(mat+2);
+ break;
+ case 4:
+ sum ^= *(mat+2);
+ break;
+ case 3:
+ sum ^= *mat ^ *(mat+1);
+ break;
+ case 2:
+ sum ^= *(mat+1);
+ break;
+ case 1:
+ sum ^= *mat;
+ break;
+ default:
+ break;
+ }
+ vec >>= 4;
+ mat += 4;
+ }
+ return sum;
+}
+
+#define CRC_MULTIPLY gf2_matrix_times_switch
+
+#else
+
+/*
+ Warning: here there be dragons involving vector math, and macros to save us
+ from repeating the same information over and over.
+*/
+
+uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec) {
+ /*
+ * Uses xmm registers on x86, works basically everywhere fast, doing
+ * cycles of movqda, mov, shr, pand, and, pxor, at least on gcc 8.
+ * Is 9-11x faster than original.
+ * E5-2670 ~29us to extend ~1 meg 64 bit hash
+ * i3-8130U ~22us to extend ~1 meg 64 bit hash
+ */
+ v2uq sum = {0, 0},
+ *mv2 = (v2uq*)mat;
+ /* this table allows us to eliminate conditions during gf2_matrix_times_vec2() */
+ static v2uq masks2[4] = {
+ {0,0},
+ {-1,0},
+ {0,-1},
+ {-1,-1},
+ };
+
+ /* Almost as beautiful as gf2_matrix_times_vec, but only half as many
+ * bits per step, so we need 2 per chunk4 operation. Faster in my tests. */
+
+#define DO_CHUNK4() \
+ sum ^= (*mv2++) & masks2[vec & 3]; \
+ vec >>= 2; \
+ sum ^= (*mv2++) & masks2[vec & 3]; \
+ vec >>= 2
+
+#define DO_CHUNK16() \
+ DO_CHUNK4(); \
+ DO_CHUNK4(); \
+ DO_CHUNK4(); \
+ DO_CHUNK4()
+
+ DO_CHUNK16();
+ DO_CHUNK16();
+ DO_CHUNK16();
+ DO_CHUNK16();
+
+ STATIC_ASSERT(sizeof(uint64_t) == 8);
+ STATIC_ASSERT(sizeof(long long unsigned int) == 8);
+ return sum[0] ^ sum[1];
+}
+
+#undef DO_CHUNK16
+#undef DO_CHUNK4
+
+#define CRC_MULTIPLY gf2_matrix_times_vec2
+#endif
+
+static void gf2_matrix_square(uint64_t *square, uint64_t *mat, uint8_t dim) {
+ unsigned n;
+
+ for (n = 0; n < dim; n++)
+ square[n] = CRC_MULTIPLY(mat, mat[n]);
+}
+
+/* Turns out our Redis / Jones CRC cycles at this point, so we can support
+ * more than 64 bits of extension if we want. Trivially. */
+static uint64_t combine_cache[64][64];
+
+/* Mark Adler has some amazing updates to crc.c in his crcany repository. I
+ * like static caches, and not worrying about finding cycles generally. We are
+ * okay to spend the 32k of memory here, leaving the algorithm unchanged from
+ * as it was a decade ago, and be happy that it costs <200 microseconds to
+ * init, and that subsequent calls to the combine function take under 100
+ * nanoseconds. We also note that the crcany/crc.c code applies to any CRC, and
+ * we are currently targeting one: Jones CRC64.
+ */
+
+void init_combine_cache(uint64_t poly, uint8_t dim) {
+ unsigned n, cache_num = 0;
+ combine_cache[1][0] = poly;
+ int prev = 1;
+ uint64_t row = 1;
+ for (n = 1; n < dim; n++)
+ {
+ combine_cache[1][n] = row;
+ row <<= 1;
+ }
+
+ gf2_matrix_square(combine_cache[0], combine_cache[1], dim);
+ gf2_matrix_square(combine_cache[1], combine_cache[0], dim);
+
+ /* do/while to overwrite the first two layers, they are not used, but are
+ * re-generated in the last two layers for the Redis polynomial */
+ do {
+ gf2_matrix_square(combine_cache[cache_num], combine_cache[cache_num + prev], dim);
+ prev = -1;
+ } while (++cache_num < 64);
+}
+
+/* Return the CRC-64 of two sequential blocks, where crc1 is the CRC-64 of the
+ * first block, crc2 is the CRC-64 of the second block, and len2 is the length
+ * of the second block.
+ *
+ * If you want reflections on your CRCs; do them outside before / after.
+ * WARNING: if you enable USE_STATIC_COMBINE_CACHE to make this fast, you MUST
+ * ALWAYS USE THE SAME POLYNOMIAL, otherwise you will get the wrong results.
+ * You MAY bzero() the even/odd static arrays, which will induce a re-cache on
+ * next call as a work-around, but ... maybe just parameterize the cached
+ * models at that point like Mark Adler does in modern crcany/crc.c .
+ */
+uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim) {
+ /* degenerate case */
+ if (len2 == 0)
+ return crc1;
+
+ unsigned cache_num = 0;
+ if (combine_cache[0][0] == 0) {
+ init_combine_cache(poly, dim);
+ }
+
+ /* apply len2 zeros to crc1 (first square will put the operator for one
+ zero byte, eight zero bits, in even) */
+ do
+ {
+ /* apply zeros operator for this bit of len2 */
+ if (len2 & 1)
+ crc1 = CRC_MULTIPLY(combine_cache[cache_num], crc1);
+ len2 >>= 1;
+ cache_num = (cache_num + 1) & 63;
+ /* if no more bits set, then done */
+ } while (len2 != 0);
+
+ /* return combined crc */
+ crc1 ^= crc2;
+ return crc1;
+}
+
+#undef CRC_MULTIPLY