diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-01-21 22:40:55 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-01-21 22:40:55 +0100 |
| commit | 5d8dfe892a2ea89f706ee140c3bdcfd89fe03fda (patch) | |
| tree | 1acdfa5220cd13b7be43a2a01368e80d306473ca /examples/redis-unstable/src/crccombine.c | |
| parent | c7ab12bba64d9c20ccd79b132dac475f7bc3923e (diff) | |
| download | crep-5d8dfe892a2ea89f706ee140c3bdcfd89fe03fda.tar.gz | |
Add Redis source code for testing
Diffstat (limited to 'examples/redis-unstable/src/crccombine.c')
| -rw-r--r-- | examples/redis-unstable/src/crccombine.c | 252 |
1 files changed, 252 insertions, 0 deletions
diff --git a/examples/redis-unstable/src/crccombine.c b/examples/redis-unstable/src/crccombine.c new file mode 100644 index 0000000..20add37 --- /dev/null +++ b/examples/redis-unstable/src/crccombine.c @@ -0,0 +1,252 @@ +#include <stdint.h> +#include <stdio.h> +#include <strings.h> +#if defined(__i386__) || defined(__X86_64__) +#include <immintrin.h> +#endif +#include "crccombine.h" + +/* Copyright (C) 2013 Mark Adler + * Copyright (C) 2019-2024 Josiah Carlson + * Portions originally from: crc64.c Version 1.4 16 Dec 2013 Mark Adler + * Modifications by Josiah Carlson <josiah.carlson@gmail.com> + * - Added implementation variations with sample timings for gf_matrix_times*() + * - Most folks would be best using gf2_matrix_times_vec or + * gf2_matrix_times_vec2, unless some processor does AVX2 fast. + * - This is the implementation of the MERGE_CRC macro defined in + * crcspeed.c (which calls crc_combine()), and is a specialization of the + * generic crc_combine() (and related from the 2013 edition of Mark Adler's + * crc64.c)) for the sake of clarity and performance. + + This software is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Mark Adler + madler@alumni.caltech.edu +*/ + +#define STATIC_ASSERT(VVV) do {int test = 1 / (VVV);test++;} while (0) + +#if !((defined(__i386__) || defined(__X86_64__))) + +/* This cuts 40% of the time vs bit-by-bit. */ + +uint64_t gf2_matrix_times_switch(uint64_t *mat, uint64_t vec) { + /* + * Without using any vector math, this handles 4 bits at a time, + * and saves 40+% of the time compared to the bit-by-bit version. Use if you + * have no vector compile option available to you. With cache, we see: + * E5-2670 ~1-2us to extend ~1 meg 64 bit hash + */ + uint64_t sum; + + sum = 0; + while (vec) { + /* reversing the case order is ~10% slower on Xeon E5-2670 */ + switch (vec & 15) { + case 15: + sum ^= *mat ^ *(mat+1) ^ *(mat+2) ^ *(mat+3); + break; + case 14: + sum ^= *(mat+1) ^ *(mat+2) ^ *(mat+3); + break; + case 13: + sum ^= *mat ^ *(mat+2) ^ *(mat+3); + break; + case 12: + sum ^= *(mat+2) ^ *(mat+3); + break; + case 11: + sum ^= *mat ^ *(mat+1) ^ *(mat+3); + break; + case 10: + sum ^= *(mat+1) ^ *(mat+3); + break; + case 9: + sum ^= *mat ^ *(mat+3); + break; + case 8: + sum ^= *(mat+3); + break; + case 7: + sum ^= *mat ^ *(mat+1) ^ *(mat+2); + break; + case 6: + sum ^= *(mat+1) ^ *(mat+2); + break; + case 5: + sum ^= *mat ^ *(mat+2); + break; + case 4: + sum ^= *(mat+2); + break; + case 3: + sum ^= *mat ^ *(mat+1); + break; + case 2: + sum ^= *(mat+1); + break; + case 1: + sum ^= *mat; + break; + default: + break; + } + vec >>= 4; + mat += 4; + } + return sum; +} + +#define CRC_MULTIPLY gf2_matrix_times_switch + +#else + +/* + Warning: here there be dragons involving vector math, and macros to save us + from repeating the same information over and over. +*/ + +uint64_t gf2_matrix_times_vec2(uint64_t *mat, uint64_t vec) { + /* + * Uses xmm registers on x86, works basically everywhere fast, doing + * cycles of movqda, mov, shr, pand, and, pxor, at least on gcc 8. + * Is 9-11x faster than original. + * E5-2670 ~29us to extend ~1 meg 64 bit hash + * i3-8130U ~22us to extend ~1 meg 64 bit hash + */ + v2uq sum = {0, 0}, + *mv2 = (v2uq*)mat; + /* this table allows us to eliminate conditions during gf2_matrix_times_vec2() */ + static v2uq masks2[4] = { + {0,0}, + {-1,0}, + {0,-1}, + {-1,-1}, + }; + + /* Almost as beautiful as gf2_matrix_times_vec, but only half as many + * bits per step, so we need 2 per chunk4 operation. Faster in my tests. */ + +#define DO_CHUNK4() \ + sum ^= (*mv2++) & masks2[vec & 3]; \ + vec >>= 2; \ + sum ^= (*mv2++) & masks2[vec & 3]; \ + vec >>= 2 + +#define DO_CHUNK16() \ + DO_CHUNK4(); \ + DO_CHUNK4(); \ + DO_CHUNK4(); \ + DO_CHUNK4() + + DO_CHUNK16(); + DO_CHUNK16(); + DO_CHUNK16(); + DO_CHUNK16(); + + STATIC_ASSERT(sizeof(uint64_t) == 8); + STATIC_ASSERT(sizeof(long long unsigned int) == 8); + return sum[0] ^ sum[1]; +} + +#undef DO_CHUNK16 +#undef DO_CHUNK4 + +#define CRC_MULTIPLY gf2_matrix_times_vec2 +#endif + +static void gf2_matrix_square(uint64_t *square, uint64_t *mat, uint8_t dim) { + unsigned n; + + for (n = 0; n < dim; n++) + square[n] = CRC_MULTIPLY(mat, mat[n]); +} + +/* Turns out our Redis / Jones CRC cycles at this point, so we can support + * more than 64 bits of extension if we want. Trivially. */ +static uint64_t combine_cache[64][64]; + +/* Mark Adler has some amazing updates to crc.c in his crcany repository. I + * like static caches, and not worrying about finding cycles generally. We are + * okay to spend the 32k of memory here, leaving the algorithm unchanged from + * as it was a decade ago, and be happy that it costs <200 microseconds to + * init, and that subsequent calls to the combine function take under 100 + * nanoseconds. We also note that the crcany/crc.c code applies to any CRC, and + * we are currently targeting one: Jones CRC64. + */ + +void init_combine_cache(uint64_t poly, uint8_t dim) { + unsigned n, cache_num = 0; + combine_cache[1][0] = poly; + int prev = 1; + uint64_t row = 1; + for (n = 1; n < dim; n++) + { + combine_cache[1][n] = row; + row <<= 1; + } + + gf2_matrix_square(combine_cache[0], combine_cache[1], dim); + gf2_matrix_square(combine_cache[1], combine_cache[0], dim); + + /* do/while to overwrite the first two layers, they are not used, but are + * re-generated in the last two layers for the Redis polynomial */ + do { + gf2_matrix_square(combine_cache[cache_num], combine_cache[cache_num + prev], dim); + prev = -1; + } while (++cache_num < 64); +} + +/* Return the CRC-64 of two sequential blocks, where crc1 is the CRC-64 of the + * first block, crc2 is the CRC-64 of the second block, and len2 is the length + * of the second block. + * + * If you want reflections on your CRCs; do them outside before / after. + * WARNING: if you enable USE_STATIC_COMBINE_CACHE to make this fast, you MUST + * ALWAYS USE THE SAME POLYNOMIAL, otherwise you will get the wrong results. + * You MAY bzero() the even/odd static arrays, which will induce a re-cache on + * next call as a work-around, but ... maybe just parameterize the cached + * models at that point like Mark Adler does in modern crcany/crc.c . + */ +uint64_t crc64_combine(uint64_t crc1, uint64_t crc2, uintmax_t len2, uint64_t poly, uint8_t dim) { + /* degenerate case */ + if (len2 == 0) + return crc1; + + unsigned cache_num = 0; + if (combine_cache[0][0] == 0) { + init_combine_cache(poly, dim); + } + + /* apply len2 zeros to crc1 (first square will put the operator for one + zero byte, eight zero bits, in even) */ + do + { + /* apply zeros operator for this bit of len2 */ + if (len2 & 1) + crc1 = CRC_MULTIPLY(combine_cache[cache_num], crc1); + len2 >>= 1; + cache_num = (cache_num + 1) & 63; + /* if no more bits set, then done */ + } while (len2 != 0); + + /* return combined crc */ + crc1 ^= crc2; + return crc1; +} + +#undef CRC_MULTIPLY |
