summaryrefslogtreecommitdiff
path: root/examples/redis-unstable/src/ebuckets.h
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-01-21 22:40:55 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-01-21 22:40:55 +0100
commit5d8dfe892a2ea89f706ee140c3bdcfd89fe03fda (patch)
tree1acdfa5220cd13b7be43a2a01368e80d306473ca /examples/redis-unstable/src/ebuckets.h
parentc7ab12bba64d9c20ccd79b132dac475f7bc3923e (diff)
downloadcrep-5d8dfe892a2ea89f706ee140c3bdcfd89fe03fda.tar.gz
Add Redis source code for testing
Diffstat (limited to 'examples/redis-unstable/src/ebuckets.h')
-rw-r--r--examples/redis-unstable/src/ebuckets.h336
1 files changed, 336 insertions, 0 deletions
diff --git a/examples/redis-unstable/src/ebuckets.h b/examples/redis-unstable/src/ebuckets.h
new file mode 100644
index 0000000..383caf0
--- /dev/null
+++ b/examples/redis-unstable/src/ebuckets.h
@@ -0,0 +1,336 @@
+/*
+ * Copyright Redis Ltd. 2024 - present
+ *
+ * Licensed under your choice of (a) the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ *
+ *
+ * WHAT IS EBUCKETS?
+ * -----------------
+ * ebuckets is being used to store items that are set with expiration-time. It
+ * supports the basic API of add, remove and active expiration. The implementation
+ * of it is based on rax-tree, or plain linked-list when small. The expiration time
+ * of the items are used as the key to traverse rax-tree.
+ *
+ * Instead of holding a distinct item in each leaf of the rax-tree we can aggregate
+ * items into small segments and hold it in each leaf. This way we can avoid
+ * frequent modification of the rax-tree, since many of the modifications
+ * will be done only at the segment level. It will also save memory because
+ * rax-tree can be costly, around 40 bytes per leaf (with rax-key limited to 6
+ * bytes). Whereas each additional item in the segment will cost the size of the
+ * 'next' pointer in a list (8 bytes) and few more bytes for maintenance of the
+ * segment.
+ *
+ * EBUCKETS STRUCTURE
+ * ------------------
+ * The ebuckets data structure is organized in a hierarchical manner as follows:
+ *
+ * 1. ebuckets: This is the top-level data structure. It can be either a rax tree
+ * or a plain linked list. It contains one or more buckets, each representing
+ * an interval in time.
+ *
+ * 2. bucket: Each bucket represents an interval in time and contains one or more
+ * segments. The key in the rax-tree for each bucket represents low
+ * bound expiration-time for the items within this bucket. The key of the
+ * following bucket represents the upper bound expiration-time.
+ *
+ * 3. segment: Each segment within a bucket can hold up to `EB_SEG_MAX_ITEMS`
+ * items as a linked list. If there are more, the segment will try to
+ * split the bucket. To avoid wasting memory, it is a singly linked list (only
+ * next-item pointer). It is a cyclic linked-list to allow efficient removal of
+ * items from the middle of the segment without traversing the rax tree.
+ *
+ * 4. item: Each item that is stored in ebuckets should embed the ExpireMeta
+ * struct and supply getter function (see EbucketsType.getExpireMeta). This
+ * struct holds the expire-time of the item and few more fields that are used
+ * to maintain the segments data-structure.
+ *
+ * SPLITTING BUCKET
+ * ----------------
+ * Each segment can hold up-to `EB_SEG_MAX_ITEMS` items. On insertion of new
+ * item, it will try to split the segment. Here is an example For adding item
+ * with expiration of 42 to a segment that already reached its maximum capacity
+ * which will cause to split of the segment and in turn split of the bucket as
+ * well to a finer grained ranges:
+ *
+ * BUCKETS BUCKETS
+ * [ 00-10 ] -> size(Seg0) = 11 ==> [ 00-10 ] -> size(Seg0) = 11
+ * [ 11-76 ] -> size(Seg1) = 16 [ 11-36 ] -> size(Seg1) = 9
+ * [ 37-76 ] -> size(Seg2) = 7
+ *
+ * EXTENDING BUCKET
+ * ----------------
+ * In the example above, the reason it wasn't split evenly is that Seg1 must have
+ * been holding items with same TTL and they must reside together in the same
+ * bucket after the split. Which brings us to another important point. If there
+ * is a segment that reached its maximum capacity and all the items have same
+ * expiration-time key, then we cannot split the bucket but aggregate all the
+ * items, with same expiration time key, by allocating an extended-segment and
+ * chain it to the first segment in visited bucket. In that sense, extended
+ * segments will only hold items with same expiration-time key.
+ *
+ * BUCKETS BUCKETS
+ * [ 00-10 ] -> size(Seg0)=11 ==> [ 00-10 ] -> size(Seg0)=11
+ * [ 11-12 ] -> size(Seg1)=16 [ 11-12 ] -> size(Seg1)=1 -> size(Seg2)=16
+ *
+ * LIMITING RAX TREE DEPTH
+ * -----------------------
+ * The rax tree is basically a B-tree and its depth is bounded by the sizeof of
+ * the key. Holding 6 bytes for expiration-time key is more than enough to represent
+ * unix-time in msec, and in turn the depth of the tree is limited to 6 levels.
+ * At a first glance it might look sufficient but we need take into consideration
+ * the heavyweight maintenance and traversal of each node in the B-tree.
+ *
+ * And so, we can further prune the tree such that holding keys with msec precision
+ * in the tree doesn't bring with it much value. The active-expiration operation can
+ * live with deletion of expired items, say, older than 1 sec, which means the size
+ * of time-expiration keys to the rax tree become no more than ~4.5 bytes and we
+ * also get rid of the "noisy" bits which most probably will cause to yet another
+ * branching and modification of the rax tree in case of items with time-expiration
+ * difference of less than 1 second. The lazy expiration will still be precise and
+ * without compromise on accuracy because the exact expiration-time is kept
+ * attached as well to each item, in `ExpireMeta`, and each traversal of item with
+ * expiration will behave as expected down to the msec. Take care to configure
+ * `EB_BUCKET_KEY_PRECISION` according to your needs.
+ *
+ * EBUCKET KEY
+ * -----------
+ * Taking into account configured value of `EB_BUCKET_KEY_PRECISION`, two items
+ * with expiration-time t1 and t2 will be considered to have the same key in the
+ * rax-tree/buckets if and only if:
+ *
+ * EB_BUCKET_KEY(t1) == EB_BUCKET_KEY(t2)
+ *
+ * EBUCKETS CREATION
+ * -----------------
+ * To avoid the cost of allocating rax data-structure for only few elements,
+ * ebuckets will start as a simple linked-list and only when it reaches some
+ * threshold, it will be converted to rax.
+ *
+ * TODO
+ * ----
+ * - ebRemove() optimize to merge small segments into one segment.
+ * - ebAdd() Fix pathological case of cascade addition of items into rax such
+ * that their values are smaller/bigger than visited extended-segment which ends
+ * up with multiple segments with a single item in each segment.
+ */
+
+#ifndef __EBUCKETS_H
+#define __EBUCKETS_H
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include "rax.h"
+
+/*
+ * EB_BUCKET_KEY_PRECISION - Defines the number of bits to ignore from the
+ * expiration-time when mapping to buckets. The higher the value, the more items
+ * with similar expiration-time will be aggregated into the same bucket. The lower
+ * the value, the more "accurate" the active expiration of buckets will be.
+ *
+ * Note that the accurate time expiration of each item is preserved anyway and
+ * enforced by lazy expiration. It only impacts the active expiration that will
+ * be able to work on buckets older than (1<<EB_BUCKET_KEY_PRECISION) msec ago.
+ * For example if EB_BUCKET_KEY_PRECISION is 10, then active expiration
+ * will work only on buckets that already got expired at least 1sec ago.
+ *
+ * The idea of it is to trim the rax tree depth, avoid having too many branches,
+ * and reduce frequent modifications of the tree to the minimum.
+ */
+#define EB_BUCKET_KEY_PRECISION 0 /* TBD: modify to 10 */
+
+/* From expiration time to bucket-key */
+#define EB_BUCKET_KEY(exptime) ((exptime) >> EB_BUCKET_KEY_PRECISION)
+
+
+#define EB_EXPIRE_TIME_MAX ((uint64_t)0x0000FFFFFFFFFFFF) /* Maximum expire-time. */
+#define EB_EXPIRE_TIME_INVALID (EB_EXPIRE_TIME_MAX+1) /* assumed bigger than max */
+
+/* Handler to ebuckets DS. Pointer to a list, rax or NULL (empty DS). See also ebIsList(). */
+typedef void *ebuckets;
+
+/* Users of ebuckets will store `eItem` which is just a void pointer to their
+ * element. In addition, eItem should embed the ExpireMeta struct and supply
+ * getter function (see EbucketsType.getExpireMeta).
+ */
+typedef void *eItem;
+
+/* This struct Should be embedded inside `eItem` and must be aligned in memory. */
+typedef struct ExpireMeta {
+ /* 48bits of unix-time in msec. This value is sufficient to represent, in
+ * unix-time, until the date of 02 August, 10889
+ */
+ uint32_t expireTimeLo; /* Low bits of expireTime. */
+ uint16_t expireTimeHi; /* High bits of expireTime. */
+
+ unsigned int lastInSegment : 1; /* Last item in segment. If set, then 'next' will
+ point to the NextSegHdr, unless lastItemBucket=1
+ then it will point to segment header of the
+ current segment. */
+ unsigned int firstItemBucket : 1; /* First item in bucket. This flag assist
+ to manipulate segments directly without
+ the need to traverse from start the
+ rax tree */
+ unsigned int lastItemBucket : 1; /* Last item in bucket. This flag assist
+ to manipulate segments directly without
+ the need to traverse from start the
+ rax tree */
+ unsigned int numItems : 5; /* Only first item in segment will maintain
+ this value. */
+
+ unsigned int trash : 1; /* This flag indicates whether the ExpireMeta
+ associated with the item is leftover.
+ There is always a potential to reuse the
+ item after removal/deletion. Note that,
+ the user can still safely O(1) TTL lookup
+ a given item and verify whether attached
+ TTL is valid or leftover. See function
+ ebGetExpireTime(). */
+
+ unsigned int userData : 3; /* ebuckets can be used to store in same
+ instance few different types of items,
+ such as, listpack and hash. This field
+ is reserved to store such identification
+ associated with the item and can help
+ to distinct on delete or expire callback.
+ It is not used by ebuckets internally and
+ should be maintained by the user */
+
+ unsigned int reserved : 4;
+
+ void *next; /* - If not last item in segment then next
+ points to next eItem (lastInSegment=0).
+ - If last in segment but not last in
+ bucket (lastItemBucket=0) then it
+ points to next segment header.
+ - If last in bucket then it points to
+ current segment header (Can be either
+ of type FirstSegHdr or NextSegHdr). */
+} ExpireMeta;
+
+/* Each instance of ebuckets need to have corresponding EbucketsType that holds
+ * the necessary callbacks and configuration to operate correctly on the type
+ * of items that are stored in it. Conceptually it should have hold reference
+ * from ebuckets instance to this type, but to save memory we will pass it as
+ * an argument to each API call. */
+typedef struct EbucketsType {
+ /* getter to extract the ExpireMeta from the item */
+ ExpireMeta* (*getExpireMeta)(const eItem item);
+
+ /* Called during ebDestroy(). Set to NULL if not needed. */
+ void (*onDeleteItem)(eItem item, void *ctx);
+
+ /* Is addresses of items are odd in memory. It is taken into consideration
+ * and used by ebuckets to know how to distinct between ebuckets pointer to
+ * rax versus a pointer to item which is head of list. */
+ unsigned int itemsAddrAreOdd;
+} EbucketsType;
+
+/* Returned value by `onExpireItem` callback to indicate the action to be taken by
+ * ebExpire(). */
+typedef enum ExpireAction {
+ ACT_REMOVE_EXP_ITEM=0, /* Remove the item from ebuckets. */
+ ACT_UPDATE_EXP_ITEM, /* Re-insert the item with updated expiration-time.
+ Before returning this value, the cb need to
+ update expiration time of the item by assisting
+ function ebSetMetaExpTime(). The item will be
+ kept aside and will be added again to ebuckets
+ at the end of ebExpire() */
+ ACT_STOP_ACTIVE_EXP /* Stop active-expiration. It will assume that
+ provided 'item' wasn't deleted by the callback. */
+} ExpireAction;
+
+/* ExpireInfo is used to pass input and output parameters to ebExpire(). */
+typedef struct ExpireInfo {
+ /* onExpireItem - Called during active-expiration by ebExpire() */
+ ExpireAction (*onExpireItem)(eItem item, void *ctx);
+
+ uint64_t maxToExpire; /* [INPUT ] Limit of number expired items to scan */
+ void *ctx; /* [INPUT ] context to pass to onExpireItem */
+ uint64_t now; /* [INPUT ] Current time in msec. */
+ uint64_t itemsExpired; /* [OUTPUT] Returns the number of expired or updated items. */
+ uint64_t nextExpireTime; /* [OUTPUT] Next expiration time. Returns
+ EB_EXPIRE_TIME_INVALID if none left. */
+} ExpireInfo;
+
+/* Iterator to traverse ebuckets items */
+typedef struct EbucketsIterator {
+ /* private data of iterator */
+ ebuckets eb;
+ EbucketsType *type;
+ raxIterator raxIter;
+ int isRax;
+
+ /* public read only */
+ eItem currItem; /* Current item ref. Use ebGetMetaExpTime()
+ on `currItem` to get expiration time.*/
+ uint64_t itemsCurrBucket; /* Number of items in current bucket. */
+} EbucketsIterator;
+
+typedef void *(ebDefragAllocFunction)(void *ptr);
+typedef void *(ebDefragAllocItemFunction)(void *ptr, void *privdata);
+typedef struct {
+ ebDefragAllocFunction *defragAlloc; /* Used for rax nodes, segment etc. */
+ ebDefragAllocItemFunction *defragItem; /* Defrag-realloc eitem */
+} ebDefragFunctions;
+
+/* ebuckets API */
+
+static inline ebuckets ebCreate(void) { return NULL; } /* Empty ebuckets */
+
+void ebDestroy(ebuckets *eb, EbucketsType *type, void *deletedItemsCbCtx);
+
+void ebExpire(ebuckets *eb, EbucketsType *type, ExpireInfo *info);
+
+uint64_t ebExpireDryRun(ebuckets eb, EbucketsType *type, uint64_t now);
+
+static inline int ebIsEmpty(ebuckets eb) { return eb == NULL; }
+
+uint64_t ebGetNextTimeToExpire(ebuckets eb, EbucketsType *type);
+
+uint64_t ebGetMaxExpireTime(ebuckets eb, EbucketsType *type, int accurate);
+
+uint64_t ebGetTotalItems(ebuckets eb, EbucketsType *type);
+
+/* Item related API */
+
+int ebRemove(ebuckets *eb, EbucketsType *type, eItem item);
+
+int ebAdd(ebuckets *eb, EbucketsType *type, eItem item, uint64_t expireTime);
+
+uint64_t ebGetExpireTime(EbucketsType *type, eItem item);
+
+void ebStart(EbucketsIterator *iter, ebuckets eb, EbucketsType *type);
+
+void ebStop(EbucketsIterator *iter);
+
+int ebNext(EbucketsIterator *iter);
+
+int ebNextBucket(EbucketsIterator *iter);
+
+int ebScanDefrag(ebuckets *eb, EbucketsType *type, unsigned long *cursor,
+ ebDefragFunctions *defragfns, void *privdata);
+
+static inline uint64_t ebGetMetaExpTime(ExpireMeta *expMeta) {
+ return (((uint64_t)(expMeta)->expireTimeHi << 32) | (expMeta)->expireTimeLo);
+}
+
+static inline void ebSetMetaExpTime(ExpireMeta *expMeta, uint64_t t) {
+ expMeta->expireTimeLo = (uint32_t)(t&0xFFFFFFFF);
+ expMeta->expireTimeHi = (uint16_t)((t) >> 32);
+}
+
+/* Debug API */
+
+void ebValidate(ebuckets eb, EbucketsType *type);
+
+void ebPrint(ebuckets eb, EbucketsType *type);
+
+#ifdef REDIS_TEST
+int ebucketsTest(int argc, char *argv[], int flags);
+#endif
+
+#endif /* __EBUCKETS_H */