From 5d8dfe892a2ea89f706ee140c3bdcfd89fe03fda Mon Sep 17 00:00:00 2001 From: Mitja Felicijan Date: Wed, 21 Jan 2026 22:40:55 +0100 Subject: Add Redis source code for testing --- examples/redis-unstable/src/.gitignore | 5 + examples/redis-unstable/src/Makefile | 563 + examples/redis-unstable/src/acl.c | 3313 ++++ examples/redis-unstable/src/adlist.c | 395 + examples/redis-unstable/src/adlist.h | 80 + examples/redis-unstable/src/ae.c | 511 + examples/redis-unstable/src/ae.h | 118 + examples/redis-unstable/src/ae_epoll.c | 119 + examples/redis-unstable/src/ae_evport.c | 323 + examples/redis-unstable/src/ae_kqueue.c | 183 + examples/redis-unstable/src/ae_select.c | 90 + examples/redis-unstable/src/anet.c | 812 + examples/redis-unstable/src/anet.h | 58 + examples/redis-unstable/src/aof.c | 2921 ++++ examples/redis-unstable/src/asciilogo.h | 27 + examples/redis-unstable/src/atomicvar.h | 186 + examples/redis-unstable/src/bio.c | 445 + examples/redis-unstable/src/bio.h | 47 + examples/redis-unstable/src/bitops.c | 2037 +++ examples/redis-unstable/src/blocked.c | 787 + examples/redis-unstable/src/call_reply.c | 540 + examples/redis-unstable/src/call_reply.h | 40 + examples/redis-unstable/src/childinfo.c | 163 + examples/redis-unstable/src/chk.c | 822 + examples/redis-unstable/src/chk.h | 89 + examples/redis-unstable/src/cli_commands.c | 13 + examples/redis-unstable/src/cli_commands.h | 46 + examples/redis-unstable/src/cli_common.c | 424 + examples/redis-unstable/src/cli_common.h | 59 + examples/redis-unstable/src/cluster.c | 2263 +++ examples/redis-unstable/src/cluster.h | 354 + examples/redis-unstable/src/cluster_asm.c | 3602 +++++ examples/redis-unstable/src/cluster_asm.h | 57 + examples/redis-unstable/src/cluster_legacy.c | 6581 ++++++++ examples/redis-unstable/src/cluster_legacy.h | 385 + examples/redis-unstable/src/cluster_slot_stats.c | 373 + examples/redis-unstable/src/cluster_slot_stats.h | 33 + examples/redis-unstable/src/commands.c | 13 + examples/redis-unstable/src/commands.def | 11962 ++++++++++++++ examples/redis-unstable/src/commands.h | 40 + examples/redis-unstable/src/commands/README.md | 15 + examples/redis-unstable/src/commands/acl-cat.json | 42 + .../redis-unstable/src/commands/acl-deluser.json | 33 + .../redis-unstable/src/commands/acl-dryrun.json | 47 + .../redis-unstable/src/commands/acl-genpass.json | 28 + .../redis-unstable/src/commands/acl-getuser.json | 91 + examples/redis-unstable/src/commands/acl-help.json | 23 + examples/redis-unstable/src/commands/acl-list.json | 25 + examples/redis-unstable/src/commands/acl-load.json | 21 + examples/redis-unstable/src/commands/acl-log.json | 90 + examples/redis-unstable/src/commands/acl-save.json | 25 + .../redis-unstable/src/commands/acl-setuser.json | 47 + .../redis-unstable/src/commands/acl-users.json | 25 + .../redis-unstable/src/commands/acl-whoami.json | 21 + examples/redis-unstable/src/commands/acl.json | 12 + examples/redis-unstable/src/commands/append.json | 53 + examples/redis-unstable/src/commands/asking.json | 19 + examples/redis-unstable/src/commands/auth.json | 43 + .../redis-unstable/src/commands/bgrewriteaof.json | 19 + examples/redis-unstable/src/commands/bgsave.json | 40 + examples/redis-unstable/src/commands/bitcount.json | 87 + examples/redis-unstable/src/commands/bitfield.json | 159 + .../redis-unstable/src/commands/bitfield_ro.json | 69 + examples/redis-unstable/src/commands/bitop.json | 119 + examples/redis-unstable/src/commands/bitpos.json | 106 + examples/redis-unstable/src/commands/blmove.json | 117 + examples/redis-unstable/src/commands/blmpop.json | 105 + examples/redis-unstable/src/commands/blpop.json | 80 + examples/redis-unstable/src/commands/brpop.json | 79 + .../redis-unstable/src/commands/brpoplpush.json | 96 + examples/redis-unstable/src/commands/bzmpop.json | 117 + examples/redis-unstable/src/commands/bzpopmax.json | 85 + examples/redis-unstable/src/commands/bzpopmin.json | 85 + .../src/commands/client-caching.json | 41 + .../src/commands/client-getname.json | 32 + .../src/commands/client-getredir.json | 37 + .../redis-unstable/src/commands/client-help.json | 26 + .../redis-unstable/src/commands/client-id.json | 24 + .../redis-unstable/src/commands/client-info.json | 27 + .../redis-unstable/src/commands/client-kill.json | 170 + .../redis-unstable/src/commands/client-list.json | 105 + .../src/commands/client-no-evict.json | 42 + .../src/commands/client-no-touch.json | 40 + .../redis-unstable/src/commands/client-pause.json | 54 + .../redis-unstable/src/commands/client-reply.json | 47 + .../src/commands/client-setinfo.json | 45 + .../src/commands/client-setname.json | 33 + .../src/commands/client-tracking.json | 80 + .../src/commands/client-trackinginfo.json | 80 + .../src/commands/client-unblock.json | 56 + .../src/commands/client-unpause.json | 24 + examples/redis-unstable/src/commands/client.json | 12 + .../src/commands/cluster-addslots.json | 26 + .../src/commands/cluster-addslotsrange.json | 36 + .../src/commands/cluster-bumpepoch.json | 33 + .../commands/cluster-count-failure-reports.json | 29 + .../src/commands/cluster-countkeysinslot.json | 25 + .../src/commands/cluster-delslots.json | 26 + .../src/commands/cluster-delslotsrange.json | 36 + .../src/commands/cluster-failover.json | 38 + .../src/commands/cluster-flushslots.json | 19 + .../src/commands/cluster-forget.json | 25 + .../src/commands/cluster-getkeysinslot.json | 35 + .../redis-unstable/src/commands/cluster-help.json | 22 + .../redis-unstable/src/commands/cluster-info.json | 21 + .../src/commands/cluster-keyslot.json | 25 + .../redis-unstable/src/commands/cluster-links.json | 60 + .../redis-unstable/src/commands/cluster-meet.json | 41 + .../src/commands/cluster-migration.json | 141 + .../redis-unstable/src/commands/cluster-myid.json | 18 + .../src/commands/cluster-myshardid.json | 22 + .../redis-unstable/src/commands/cluster-nodes.json | 21 + .../src/commands/cluster-replicas.json | 32 + .../src/commands/cluster-replicate.json | 25 + .../redis-unstable/src/commands/cluster-reset.json | 38 + .../src/commands/cluster-saveconfig.json | 19 + .../src/commands/cluster-set-config-epoch.json | 25 + .../src/commands/cluster-setslot.json | 54 + .../src/commands/cluster-shards.json | 90 + .../src/commands/cluster-slaves.json | 37 + .../src/commands/cluster-slot-stats.json | 114 + .../redis-unstable/src/commands/cluster-slots.json | 136 + .../src/commands/cluster-syncslots.json | 117 + examples/redis-unstable/src/commands/cluster.json | 9 + .../redis-unstable/src/commands/command-count.json | 23 + .../redis-unstable/src/commands/command-docs.json | 211 + .../src/commands/command-getkeys.json | 39 + .../src/commands/command-getkeysandflags.json | 55 + .../redis-unstable/src/commands/command-help.json | 26 + .../redis-unstable/src/commands/command-info.json | 213 + .../redis-unstable/src/commands/command-list.json | 55 + examples/redis-unstable/src/commands/command.json | 21 + .../redis-unstable/src/commands/config-get.json | 36 + .../redis-unstable/src/commands/config-help.json | 22 + .../src/commands/config-resetstat.json | 24 + .../src/commands/config-rewrite.json | 24 + .../redis-unstable/src/commands/config-set.json | 47 + examples/redis-unstable/src/commands/config.json | 9 + examples/redis-unstable/src/commands/copy.json | 91 + examples/redis-unstable/src/commands/dbsize.json | 25 + examples/redis-unstable/src/commands/debug.json | 20 + examples/redis-unstable/src/commands/decr.json | 50 + examples/redis-unstable/src/commands/decrby.json | 54 + examples/redis-unstable/src/commands/del.json | 53 + examples/redis-unstable/src/commands/delex.json | 89 + examples/redis-unstable/src/commands/digest.json | 56 + examples/redis-unstable/src/commands/discard.json | 23 + examples/redis-unstable/src/commands/dump.json | 58 + examples/redis-unstable/src/commands/echo.json | 28 + examples/redis-unstable/src/commands/eval.json | 69 + examples/redis-unstable/src/commands/eval_ro.json | 68 + examples/redis-unstable/src/commands/evalsha.json | 68 + .../redis-unstable/src/commands/evalsha_ro.json | 67 + examples/redis-unstable/src/commands/exec.json | 31 + examples/redis-unstable/src/commands/exists.json | 58 + examples/redis-unstable/src/commands/expire.json | 94 + examples/redis-unstable/src/commands/expireat.json | 94 + .../redis-unstable/src/commands/expiretime.json | 61 + examples/redis-unstable/src/commands/failover.json | 54 + examples/redis-unstable/src/commands/fcall.json | 69 + examples/redis-unstable/src/commands/fcall_ro.json | 68 + examples/redis-unstable/src/commands/flushall.json | 55 + examples/redis-unstable/src/commands/flushdb.json | 55 + .../src/commands/function-delete.json | 31 + .../redis-unstable/src/commands/function-dump.json | 21 + .../src/commands/function-flush.json | 44 + .../redis-unstable/src/commands/function-help.json | 25 + .../redis-unstable/src/commands/function-kill.json | 25 + .../redis-unstable/src/commands/function-list.json | 87 + .../redis-unstable/src/commands/function-load.json | 39 + .../src/commands/function-restore.json | 54 + .../src/commands/function-stats.json | 81 + examples/redis-unstable/src/commands/function.json | 9 + examples/redis-unstable/src/commands/geoadd.json | 98 + examples/redis-unstable/src/commands/geodist.json | 91 + examples/redis-unstable/src/commands/geohash.json | 56 + examples/redis-unstable/src/commands/geopos.json | 76 + .../redis-unstable/src/commands/georadius.json | 270 + .../redis-unstable/src/commands/georadius_ro.json | 205 + .../src/commands/georadiusbymember.json | 265 + .../src/commands/georadiusbymember_ro.json | 200 + .../redis-unstable/src/commands/geosearch.json | 267 + .../src/commands/geosearchstore.json | 228 + examples/redis-unstable/src/commands/get.json | 56 + examples/redis-unstable/src/commands/getbit.json | 59 + examples/redis-unstable/src/commands/getdel.json | 57 + examples/redis-unstable/src/commands/getex.json | 90 + examples/redis-unstable/src/commands/getrange.json | 55 + examples/redis-unstable/src/commands/getset.json | 67 + examples/redis-unstable/src/commands/hdel.json | 59 + examples/redis-unstable/src/commands/hello.json | 111 + examples/redis-unstable/src/commands/hexists.json | 59 + examples/redis-unstable/src/commands/hexpire.json | 118 + .../redis-unstable/src/commands/hexpireat.json | 118 + .../redis-unstable/src/commands/hexpiretime.json | 84 + examples/redis-unstable/src/commands/hget.json | 60 + examples/redis-unstable/src/commands/hgetall.json | 53 + examples/redis-unstable/src/commands/hgetdel.json | 78 + examples/redis-unstable/src/commands/hgetex.json | 111 + examples/redis-unstable/src/commands/hincrby.json | 58 + .../redis-unstable/src/commands/hincrbyfloat.json | 58 + examples/redis-unstable/src/commands/hkeys.json | 54 + examples/redis-unstable/src/commands/hlen.json | 47 + examples/redis-unstable/src/commands/hmget.json | 64 + examples/redis-unstable/src/commands/hmset.json | 68 + .../redis-unstable/src/commands/hotkeys-get.json | 51 + .../redis-unstable/src/commands/hotkeys-reset.json | 18 + .../redis-unstable/src/commands/hotkeys-start.json | 80 + .../redis-unstable/src/commands/hotkeys-stop.json | 19 + examples/redis-unstable/src/commands/hotkeys.json | 10 + examples/redis-unstable/src/commands/hpersist.json | 83 + examples/redis-unstable/src/commands/hpexpire.json | 118 + .../redis-unstable/src/commands/hpexpireat.json | 118 + .../redis-unstable/src/commands/hpexpiretime.json | 84 + examples/redis-unstable/src/commands/hpttl.json | 87 + .../redis-unstable/src/commands/hrandfield.json | 101 + examples/redis-unstable/src/commands/hscan.json | 87 + examples/redis-unstable/src/commands/hset.json | 70 + examples/redis-unstable/src/commands/hsetex.json | 132 + examples/redis-unstable/src/commands/hsetnx.json | 65 + examples/redis-unstable/src/commands/hstrlen.json | 52 + examples/redis-unstable/src/commands/httl.json | 87 + examples/redis-unstable/src/commands/hvals.json | 53 + examples/redis-unstable/src/commands/incr.json | 50 + examples/redis-unstable/src/commands/incrby.json | 54 + .../redis-unstable/src/commands/incrbyfloat.json | 54 + examples/redis-unstable/src/commands/info.json | 41 + examples/redis-unstable/src/commands/keys.json | 34 + examples/redis-unstable/src/commands/lastsave.json | 26 + .../src/commands/latency-doctor.json | 26 + .../redis-unstable/src/commands/latency-graph.json | 32 + .../redis-unstable/src/commands/latency-help.json | 22 + .../src/commands/latency-histogram.json | 54 + .../src/commands/latency-history.json | 49 + .../src/commands/latency-latest.json | 49 + .../redis-unstable/src/commands/latency-reset.json | 33 + examples/redis-unstable/src/commands/latency.json | 9 + examples/redis-unstable/src/commands/lcs.json | 127 + examples/redis-unstable/src/commands/lindex.json | 59 + examples/redis-unstable/src/commands/linsert.json | 85 + examples/redis-unstable/src/commands/llen.json | 48 + examples/redis-unstable/src/commands/lmove.json | 104 + examples/redis-unstable/src/commands/lmpop.json | 100 + examples/redis-unstable/src/commands/lolwut.json | 25 + examples/redis-unstable/src/commands/lpop.json | 77 + examples/redis-unstable/src/commands/lpos.json | 85 + examples/redis-unstable/src/commands/lpush.json | 60 + examples/redis-unstable/src/commands/lpushx.json | 61 + examples/redis-unstable/src/commands/lrange.json | 58 + examples/redis-unstable/src/commands/lrem.json | 56 + examples/redis-unstable/src/commands/lset.json | 55 + examples/redis-unstable/src/commands/ltrim.json | 54 + .../redis-unstable/src/commands/memory-doctor.json | 20 + .../redis-unstable/src/commands/memory-help.json | 22 + .../src/commands/memory-malloc-stats.json | 20 + .../redis-unstable/src/commands/memory-purge.json | 18 + .../redis-unstable/src/commands/memory-stats.json | 136 + .../redis-unstable/src/commands/memory-usage.json | 58 + examples/redis-unstable/src/commands/memory.json | 9 + examples/redis-unstable/src/commands/mget.json | 63 + examples/redis-unstable/src/commands/migrate.json | 181 + .../redis-unstable/src/commands/module-help.json | 22 + .../redis-unstable/src/commands/module-list.json | 47 + .../redis-unstable/src/commands/module-load.json | 32 + .../redis-unstable/src/commands/module-loadex.json | 51 + .../redis-unstable/src/commands/module-unload.json | 26 + examples/redis-unstable/src/commands/module.json | 9 + examples/redis-unstable/src/commands/monitor.json | 16 + examples/redis-unstable/src/commands/move.json | 61 + examples/redis-unstable/src/commands/mset.json | 62 + examples/redis-unstable/src/commands/msetex.json | 124 + examples/redis-unstable/src/commands/msetnx.json | 67 + examples/redis-unstable/src/commands/multi.json | 23 + .../src/commands/object-encoding.json | 58 + .../redis-unstable/src/commands/object-freq.json | 50 + .../redis-unstable/src/commands/object-help.json | 25 + .../src/commands/object-idletime.json | 50 + .../src/commands/object-refcount.json | 50 + examples/redis-unstable/src/commands/object.json | 9 + examples/redis-unstable/src/commands/persist.json | 56 + examples/redis-unstable/src/commands/pexpire.json | 94 + .../redis-unstable/src/commands/pexpireat.json | 94 + .../redis-unstable/src/commands/pexpiretime.json | 61 + examples/redis-unstable/src/commands/pfadd.json | 63 + examples/redis-unstable/src/commands/pfcount.json | 50 + examples/redis-unstable/src/commands/pfdebug.json | 52 + examples/redis-unstable/src/commands/pfmerge.json | 73 + .../redis-unstable/src/commands/pfselftest.json | 22 + examples/redis-unstable/src/commands/ping.json | 40 + examples/redis-unstable/src/commands/psetex.json | 60 + .../redis-unstable/src/commands/psubscribe.json | 24 + examples/redis-unstable/src/commands/psync.json | 25 + examples/redis-unstable/src/commands/pttl.json | 70 + examples/redis-unstable/src/commands/publish.json | 33 + .../src/commands/pubsub-channels.json | 31 + .../redis-unstable/src/commands/pubsub-help.json | 22 + .../redis-unstable/src/commands/pubsub-numpat.json | 21 + .../redis-unstable/src/commands/pubsub-numsub.json | 28 + .../src/commands/pubsub-shardchannels.json | 31 + .../src/commands/pubsub-shardnumsub.json | 28 + examples/redis-unstable/src/commands/pubsub.json | 9 + .../redis-unstable/src/commands/punsubscribe.json | 25 + examples/redis-unstable/src/commands/quit.json | 29 + .../redis-unstable/src/commands/randomkey.json | 34 + examples/redis-unstable/src/commands/readonly.json | 21 + .../redis-unstable/src/commands/readwrite.json | 21 + examples/redis-unstable/src/commands/rename.json | 72 + examples/redis-unstable/src/commands/renamenx.json | 86 + examples/redis-unstable/src/commands/replconf.json | 23 + .../redis-unstable/src/commands/replicaof.json | 59 + examples/redis-unstable/src/commands/reset.json | 24 + .../src/commands/restore-asking.json | 102 + examples/redis-unstable/src/commands/restore.json | 98 + examples/redis-unstable/src/commands/role.json | 134 + examples/redis-unstable/src/commands/rpop.json | 76 + .../redis-unstable/src/commands/rpoplpush.json | 85 + examples/redis-unstable/src/commands/rpush.json | 61 + examples/redis-unstable/src/commands/rpushx.json | 61 + examples/redis-unstable/src/commands/sadd.json | 60 + examples/redis-unstable/src/commands/save.json | 19 + examples/redis-unstable/src/commands/scan.json | 72 + examples/redis-unstable/src/commands/scard.json | 48 + .../redis-unstable/src/commands/script-debug.json | 43 + .../redis-unstable/src/commands/script-exists.json | 44 + .../redis-unstable/src/commands/script-flush.json | 50 + .../redis-unstable/src/commands/script-help.json | 25 + .../redis-unstable/src/commands/script-kill.json | 25 + .../redis-unstable/src/commands/script-load.json | 32 + examples/redis-unstable/src/commands/script.json | 9 + examples/redis-unstable/src/commands/sdiff.json | 55 + .../redis-unstable/src/commands/sdiffstore.json | 73 + examples/redis-unstable/src/commands/select.json | 27 + .../src/commands/sentinel-ckquorum.json | 26 + .../src/commands/sentinel-config.json | 121 + .../src/commands/sentinel-debug.json | 49 + .../src/commands/sentinel-failover.json | 25 + .../src/commands/sentinel-flushconfig.json | 20 + .../commands/sentinel-get-master-addr-by-name.json | 38 + .../redis-unstable/src/commands/sentinel-help.json | 24 + .../src/commands/sentinel-info-cache.json | 64 + .../commands/sentinel-is-master-down-by-addr.json | 61 + .../src/commands/sentinel-master.json | 29 + .../src/commands/sentinel-masters.json | 26 + .../src/commands/sentinel-monitor.json | 37 + .../redis-unstable/src/commands/sentinel-myid.json | 20 + .../src/commands/sentinel-pending-scripts.json | 52 + .../src/commands/sentinel-remove.json | 25 + .../src/commands/sentinel-replicas.json | 32 + .../src/commands/sentinel-reset.json | 26 + .../src/commands/sentinel-sentinels.json | 32 + .../redis-unstable/src/commands/sentinel-set.json | 40 + .../src/commands/sentinel-simulate-failure.json | 52 + .../src/commands/sentinel-slaves.json | 37 + examples/redis-unstable/src/commands/sentinel.json | 14 + examples/redis-unstable/src/commands/set.json | 180 + examples/redis-unstable/src/commands/setbit.json | 64 + examples/redis-unstable/src/commands/setex.json | 60 + examples/redis-unstable/src/commands/setnx.json | 66 + examples/redis-unstable/src/commands/setrange.json | 57 + examples/redis-unstable/src/commands/sflush.json | 75 + examples/redis-unstable/src/commands/shutdown.json | 69 + examples/redis-unstable/src/commands/sinter.json | 55 + .../redis-unstable/src/commands/sintercard.json | 60 + .../redis-unstable/src/commands/sinterstore.json | 73 + .../redis-unstable/src/commands/sismember.json | 59 + examples/redis-unstable/src/commands/slaveof.json | 64 + .../redis-unstable/src/commands/slowlog-get.json | 74 + .../redis-unstable/src/commands/slowlog-help.json | 22 + .../redis-unstable/src/commands/slowlog-len.json | 26 + .../redis-unstable/src/commands/slowlog-reset.json | 23 + examples/redis-unstable/src/commands/slowlog.json | 9 + examples/redis-unstable/src/commands/smembers.json | 54 + .../redis-unstable/src/commands/smismember.json | 66 + examples/redis-unstable/src/commands/smove.json | 84 + examples/redis-unstable/src/commands/sort.json | 162 + examples/redis-unstable/src/commands/sort_ro.json | 132 + examples/redis-unstable/src/commands/spop.json | 80 + examples/redis-unstable/src/commands/spublish.json | 51 + .../redis-unstable/src/commands/srandmember.json | 83 + examples/redis-unstable/src/commands/srem.json | 60 + examples/redis-unstable/src/commands/sscan.json | 81 + .../redis-unstable/src/commands/ssubscribe.json | 42 + examples/redis-unstable/src/commands/strlen.json | 48 + .../redis-unstable/src/commands/subscribe.json | 25 + examples/redis-unstable/src/commands/substr.json | 60 + examples/redis-unstable/src/commands/sunion.json | 55 + .../redis-unstable/src/commands/sunionstore.json | 73 + .../redis-unstable/src/commands/sunsubscribe.json | 43 + examples/redis-unstable/src/commands/swapdb.json | 31 + examples/redis-unstable/src/commands/sync.json | 15 + examples/redis-unstable/src/commands/time.json | 28 + examples/redis-unstable/src/commands/touch.json | 53 + .../redis-unstable/src/commands/trimslots.json | 48 + examples/redis-unstable/src/commands/ttl.json | 70 + examples/redis-unstable/src/commands/type.json | 55 + examples/redis-unstable/src/commands/unlink.json | 54 + .../redis-unstable/src/commands/unsubscribe.json | 25 + examples/redis-unstable/src/commands/unwatch.json | 23 + examples/redis-unstable/src/commands/wait.json | 35 + examples/redis-unstable/src/commands/waitaof.json | 52 + examples/redis-unstable/src/commands/watch.json | 50 + examples/redis-unstable/src/commands/xack.json | 58 + examples/redis-unstable/src/commands/xackdel.json | 109 + examples/redis-unstable/src/commands/xadd.json | 231 + .../redis-unstable/src/commands/xautoclaim.json | 158 + examples/redis-unstable/src/commands/xcfgset.json | 79 + examples/redis-unstable/src/commands/xclaim.json | 138 + examples/redis-unstable/src/commands/xdel.json | 54 + examples/redis-unstable/src/commands/xdelex.json | 104 + .../redis-unstable/src/commands/xgroup-create.json | 86 + .../src/commands/xgroup-createconsumer.json | 64 + .../src/commands/xgroup-delconsumer.json | 57 + .../src/commands/xgroup-destroy.json | 59 + .../redis-unstable/src/commands/xgroup-help.json | 25 + .../redis-unstable/src/commands/xgroup-setid.json | 79 + examples/redis-unstable/src/commands/xgroup.json | 9 + .../src/commands/xinfo-consumers.json | 80 + .../redis-unstable/src/commands/xinfo-groups.json | 92 + .../redis-unstable/src/commands/xinfo-help.json | 25 + .../redis-unstable/src/commands/xinfo-stream.json | 414 + examples/redis-unstable/src/commands/xinfo.json | 9 + examples/redis-unstable/src/commands/xlen.json | 48 + examples/redis-unstable/src/commands/xpending.json | 190 + examples/redis-unstable/src/commands/xrange.json | 87 + examples/redis-unstable/src/commands/xread.json | 107 + .../redis-unstable/src/commands/xreadgroup.json | 180 + .../redis-unstable/src/commands/xrevrange.json | 86 + examples/redis-unstable/src/commands/xsetid.json | 72 + examples/redis-unstable/src/commands/xtrim.json | 134 + examples/redis-unstable/src/commands/zadd.json | 144 + examples/redis-unstable/src/commands/zcard.json | 47 + examples/redis-unstable/src/commands/zcount.json | 56 + examples/redis-unstable/src/commands/zdiff.json | 85 + .../redis-unstable/src/commands/zdiffstore.json | 77 + examples/redis-unstable/src/commands/zincrby.json | 58 + examples/redis-unstable/src/commands/zinter.json | 115 + .../redis-unstable/src/commands/zintercard.json | 60 + .../redis-unstable/src/commands/zinterstore.json | 108 + .../redis-unstable/src/commands/zlexcount.json | 57 + examples/redis-unstable/src/commands/zmpop.json | 111 + examples/redis-unstable/src/commands/zmscore.json | 65 + examples/redis-unstable/src/commands/zpopmax.json | 89 + examples/redis-unstable/src/commands/zpopmin.json | 89 + .../redis-unstable/src/commands/zrandmember.json | 101 + examples/redis-unstable/src/commands/zrange.json | 137 + .../redis-unstable/src/commands/zrangebylex.json | 80 + .../redis-unstable/src/commands/zrangebyscore.json | 119 + .../redis-unstable/src/commands/zrangestore.json | 118 + examples/redis-unstable/src/commands/zrank.json | 86 + examples/redis-unstable/src/commands/zrem.json | 60 + .../src/commands/zremrangebylex.json | 55 + .../src/commands/zremrangebyrank.json | 55 + .../src/commands/zremrangebyscore.json | 55 + .../redis-unstable/src/commands/zrevrange.json | 94 + .../src/commands/zrevrangebylex.json | 80 + .../src/commands/zrevrangebyscore.json | 118 + examples/redis-unstable/src/commands/zrevrank.json | 86 + examples/redis-unstable/src/commands/zscan.json | 81 + examples/redis-unstable/src/commands/zscore.json | 60 + examples/redis-unstable/src/commands/zunion.json | 115 + .../redis-unstable/src/commands/zunionstore.json | 107 + examples/redis-unstable/src/config.c | 3775 +++++ examples/redis-unstable/src/config.h | 375 + examples/redis-unstable/src/connection.c | 209 + examples/redis-unstable/src/connection.h | 471 + examples/redis-unstable/src/connhelpers.h | 68 + examples/redis-unstable/src/crc16.c | 88 + examples/redis-unstable/src/crc16_slottable.h | 836 + examples/redis-unstable/src/crc64.c | 371 + examples/redis-unstable/src/crc64.h | 13 + examples/redis-unstable/src/crccombine.c | 252 + examples/redis-unstable/src/crccombine.h | 10 + examples/redis-unstable/src/crcspeed.c | 410 + examples/redis-unstable/src/crcspeed.h | 62 + examples/redis-unstable/src/db.c | 3793 +++++ examples/redis-unstable/src/debug.c | 2849 ++++ examples/redis-unstable/src/debugmacro.h | 26 + examples/redis-unstable/src/defrag.c | 1985 +++ examples/redis-unstable/src/dict.c | 2340 +++ examples/redis-unstable/src/dict.h | 319 + examples/redis-unstable/src/ebuckets.c | 2725 ++++ examples/redis-unstable/src/ebuckets.h | 336 + examples/redis-unstable/src/endianconv.c | 109 + examples/redis-unstable/src/endianconv.h | 69 + examples/redis-unstable/src/entry.c | 408 + examples/redis-unstable/src/entry.h | 141 + examples/redis-unstable/src/estore.c | 496 + examples/redis-unstable/src/estore.h | 91 + examples/redis-unstable/src/eval.c | 1760 +++ examples/redis-unstable/src/eventnotifier.c | 98 + examples/redis-unstable/src/eventnotifier.h | 34 + examples/redis-unstable/src/evict.c | 764 + examples/redis-unstable/src/expire.c | 932 ++ examples/redis-unstable/src/fmacros.h | 56 + examples/redis-unstable/src/fmtargs.h | 213 + examples/redis-unstable/src/function_lua.c | 513 + examples/redis-unstable/src/functions.c | 1138 ++ examples/redis-unstable/src/functions.h | 127 + examples/redis-unstable/src/fwtree.c | 237 + examples/redis-unstable/src/fwtree.h | 71 + examples/redis-unstable/src/geo.c | 1006 ++ examples/redis-unstable/src/geo.h | 22 + examples/redis-unstable/src/geohash.c | 299 + examples/redis-unstable/src/geohash.h | 135 + examples/redis-unstable/src/geohash_helper.c | 280 + examples/redis-unstable/src/geohash_helper.h | 65 + examples/redis-unstable/src/hotkeys.c | 614 + examples/redis-unstable/src/hyperloglog.c | 2099 +++ examples/redis-unstable/src/intset.c | 566 + examples/redis-unstable/src/intset.h | 58 + examples/redis-unstable/src/iothread.c | 955 ++ examples/redis-unstable/src/keymeta.c | 935 ++ examples/redis-unstable/src/keymeta.h | 182 + examples/redis-unstable/src/kvstore.c | 1171 ++ examples/redis-unstable/src/kvstore.h | 159 + examples/redis-unstable/src/latency.c | 721 + examples/redis-unstable/src/latency.h | 88 + examples/redis-unstable/src/lazyfree.c | 362 + examples/redis-unstable/src/listpack.c | 3334 ++++ examples/redis-unstable/src/listpack.h | 97 + examples/redis-unstable/src/listpack_malloc.h | 29 + examples/redis-unstable/src/localtime.c | 103 + examples/redis-unstable/src/logreqres.c | 347 + examples/redis-unstable/src/lolwut.c | 172 + examples/redis-unstable/src/lolwut.h | 35 + examples/redis-unstable/src/lolwut5.c | 157 + examples/redis-unstable/src/lolwut6.c | 181 + examples/redis-unstable/src/lolwut8.c | 179 + examples/redis-unstable/src/lzf.h | 100 + examples/redis-unstable/src/lzfP.h | 190 + examples/redis-unstable/src/lzf_c.c | 309 + examples/redis-unstable/src/lzf_d.c | 191 + examples/redis-unstable/src/memory_prefetch.c | 413 + examples/redis-unstable/src/memory_prefetch.h | 26 + examples/redis-unstable/src/memtest.c | 347 + examples/redis-unstable/src/mkreleasehdr.sh | 16 + examples/redis-unstable/src/module.c | 15545 +++++++++++++++++++ examples/redis-unstable/src/modules/.gitignore | 2 + examples/redis-unstable/src/modules/Makefile | 69 + examples/redis-unstable/src/modules/helloacl.c | 190 + examples/redis-unstable/src/modules/helloblock.c | 198 + examples/redis-unstable/src/modules/hellocluster.c | 98 + examples/redis-unstable/src/modules/hellodict.c | 111 + examples/redis-unstable/src/modules/hellohook.c | 72 + examples/redis-unstable/src/modules/hellotimer.c | 55 + examples/redis-unstable/src/modules/hellotype.c | 342 + examples/redis-unstable/src/modules/helloworld.c | 601 + examples/redis-unstable/src/monotonic.c | 247 + examples/redis-unstable/src/monotonic.h | 61 + examples/redis-unstable/src/mstr.c | 528 + examples/redis-unstable/src/mstr.h | 227 + examples/redis-unstable/src/mt19937-64.c | 187 + examples/redis-unstable/src/mt19937-64.h | 87 + examples/redis-unstable/src/multi.c | 509 + examples/redis-unstable/src/networking.c | 5743 +++++++ examples/redis-unstable/src/notify.c | 129 + examples/redis-unstable/src/object.c | 1821 +++ examples/redis-unstable/src/object.h | 194 + examples/redis-unstable/src/pqsort.c | 185 + examples/redis-unstable/src/pqsort.h | 20 + examples/redis-unstable/src/pubsub.c | 768 + examples/redis-unstable/src/quicklist.c | 3658 +++++ examples/redis-unstable/src/quicklist.h | 218 + examples/redis-unstable/src/rand.c | 93 + examples/redis-unstable/src/rand.h | 18 + examples/redis-unstable/src/rax.c | 2098 +++ examples/redis-unstable/src/rax.h | 204 + examples/redis-unstable/src/rax_malloc.h | 28 + examples/redis-unstable/src/rdb.c | 4483 ++++++ examples/redis-unstable/src/rdb.h | 170 + examples/redis-unstable/src/redis-benchmark.c | 2028 +++ examples/redis-unstable/src/redis-check-aof.c | 589 + examples/redis-unstable/src/redis-check-rdb.c | 451 + examples/redis-unstable/src/redis-cli.c | 11143 +++++++++++++ examples/redis-unstable/src/redis-trib.rb | 129 + examples/redis-unstable/src/redisassert.c | 61 + examples/redis-unstable/src/redisassert.h | 35 + examples/redis-unstable/src/redismodule.h | 1885 +++ examples/redis-unstable/src/release.c | 49 + examples/redis-unstable/src/replication.c | 5387 +++++++ examples/redis-unstable/src/resp_parser.c | 209 + examples/redis-unstable/src/resp_parser.h | 74 + examples/redis-unstable/src/rio.c | 640 + examples/redis-unstable/src/rio.h | 188 + examples/redis-unstable/src/script.c | 699 + examples/redis-unstable/src/script.h | 95 + examples/redis-unstable/src/script_lua.c | 1767 +++ examples/redis-unstable/src/script_lua.h | 68 + examples/redis-unstable/src/sds.c | 1565 ++ examples/redis-unstable/src/sds.h | 340 + examples/redis-unstable/src/sdsalloc.h | 33 + examples/redis-unstable/src/sentinel.c | 5474 +++++++ examples/redis-unstable/src/server.c | 7941 ++++++++++ examples/redis-unstable/src/server.h | 4478 ++++++ examples/redis-unstable/src/setcpuaffinity.c | 155 + examples/redis-unstable/src/setproctitle.c | 331 + examples/redis-unstable/src/sha1.c | 239 + examples/redis-unstable/src/sha1.h | 27 + examples/redis-unstable/src/sha256.c | 163 + examples/redis-unstable/src/sha256.h | 35 + examples/redis-unstable/src/siphash.c | 373 + examples/redis-unstable/src/slowlog.c | 186 + examples/redis-unstable/src/slowlog.h | 31 + examples/redis-unstable/src/socket.c | 468 + examples/redis-unstable/src/solarisfixes.h | 34 + examples/redis-unstable/src/sort.c | 665 + examples/redis-unstable/src/sparkline.c | 162 + examples/redis-unstable/src/sparkline.h | 39 + examples/redis-unstable/src/stream.h | 208 + examples/redis-unstable/src/strl.c | 86 + examples/redis-unstable/src/syncio.c | 125 + examples/redis-unstable/src/syscheck.c | 354 + examples/redis-unstable/src/syscheck.h | 26 + examples/redis-unstable/src/t_hash.c | 4068 +++++ examples/redis-unstable/src/t_list.c | 1465 ++ examples/redis-unstable/src/t_set.c | 1863 +++ examples/redis-unstable/src/t_stream.c | 5755 +++++++ examples/redis-unstable/src/t_string.c | 1217 ++ examples/redis-unstable/src/t_zset.c | 5024 ++++++ examples/redis-unstable/src/testhelp.h | 44 + examples/redis-unstable/src/threads_mngr.c | 171 + examples/redis-unstable/src/threads_mngr.h | 49 + examples/redis-unstable/src/timeout.c | 183 + examples/redis-unstable/src/tls.c | 1295 ++ examples/redis-unstable/src/tracking.c | 666 + examples/redis-unstable/src/tsan.sup | 21 + examples/redis-unstable/src/unix.c | 219 + examples/redis-unstable/src/util.c | 1774 +++ examples/redis-unstable/src/util.h | 102 + examples/redis-unstable/src/valgrind.sup | 26 + examples/redis-unstable/src/version.h | 2 + examples/redis-unstable/src/ziplist.c | 2665 ++++ examples/redis-unstable/src/ziplist.h | 74 + examples/redis-unstable/src/zipmap.c | 524 + examples/redis-unstable/src/zipmap.h | 34 + examples/redis-unstable/src/zmalloc.c | 1217 ++ examples/redis-unstable/src/zmalloc.h | 168 + 637 files changed, 228441 insertions(+) create mode 100644 examples/redis-unstable/src/.gitignore create mode 100644 examples/redis-unstable/src/Makefile create mode 100644 examples/redis-unstable/src/acl.c create mode 100644 examples/redis-unstable/src/adlist.c create mode 100644 examples/redis-unstable/src/adlist.h create mode 100644 examples/redis-unstable/src/ae.c create mode 100644 examples/redis-unstable/src/ae.h create mode 100644 examples/redis-unstable/src/ae_epoll.c create mode 100644 examples/redis-unstable/src/ae_evport.c create mode 100644 examples/redis-unstable/src/ae_kqueue.c create mode 100644 examples/redis-unstable/src/ae_select.c create mode 100644 examples/redis-unstable/src/anet.c create mode 100644 examples/redis-unstable/src/anet.h create mode 100644 examples/redis-unstable/src/aof.c create mode 100644 examples/redis-unstable/src/asciilogo.h create mode 100644 examples/redis-unstable/src/atomicvar.h create mode 100644 examples/redis-unstable/src/bio.c create mode 100644 examples/redis-unstable/src/bio.h create mode 100644 examples/redis-unstable/src/bitops.c create mode 100644 examples/redis-unstable/src/blocked.c create mode 100644 examples/redis-unstable/src/call_reply.c create mode 100644 examples/redis-unstable/src/call_reply.h create mode 100644 examples/redis-unstable/src/childinfo.c create mode 100644 examples/redis-unstable/src/chk.c create mode 100644 examples/redis-unstable/src/chk.h create mode 100644 examples/redis-unstable/src/cli_commands.c create mode 100644 examples/redis-unstable/src/cli_commands.h create mode 100644 examples/redis-unstable/src/cli_common.c create mode 100644 examples/redis-unstable/src/cli_common.h create mode 100644 examples/redis-unstable/src/cluster.c create mode 100644 examples/redis-unstable/src/cluster.h create mode 100644 examples/redis-unstable/src/cluster_asm.c create mode 100644 examples/redis-unstable/src/cluster_asm.h create mode 100644 examples/redis-unstable/src/cluster_legacy.c create mode 100644 examples/redis-unstable/src/cluster_legacy.h create mode 100644 examples/redis-unstable/src/cluster_slot_stats.c create mode 100644 examples/redis-unstable/src/cluster_slot_stats.h create mode 100644 examples/redis-unstable/src/commands.c create mode 100644 examples/redis-unstable/src/commands.def create mode 100644 examples/redis-unstable/src/commands.h create mode 100644 examples/redis-unstable/src/commands/README.md create mode 100644 examples/redis-unstable/src/commands/acl-cat.json create mode 100644 examples/redis-unstable/src/commands/acl-deluser.json create mode 100644 examples/redis-unstable/src/commands/acl-dryrun.json create mode 100644 examples/redis-unstable/src/commands/acl-genpass.json create mode 100644 examples/redis-unstable/src/commands/acl-getuser.json create mode 100644 examples/redis-unstable/src/commands/acl-help.json create mode 100644 examples/redis-unstable/src/commands/acl-list.json create mode 100644 examples/redis-unstable/src/commands/acl-load.json create mode 100644 examples/redis-unstable/src/commands/acl-log.json create mode 100644 examples/redis-unstable/src/commands/acl-save.json create mode 100644 examples/redis-unstable/src/commands/acl-setuser.json create mode 100644 examples/redis-unstable/src/commands/acl-users.json create mode 100644 examples/redis-unstable/src/commands/acl-whoami.json create mode 100644 examples/redis-unstable/src/commands/acl.json create mode 100644 examples/redis-unstable/src/commands/append.json create mode 100644 examples/redis-unstable/src/commands/asking.json create mode 100644 examples/redis-unstable/src/commands/auth.json create mode 100644 examples/redis-unstable/src/commands/bgrewriteaof.json create mode 100644 examples/redis-unstable/src/commands/bgsave.json create mode 100644 examples/redis-unstable/src/commands/bitcount.json create mode 100644 examples/redis-unstable/src/commands/bitfield.json create mode 100644 examples/redis-unstable/src/commands/bitfield_ro.json create mode 100644 examples/redis-unstable/src/commands/bitop.json create mode 100644 examples/redis-unstable/src/commands/bitpos.json create mode 100644 examples/redis-unstable/src/commands/blmove.json create mode 100644 examples/redis-unstable/src/commands/blmpop.json create mode 100644 examples/redis-unstable/src/commands/blpop.json create mode 100644 examples/redis-unstable/src/commands/brpop.json create mode 100644 examples/redis-unstable/src/commands/brpoplpush.json create mode 100644 examples/redis-unstable/src/commands/bzmpop.json create mode 100644 examples/redis-unstable/src/commands/bzpopmax.json create mode 100644 examples/redis-unstable/src/commands/bzpopmin.json create mode 100644 examples/redis-unstable/src/commands/client-caching.json create mode 100644 examples/redis-unstable/src/commands/client-getname.json create mode 100644 examples/redis-unstable/src/commands/client-getredir.json create mode 100644 examples/redis-unstable/src/commands/client-help.json create mode 100644 examples/redis-unstable/src/commands/client-id.json create mode 100644 examples/redis-unstable/src/commands/client-info.json create mode 100644 examples/redis-unstable/src/commands/client-kill.json create mode 100644 examples/redis-unstable/src/commands/client-list.json create mode 100644 examples/redis-unstable/src/commands/client-no-evict.json create mode 100644 examples/redis-unstable/src/commands/client-no-touch.json create mode 100644 examples/redis-unstable/src/commands/client-pause.json create mode 100644 examples/redis-unstable/src/commands/client-reply.json create mode 100644 examples/redis-unstable/src/commands/client-setinfo.json create mode 100644 examples/redis-unstable/src/commands/client-setname.json create mode 100644 examples/redis-unstable/src/commands/client-tracking.json create mode 100644 examples/redis-unstable/src/commands/client-trackinginfo.json create mode 100644 examples/redis-unstable/src/commands/client-unblock.json create mode 100644 examples/redis-unstable/src/commands/client-unpause.json create mode 100644 examples/redis-unstable/src/commands/client.json create mode 100644 examples/redis-unstable/src/commands/cluster-addslots.json create mode 100644 examples/redis-unstable/src/commands/cluster-addslotsrange.json create mode 100644 examples/redis-unstable/src/commands/cluster-bumpepoch.json create mode 100644 examples/redis-unstable/src/commands/cluster-count-failure-reports.json create mode 100644 examples/redis-unstable/src/commands/cluster-countkeysinslot.json create mode 100644 examples/redis-unstable/src/commands/cluster-delslots.json create mode 100644 examples/redis-unstable/src/commands/cluster-delslotsrange.json create mode 100644 examples/redis-unstable/src/commands/cluster-failover.json create mode 100644 examples/redis-unstable/src/commands/cluster-flushslots.json create mode 100644 examples/redis-unstable/src/commands/cluster-forget.json create mode 100644 examples/redis-unstable/src/commands/cluster-getkeysinslot.json create mode 100644 examples/redis-unstable/src/commands/cluster-help.json create mode 100644 examples/redis-unstable/src/commands/cluster-info.json create mode 100644 examples/redis-unstable/src/commands/cluster-keyslot.json create mode 100644 examples/redis-unstable/src/commands/cluster-links.json create mode 100644 examples/redis-unstable/src/commands/cluster-meet.json create mode 100644 examples/redis-unstable/src/commands/cluster-migration.json create mode 100644 examples/redis-unstable/src/commands/cluster-myid.json create mode 100644 examples/redis-unstable/src/commands/cluster-myshardid.json create mode 100644 examples/redis-unstable/src/commands/cluster-nodes.json create mode 100644 examples/redis-unstable/src/commands/cluster-replicas.json create mode 100644 examples/redis-unstable/src/commands/cluster-replicate.json create mode 100644 examples/redis-unstable/src/commands/cluster-reset.json create mode 100644 examples/redis-unstable/src/commands/cluster-saveconfig.json create mode 100644 examples/redis-unstable/src/commands/cluster-set-config-epoch.json create mode 100644 examples/redis-unstable/src/commands/cluster-setslot.json create mode 100644 examples/redis-unstable/src/commands/cluster-shards.json create mode 100644 examples/redis-unstable/src/commands/cluster-slaves.json create mode 100644 examples/redis-unstable/src/commands/cluster-slot-stats.json create mode 100644 examples/redis-unstable/src/commands/cluster-slots.json create mode 100644 examples/redis-unstable/src/commands/cluster-syncslots.json create mode 100644 examples/redis-unstable/src/commands/cluster.json create mode 100644 examples/redis-unstable/src/commands/command-count.json create mode 100644 examples/redis-unstable/src/commands/command-docs.json create mode 100644 examples/redis-unstable/src/commands/command-getkeys.json create mode 100644 examples/redis-unstable/src/commands/command-getkeysandflags.json create mode 100644 examples/redis-unstable/src/commands/command-help.json create mode 100644 examples/redis-unstable/src/commands/command-info.json create mode 100644 examples/redis-unstable/src/commands/command-list.json create mode 100644 examples/redis-unstable/src/commands/command.json create mode 100644 examples/redis-unstable/src/commands/config-get.json create mode 100644 examples/redis-unstable/src/commands/config-help.json create mode 100644 examples/redis-unstable/src/commands/config-resetstat.json create mode 100644 examples/redis-unstable/src/commands/config-rewrite.json create mode 100644 examples/redis-unstable/src/commands/config-set.json create mode 100644 examples/redis-unstable/src/commands/config.json create mode 100644 examples/redis-unstable/src/commands/copy.json create mode 100644 examples/redis-unstable/src/commands/dbsize.json create mode 100644 examples/redis-unstable/src/commands/debug.json create mode 100644 examples/redis-unstable/src/commands/decr.json create mode 100644 examples/redis-unstable/src/commands/decrby.json create mode 100644 examples/redis-unstable/src/commands/del.json create mode 100644 examples/redis-unstable/src/commands/delex.json create mode 100644 examples/redis-unstable/src/commands/digest.json create mode 100644 examples/redis-unstable/src/commands/discard.json create mode 100644 examples/redis-unstable/src/commands/dump.json create mode 100644 examples/redis-unstable/src/commands/echo.json create mode 100644 examples/redis-unstable/src/commands/eval.json create mode 100644 examples/redis-unstable/src/commands/eval_ro.json create mode 100644 examples/redis-unstable/src/commands/evalsha.json create mode 100644 examples/redis-unstable/src/commands/evalsha_ro.json create mode 100644 examples/redis-unstable/src/commands/exec.json create mode 100644 examples/redis-unstable/src/commands/exists.json create mode 100644 examples/redis-unstable/src/commands/expire.json create mode 100644 examples/redis-unstable/src/commands/expireat.json create mode 100644 examples/redis-unstable/src/commands/expiretime.json create mode 100644 examples/redis-unstable/src/commands/failover.json create mode 100644 examples/redis-unstable/src/commands/fcall.json create mode 100644 examples/redis-unstable/src/commands/fcall_ro.json create mode 100644 examples/redis-unstable/src/commands/flushall.json create mode 100644 examples/redis-unstable/src/commands/flushdb.json create mode 100644 examples/redis-unstable/src/commands/function-delete.json create mode 100644 examples/redis-unstable/src/commands/function-dump.json create mode 100644 examples/redis-unstable/src/commands/function-flush.json create mode 100644 examples/redis-unstable/src/commands/function-help.json create mode 100644 examples/redis-unstable/src/commands/function-kill.json create mode 100644 examples/redis-unstable/src/commands/function-list.json create mode 100644 examples/redis-unstable/src/commands/function-load.json create mode 100644 examples/redis-unstable/src/commands/function-restore.json create mode 100644 examples/redis-unstable/src/commands/function-stats.json create mode 100644 examples/redis-unstable/src/commands/function.json create mode 100644 examples/redis-unstable/src/commands/geoadd.json create mode 100644 examples/redis-unstable/src/commands/geodist.json create mode 100644 examples/redis-unstable/src/commands/geohash.json create mode 100644 examples/redis-unstable/src/commands/geopos.json create mode 100644 examples/redis-unstable/src/commands/georadius.json create mode 100644 examples/redis-unstable/src/commands/georadius_ro.json create mode 100644 examples/redis-unstable/src/commands/georadiusbymember.json create mode 100644 examples/redis-unstable/src/commands/georadiusbymember_ro.json create mode 100644 examples/redis-unstable/src/commands/geosearch.json create mode 100644 examples/redis-unstable/src/commands/geosearchstore.json create mode 100644 examples/redis-unstable/src/commands/get.json create mode 100644 examples/redis-unstable/src/commands/getbit.json create mode 100644 examples/redis-unstable/src/commands/getdel.json create mode 100644 examples/redis-unstable/src/commands/getex.json create mode 100644 examples/redis-unstable/src/commands/getrange.json create mode 100644 examples/redis-unstable/src/commands/getset.json create mode 100644 examples/redis-unstable/src/commands/hdel.json create mode 100644 examples/redis-unstable/src/commands/hello.json create mode 100644 examples/redis-unstable/src/commands/hexists.json create mode 100644 examples/redis-unstable/src/commands/hexpire.json create mode 100644 examples/redis-unstable/src/commands/hexpireat.json create mode 100644 examples/redis-unstable/src/commands/hexpiretime.json create mode 100644 examples/redis-unstable/src/commands/hget.json create mode 100644 examples/redis-unstable/src/commands/hgetall.json create mode 100644 examples/redis-unstable/src/commands/hgetdel.json create mode 100644 examples/redis-unstable/src/commands/hgetex.json create mode 100644 examples/redis-unstable/src/commands/hincrby.json create mode 100644 examples/redis-unstable/src/commands/hincrbyfloat.json create mode 100644 examples/redis-unstable/src/commands/hkeys.json create mode 100644 examples/redis-unstable/src/commands/hlen.json create mode 100644 examples/redis-unstable/src/commands/hmget.json create mode 100644 examples/redis-unstable/src/commands/hmset.json create mode 100644 examples/redis-unstable/src/commands/hotkeys-get.json create mode 100644 examples/redis-unstable/src/commands/hotkeys-reset.json create mode 100644 examples/redis-unstable/src/commands/hotkeys-start.json create mode 100644 examples/redis-unstable/src/commands/hotkeys-stop.json create mode 100644 examples/redis-unstable/src/commands/hotkeys.json create mode 100644 examples/redis-unstable/src/commands/hpersist.json create mode 100644 examples/redis-unstable/src/commands/hpexpire.json create mode 100644 examples/redis-unstable/src/commands/hpexpireat.json create mode 100644 examples/redis-unstable/src/commands/hpexpiretime.json create mode 100644 examples/redis-unstable/src/commands/hpttl.json create mode 100644 examples/redis-unstable/src/commands/hrandfield.json create mode 100644 examples/redis-unstable/src/commands/hscan.json create mode 100644 examples/redis-unstable/src/commands/hset.json create mode 100644 examples/redis-unstable/src/commands/hsetex.json create mode 100644 examples/redis-unstable/src/commands/hsetnx.json create mode 100644 examples/redis-unstable/src/commands/hstrlen.json create mode 100644 examples/redis-unstable/src/commands/httl.json create mode 100644 examples/redis-unstable/src/commands/hvals.json create mode 100644 examples/redis-unstable/src/commands/incr.json create mode 100644 examples/redis-unstable/src/commands/incrby.json create mode 100644 examples/redis-unstable/src/commands/incrbyfloat.json create mode 100644 examples/redis-unstable/src/commands/info.json create mode 100644 examples/redis-unstable/src/commands/keys.json create mode 100644 examples/redis-unstable/src/commands/lastsave.json create mode 100644 examples/redis-unstable/src/commands/latency-doctor.json create mode 100644 examples/redis-unstable/src/commands/latency-graph.json create mode 100644 examples/redis-unstable/src/commands/latency-help.json create mode 100644 examples/redis-unstable/src/commands/latency-histogram.json create mode 100644 examples/redis-unstable/src/commands/latency-history.json create mode 100644 examples/redis-unstable/src/commands/latency-latest.json create mode 100644 examples/redis-unstable/src/commands/latency-reset.json create mode 100644 examples/redis-unstable/src/commands/latency.json create mode 100644 examples/redis-unstable/src/commands/lcs.json create mode 100644 examples/redis-unstable/src/commands/lindex.json create mode 100644 examples/redis-unstable/src/commands/linsert.json create mode 100644 examples/redis-unstable/src/commands/llen.json create mode 100644 examples/redis-unstable/src/commands/lmove.json create mode 100644 examples/redis-unstable/src/commands/lmpop.json create mode 100644 examples/redis-unstable/src/commands/lolwut.json create mode 100644 examples/redis-unstable/src/commands/lpop.json create mode 100644 examples/redis-unstable/src/commands/lpos.json create mode 100644 examples/redis-unstable/src/commands/lpush.json create mode 100644 examples/redis-unstable/src/commands/lpushx.json create mode 100644 examples/redis-unstable/src/commands/lrange.json create mode 100644 examples/redis-unstable/src/commands/lrem.json create mode 100644 examples/redis-unstable/src/commands/lset.json create mode 100644 examples/redis-unstable/src/commands/ltrim.json create mode 100644 examples/redis-unstable/src/commands/memory-doctor.json create mode 100644 examples/redis-unstable/src/commands/memory-help.json create mode 100644 examples/redis-unstable/src/commands/memory-malloc-stats.json create mode 100644 examples/redis-unstable/src/commands/memory-purge.json create mode 100644 examples/redis-unstable/src/commands/memory-stats.json create mode 100644 examples/redis-unstable/src/commands/memory-usage.json create mode 100644 examples/redis-unstable/src/commands/memory.json create mode 100644 examples/redis-unstable/src/commands/mget.json create mode 100644 examples/redis-unstable/src/commands/migrate.json create mode 100644 examples/redis-unstable/src/commands/module-help.json create mode 100644 examples/redis-unstable/src/commands/module-list.json create mode 100644 examples/redis-unstable/src/commands/module-load.json create mode 100644 examples/redis-unstable/src/commands/module-loadex.json create mode 100644 examples/redis-unstable/src/commands/module-unload.json create mode 100644 examples/redis-unstable/src/commands/module.json create mode 100644 examples/redis-unstable/src/commands/monitor.json create mode 100644 examples/redis-unstable/src/commands/move.json create mode 100644 examples/redis-unstable/src/commands/mset.json create mode 100644 examples/redis-unstable/src/commands/msetex.json create mode 100644 examples/redis-unstable/src/commands/msetnx.json create mode 100644 examples/redis-unstable/src/commands/multi.json create mode 100644 examples/redis-unstable/src/commands/object-encoding.json create mode 100644 examples/redis-unstable/src/commands/object-freq.json create mode 100644 examples/redis-unstable/src/commands/object-help.json create mode 100644 examples/redis-unstable/src/commands/object-idletime.json create mode 100644 examples/redis-unstable/src/commands/object-refcount.json create mode 100644 examples/redis-unstable/src/commands/object.json create mode 100644 examples/redis-unstable/src/commands/persist.json create mode 100644 examples/redis-unstable/src/commands/pexpire.json create mode 100644 examples/redis-unstable/src/commands/pexpireat.json create mode 100644 examples/redis-unstable/src/commands/pexpiretime.json create mode 100644 examples/redis-unstable/src/commands/pfadd.json create mode 100644 examples/redis-unstable/src/commands/pfcount.json create mode 100644 examples/redis-unstable/src/commands/pfdebug.json create mode 100644 examples/redis-unstable/src/commands/pfmerge.json create mode 100644 examples/redis-unstable/src/commands/pfselftest.json create mode 100644 examples/redis-unstable/src/commands/ping.json create mode 100644 examples/redis-unstable/src/commands/psetex.json create mode 100644 examples/redis-unstable/src/commands/psubscribe.json create mode 100644 examples/redis-unstable/src/commands/psync.json create mode 100644 examples/redis-unstable/src/commands/pttl.json create mode 100644 examples/redis-unstable/src/commands/publish.json create mode 100644 examples/redis-unstable/src/commands/pubsub-channels.json create mode 100644 examples/redis-unstable/src/commands/pubsub-help.json create mode 100644 examples/redis-unstable/src/commands/pubsub-numpat.json create mode 100644 examples/redis-unstable/src/commands/pubsub-numsub.json create mode 100644 examples/redis-unstable/src/commands/pubsub-shardchannels.json create mode 100644 examples/redis-unstable/src/commands/pubsub-shardnumsub.json create mode 100644 examples/redis-unstable/src/commands/pubsub.json create mode 100644 examples/redis-unstable/src/commands/punsubscribe.json create mode 100644 examples/redis-unstable/src/commands/quit.json create mode 100644 examples/redis-unstable/src/commands/randomkey.json create mode 100644 examples/redis-unstable/src/commands/readonly.json create mode 100644 examples/redis-unstable/src/commands/readwrite.json create mode 100644 examples/redis-unstable/src/commands/rename.json create mode 100644 examples/redis-unstable/src/commands/renamenx.json create mode 100644 examples/redis-unstable/src/commands/replconf.json create mode 100644 examples/redis-unstable/src/commands/replicaof.json create mode 100644 examples/redis-unstable/src/commands/reset.json create mode 100644 examples/redis-unstable/src/commands/restore-asking.json create mode 100644 examples/redis-unstable/src/commands/restore.json create mode 100644 examples/redis-unstable/src/commands/role.json create mode 100644 examples/redis-unstable/src/commands/rpop.json create mode 100644 examples/redis-unstable/src/commands/rpoplpush.json create mode 100644 examples/redis-unstable/src/commands/rpush.json create mode 100644 examples/redis-unstable/src/commands/rpushx.json create mode 100644 examples/redis-unstable/src/commands/sadd.json create mode 100644 examples/redis-unstable/src/commands/save.json create mode 100644 examples/redis-unstable/src/commands/scan.json create mode 100644 examples/redis-unstable/src/commands/scard.json create mode 100644 examples/redis-unstable/src/commands/script-debug.json create mode 100644 examples/redis-unstable/src/commands/script-exists.json create mode 100644 examples/redis-unstable/src/commands/script-flush.json create mode 100644 examples/redis-unstable/src/commands/script-help.json create mode 100644 examples/redis-unstable/src/commands/script-kill.json create mode 100644 examples/redis-unstable/src/commands/script-load.json create mode 100644 examples/redis-unstable/src/commands/script.json create mode 100644 examples/redis-unstable/src/commands/sdiff.json create mode 100644 examples/redis-unstable/src/commands/sdiffstore.json create mode 100644 examples/redis-unstable/src/commands/select.json create mode 100644 examples/redis-unstable/src/commands/sentinel-ckquorum.json create mode 100644 examples/redis-unstable/src/commands/sentinel-config.json create mode 100644 examples/redis-unstable/src/commands/sentinel-debug.json create mode 100644 examples/redis-unstable/src/commands/sentinel-failover.json create mode 100644 examples/redis-unstable/src/commands/sentinel-flushconfig.json create mode 100644 examples/redis-unstable/src/commands/sentinel-get-master-addr-by-name.json create mode 100644 examples/redis-unstable/src/commands/sentinel-help.json create mode 100644 examples/redis-unstable/src/commands/sentinel-info-cache.json create mode 100644 examples/redis-unstable/src/commands/sentinel-is-master-down-by-addr.json create mode 100644 examples/redis-unstable/src/commands/sentinel-master.json create mode 100644 examples/redis-unstable/src/commands/sentinel-masters.json create mode 100644 examples/redis-unstable/src/commands/sentinel-monitor.json create mode 100644 examples/redis-unstable/src/commands/sentinel-myid.json create mode 100644 examples/redis-unstable/src/commands/sentinel-pending-scripts.json create mode 100644 examples/redis-unstable/src/commands/sentinel-remove.json create mode 100644 examples/redis-unstable/src/commands/sentinel-replicas.json create mode 100644 examples/redis-unstable/src/commands/sentinel-reset.json create mode 100644 examples/redis-unstable/src/commands/sentinel-sentinels.json create mode 100644 examples/redis-unstable/src/commands/sentinel-set.json create mode 100644 examples/redis-unstable/src/commands/sentinel-simulate-failure.json create mode 100644 examples/redis-unstable/src/commands/sentinel-slaves.json create mode 100644 examples/redis-unstable/src/commands/sentinel.json create mode 100644 examples/redis-unstable/src/commands/set.json create mode 100644 examples/redis-unstable/src/commands/setbit.json create mode 100644 examples/redis-unstable/src/commands/setex.json create mode 100644 examples/redis-unstable/src/commands/setnx.json create mode 100644 examples/redis-unstable/src/commands/setrange.json create mode 100644 examples/redis-unstable/src/commands/sflush.json create mode 100644 examples/redis-unstable/src/commands/shutdown.json create mode 100644 examples/redis-unstable/src/commands/sinter.json create mode 100644 examples/redis-unstable/src/commands/sintercard.json create mode 100644 examples/redis-unstable/src/commands/sinterstore.json create mode 100644 examples/redis-unstable/src/commands/sismember.json create mode 100644 examples/redis-unstable/src/commands/slaveof.json create mode 100644 examples/redis-unstable/src/commands/slowlog-get.json create mode 100644 examples/redis-unstable/src/commands/slowlog-help.json create mode 100644 examples/redis-unstable/src/commands/slowlog-len.json create mode 100644 examples/redis-unstable/src/commands/slowlog-reset.json create mode 100644 examples/redis-unstable/src/commands/slowlog.json create mode 100644 examples/redis-unstable/src/commands/smembers.json create mode 100644 examples/redis-unstable/src/commands/smismember.json create mode 100644 examples/redis-unstable/src/commands/smove.json create mode 100644 examples/redis-unstable/src/commands/sort.json create mode 100644 examples/redis-unstable/src/commands/sort_ro.json create mode 100644 examples/redis-unstable/src/commands/spop.json create mode 100644 examples/redis-unstable/src/commands/spublish.json create mode 100644 examples/redis-unstable/src/commands/srandmember.json create mode 100644 examples/redis-unstable/src/commands/srem.json create mode 100644 examples/redis-unstable/src/commands/sscan.json create mode 100644 examples/redis-unstable/src/commands/ssubscribe.json create mode 100644 examples/redis-unstable/src/commands/strlen.json create mode 100644 examples/redis-unstable/src/commands/subscribe.json create mode 100644 examples/redis-unstable/src/commands/substr.json create mode 100644 examples/redis-unstable/src/commands/sunion.json create mode 100644 examples/redis-unstable/src/commands/sunionstore.json create mode 100644 examples/redis-unstable/src/commands/sunsubscribe.json create mode 100644 examples/redis-unstable/src/commands/swapdb.json create mode 100644 examples/redis-unstable/src/commands/sync.json create mode 100644 examples/redis-unstable/src/commands/time.json create mode 100644 examples/redis-unstable/src/commands/touch.json create mode 100644 examples/redis-unstable/src/commands/trimslots.json create mode 100644 examples/redis-unstable/src/commands/ttl.json create mode 100644 examples/redis-unstable/src/commands/type.json create mode 100644 examples/redis-unstable/src/commands/unlink.json create mode 100644 examples/redis-unstable/src/commands/unsubscribe.json create mode 100644 examples/redis-unstable/src/commands/unwatch.json create mode 100644 examples/redis-unstable/src/commands/wait.json create mode 100644 examples/redis-unstable/src/commands/waitaof.json create mode 100644 examples/redis-unstable/src/commands/watch.json create mode 100644 examples/redis-unstable/src/commands/xack.json create mode 100644 examples/redis-unstable/src/commands/xackdel.json create mode 100644 examples/redis-unstable/src/commands/xadd.json create mode 100644 examples/redis-unstable/src/commands/xautoclaim.json create mode 100644 examples/redis-unstable/src/commands/xcfgset.json create mode 100644 examples/redis-unstable/src/commands/xclaim.json create mode 100644 examples/redis-unstable/src/commands/xdel.json create mode 100644 examples/redis-unstable/src/commands/xdelex.json create mode 100644 examples/redis-unstable/src/commands/xgroup-create.json create mode 100644 examples/redis-unstable/src/commands/xgroup-createconsumer.json create mode 100644 examples/redis-unstable/src/commands/xgroup-delconsumer.json create mode 100644 examples/redis-unstable/src/commands/xgroup-destroy.json create mode 100644 examples/redis-unstable/src/commands/xgroup-help.json create mode 100644 examples/redis-unstable/src/commands/xgroup-setid.json create mode 100644 examples/redis-unstable/src/commands/xgroup.json create mode 100644 examples/redis-unstable/src/commands/xinfo-consumers.json create mode 100644 examples/redis-unstable/src/commands/xinfo-groups.json create mode 100644 examples/redis-unstable/src/commands/xinfo-help.json create mode 100644 examples/redis-unstable/src/commands/xinfo-stream.json create mode 100644 examples/redis-unstable/src/commands/xinfo.json create mode 100644 examples/redis-unstable/src/commands/xlen.json create mode 100644 examples/redis-unstable/src/commands/xpending.json create mode 100644 examples/redis-unstable/src/commands/xrange.json create mode 100644 examples/redis-unstable/src/commands/xread.json create mode 100644 examples/redis-unstable/src/commands/xreadgroup.json create mode 100644 examples/redis-unstable/src/commands/xrevrange.json create mode 100644 examples/redis-unstable/src/commands/xsetid.json create mode 100644 examples/redis-unstable/src/commands/xtrim.json create mode 100644 examples/redis-unstable/src/commands/zadd.json create mode 100644 examples/redis-unstable/src/commands/zcard.json create mode 100644 examples/redis-unstable/src/commands/zcount.json create mode 100644 examples/redis-unstable/src/commands/zdiff.json create mode 100644 examples/redis-unstable/src/commands/zdiffstore.json create mode 100644 examples/redis-unstable/src/commands/zincrby.json create mode 100644 examples/redis-unstable/src/commands/zinter.json create mode 100644 examples/redis-unstable/src/commands/zintercard.json create mode 100644 examples/redis-unstable/src/commands/zinterstore.json create mode 100644 examples/redis-unstable/src/commands/zlexcount.json create mode 100644 examples/redis-unstable/src/commands/zmpop.json create mode 100644 examples/redis-unstable/src/commands/zmscore.json create mode 100644 examples/redis-unstable/src/commands/zpopmax.json create mode 100644 examples/redis-unstable/src/commands/zpopmin.json create mode 100644 examples/redis-unstable/src/commands/zrandmember.json create mode 100644 examples/redis-unstable/src/commands/zrange.json create mode 100644 examples/redis-unstable/src/commands/zrangebylex.json create mode 100644 examples/redis-unstable/src/commands/zrangebyscore.json create mode 100644 examples/redis-unstable/src/commands/zrangestore.json create mode 100644 examples/redis-unstable/src/commands/zrank.json create mode 100644 examples/redis-unstable/src/commands/zrem.json create mode 100644 examples/redis-unstable/src/commands/zremrangebylex.json create mode 100644 examples/redis-unstable/src/commands/zremrangebyrank.json create mode 100644 examples/redis-unstable/src/commands/zremrangebyscore.json create mode 100644 examples/redis-unstable/src/commands/zrevrange.json create mode 100644 examples/redis-unstable/src/commands/zrevrangebylex.json create mode 100644 examples/redis-unstable/src/commands/zrevrangebyscore.json create mode 100644 examples/redis-unstable/src/commands/zrevrank.json create mode 100644 examples/redis-unstable/src/commands/zscan.json create mode 100644 examples/redis-unstable/src/commands/zscore.json create mode 100644 examples/redis-unstable/src/commands/zunion.json create mode 100644 examples/redis-unstable/src/commands/zunionstore.json create mode 100644 examples/redis-unstable/src/config.c create mode 100644 examples/redis-unstable/src/config.h create mode 100644 examples/redis-unstable/src/connection.c create mode 100644 examples/redis-unstable/src/connection.h create mode 100644 examples/redis-unstable/src/connhelpers.h create mode 100644 examples/redis-unstable/src/crc16.c create mode 100644 examples/redis-unstable/src/crc16_slottable.h create mode 100644 examples/redis-unstable/src/crc64.c create mode 100644 examples/redis-unstable/src/crc64.h create mode 100644 examples/redis-unstable/src/crccombine.c create mode 100644 examples/redis-unstable/src/crccombine.h create mode 100644 examples/redis-unstable/src/crcspeed.c create mode 100644 examples/redis-unstable/src/crcspeed.h create mode 100644 examples/redis-unstable/src/db.c create mode 100644 examples/redis-unstable/src/debug.c create mode 100644 examples/redis-unstable/src/debugmacro.h create mode 100644 examples/redis-unstable/src/defrag.c create mode 100644 examples/redis-unstable/src/dict.c create mode 100644 examples/redis-unstable/src/dict.h create mode 100644 examples/redis-unstable/src/ebuckets.c create mode 100644 examples/redis-unstable/src/ebuckets.h create mode 100644 examples/redis-unstable/src/endianconv.c create mode 100644 examples/redis-unstable/src/endianconv.h create mode 100644 examples/redis-unstable/src/entry.c create mode 100644 examples/redis-unstable/src/entry.h create mode 100644 examples/redis-unstable/src/estore.c create mode 100644 examples/redis-unstable/src/estore.h create mode 100644 examples/redis-unstable/src/eval.c create mode 100644 examples/redis-unstable/src/eventnotifier.c create mode 100644 examples/redis-unstable/src/eventnotifier.h create mode 100644 examples/redis-unstable/src/evict.c create mode 100644 examples/redis-unstable/src/expire.c create mode 100644 examples/redis-unstable/src/fmacros.h create mode 100644 examples/redis-unstable/src/fmtargs.h create mode 100644 examples/redis-unstable/src/function_lua.c create mode 100644 examples/redis-unstable/src/functions.c create mode 100644 examples/redis-unstable/src/functions.h create mode 100644 examples/redis-unstable/src/fwtree.c create mode 100644 examples/redis-unstable/src/fwtree.h create mode 100644 examples/redis-unstable/src/geo.c create mode 100644 examples/redis-unstable/src/geo.h create mode 100644 examples/redis-unstable/src/geohash.c create mode 100644 examples/redis-unstable/src/geohash.h create mode 100644 examples/redis-unstable/src/geohash_helper.c create mode 100644 examples/redis-unstable/src/geohash_helper.h create mode 100644 examples/redis-unstable/src/hotkeys.c create mode 100644 examples/redis-unstable/src/hyperloglog.c create mode 100644 examples/redis-unstable/src/intset.c create mode 100644 examples/redis-unstable/src/intset.h create mode 100644 examples/redis-unstable/src/iothread.c create mode 100644 examples/redis-unstable/src/keymeta.c create mode 100644 examples/redis-unstable/src/keymeta.h create mode 100644 examples/redis-unstable/src/kvstore.c create mode 100644 examples/redis-unstable/src/kvstore.h create mode 100644 examples/redis-unstable/src/latency.c create mode 100644 examples/redis-unstable/src/latency.h create mode 100644 examples/redis-unstable/src/lazyfree.c create mode 100644 examples/redis-unstable/src/listpack.c create mode 100644 examples/redis-unstable/src/listpack.h create mode 100644 examples/redis-unstable/src/listpack_malloc.h create mode 100644 examples/redis-unstable/src/localtime.c create mode 100644 examples/redis-unstable/src/logreqres.c create mode 100644 examples/redis-unstable/src/lolwut.c create mode 100644 examples/redis-unstable/src/lolwut.h create mode 100644 examples/redis-unstable/src/lolwut5.c create mode 100644 examples/redis-unstable/src/lolwut6.c create mode 100644 examples/redis-unstable/src/lolwut8.c create mode 100644 examples/redis-unstable/src/lzf.h create mode 100644 examples/redis-unstable/src/lzfP.h create mode 100644 examples/redis-unstable/src/lzf_c.c create mode 100644 examples/redis-unstable/src/lzf_d.c create mode 100644 examples/redis-unstable/src/memory_prefetch.c create mode 100644 examples/redis-unstable/src/memory_prefetch.h create mode 100644 examples/redis-unstable/src/memtest.c create mode 100755 examples/redis-unstable/src/mkreleasehdr.sh create mode 100644 examples/redis-unstable/src/module.c create mode 100644 examples/redis-unstable/src/modules/.gitignore create mode 100644 examples/redis-unstable/src/modules/Makefile create mode 100644 examples/redis-unstable/src/modules/helloacl.c create mode 100644 examples/redis-unstable/src/modules/helloblock.c create mode 100644 examples/redis-unstable/src/modules/hellocluster.c create mode 100644 examples/redis-unstable/src/modules/hellodict.c create mode 100644 examples/redis-unstable/src/modules/hellohook.c create mode 100644 examples/redis-unstable/src/modules/hellotimer.c create mode 100644 examples/redis-unstable/src/modules/hellotype.c create mode 100644 examples/redis-unstable/src/modules/helloworld.c create mode 100644 examples/redis-unstable/src/monotonic.c create mode 100644 examples/redis-unstable/src/monotonic.h create mode 100644 examples/redis-unstable/src/mstr.c create mode 100644 examples/redis-unstable/src/mstr.h create mode 100644 examples/redis-unstable/src/mt19937-64.c create mode 100644 examples/redis-unstable/src/mt19937-64.h create mode 100644 examples/redis-unstable/src/multi.c create mode 100644 examples/redis-unstable/src/networking.c create mode 100644 examples/redis-unstable/src/notify.c create mode 100644 examples/redis-unstable/src/object.c create mode 100644 examples/redis-unstable/src/object.h create mode 100644 examples/redis-unstable/src/pqsort.c create mode 100644 examples/redis-unstable/src/pqsort.h create mode 100644 examples/redis-unstable/src/pubsub.c create mode 100644 examples/redis-unstable/src/quicklist.c create mode 100644 examples/redis-unstable/src/quicklist.h create mode 100644 examples/redis-unstable/src/rand.c create mode 100644 examples/redis-unstable/src/rand.h create mode 100644 examples/redis-unstable/src/rax.c create mode 100644 examples/redis-unstable/src/rax.h create mode 100644 examples/redis-unstable/src/rax_malloc.h create mode 100644 examples/redis-unstable/src/rdb.c create mode 100644 examples/redis-unstable/src/rdb.h create mode 100644 examples/redis-unstable/src/redis-benchmark.c create mode 100644 examples/redis-unstable/src/redis-check-aof.c create mode 100644 examples/redis-unstable/src/redis-check-rdb.c create mode 100644 examples/redis-unstable/src/redis-cli.c create mode 100755 examples/redis-unstable/src/redis-trib.rb create mode 100644 examples/redis-unstable/src/redisassert.c create mode 100644 examples/redis-unstable/src/redisassert.h create mode 100644 examples/redis-unstable/src/redismodule.h create mode 100644 examples/redis-unstable/src/release.c create mode 100644 examples/redis-unstable/src/replication.c create mode 100644 examples/redis-unstable/src/resp_parser.c create mode 100644 examples/redis-unstable/src/resp_parser.h create mode 100644 examples/redis-unstable/src/rio.c create mode 100644 examples/redis-unstable/src/rio.h create mode 100644 examples/redis-unstable/src/script.c create mode 100644 examples/redis-unstable/src/script.h create mode 100644 examples/redis-unstable/src/script_lua.c create mode 100644 examples/redis-unstable/src/script_lua.h create mode 100644 examples/redis-unstable/src/sds.c create mode 100644 examples/redis-unstable/src/sds.h create mode 100644 examples/redis-unstable/src/sdsalloc.h create mode 100644 examples/redis-unstable/src/sentinel.c create mode 100644 examples/redis-unstable/src/server.c create mode 100644 examples/redis-unstable/src/server.h create mode 100644 examples/redis-unstable/src/setcpuaffinity.c create mode 100644 examples/redis-unstable/src/setproctitle.c create mode 100644 examples/redis-unstable/src/sha1.c create mode 100644 examples/redis-unstable/src/sha1.h create mode 100644 examples/redis-unstable/src/sha256.c create mode 100644 examples/redis-unstable/src/sha256.h create mode 100644 examples/redis-unstable/src/siphash.c create mode 100644 examples/redis-unstable/src/slowlog.c create mode 100644 examples/redis-unstable/src/slowlog.h create mode 100644 examples/redis-unstable/src/socket.c create mode 100644 examples/redis-unstable/src/solarisfixes.h create mode 100644 examples/redis-unstable/src/sort.c create mode 100644 examples/redis-unstable/src/sparkline.c create mode 100644 examples/redis-unstable/src/sparkline.h create mode 100644 examples/redis-unstable/src/stream.h create mode 100644 examples/redis-unstable/src/strl.c create mode 100644 examples/redis-unstable/src/syncio.c create mode 100644 examples/redis-unstable/src/syscheck.c create mode 100644 examples/redis-unstable/src/syscheck.h create mode 100644 examples/redis-unstable/src/t_hash.c create mode 100644 examples/redis-unstable/src/t_list.c create mode 100644 examples/redis-unstable/src/t_set.c create mode 100644 examples/redis-unstable/src/t_stream.c create mode 100644 examples/redis-unstable/src/t_string.c create mode 100644 examples/redis-unstable/src/t_zset.c create mode 100644 examples/redis-unstable/src/testhelp.h create mode 100644 examples/redis-unstable/src/threads_mngr.c create mode 100644 examples/redis-unstable/src/threads_mngr.h create mode 100644 examples/redis-unstable/src/timeout.c create mode 100644 examples/redis-unstable/src/tls.c create mode 100644 examples/redis-unstable/src/tracking.c create mode 100644 examples/redis-unstable/src/tsan.sup create mode 100644 examples/redis-unstable/src/unix.c create mode 100644 examples/redis-unstable/src/util.c create mode 100644 examples/redis-unstable/src/util.h create mode 100644 examples/redis-unstable/src/valgrind.sup create mode 100644 examples/redis-unstable/src/version.h create mode 100644 examples/redis-unstable/src/ziplist.c create mode 100644 examples/redis-unstable/src/ziplist.h create mode 100644 examples/redis-unstable/src/zipmap.c create mode 100644 examples/redis-unstable/src/zipmap.h create mode 100644 examples/redis-unstable/src/zmalloc.c create mode 100644 examples/redis-unstable/src/zmalloc.h (limited to 'examples/redis-unstable/src') diff --git a/examples/redis-unstable/src/.gitignore b/examples/redis-unstable/src/.gitignore new file mode 100644 index 0000000..aee7aac --- /dev/null +++ b/examples/redis-unstable/src/.gitignore @@ -0,0 +1,5 @@ +*.gcda +*.gcno +*.gcov +redis.info +lcov-html diff --git a/examples/redis-unstable/src/Makefile b/examples/redis-unstable/src/Makefile new file mode 100644 index 0000000..24b9a39 --- /dev/null +++ b/examples/redis-unstable/src/Makefile @@ -0,0 +1,563 @@ +# Redis Makefile +# Copyright (c) 2011-Present, Redis Ltd. +# All rights reserved. +# +# Licensed under your choice of (a) the Redis Source Available License 2.0 +# (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the +# GNU Affero General Public License v3 (AGPLv3). +# +# The Makefile composes the final FINAL_CFLAGS and FINAL_LDFLAGS using +# what is needed for Redis plus the standard CFLAGS and LDFLAGS passed. +# However when building the dependencies (Jemalloc, Lua, Hiredis, ...) +# CFLAGS and LDFLAGS are propagated to the dependencies, so to pass +# flags only to be used when compiling / linking Redis itself REDIS_CFLAGS +# and REDIS_LDFLAGS are used instead (this is the case of 'make gcov'). +# +# Dependencies are stored in the Makefile.dep file. To rebuild this file +# Just use 'make dep', but this is only needed by developers. + +release_hdr := $(shell sh -c './mkreleasehdr.sh') +uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') +uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') +CLANG := $(findstring clang,$(shell sh -c '$(CC) --version | head -1')) + +# Optimization flags. To override, the OPTIMIZATION variable can be passed, but +# some automatic defaults are added to it. To specify optimization flags +# explicitly without any defaults added, pass the OPT variable instead. +OPTIMIZATION?=-O3 +ifeq ($(OPTIMIZATION),-O3) + ifeq (clang,$(CLANG)) + OPTIMIZATION+=-flto + else + OPTIMIZATION+=-flto=auto + endif +endif +ifneq ($(OPTIMIZATION),-O0) + OPTIMIZATION+=-fno-omit-frame-pointer +endif +DEPENDENCY_TARGETS=hiredis linenoise lua hdr_histogram fpconv fast_float xxhash +NODEPS:=clean distclean + +# Default settings +STD=-pedantic -DREDIS_STATIC='' + +# Use -Wno-c11-extensions on clang, either where explicitly used or on +# platforms we can assume it's being used. +ifeq (clang,$(CLANG)) + STD+=-Wno-c11-extensions +else +ifneq (,$(findstring FreeBSD,$(uname_S))) + STD+=-Wno-c11-extensions +endif +endif +WARN=-Wall -W -Wno-missing-field-initializers -Werror=deprecated-declarations -Wstrict-prototypes +OPT=$(OPTIMIZATION) + +SKIP_VEC_SETS?=no +# Detect if the compiler supports C11 _Atomic. +# NUMBER_SIGN_CHAR is a workaround to support both GNU Make 4.3 and older versions. +NUMBER_SIGN_CHAR := \# +C11_ATOMIC := $(shell sh -c 'echo "$(NUMBER_SIGN_CHAR)include " > foo.c; \ + $(CC) -std=gnu11 -c foo.c -o foo.o > /dev/null 2>&1; \ + if [ -f foo.o ]; then echo "yes"; rm foo.o; fi; rm foo.c') +ifeq ($(C11_ATOMIC),yes) + STD+=-std=gnu11 +else + SKIP_VEC_SETS=yes + STD+=-std=c99 +endif + +PREFIX?=/usr/local +INSTALL_BIN=$(PREFIX)/bin +INSTALL=install +PKG_CONFIG?=pkg-config + +ifndef PYTHON +PYTHON := $(shell which python3 || which python) +endif + +# Default allocator defaults to Jemalloc on Linux and libc otherwise +MALLOC=libc +ifeq ($(uname_S),Linux) + MALLOC=jemalloc +endif + +# To get ARM stack traces if Redis crashes we need a special C flag. +ifneq (,$(filter aarch64 armv%,$(uname_M))) + CFLAGS+=-funwind-tables +endif + +# Backwards compatibility for selecting an allocator +ifeq ($(USE_TCMALLOC),yes) + MALLOC=tcmalloc +endif + +ifeq ($(USE_TCMALLOC_MINIMAL),yes) + MALLOC=tcmalloc_minimal +endif + +ifeq ($(USE_JEMALLOC),yes) + MALLOC=jemalloc +endif + +ifeq ($(USE_JEMALLOC),no) + MALLOC=libc +endif + +ifdef SANITIZER +ifeq ($(SANITIZER),address) + MALLOC=libc + CFLAGS+=-fsanitize=address -fno-sanitize-recover=all -fno-omit-frame-pointer + LDFLAGS+=-fsanitize=address +else +ifeq ($(SANITIZER),undefined) + MALLOC=libc + CFLAGS+=-fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer + LDFLAGS+=-fsanitize=undefined +else +ifeq ($(SANITIZER),thread) + CFLAGS+=-fsanitize=thread -fno-sanitize-recover=all -fno-omit-frame-pointer + LDFLAGS+=-fsanitize=thread +else +ifeq ($(SANITIZER),memory) +ifeq (clang, $(CLANG)) + export CXX:=clang + export LD:=clang + MALLOC=libc # MSan provides its own allocator so make sure not to use jemalloc as they clash + CFLAGS+=-fsanitize=memory -fsanitize-memory-track-origins=2 -fno-sanitize-recover=all -fno-omit-frame-pointer + LDFLAGS+=-fsanitize=memory +else + $(error "MemorySanitizer needs to be compiled and linked with clang. Please use CC=clang") +endif +else + $(error "unknown sanitizer=${SANITIZER}") +endif +endif +endif +endif +endif + +# Special case of forcing defrag to run even though we have no Jemlloc support +ifeq ($(DEBUG_DEFRAG), force) + CFLAGS +=-DDEBUG_DEFRAG_FORCE +else ifeq ($(DEBUG_DEFRAG), fully) + CFLAGS +=-DDEBUG_DEFRAG_FORCE -DDEBUG_DEFRAG_FULLY +endif + +# Override default settings if possible +-include .make-settings + +FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(REDIS_CFLAGS) +FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(REDIS_LDFLAGS) $(DEBUG) +FINAL_LIBS=-lm -lstdc++ +DEBUG=-g -ggdb + +# Linux ARM32 needs -latomic at linking time +ifneq (,$(findstring armv,$(uname_M))) + FINAL_LIBS+=-latomic +endif + +ifeq ($(uname_S),SunOS) + # SunOS + ifeq ($(findstring -m32,$(FINAL_CFLAGS)),) + CFLAGS+=-m64 + endif + ifeq ($(findstring -m32,$(FINAL_LDFLAGS)),) + LDFLAGS+=-m64 + endif + DEBUG=-g + DEBUG_FLAGS=-g + export CFLAGS LDFLAGS DEBUG DEBUG_FLAGS + INSTALL=cp -pf + FINAL_CFLAGS+= -D__EXTENSIONS__ -D_XPG6 + FINAL_LIBS+= -ldl -lnsl -lsocket -lresolv -lpthread -lrt + ifeq ($(USE_BACKTRACE),yes) + FINAL_CFLAGS+= -DUSE_BACKTRACE + endif +else +ifeq ($(uname_S),Darwin) + # Darwin + FINAL_LIBS+= -ldl + # Homebrew's OpenSSL is not linked to /usr/local to avoid + # conflicts with the system's LibreSSL installation so it + # must be referenced explicitly during build. +ifeq ($(uname_M),arm64) + # Homebrew arm64 uses /opt/homebrew as HOMEBREW_PREFIX + OPENSSL_PREFIX?=/opt/homebrew/opt/openssl +else + # Homebrew x86/ppc uses /usr/local as HOMEBREW_PREFIX + OPENSSL_PREFIX?=/usr/local/opt/openssl +endif +else +ifeq ($(uname_S),AIX) + # AIX + FINAL_LDFLAGS+= -Wl,-bexpall + FINAL_LIBS+=-ldl -pthread -lcrypt -lbsd +else +ifeq ($(uname_S),OpenBSD) + # OpenBSD + FINAL_LIBS+= -lpthread + ifeq ($(USE_BACKTRACE),yes) + FINAL_CFLAGS+= -DUSE_BACKTRACE -I/usr/local/include + FINAL_LDFLAGS+= -L/usr/local/lib + FINAL_LIBS+= -lexecinfo + endif + +else +ifeq ($(uname_S),NetBSD) + # NetBSD + FINAL_LIBS+= -lpthread + ifeq ($(USE_BACKTRACE),yes) + FINAL_CFLAGS+= -DUSE_BACKTRACE -I/usr/pkg/include + FINAL_LDFLAGS+= -L/usr/pkg/lib + FINAL_LIBS+= -lexecinfo + endif +else +ifeq ($(uname_S),FreeBSD) + # FreeBSD + FINAL_LIBS+= -lpthread -lexecinfo +else +ifeq ($(uname_S),DragonFly) + # DragonFly + FINAL_LIBS+= -lpthread -lexecinfo +else +ifeq ($(uname_S),OpenBSD) + # OpenBSD + FINAL_LIBS+= -lpthread -lexecinfo +else +ifeq ($(uname_S),NetBSD) + # NetBSD + FINAL_LIBS+= -lpthread -lexecinfo +else +ifeq ($(uname_S),Haiku) + # Haiku + FINAL_CFLAGS+= -DBSD_SOURCE + FINAL_LDFLAGS+= -lbsd -lnetwork + FINAL_LIBS+= -lpthread +else + # All the other OSes (notably Linux) + FINAL_LDFLAGS+= -rdynamic + FINAL_LIBS+=-ldl -pthread -lrt +endif +endif +endif +endif +endif +endif +endif +endif +endif +endif + +ifdef OPENSSL_PREFIX + OPENSSL_CFLAGS=-I$(OPENSSL_PREFIX)/include + OPENSSL_LDFLAGS=-L$(OPENSSL_PREFIX)/lib + # Also export OPENSSL_PREFIX so it ends up in deps sub-Makefiles + export OPENSSL_PREFIX +endif + +# Include paths to dependencies +FINAL_CFLAGS+= -I../deps/hiredis -I../deps/linenoise -I../deps/lua/src -I../deps/hdr_histogram -I../deps/fpconv -I../deps/fast_float -I../deps/xxhash + +# Determine systemd support and/or build preference (defaulting to auto-detection) +BUILD_WITH_SYSTEMD=no +LIBSYSTEMD_LIBS=-lsystemd + +# If 'USE_SYSTEMD' in the environment is neither "no" nor "yes", try to +# auto-detect libsystemd's presence and link accordingly. +ifneq ($(USE_SYSTEMD),no) + LIBSYSTEMD_PKGCONFIG := $(shell $(PKG_CONFIG) --exists libsystemd && echo $$?) +# If libsystemd cannot be detected, continue building without support for it +# (unless a later check tells us otherwise) +ifeq ($(LIBSYSTEMD_PKGCONFIG),0) + BUILD_WITH_SYSTEMD=yes + LIBSYSTEMD_LIBS=$(shell $(PKG_CONFIG) --libs libsystemd) +endif +endif + +# If 'USE_SYSTEMD' is set to "yes" use pkg-config if available or fall back to +# default -lsystemd. +ifeq ($(USE_SYSTEMD),yes) + BUILD_WITH_SYSTEMD=yes +endif + +ifeq ($(BUILD_WITH_SYSTEMD),yes) + FINAL_LIBS+=$(LIBSYSTEMD_LIBS) + FINAL_CFLAGS+= -DHAVE_LIBSYSTEMD +endif + +ifeq ($(MALLOC),tcmalloc) + FINAL_CFLAGS+= -DUSE_TCMALLOC + FINAL_LIBS+= -ltcmalloc +endif + +ifeq ($(MALLOC),tcmalloc_minimal) + FINAL_CFLAGS+= -DUSE_TCMALLOC + FINAL_LIBS+= -ltcmalloc_minimal +endif + +ifeq ($(MALLOC),jemalloc) + DEPENDENCY_TARGETS+= jemalloc + FINAL_CFLAGS+= -DUSE_JEMALLOC -I../deps/jemalloc/include + FINAL_LIBS := ../deps/jemalloc/lib/libjemalloc.a $(FINAL_LIBS) +endif + +# LIBSSL & LIBCRYPTO +LIBSSL_LIBS= +LIBSSL_PKGCONFIG := $(shell $(PKG_CONFIG) --exists libssl && echo $$?) +ifeq ($(LIBSSL_PKGCONFIG),0) + LIBSSL_LIBS=$(shell $(PKG_CONFIG) --libs libssl) +else + LIBSSL_LIBS=-lssl +endif +LIBCRYPTO_LIBS= +LIBCRYPTO_PKGCONFIG := $(shell $(PKG_CONFIG) --exists libcrypto && echo $$?) +ifeq ($(LIBCRYPTO_PKGCONFIG),0) + LIBCRYPTO_LIBS=$(shell $(PKG_CONFIG) --libs libcrypto) +else + LIBCRYPTO_LIBS=-lcrypto +endif + +BUILD_NO:=0 +BUILD_YES:=1 +BUILD_MODULE:=2 +ifeq ($(BUILD_TLS),yes) + FINAL_CFLAGS+=-DUSE_OPENSSL=$(BUILD_YES) $(OPENSSL_CFLAGS) -DBUILD_TLS_MODULE=$(BUILD_NO) + FINAL_LDFLAGS+=$(OPENSSL_LDFLAGS) + FINAL_LIBS += ../deps/hiredis/libhiredis_ssl.a $(LIBSSL_LIBS) $(LIBCRYPTO_LIBS) +endif + +TLS_MODULE= +TLS_MODULE_NAME:=redis-tls$(PROG_SUFFIX).so +TLS_MODULE_CFLAGS:=$(FINAL_CFLAGS) +ifeq ($(BUILD_TLS),module) + FINAL_CFLAGS+=-DUSE_OPENSSL=$(BUILD_MODULE) $(OPENSSL_CFLAGS) + TLS_CLIENT_LIBS = ../deps/hiredis/libhiredis_ssl.a $(LIBSSL_LIBS) $(LIBCRYPTO_LIBS) + TLS_MODULE=$(TLS_MODULE_NAME) + TLS_MODULE_CFLAGS+=-DUSE_OPENSSL=$(BUILD_MODULE) $(OPENSSL_CFLAGS) -DBUILD_TLS_MODULE=$(BUILD_MODULE) +endif + +ifneq ($(SKIP_VEC_SETS),yes) + vpath %.c ../modules/vector-sets + REDIS_VEC_SETS_OBJ=hnsw.o vset.o vset_config.o + FINAL_CFLAGS+=-DINCLUDE_VEC_SETS=1 +endif + +ifndef V + define MAKE_INSTALL + @printf ' %b %b\n' $(LINKCOLOR)INSTALL$(ENDCOLOR) $(BINCOLOR)$(1)$(ENDCOLOR) 1>&2 + @$(INSTALL) $(1) $(2) + endef +else + define MAKE_INSTALL + $(INSTALL) $(1) $(2) + endef +endif + +REDIS_CC=$(QUIET_CC)$(CC) $(FINAL_CFLAGS) +REDIS_LD=$(QUIET_LINK)$(CC) $(FINAL_LDFLAGS) +REDIS_INSTALL=$(QUIET_INSTALL)$(INSTALL) + +CCCOLOR="\033[34m" +LINKCOLOR="\033[34;1m" +SRCCOLOR="\033[33m" +BINCOLOR="\033[37;1m" +MAKECOLOR="\033[32;1m" +ENDCOLOR="\033[0m" + +ifndef V +QUIET_CC = @printf ' %b %b\n' $(CCCOLOR)CC$(ENDCOLOR) $(SRCCOLOR)$@$(ENDCOLOR) 1>&2; +QUIET_GEN = @printf ' %b %b\n' $(CCCOLOR)GEN$(ENDCOLOR) $(SRCCOLOR)$@$(ENDCOLOR) 1>&2; +QUIET_LINK = @printf ' %b %b\n' $(LINKCOLOR)LINK$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) 1>&2; +QUIET_INSTALL = @printf ' %b %b\n' $(LINKCOLOR)INSTALL$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) 1>&2; +endif + +ifneq (, $(findstring LOG_REQ_RES, $(REDIS_CFLAGS))) + COMMANDS_DEF_FILENAME=commands_with_reply_schema + GEN_COMMANDS_FLAGS=--with-reply-schema +else + COMMANDS_DEF_FILENAME=commands + GEN_COMMANDS_FLAGS= +endif + +REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) +REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) +REDIS_SERVER_OBJ=threads_mngr.o memory_prefetch.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o entry.o kvstore.o fwtree.o estore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_asm.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o lolwut8.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o keymeta.o chk.o hotkeys.o +REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) +REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o +REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) +REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o redisassert.o release.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o +REDIS_CHECK_RDB_NAME=redis-check-rdb$(PROG_SUFFIX) +REDIS_CHECK_AOF_NAME=redis-check-aof$(PROG_SUFFIX) +ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(REDIS_SERVER_OBJ) $(REDIS_VEC_SETS_OBJ) $(REDIS_CLI_OBJ) $(REDIS_BENCHMARK_OBJ))) + +all: $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) $(TLS_MODULE) module_tests + @echo "" + @echo "Hint: It's a good idea to run 'make test' ;)" + @echo "" + +Makefile.dep: + -$(REDIS_CC) -MM $(ALL_SOURCES) > Makefile.dep 2> /dev/null || true + +ifeq (0, $(words $(findstring $(MAKECMDGOALS), $(NODEPS)))) +-include Makefile.dep +endif + +.PHONY: all + +module_tests: $(REDIS_SERVER_NAME) + $(MAKE) -C ../tests/modules + +.PHONY: module_tests + +persist-settings: distclean + echo STD=$(STD) >> .make-settings + echo WARN=$(WARN) >> .make-settings + echo OPT=$(OPT) >> .make-settings + echo MALLOC=$(MALLOC) >> .make-settings + echo BUILD_TLS=$(BUILD_TLS) >> .make-settings + echo USE_SYSTEMD=$(USE_SYSTEMD) >> .make-settings + echo CFLAGS=$(CFLAGS) >> .make-settings + echo LDFLAGS=$(LDFLAGS) >> .make-settings + echo REDIS_CFLAGS=$(REDIS_CFLAGS) >> .make-settings + echo REDIS_LDFLAGS=$(REDIS_LDFLAGS) >> .make-settings + echo PREV_FINAL_CFLAGS=$(FINAL_CFLAGS) >> .make-settings + echo PREV_FINAL_LDFLAGS=$(FINAL_LDFLAGS) >> .make-settings + -(cd ../deps && $(MAKE) $(DEPENDENCY_TARGETS)) + +.PHONY: persist-settings + +# Prerequisites target +.make-prerequisites: + @touch $@ + +# Clean everything, persist settings and build dependencies if anything changed +ifneq ($(strip $(PREV_FINAL_CFLAGS)), $(strip $(FINAL_CFLAGS))) +.make-prerequisites: persist-settings +endif + +ifneq ($(strip $(PREV_FINAL_LDFLAGS)), $(strip $(FINAL_LDFLAGS))) +.make-prerequisites: persist-settings +endif + +# redis-server +$(REDIS_SERVER_NAME): $(REDIS_SERVER_OBJ) $(REDIS_VEC_SETS_OBJ) + $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a ../deps/fast_float/libfast_float.a ../deps/xxhash/libxxhash.a $(FINAL_LIBS) + +# redis-sentinel +$(REDIS_SENTINEL_NAME): $(REDIS_SERVER_NAME) + $(REDIS_INSTALL) $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) + +# redis-check-rdb +$(REDIS_CHECK_RDB_NAME): $(REDIS_SERVER_NAME) + $(REDIS_INSTALL) $(REDIS_SERVER_NAME) $(REDIS_CHECK_RDB_NAME) + +# redis-check-aof +$(REDIS_CHECK_AOF_NAME): $(REDIS_SERVER_NAME) + $(REDIS_INSTALL) $(REDIS_SERVER_NAME) $(REDIS_CHECK_AOF_NAME) + +# redis-tls.so +$(TLS_MODULE_NAME): $(REDIS_SERVER_NAME) + $(QUIET_CC)$(CC) -o $@ tls.c -shared -fPIC $(TLS_MODULE_CFLAGS) $(TLS_CLIENT_LIBS) + +# redis-cli +$(REDIS_CLI_NAME): $(REDIS_CLI_OBJ) + $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/linenoise/linenoise.o ../deps/hdr_histogram/libhdrhistogram.a $(FINAL_LIBS) $(TLS_CLIENT_LIBS) + +# redis-benchmark +$(REDIS_BENCHMARK_NAME): $(REDIS_BENCHMARK_OBJ) + $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/hdr_histogram/libhdrhistogram.a $(FINAL_LIBS) $(TLS_CLIENT_LIBS) + +DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_VEC_SETS_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:%.o=%.d) $(REDIS_BENCHMARK_OBJ:%.o=%.d) +-include $(DEP) + +# Because the jemalloc.h header is generated as a part of the jemalloc build, +# building it should complete before building any other object. Instead of +# depending on a single artifact, build all dependencies first. +%.o: %.c .make-prerequisites + $(REDIS_CC) -MMD -o $@ -c $< + +# The following files are checked in and don't normally need to be rebuilt. They +# are built only if python is available and their prereqs are modified. +ifneq (,$(PYTHON)) +$(COMMANDS_DEF_FILENAME).def: commands/*.json ../utils/generate-command-code.py + $(QUIET_GEN)$(PYTHON) ../utils/generate-command-code.py $(GEN_COMMANDS_FLAGS) + +fmtargs.h: ../utils/generate-fmtargs.py + $(QUITE_GEN)sed '/Everything below this line/,$$d' $@ > $@.tmp + $(QUITE_GEN)$(PYTHON) ../utils/generate-fmtargs.py >> $@.tmp + $(QUITE_GEN)mv $@.tmp $@ +endif + +commands.c: $(COMMANDS_DEF_FILENAME).def + +clean: + rm -rf $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) *.o *.gcda *.gcno *.gcov redis.info lcov-html Makefile.dep *.so + rm -f $(DEP) + -(cd ../tests/modules && $(MAKE) clean) + +.PHONY: clean + +distclean: clean + -(cd ../deps && $(MAKE) distclean) + -(cd modules && $(MAKE) clean) + -(cd ../tests/modules && $(MAKE) clean) + -(rm -f .make-*) + +.PHONY: distclean + +test: $(REDIS_SERVER_NAME) $(REDIS_CHECK_AOF_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) module_tests + @(cd ..; ./runtest) + +test-modules: $(REDIS_SERVER_NAME) + @(cd ..; ./runtest-moduleapi) + +test-sentinel: $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) + @(cd ..; ./runtest-sentinel) + +test-cluster: $(REDIS_SERVER_NAME) $(REDIS_CLI_NAME) + @(cd ..; ./runtest-cluster) + +check: test + +lcov: + @lcov --version + $(MAKE) gcov + @(set -e; cd ..; ./runtest) + @geninfo -o redis.info . + @genhtml --legend -o lcov-html redis.info + +.PHONY: lcov + +bench: $(REDIS_BENCHMARK_NAME) + ./$(REDIS_BENCHMARK_NAME) + +32bit: + @echo "" + @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386" + @echo "" + $(MAKE) CFLAGS="-m32" LDFLAGS="-m32" SKIP_VEC_SETS="yes" + +gcov: + $(MAKE) REDIS_CFLAGS="-fprofile-arcs -ftest-coverage -DCOVERAGE_TEST" REDIS_LDFLAGS="-fprofile-arcs -ftest-coverage" + +noopt: + $(MAKE) OPTIMIZATION="-O0" + +valgrind: + $(MAKE) OPTIMIZATION="-O0" MALLOC="libc" + +helgrind: + $(MAKE) OPTIMIZATION="-O0" MALLOC="libc" CFLAGS="-D__ATOMIC_VAR_FORCE_SYNC_MACROS" REDIS_CFLAGS="-I/usr/local/include" REDIS_LDFLAGS="-L/usr/local/lib" + +install: all + @mkdir -p $(INSTALL_BIN) + $(call MAKE_INSTALL,$(REDIS_SERVER_NAME),$(INSTALL_BIN)) + $(call MAKE_INSTALL,$(REDIS_BENCHMARK_NAME),$(INSTALL_BIN)) + $(call MAKE_INSTALL,$(REDIS_CLI_NAME),$(INSTALL_BIN)) + @ln -sf $(REDIS_SERVER_NAME) $(INSTALL_BIN)/$(REDIS_CHECK_RDB_NAME) + @ln -sf $(REDIS_SERVER_NAME) $(INSTALL_BIN)/$(REDIS_CHECK_AOF_NAME) + @ln -sf $(REDIS_SERVER_NAME) $(INSTALL_BIN)/$(REDIS_SENTINEL_NAME) + +uninstall: + rm -f $(INSTALL_BIN)/{$(REDIS_SERVER_NAME),$(REDIS_BENCHMARK_NAME),$(REDIS_CLI_NAME),$(REDIS_CHECK_RDB_NAME),$(REDIS_CHECK_AOF_NAME),$(REDIS_SENTINEL_NAME)} diff --git a/examples/redis-unstable/src/acl.c b/examples/redis-unstable/src/acl.c new file mode 100644 index 0000000..37d504d --- /dev/null +++ b/examples/redis-unstable/src/acl.c @@ -0,0 +1,3313 @@ +/* + * Copyright (c) 2018-Present, Redis Ltd. + * All rights reserved. + * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "server.h" +#include "cluster.h" +#include "sha256.h" +#include +#include + +/* ============================================================================= + * Global state for ACLs + * ==========================================================================*/ + +rax *Users; /* Table mapping usernames to user structures. */ + +user *DefaultUser; /* Global reference to the default user. + Every new connection is associated to it, if no + AUTH or HELLO is used to authenticate with a + different user. */ + +list *UsersToLoad; /* This is a list of users found in the configuration file + that we'll need to load in the final stage of Redis + initialization, after all the modules are already + loaded. Every list element is a NULL terminated + array of SDS pointers: the first is the user name, + all the remaining pointers are ACL rules in the same + format as ACLSetUser(). */ +list *ACLLog; /* Our security log, the user is able to inspect that + using the ACL LOG command .*/ + +long long ACLLogEntryCount = 0; /* Number of ACL log entries created */ + +static rax *commandId = NULL; /* Command name to id mapping */ + +static unsigned long nextid = 0; /* Next command id that has not been assigned */ + +#define ACL_MAX_CATEGORIES 64 /* Maximum number of command categories */ + +struct ACLCategoryItem { + char *name; + uint64_t flag; +} ACLDefaultCommandCategories[] = { /* See redis.conf for details on each category. */ + {"keyspace", ACL_CATEGORY_KEYSPACE}, + {"read", ACL_CATEGORY_READ}, + {"write", ACL_CATEGORY_WRITE}, + {"set", ACL_CATEGORY_SET}, + {"sortedset", ACL_CATEGORY_SORTEDSET}, + {"list", ACL_CATEGORY_LIST}, + {"hash", ACL_CATEGORY_HASH}, + {"string", ACL_CATEGORY_STRING}, + {"bitmap", ACL_CATEGORY_BITMAP}, + {"hyperloglog", ACL_CATEGORY_HYPERLOGLOG}, + {"geo", ACL_CATEGORY_GEO}, + {"stream", ACL_CATEGORY_STREAM}, + {"pubsub", ACL_CATEGORY_PUBSUB}, + {"admin", ACL_CATEGORY_ADMIN}, + {"fast", ACL_CATEGORY_FAST}, + {"slow", ACL_CATEGORY_SLOW}, + {"blocking", ACL_CATEGORY_BLOCKING}, + {"dangerous", ACL_CATEGORY_DANGEROUS}, + {"connection", ACL_CATEGORY_CONNECTION}, + {"transaction", ACL_CATEGORY_TRANSACTION}, + {"scripting", ACL_CATEGORY_SCRIPTING}, + {NULL,0} /* Terminator. */ +}; + +static struct ACLCategoryItem *ACLCommandCategories = NULL; +static size_t nextCommandCategory = 0; /* Index of the next command category to be added */ + +/* Implements the ability to add to the list of ACL categories at runtime. Since each ACL category + * also requires a bit in the acl_categories flag, there is a limit to the number that can be added. + * The new ACL categories occupy the remaining bits of acl_categories flag, other than the bits + * occupied by the default ACL command categories. + * + * The optional `flag` argument allows the assignment of the `acl_categories` flag bit to the ACL category. + * When adding a new category, except for the default ACL command categories, this arguments should be `0` + * to allow the function to assign the next available `acl_categories` flag bit to the new ACL category. + * + * returns 1 -> Added, 0 -> Failed (out of space) + * + * This function is present here to gain access to the ACLCommandCategories array and add a new ACL category. + */ +int ACLAddCommandCategory(const char *name, uint64_t flag) { + if (nextCommandCategory >= ACL_MAX_CATEGORIES) return 0; + ACLCommandCategories[nextCommandCategory].name = zstrdup(name); + ACLCommandCategories[nextCommandCategory].flag = flag != 0 ? flag : (1ULL<>4)]; + hex[j*2+1] = cset[(hash[j]&0xF)]; + } + return sdsnewlen(hex,HASH_PASSWORD_LEN); +} + +/* Given a hash and the hash length, returns C_OK if it is a valid password + * hash, or C_ERR otherwise. */ +int ACLCheckPasswordHash(unsigned char *hash, int hashlen) { + if (hashlen != HASH_PASSWORD_LEN) { + return C_ERR; + } + + /* Password hashes can only be characters that represent + * hexadecimal values, which are numbers and lowercase + * characters 'a' through 'f'. */ + for(int i = 0; i < HASH_PASSWORD_LEN; i++) { + char c = hash[i]; + if ((c < 'a' || c > 'f') && (c < '0' || c > '9')) { + return C_ERR; + } + } + return C_OK; +} + +/* ============================================================================= + * Low level ACL API + * ==========================================================================*/ + +/* Return 1 if the specified string contains spaces or null characters. + * We do this for usernames and key patterns for simpler rewriting of + * ACL rules, presentation on ACL list, and to avoid subtle security bugs + * that may arise from parsing the rules in presence of escapes. + * The function returns 0 if the string has no spaces. */ +int ACLStringHasSpaces(const char *s, size_t len) { + for (size_t i = 0; i < len; i++) { + if (isspace(s[i]) || s[i] == 0) return 1; + } + return 0; +} + +/* Given the category name the command returns the corresponding flag, or + * zero if there is no match. */ +uint64_t ACLGetCommandCategoryFlagByName(const char *name) { + for (int j = 0; ACLCommandCategories[j].flag != 0; j++) { + if (!strcasecmp(name,ACLCommandCategories[j].name)) { + return ACLCommandCategories[j].flag; + } + } + return 0; /* No match. */ +} + +/* Method for searching for a user within a list of user definitions. The + * list contains an array of user arguments, and we are only + * searching the first argument, the username, for a match. */ +int ACLListMatchLoadedUser(void *definition, void *user) { + sds *user_definition = definition; + return sdscmp(user_definition[0], user) == 0; +} + +/* Method for passwords/pattern comparison used for the user->passwords list + * so that we can search for items with listSearchKey(). */ +int ACLListMatchSds(void *a, void *b) { + return sdscmp(a,b) == 0; +} + +/* Method to free list elements from ACL users password/patterns lists. */ +void ACLListFreeSds(void *item) { + sdsfreegeneric(item); +} + +/* Method to duplicate list elements from ACL users password/patterns lists. */ +void *ACLListDupSds(void *item) { + return sdsdup(item); +} + +/* Structure used for handling key patterns with different key + * based permissions. */ +typedef struct { + int flags; /* The ACL key permission types for this key pattern */ + sds pattern; /* The pattern to match keys against */ +} keyPattern; + +/* Create a new key pattern. */ +keyPattern *ACLKeyPatternCreate(sds pattern, int flags) { + keyPattern *new = (keyPattern *) zmalloc(sizeof(keyPattern)); + new->pattern = pattern; + new->flags = flags; + return new; +} + +/* Free a key pattern and internal structures. */ +void ACLKeyPatternFree(keyPattern *pattern) { + sdsfree(pattern->pattern); + zfree(pattern); +} + +/* Method for passwords/pattern comparison used for the user->passwords list + * so that we can search for items with listSearchKey(). */ +int ACLListMatchKeyPattern(void *a, void *b) { + return sdscmp(((keyPattern *) a)->pattern,((keyPattern *) b)->pattern) == 0; +} + +/* Method to free list elements from ACL users password/patterns lists. */ +void ACLListFreeKeyPattern(void *item) { + ACLKeyPatternFree(item); +} + +/* Method to duplicate list elements from ACL users password/patterns lists. */ +void *ACLListDupKeyPattern(void *item) { + keyPattern *old = (keyPattern *) item; + return ACLKeyPatternCreate(sdsdup(old->pattern), old->flags); +} + +/* Append the string representation of a key pattern onto the + * provided base string. */ +sds sdsCatPatternString(sds base, keyPattern *pat) { + if (pat->flags == ACL_ALL_PERMISSION) { + base = sdscatlen(base,"~",1); + } else if (pat->flags == ACL_READ_PERMISSION) { + base = sdscatlen(base,"%R~",3); + } else if (pat->flags == ACL_WRITE_PERMISSION) { + base = sdscatlen(base,"%W~",3); + } else { + serverPanic("Invalid key pattern flag detected"); + } + return sdscatsds(base, pat->pattern); +} + +/* Create an empty selector with the provided set of initial + * flags. The selector will be default have no permissions. */ +aclSelector *ACLCreateSelector(int flags) { + aclSelector *selector = zmalloc(sizeof(aclSelector)); + selector->flags = flags | server.acl_pubsub_default; + selector->patterns = listCreate(); + selector->channels = listCreate(); + selector->allowed_firstargs = NULL; + selector->command_rules = sdsempty(); + + listSetMatchMethod(selector->patterns,ACLListMatchKeyPattern); + listSetFreeMethod(selector->patterns,ACLListFreeKeyPattern); + listSetDupMethod(selector->patterns,ACLListDupKeyPattern); + listSetMatchMethod(selector->channels,ACLListMatchSds); + listSetFreeMethod(selector->channels,ACLListFreeSds); + listSetDupMethod(selector->channels,ACLListDupSds); + memset(selector->allowed_commands,0,sizeof(selector->allowed_commands)); + + return selector; +} + +/* Cleanup the provided selector, including all interior structures. */ +void ACLFreeSelector(aclSelector *selector) { + listRelease(selector->patterns); + listRelease(selector->channels); + sdsfree(selector->command_rules); + ACLResetFirstArgs(selector); + zfree(selector); +} + +/* Create an exact copy of the provided selector. */ +aclSelector *ACLCopySelector(aclSelector *src) { + aclSelector *dst = zmalloc(sizeof(aclSelector)); + dst->flags = src->flags; + dst->patterns = listDup(src->patterns); + dst->channels = listDup(src->channels); + dst->command_rules = sdsdup(src->command_rules); + memcpy(dst->allowed_commands,src->allowed_commands, + sizeof(dst->allowed_commands)); + dst->allowed_firstargs = NULL; + /* Copy the allowed first-args array of array of SDS strings. */ + if (src->allowed_firstargs) { + for (int j = 0; j < USER_COMMAND_BITS_COUNT; j++) { + if (!(src->allowed_firstargs[j])) continue; + for (int i = 0; src->allowed_firstargs[j][i]; i++) { + ACLAddAllowedFirstArg(dst, j, src->allowed_firstargs[j][i]); + } + } + } + return dst; +} + +/* List method for freeing a selector */ +void ACLListFreeSelector(void *a) { + ACLFreeSelector((aclSelector *) a); +} + +/* List method for duplicating a selector */ +void *ACLListDuplicateSelector(void *src) { + return ACLCopySelector((aclSelector *)src); +} + +/* All users have an implicit root selector which + * provides backwards compatibility to the old ACLs- + * permissions. */ +aclSelector *ACLUserGetRootSelector(user *u) { + serverAssert(listLength(u->selectors)); + aclSelector *s = (aclSelector *) listNodeValue(listFirst(u->selectors)); + serverAssert(s->flags & SELECTOR_FLAG_ROOT); + return s; +} + +/* Create a new user with the specified name, store it in the list + * of users (the Users global radix tree), and returns a reference to + * the structure representing the user. + * + * If the user with such name already exists NULL is returned. */ +user *ACLCreateUser(const char *name, size_t namelen) { + if (raxFind(Users,(unsigned char*)name,namelen,NULL)) return NULL; + user *u = zmalloc(sizeof(*u)); + u->name = sdsnewlen(name,namelen); + atomicSet(u->flags, USER_FLAG_DISABLED | USER_FLAG_SANITIZE_PAYLOAD); + u->passwords = listCreate(); + u->acl_string = NULL; + listSetMatchMethod(u->passwords,ACLListMatchSds); + listSetFreeMethod(u->passwords,ACLListFreeSds); + listSetDupMethod(u->passwords,ACLListDupSds); + + u->selectors = listCreate(); + listSetFreeMethod(u->selectors,ACLListFreeSelector); + listSetDupMethod(u->selectors,ACLListDuplicateSelector); + + /* Add the initial root selector */ + aclSelector *s = ACLCreateSelector(SELECTOR_FLAG_ROOT); + listAddNodeHead(u->selectors, s); + + raxInsert(Users,(unsigned char*)name,namelen,u,NULL); + return u; +} + +/* This function should be called when we need an unlinked "fake" user + * we can use in order to validate ACL rules or for other similar reasons. + * The user will not get linked to the Users radix tree. The returned + * user should be released with ACLFreeUser() as usually. */ +user *ACLCreateUnlinkedUser(void) { + char username[64]; + for (int j = 0; ; j++) { + snprintf(username,sizeof(username),"__fakeuser:%d__",j); + user *fakeuser = ACLCreateUser(username,strlen(username)); + if (fakeuser == NULL) continue; + int retval = raxRemove(Users,(unsigned char*) username, + strlen(username),NULL); + serverAssert(retval != 0); + return fakeuser; + } +} + +/* Release the memory used by the user structure. Note that this function + * will not remove the user from the Users global radix tree. */ +void ACLFreeUser(user *u) { + sdsfree(u->name); + if (u->acl_string) { + decrRefCount(u->acl_string); + u->acl_string = NULL; + } + listRelease(u->passwords); + listRelease(u->selectors); + zfree(u); +} + +/* Generic version of ACLFreeUser. */ +void ACLFreeUserGeneric(void *u) { + ACLFreeUser((user *)u); +} + +/* When a user is deleted we need to cycle the active + * connections in order to kill all the pending ones that + * are authenticated with such user. */ +void ACLFreeUserAndKillClients(user *u) { + listIter li; + listNode *ln; + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + if (c->user == u) { + /* We'll free the connection asynchronously, so + * in theory to set a different user is not needed. + * However if there are bugs in Redis, soon or later + * this may result in some security hole: it's much + * more defensive to set the default user and put + * it in non authenticated mode. */ + deauthenticateAndCloseClient(c); + } + } + ACLFreeUser(u); +} + +/* Copy the user ACL rules from the source user 'src' to the destination + * user 'dst' so that at the end of the process they'll have exactly the + * same rules (but the names will continue to be the original ones). */ +void ACLCopyUser(user *dst, user *src) { + listRelease(dst->passwords); + listRelease(dst->selectors); + dst->passwords = listDup(src->passwords); + dst->selectors = listDup(src->selectors); + dst->flags = src->flags; + if (dst->acl_string) { + decrRefCount(dst->acl_string); + } + dst->acl_string = src->acl_string; + if (dst->acl_string) { + /* if src is NULL, we set it to NULL, if not, need to increment reference count */ + incrRefCount(dst->acl_string); + } +} + +/* Given a command ID, this function set by reference 'word' and 'bit' + * so that user->allowed_commands[word] will address the right word + * where the corresponding bit for the provided ID is stored, and + * so that user->allowed_commands[word]&bit will identify that specific + * bit. The function returns C_ERR in case the specified ID overflows + * the bitmap in the user representation. */ +int ACLGetCommandBitCoordinates(uint64_t id, uint64_t *word, uint64_t *bit) { + if (id >= USER_COMMAND_BITS_COUNT) return C_ERR; + *word = id / sizeof(uint64_t) / 8; + *bit = 1ULL << (id % (sizeof(uint64_t) * 8)); + return C_OK; +} + +/* Check if the specified command bit is set for the specified user. + * The function returns 1 is the bit is set or 0 if it is not. + * Note that this function does not check the ALLCOMMANDS flag of the user + * but just the lowlevel bitmask. + * + * If the bit overflows the user internal representation, zero is returned + * in order to disallow the execution of the command in such edge case. */ +int ACLGetSelectorCommandBit(const aclSelector *selector, unsigned long id) { + uint64_t word, bit; + if (ACLGetCommandBitCoordinates(id,&word,&bit) == C_ERR) return 0; + return (selector->allowed_commands[word] & bit) != 0; +} + +/* When +@all or allcommands is given, we set a reserved bit as well that we + * can later test, to see if the user has the right to execute "future commands", + * that is, commands loaded later via modules. */ +int ACLSelectorCanExecuteFutureCommands(aclSelector *selector) { + return ACLGetSelectorCommandBit(selector,USER_COMMAND_BITS_COUNT-1); +} + +/* Set the specified command bit for the specified user to 'value' (0 or 1). + * If the bit overflows the user internal representation, no operation + * is performed. As a side effect of calling this function with a value of + * zero, the user flag ALLCOMMANDS is cleared since it is no longer possible + * to skip the command bit explicit test. */ +void ACLSetSelectorCommandBit(aclSelector *selector, unsigned long id, int value) { + uint64_t word, bit; + if (ACLGetCommandBitCoordinates(id,&word,&bit) == C_ERR) return; + if (value) { + selector->allowed_commands[word] |= bit; + } else { + selector->allowed_commands[word] &= ~bit; + selector->flags &= ~SELECTOR_FLAG_ALLCOMMANDS; + } +} + +/* Remove a rule from the retained command rules. Always match rules + * verbatim, but also remove subcommand rules if we are adding or removing the + * entire command. */ +void ACLSelectorRemoveCommandRule(aclSelector *selector, sds new_rule) { + size_t new_len = sdslen(new_rule); + char *existing_rule = selector->command_rules; + + /* Loop over the existing rules, trying to find a rule that "matches" + * the new rule. If we find a match, then remove the command from the string by + * copying the later rules over it. */ + while(existing_rule[0]) { + /* The first character of the rule is +/-, which we don't need to compare. */ + char *copy_position = existing_rule; + existing_rule += 1; + + /* Assume a trailing space after a command is part of the command, like '+get ', so trim it + * as well if the command is removed. */ + char *rule_end = strchr(existing_rule, ' '); + if (!rule_end) { + /* This is the last rule, so move it to the end of the string. */ + rule_end = existing_rule + strlen(existing_rule); + + /* This approach can leave a trailing space if the last rule is removed, + * but only if it's not the first rule, so handle that case. */ + if (copy_position != selector->command_rules) copy_position -= 1; + } + char *copy_end = rule_end; + if (*copy_end == ' ') copy_end++; + + /* Exact match or the rule we are comparing is a subcommand denoted by '|' */ + size_t existing_len = rule_end - existing_rule; + if (!memcmp(existing_rule, new_rule, min(existing_len, new_len))) { + if ((existing_len == new_len) || (existing_len > new_len && (existing_rule[new_len]) == '|')) { + /* Copy the remaining rules starting at the next rule to replace the rule to be + * deleted, including the terminating NULL character. */ + memmove(copy_position, copy_end, strlen(copy_end) + 1); + existing_rule = copy_position; + continue; + } + } + existing_rule = copy_end; + } + + /* There is now extra padding at the end of the rules, so clean that up. */ + sdsupdatelen(selector->command_rules); +} + +/* This function is resopnsible for updating the command_rules struct so that relative ordering of + * commands and categories is maintained and can be reproduced without loss. */ +void ACLUpdateCommandRules(aclSelector *selector, const char *rule, int allow) { + sds new_rule = sdsnew(rule); + sdstolower(new_rule); + + ACLSelectorRemoveCommandRule(selector, new_rule); + if (sdslen(selector->command_rules)) selector->command_rules = sdscat(selector->command_rules, " "); + selector->command_rules = sdscatfmt(selector->command_rules, allow ? "+%S" : "-%S", new_rule); + sdsfree(new_rule); +} + +/* This function is used to allow/block a specific command. + * Allowing/blocking a container command also applies for its subcommands */ +void ACLChangeSelectorPerm(aclSelector *selector, struct redisCommand *cmd, int allow) { + unsigned long id = cmd->id; + ACLSetSelectorCommandBit(selector,id,allow); + ACLResetFirstArgsForCommand(selector,id); + if (cmd->subcommands_dict) { + dictEntry *de; + dictIterator di; + dictInitSafeIterator(&di, cmd->subcommands_dict); + while((de = dictNext(&di)) != NULL) { + struct redisCommand *sub = (struct redisCommand *)dictGetVal(de); + ACLSetSelectorCommandBit(selector,sub->id,allow); + } + dictResetIterator(&di); + } +} + +/* This is like ACLSetSelectorCommandBit(), but instead of setting the specified + * ID, it will check all the commands in the category specified as argument, + * and will set all the bits corresponding to such commands to the specified + * value. Since the category passed by the user may be non existing, the + * function returns C_ERR if the category was not found, or C_OK if it was + * found and the operation was performed. */ +void ACLSetSelectorCommandBitsForCategory(dict *commands, aclSelector *selector, uint64_t cflag, int value) { + dictIterator di; + dictEntry *de; + dictInitIterator(&di, commands); + while ((de = dictNext(&di)) != NULL) { + struct redisCommand *cmd = dictGetVal(de); + if (cmd->acl_categories & cflag) { + ACLChangeSelectorPerm(selector,cmd,value); + } + if (cmd->subcommands_dict) { + ACLSetSelectorCommandBitsForCategory(cmd->subcommands_dict, selector, cflag, value); + } + } + dictResetIterator(&di); +} + +/* This function is responsible for recomputing the command bits for all selectors of the existing users. + * It uses the 'command_rules', a string representation of the ordered categories and commands, + * to recompute the command bits. */ +void ACLRecomputeCommandBitsFromCommandRulesAllUsers(void) { + raxIterator ri; + raxStart(&ri,Users); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + user *u = ri.data; + listIter li; + listNode *ln; + listRewind(u->selectors,&li); + while((ln = listNext(&li))) { + aclSelector *selector = (aclSelector *) listNodeValue(ln); + int argc = 0; + sds *argv = sdssplitargs(selector->command_rules, &argc); + serverAssert(argv != NULL); + /* Checking selector's permissions for all commands to start with a clean state. */ + if (ACLSelectorCanExecuteFutureCommands(selector)) { + int res = ACLSetSelector(selector,"+@all",-1); + serverAssert(res == C_OK); + } else { + int res = ACLSetSelector(selector,"-@all",-1); + serverAssert(res == C_OK); + } + + /* Apply all of the commands and categories to this selector. */ + for(int i = 0; i < argc; i++) { + int res = ACLSetSelector(selector, argv[i], sdslen(argv[i])); + serverAssert(res == C_OK); + } + sdsfreesplitres(argv, argc); + } + } + raxStop(&ri); + +} + +int ACLSetSelectorCategory(aclSelector *selector, const char *category, int allow) { + uint64_t cflag = ACLGetCommandCategoryFlagByName(category + 1); + if (!cflag) return C_ERR; + + ACLUpdateCommandRules(selector, category, allow); + + /* Set the actual command bits on the selector. */ + ACLSetSelectorCommandBitsForCategory(server.orig_commands, selector, cflag, allow); + return C_OK; +} + +void ACLCountCategoryBitsForCommands(dict *commands, aclSelector *selector, unsigned long *on, unsigned long *off, uint64_t cflag) { + dictIterator di; + dictEntry *de; + dictInitIterator(&di, commands); + while ((de = dictNext(&di)) != NULL) { + struct redisCommand *cmd = dictGetVal(de); + if (cmd->acl_categories & cflag) { + if (ACLGetSelectorCommandBit(selector,cmd->id)) + (*on)++; + else + (*off)++; + } + if (cmd->subcommands_dict) { + ACLCountCategoryBitsForCommands(cmd->subcommands_dict, selector, on, off, cflag); + } + } + dictResetIterator(&di); +} + +/* Return the number of commands allowed (on) and denied (off) for the user 'u' + * in the subset of commands flagged with the specified category name. + * If the category name is not valid, C_ERR is returned, otherwise C_OK is + * returned and on and off are populated by reference. */ +int ACLCountCategoryBitsForSelector(aclSelector *selector, unsigned long *on, unsigned long *off, + const char *category) +{ + uint64_t cflag = ACLGetCommandCategoryFlagByName(category); + if (!cflag) return C_ERR; + + *on = *off = 0; + ACLCountCategoryBitsForCommands(server.orig_commands, selector, on, off, cflag); + return C_OK; +} + +/* This function returns an SDS string representing the specified selector ACL + * rules related to command execution, in the same format you could set them + * back using ACL SETUSER. The function will return just the set of rules needed + * to recreate the user commands bitmap, without including other user flags such + * as on/off, passwords and so forth. The returned string always starts with + * the +@all or -@all rule, depending on the user bitmap, and is followed, if + * needed, by the other rules needed to narrow or extend what the user can do. */ +sds ACLDescribeSelectorCommandRules(aclSelector *selector) { + sds rules = sdsempty(); + + /* We use this fake selector as a "sanity" check to make sure the rules + * we generate have the same bitmap as those on the current selector. */ + aclSelector *fake_selector = ACLCreateSelector(0); + + /* Here we want to understand if we should start with +@all or -@all. + * Note that when starting with +@all and subtracting, the user + * will be able to execute future commands, while -@all and adding will just + * allow the user the run the selected commands and/or categories. + * How do we test for that? We use the trick of a reserved command ID bit + * that is set only by +@all (and its alias "allcommands"). */ + if (ACLSelectorCanExecuteFutureCommands(selector)) { + rules = sdscat(rules,"+@all "); + ACLSetSelector(fake_selector,"+@all",-1); + } else { + rules = sdscat(rules,"-@all "); + ACLSetSelector(fake_selector,"-@all",-1); + } + + /* Apply all of the commands and categories to the fake selector. */ + int argc = 0; + sds *argv = sdssplitargs(selector->command_rules, &argc); + serverAssert(argv != NULL); + + for(int i = 0; i < argc; i++) { + int res = ACLSetSelector(fake_selector, argv[i], -1); + serverAssert(res == C_OK); + } + if (sdslen(selector->command_rules)) { + rules = sdscatfmt(rules, "%S ", selector->command_rules); + } + sdsfreesplitres(argv, argc); + + /* Trim the final useless space. */ + sdsrange(rules,0,-2); + + /* This is technically not needed, but we want to verify that now the + * predicted bitmap is exactly the same as the user bitmap, and abort + * otherwise, because aborting is better than a security risk in this + * code path. */ + if (memcmp(fake_selector->allowed_commands, + selector->allowed_commands, + sizeof(selector->allowed_commands)) != 0) + { + serverLog(LL_WARNING, + "CRITICAL ERROR: User ACLs don't match final bitmap: '%s'", + redactLogCstr(rules)); + serverPanic("No bitmap match in ACLDescribeSelectorCommandRules()"); + } + ACLFreeSelector(fake_selector); + return rules; +} + +sds ACLDescribeSelector(aclSelector *selector) { + listIter li; + listNode *ln; + sds res = sdsempty(); + /* Key patterns. */ + if (selector->flags & SELECTOR_FLAG_ALLKEYS) { + res = sdscatlen(res,"~* ",3); + } else { + listRewind(selector->patterns,&li); + while((ln = listNext(&li))) { + keyPattern *thispat = (keyPattern *)listNodeValue(ln); + res = sdsCatPatternString(res, thispat); + res = sdscatlen(res," ",1); + } + } + + /* Pub/sub channel patterns. */ + if (selector->flags & SELECTOR_FLAG_ALLCHANNELS) { + res = sdscatlen(res,"&* ",3); + } else { + res = sdscatlen(res,"resetchannels ",14); + listRewind(selector->channels,&li); + while((ln = listNext(&li))) { + sds thispat = listNodeValue(ln); + res = sdscatlen(res,"&",1); + res = sdscatsds(res,thispat); + res = sdscatlen(res," ",1); + } + } + + /* Command rules. */ + sds rules = ACLDescribeSelectorCommandRules(selector); + res = sdscatsds(res,rules); + sdsfree(rules); + return res; +} + +/* This is similar to ACLDescribeSelectorCommandRules(), however instead of + * describing just the user command rules, everything is described: user + * flags, keys, passwords and finally the command rules obtained via + * the ACLDescribeSelectorCommandRules() function. This is the function we call + * when we want to rewrite the configuration files describing ACLs and + * in order to show users with ACL LIST. */ +robj *ACLDescribeUser(user *u) { + if (u->acl_string) { + incrRefCount(u->acl_string); + return u->acl_string; + } + + sds res = sdsempty(); + + /* Flags. */ + for (int j = 0; ACLUserFlags[j].flag; j++) { + if (u->flags & ACLUserFlags[j].flag) { + res = sdscat(res,ACLUserFlags[j].name); + res = sdscatlen(res," ",1); + } + } + + /* Passwords. */ + listIter li; + listNode *ln; + listRewind(u->passwords,&li); + while((ln = listNext(&li))) { + sds thispass = listNodeValue(ln); + res = sdscatlen(res,"#",1); + res = sdscatsds(res,thispass); + res = sdscatlen(res," ",1); + } + + /* Selectors (Commands and keys) */ + listRewind(u->selectors,&li); + while((ln = listNext(&li))) { + aclSelector *selector = (aclSelector *) listNodeValue(ln); + sds default_perm = ACLDescribeSelector(selector); + if (selector->flags & SELECTOR_FLAG_ROOT) { + res = sdscatfmt(res, "%s", default_perm); + } else { + res = sdscatfmt(res, " (%s)", default_perm); + } + sdsfree(default_perm); + } + + u->acl_string = createObject(OBJ_STRING, res); + /* because we are returning it, have to increase count */ + incrRefCount(u->acl_string); + + return u->acl_string; +} + +/* Get a command from the original command table, that is not affected + * by the command renaming operations: we base all the ACL work from that + * table, so that ACLs are valid regardless of command renaming. */ +struct redisCommand *ACLLookupCommand(const char *name) { + struct redisCommand *cmd; + sds sdsname = sdsnew(name); + cmd = lookupCommandBySdsLogic(server.orig_commands,sdsname); + sdsfree(sdsname); + return cmd; +} + +/* Flush the array of allowed first-args for the specified user + * and command ID. */ +void ACLResetFirstArgsForCommand(aclSelector *selector, unsigned long id) { + if (selector->allowed_firstargs && selector->allowed_firstargs[id]) { + for (int i = 0; selector->allowed_firstargs[id][i]; i++) + sdsfree(selector->allowed_firstargs[id][i]); + zfree(selector->allowed_firstargs[id]); + selector->allowed_firstargs[id] = NULL; + } +} + +/* Flush the entire table of first-args. This is useful on +@all, -@all + * or similar to return back to the minimal memory usage (and checks to do) + * for the user. */ +void ACLResetFirstArgs(aclSelector *selector) { + if (selector->allowed_firstargs == NULL) return; + for (int j = 0; j < USER_COMMAND_BITS_COUNT; j++) { + if (selector->allowed_firstargs[j]) { + for (int i = 0; selector->allowed_firstargs[j][i]; i++) + sdsfree(selector->allowed_firstargs[j][i]); + zfree(selector->allowed_firstargs[j]); + } + } + zfree(selector->allowed_firstargs); + selector->allowed_firstargs = NULL; +} + +/* Add a first-arg to the list of subcommands for the user 'u' and + * the command id specified. */ +void ACLAddAllowedFirstArg(aclSelector *selector, unsigned long id, const char *sub) { + /* If this is the first first-arg to be configured for + * this user, we have to allocate the first-args array. */ + if (selector->allowed_firstargs == NULL) { + selector->allowed_firstargs = zcalloc(USER_COMMAND_BITS_COUNT * sizeof(sds*)); + } + + /* We also need to enlarge the allocation pointing to the + * null terminated SDS array, to make space for this one. + * To start check the current size, and while we are here + * make sure the first-arg is not already specified inside. */ + long items = 0; + if (selector->allowed_firstargs[id]) { + while(selector->allowed_firstargs[id][items]) { + /* If it's already here do not add it again. */ + if (!strcasecmp(selector->allowed_firstargs[id][items],sub)) + return; + items++; + } + } + + /* Now we can make space for the new item (and the null term). */ + items += 2; + selector->allowed_firstargs[id] = zrealloc(selector->allowed_firstargs[id], sizeof(sds)*items); + selector->allowed_firstargs[id][items-2] = sdsnew(sub); + selector->allowed_firstargs[id][items-1] = NULL; +} + +/* Create an ACL selector from the given ACL operations, which should be + * a list of space separate ACL operations that starts and ends + * with parentheses. + * + * If any of the operations are invalid, NULL will be returned instead + * and errno will be set corresponding to the interior error. */ +aclSelector *aclCreateSelectorFromOpSet(const char *opset, size_t opsetlen) { + serverAssert(opset[0] == '(' && opset[opsetlen - 1] == ')'); + aclSelector *s = ACLCreateSelector(0); + + int argc = 0; + sds trimmed = sdsnewlen(opset + 1, opsetlen - 2); + sds *argv = sdssplitargs(trimmed, &argc); + for (int i = 0; i < argc; i++) { + if (ACLSetSelector(s, argv[i], sdslen(argv[i])) == C_ERR) { + ACLFreeSelector(s); + s = NULL; + goto cleanup; + } + } + +cleanup: + sdsfreesplitres(argv, argc); + sdsfree(trimmed); + return s; +} + +/* Set a selector's properties with the provided 'op'. + * + * + Allow the execution of that command. + * May be used with `|` for allowing subcommands (e.g "+config|get") + * - Disallow the execution of that command. + * May be used with `|` for blocking subcommands (e.g "-config|set") + * +@ Allow the execution of all the commands in such category + * with valid categories are like @admin, @set, @sortedset, ... + * and so forth, see the full list in the server.c file where + * the Redis command table is described and defined. + * The special category @all means all the commands, but currently + * present in the server, and that will be loaded in the future + * via modules. + * +|first-arg Allow a specific first argument of an otherwise + * disabled command. Note that this form is not + * allowed as negative like -SELECT|1, but + * only additive starting with "+". + * allcommands Alias for +@all. Note that it implies the ability to execute + * all the future commands loaded via the modules system. + * nocommands Alias for -@all. + * ~ Add a pattern of keys that can be mentioned as part of + * commands. For instance ~* allows all the keys. The pattern + * is a glob-style pattern like the one of KEYS. + * It is possible to specify multiple patterns. + * %R~ Add key read pattern that specifies which keys can be read + * from. + * %W~ Add key write pattern that specifies which keys can be + * written to. + * allkeys Alias for ~* + * resetkeys Flush the list of allowed keys patterns. + * & Add a pattern of channels that can be mentioned as part of + * Pub/Sub commands. For instance &* allows all the channels. The + * pattern is a glob-style pattern like the one of PSUBSCRIBE. + * It is possible to specify multiple patterns. + * allchannels Alias for &* + * resetchannels Flush the list of allowed channel patterns. + */ +int ACLSetSelector(aclSelector *selector, const char* op, size_t oplen) { + if (!strcasecmp(op,"allkeys") || + !strcasecmp(op,"~*")) + { + selector->flags |= SELECTOR_FLAG_ALLKEYS; + listEmpty(selector->patterns); + } else if (!strcasecmp(op,"resetkeys")) { + selector->flags &= ~SELECTOR_FLAG_ALLKEYS; + listEmpty(selector->patterns); + } else if (!strcasecmp(op,"allchannels") || + !strcasecmp(op,"&*")) + { + selector->flags |= SELECTOR_FLAG_ALLCHANNELS; + listEmpty(selector->channels); + } else if (!strcasecmp(op,"resetchannels")) { + selector->flags &= ~SELECTOR_FLAG_ALLCHANNELS; + listEmpty(selector->channels); + } else if (!strcasecmp(op,"allcommands") || + !strcasecmp(op,"+@all")) + { + memset(selector->allowed_commands,255,sizeof(selector->allowed_commands)); + selector->flags |= SELECTOR_FLAG_ALLCOMMANDS; + sdsclear(selector->command_rules); + ACLResetFirstArgs(selector); + } else if (!strcasecmp(op,"nocommands") || + !strcasecmp(op,"-@all")) + { + memset(selector->allowed_commands,0,sizeof(selector->allowed_commands)); + selector->flags &= ~SELECTOR_FLAG_ALLCOMMANDS; + sdsclear(selector->command_rules); + ACLResetFirstArgs(selector); + } else if (op[0] == '~' || op[0] == '%') { + if (selector->flags & SELECTOR_FLAG_ALLKEYS) { + errno = EEXIST; + return C_ERR; + } + int flags = 0; + size_t offset = 1; + if (op[0] == '%') { + int perm_ok = 1; + for (; offset < oplen; offset++) { + if (toupper(op[offset]) == 'R' && !(flags & ACL_READ_PERMISSION)) { + flags |= ACL_READ_PERMISSION; + } else if (toupper(op[offset]) == 'W' && !(flags & ACL_WRITE_PERMISSION)) { + flags |= ACL_WRITE_PERMISSION; + } else if (op[offset] == '~') { + offset++; + break; + } else { + perm_ok = 0; + break; + } + } + if (!flags || !perm_ok) { + errno = EINVAL; + return C_ERR; + } + } else { + flags = ACL_ALL_PERMISSION; + } + + if (ACLStringHasSpaces(op+offset,oplen-offset)) { + errno = EINVAL; + return C_ERR; + } + keyPattern *newpat = ACLKeyPatternCreate(sdsnewlen(op+offset,oplen-offset), flags); + listNode *ln = listSearchKey(selector->patterns,newpat); + /* Avoid re-adding the same key pattern multiple times. */ + if (ln == NULL) { + listAddNodeTail(selector->patterns,newpat); + } else { + ((keyPattern *)listNodeValue(ln))->flags |= flags; + ACLKeyPatternFree(newpat); + } + selector->flags &= ~SELECTOR_FLAG_ALLKEYS; + } else if (op[0] == '&') { + if (selector->flags & SELECTOR_FLAG_ALLCHANNELS) { + errno = EISDIR; + return C_ERR; + } + if (ACLStringHasSpaces(op+1,oplen-1)) { + errno = EINVAL; + return C_ERR; + } + sds newpat = sdsnewlen(op+1,oplen-1); + listNode *ln = listSearchKey(selector->channels,newpat); + /* Avoid re-adding the same channel pattern multiple times. */ + if (ln == NULL) + listAddNodeTail(selector->channels,newpat); + else + sdsfree(newpat); + selector->flags &= ~SELECTOR_FLAG_ALLCHANNELS; + } else if (op[0] == '+' && op[1] != '@') { + if (strrchr(op,'|') == NULL) { + struct redisCommand *cmd = ACLLookupCommand(op+1); + if (cmd == NULL) { + errno = ENOENT; + return C_ERR; + } + ACLChangeSelectorPerm(selector,cmd,1); + ACLUpdateCommandRules(selector,cmd->fullname,1); + } else { + /* Split the command and subcommand parts. */ + char *copy = zstrdup(op+1); + char *sub = strrchr(copy,'|'); + sub[0] = '\0'; + sub++; + + struct redisCommand *cmd = ACLLookupCommand(copy); + + /* Check if the command exists. We can't check the + * first-arg to see if it is valid. */ + if (cmd == NULL) { + zfree(copy); + errno = ENOENT; + return C_ERR; + } + + /* We do not support allowing first-arg of a subcommand */ + if (cmd->parent) { + zfree(copy); + errno = ECHILD; + return C_ERR; + } + + /* The subcommand cannot be empty, so things like DEBUG| + * are syntax errors of course. */ + if (strlen(sub) == 0) { + zfree(copy); + errno = EINVAL; + return C_ERR; + } + + if (cmd->subcommands_dict) { + /* If user is trying to allow a valid subcommand we can just add its unique ID */ + cmd = ACLLookupCommand(op+1); + if (cmd == NULL) { + zfree(copy); + errno = ENOENT; + return C_ERR; + } + ACLChangeSelectorPerm(selector,cmd,1); + } else { + /* If user is trying to use the ACL mech to block SELECT except SELECT 0 or + * block DEBUG except DEBUG OBJECT (DEBUG subcommands are not considered + * subcommands for now) we use the allowed_firstargs mechanism. */ + + /* Add the first-arg to the list of valid ones. */ + serverLog(LL_WARNING, "Deprecation warning: Allowing a first arg of an otherwise " + "blocked command is a misuse of ACL and may get disabled " + "in the future (offender: +%s)", redactLogCstr(op+1)); + ACLAddAllowedFirstArg(selector,cmd->id,sub); + } + ACLUpdateCommandRules(selector,op+1,1); + zfree(copy); + } + } else if (op[0] == '-' && op[1] != '@') { + struct redisCommand *cmd = ACLLookupCommand(op+1); + if (cmd == NULL) { + errno = ENOENT; + return C_ERR; + } + ACLChangeSelectorPerm(selector,cmd,0); + ACLUpdateCommandRules(selector,cmd->fullname,0); + } else if ((op[0] == '+' || op[0] == '-') && op[1] == '@') { + int bitval = op[0] == '+' ? 1 : 0; + if (ACLSetSelectorCategory(selector,op+1,bitval) == C_ERR) { + errno = ENOENT; + return C_ERR; + } + } else { + errno = EINVAL; + return C_ERR; + } + return C_OK; +} + +/* Set user properties according to the string "op". The following + * is a description of what different strings will do: + * + * on Enable the user: it is possible to authenticate as this user. + * off Disable the user: it's no longer possible to authenticate + * with this user, however the already authenticated connections + * will still work. + * skip-sanitize-payload RESTORE dump-payload sanitization is skipped. + * sanitize-payload RESTORE dump-payload is sanitized (default). + * > Add this password to the list of valid password for the user. + * For example >mypass will add "mypass" to the list. + * This directive clears the "nopass" flag (see later). + * # Add this password hash to the list of valid hashes for + * the user. This is useful if you have previously computed + * the hash, and don't want to store it in plaintext. + * This directive clears the "nopass" flag (see later). + * < Remove this password from the list of valid passwords. + * ! Remove this hashed password from the list of valid passwords. + * This is useful when you want to remove a password just by + * hash without knowing its plaintext version at all. + * nopass All the set passwords of the user are removed, and the user + * is flagged as requiring no password: it means that every + * password will work against this user. If this directive is + * used for the default user, every new connection will be + * immediately authenticated with the default user without + * any explicit AUTH command required. Note that the "resetpass" + * directive will clear this condition. + * resetpass Flush the list of allowed passwords. Moreover removes the + * "nopass" status. After "resetpass" the user has no associated + * passwords and there is no way to authenticate without adding + * some password (or setting it as "nopass" later). + * reset Performs the following actions: resetpass, resetkeys, resetchannels, + * allchannels (if acl-pubsub-default is set), off, clearselectors, -@all. + * The user returns to the same state it has immediately after its creation. + * () Create a new selector with the options specified within the + * parentheses and attach it to the user. Each option should be + * space separated. The first character must be ( and the last + * character must be ). + * clearselectors Remove all of the currently attached selectors. + * Note this does not change the "root" user permissions, + * which are the permissions directly applied onto the + * user (outside the parentheses). + * + * Selector options can also be specified by this function, in which case + * they update the root selector for the user. + * + * The 'op' string must be null terminated. The 'oplen' argument should + * specify the length of the 'op' string in case the caller requires to pass + * binary data (for instance the >password form may use a binary password). + * Otherwise the field can be set to -1 and the function will use strlen() + * to determine the length. + * + * The function returns C_OK if the action to perform was understood because + * the 'op' string made sense. Otherwise C_ERR is returned if the operation + * is unknown or has some syntax error. + * + * When an error is returned, errno is set to the following values: + * + * EINVAL: The specified opcode is not understood or the key/channel pattern is + * invalid (contains non allowed characters). + * ENOENT: The command name or command category provided with + or - is not + * known. + * EEXIST: You are adding a key pattern after "*" was already added. This is + * almost surely an error on the user side. + * EISDIR: You are adding a channel pattern after "*" was already added. This is + * almost surely an error on the user side. + * ENODEV: The password you are trying to remove from the user does not exist. + * EBADMSG: The hash you are trying to add is not a valid hash. + * ECHILD: Attempt to allow a specific first argument of a subcommand + */ +int ACLSetUser(user *u, const char *op, ssize_t oplen) { + /* as we are changing the ACL, the old generated string is now invalid */ + if (u->acl_string) { + decrRefCount(u->acl_string); + u->acl_string = NULL; + } + + if (oplen == -1) oplen = strlen(op); + if (oplen == 0) return C_OK; /* Empty string is a no-operation. */ + if (!strcasecmp(op,"on")) { + atomicSet(u->flags, (u->flags | USER_FLAG_ENABLED) & ~USER_FLAG_DISABLED); + } else if (!strcasecmp(op,"off")) { + atomicSet(u->flags, (u->flags | USER_FLAG_DISABLED) & ~USER_FLAG_ENABLED); + } else if (!strcasecmp(op,"skip-sanitize-payload")) { + atomicSet(u->flags, (u->flags | USER_FLAG_SANITIZE_PAYLOAD_SKIP) & ~USER_FLAG_SANITIZE_PAYLOAD); + } else if (!strcasecmp(op,"sanitize-payload")) { + atomicSet(u->flags, (u->flags | USER_FLAG_SANITIZE_PAYLOAD) & ~USER_FLAG_SANITIZE_PAYLOAD_SKIP); + } else if (!strcasecmp(op,"nopass")) { + atomicSet(u->flags, u->flags | USER_FLAG_NOPASS); + listEmpty(u->passwords); + } else if (!strcasecmp(op,"resetpass")) { + atomicSet(u->flags, u->flags & ~USER_FLAG_NOPASS); + listEmpty(u->passwords); + } else if (op[0] == '>' || op[0] == '#') { + sds newpass; + if (op[0] == '>') { + newpass = ACLHashPassword((unsigned char*)op+1,oplen-1); + } else { + if (ACLCheckPasswordHash((unsigned char*)op+1,oplen-1) == C_ERR) { + errno = EBADMSG; + return C_ERR; + } + newpass = sdsnewlen(op+1,oplen-1); + } + + listNode *ln = listSearchKey(u->passwords,newpass); + /* Avoid re-adding the same password multiple times. */ + if (ln == NULL) + listAddNodeTail(u->passwords,newpass); + else + sdsfree(newpass); + atomicSet(u->flags, u->flags & ~USER_FLAG_NOPASS); + } else if (op[0] == '<' || op[0] == '!') { + sds delpass; + if (op[0] == '<') { + delpass = ACLHashPassword((unsigned char*)op+1,oplen-1); + } else { + if (ACLCheckPasswordHash((unsigned char*)op+1,oplen-1) == C_ERR) { + errno = EBADMSG; + return C_ERR; + } + delpass = sdsnewlen(op+1,oplen-1); + } + listNode *ln = listSearchKey(u->passwords,delpass); + sdsfree(delpass); + if (ln) { + listDelNode(u->passwords,ln); + } else { + errno = ENODEV; + return C_ERR; + } + } else if (op[0] == '(' && op[oplen - 1] == ')') { + aclSelector *selector = aclCreateSelectorFromOpSet(op, oplen); + if (!selector) { + /* No errorno set, propagate it from interior error. */ + return C_ERR; + } + listAddNodeTail(u->selectors, selector); + return C_OK; + } else if (!strcasecmp(op,"clearselectors")) { + listIter li; + listNode *ln; + listRewind(u->selectors,&li); + /* There has to be a root selector */ + serverAssert(listNext(&li)); + while((ln = listNext(&li))) { + listDelNode(u->selectors, ln); + } + return C_OK; + } else if (!strcasecmp(op,"reset")) { + serverAssert(ACLSetUser(u,"resetpass",-1) == C_OK); + serverAssert(ACLSetUser(u,"resetkeys",-1) == C_OK); + serverAssert(ACLSetUser(u,"resetchannels",-1) == C_OK); + if (server.acl_pubsub_default & SELECTOR_FLAG_ALLCHANNELS) + serverAssert(ACLSetUser(u,"allchannels",-1) == C_OK); + serverAssert(ACLSetUser(u,"off",-1) == C_OK); + serverAssert(ACLSetUser(u,"sanitize-payload",-1) == C_OK); + serverAssert(ACLSetUser(u,"clearselectors",-1) == C_OK); + serverAssert(ACLSetUser(u,"-@all",-1) == C_OK); + } else { + aclSelector *selector = ACLUserGetRootSelector(u); + if (ACLSetSelector(selector, op, oplen) == C_ERR) { + return C_ERR; + } + } + return C_OK; +} + +/* Return a description of the error that occurred in ACLSetUser() according to + * the errno value set by the function on error. */ +const char *ACLSetUserStringError(void) { + const char *errmsg = "Wrong format"; + if (errno == ENOENT) + errmsg = "Unknown command or category name in ACL"; + else if (errno == EINVAL) + errmsg = "Syntax error"; + else if (errno == EEXIST) + errmsg = "Adding a pattern after the * pattern (or the " + "'allkeys' flag) is not valid and does not have any " + "effect. Try 'resetkeys' to start with an empty " + "list of patterns"; + else if (errno == EISDIR) + errmsg = "Adding a pattern after the * pattern (or the " + "'allchannels' flag) is not valid and does not have any " + "effect. Try 'resetchannels' to start with an empty " + "list of channels"; + else if (errno == ENODEV) + errmsg = "The password you are trying to remove from the user does " + "not exist"; + else if (errno == EBADMSG) + errmsg = "The password hash must be exactly 64 characters and contain " + "only lowercase hexadecimal characters"; + else if (errno == EALREADY) + errmsg = "Duplicate user found. A user can only be defined once in " + "config files"; + else if (errno == ECHILD) + errmsg = "Allowing first-arg of a subcommand is not supported"; + return errmsg; +} + +/* Create the default user, this has special permissions. */ +user *ACLCreateDefaultUser(void) { + user *new = ACLCreateUser("default",7); + ACLSetUser(new,"+@all",-1); + ACLSetUser(new,"~*",-1); + ACLSetUser(new,"&*",-1); + ACLSetUser(new,"on",-1); + ACLSetUser(new,"nopass",-1); + return new; +} + +/* Initialization of the ACL subsystem. */ +void ACLInit(void) { + Users = raxNew(); + UsersToLoad = listCreate(); + ACLInitCommandCategories(); + listSetMatchMethod(UsersToLoad, ACLListMatchLoadedUser); + ACLLog = listCreate(); + DefaultUser = ACLCreateDefaultUser(); +} + +/* Check the username and password pair and return C_OK if they are valid, + * otherwise C_ERR is returned and errno is set to: + * + * EINVAL: if the username-password do not match. + * ENOENT: if the specified user does not exist at all. + */ +int ACLCheckUserCredentials(robj *username, robj *password) { + user *u = ACLGetUserByName(username->ptr,sdslen(username->ptr)); + if (u == NULL) { + errno = ENOENT; + return C_ERR; + } + + /* Disabled users can't login. */ + if (u->flags & USER_FLAG_DISABLED) { + errno = EINVAL; + return C_ERR; + } + + /* If the user is configured to don't require any password, we + * are already fine here. */ + if (u->flags & USER_FLAG_NOPASS) return C_OK; + + /* Check all the user passwords for at least one to match. */ + listIter li; + listNode *ln; + listRewind(u->passwords,&li); + sds hashed = ACLHashPassword(password->ptr,sdslen(password->ptr)); + while((ln = listNext(&li))) { + sds thispass = listNodeValue(ln); + if (!time_independent_strcmp(hashed, thispass, HASH_PASSWORD_LEN)) { + sdsfree(hashed); + return C_OK; + } + } + sdsfree(hashed); + + /* If we reached this point, no password matched. */ + errno = EINVAL; + return C_ERR; +} + +/* If `err` is provided, this is added as an error reply to the client. + * Otherwise, the standard Auth error is added as a reply. */ +void addAuthErrReply(client *c, robj *err) { + if (clientHasPendingReplies(c)) return; + if (!err) { + addReplyError(c, "-WRONGPASS invalid username-password pair or user is disabled."); + return; + } + addReplyError(c, err->ptr); +} + +/* This is like ACLCheckUserCredentials(), however if the user/pass + * are correct, the connection is put in authenticated state and the + * connection user reference is populated. + * + * The return value is AUTH_OK on success (valid username / password pair) & AUTH_ERR otherwise. */ +int checkPasswordBasedAuth(client *c, robj *username, robj *password) { + if (ACLCheckUserCredentials(username,password) == C_OK) { + c->authenticated = 1; + c->user = ACLGetUserByName(username->ptr,sdslen(username->ptr)); + moduleNotifyUserChanged(c); + return AUTH_OK; + } else { + addACLLogEntry(c,ACL_DENIED_AUTH,(c->flags & CLIENT_MULTI) ? ACL_LOG_CTX_MULTI : ACL_LOG_CTX_TOPLEVEL,0,username->ptr,NULL); + return AUTH_ERR; + } +} + +/* Attempt authenticating the user - first through module based authentication, + * and then, if needed, with normal password based authentication. + * Returns one of the following codes: + * AUTH_OK - Indicates that authentication succeeded. + * AUTH_ERR - Indicates that authentication failed. + * AUTH_BLOCKED - Indicates module authentication is in progress through a blocking implementation. + */ +int ACLAuthenticateUser(client *c, robj *username, robj *password, robj **err) { + int result = checkModuleAuthentication(c, username, password, err); + /* If authentication was not handled by any Module, attempt normal password based auth. */ + if (result == AUTH_NOT_HANDLED) { + result = checkPasswordBasedAuth(c, username, password); + } + return result; +} + +/* For ACL purposes, every user has a bitmap with the commands that such + * user is allowed to execute. In order to populate the bitmap, every command + * should have an assigned ID (that is used to index the bitmap). This function + * creates such an ID: it uses sequential IDs, reusing the same ID for the same + * command name, so that a command retains the same ID in case of modules that + * are unloaded and later reloaded. + * + * The function does not take ownership of the 'cmdname' SDS string. + * */ +unsigned long ACLGetCommandID(sds cmdname) { + sds lowername = sdsdup(cmdname); + sdstolower(lowername); + if (commandId == NULL) commandId = raxNew(); + void *id; + if (raxFind(commandId,(unsigned char*)lowername,sdslen(lowername),&id)) { + sdsfree(lowername); + return (unsigned long)id; + } + raxInsert(commandId,(unsigned char*)lowername,strlen(lowername), + (void*)nextid,NULL); + sdsfree(lowername); + unsigned long thisid = nextid; + nextid++; + + /* We never assign the last bit in the user commands bitmap structure, + * this way we can later check if this bit is set, understanding if the + * current ACL for the user was created starting with a +@all to add all + * the possible commands and just subtracting other single commands or + * categories, or if, instead, the ACL was created just adding commands + * and command categories from scratch, not allowing future commands by + * default (loaded via modules). This is useful when rewriting the ACLs + * with ACL SAVE. */ + if (nextid == USER_COMMAND_BITS_COUNT-1) nextid++; + return thisid; +} + +/* Clear command id table and reset nextid to 0. */ +void ACLClearCommandID(void) { + if (commandId) raxFree(commandId); + commandId = NULL; + nextid = 0; +} + +/* Return an username by its name, or NULL if the user does not exist. */ +user *ACLGetUserByName(const char *name, size_t namelen) { + void *myuser = NULL; + raxFind(Users,(unsigned char*)name,namelen,&myuser); + return myuser; +} + +/* ============================================================================= + * ACL permission checks + * ==========================================================================*/ + +/* Check if the key can be accessed by the selector. + * + * If the selector can access the key, ACL_OK is returned, otherwise + * ACL_DENIED_KEY is returned. */ +static int ACLSelectorCheckKey(aclSelector *selector, const char *key, int keylen, int keyspec_flags) { + /* The selector can access any key */ + if (selector->flags & SELECTOR_FLAG_ALLKEYS) return ACL_OK; + + listIter li; + listNode *ln; + listRewind(selector->patterns,&li); + + int key_flags = 0; + if (keyspec_flags & CMD_KEY_ACCESS) key_flags |= ACL_READ_PERMISSION; + if (keyspec_flags & CMD_KEY_INSERT) key_flags |= ACL_WRITE_PERMISSION; + if (keyspec_flags & CMD_KEY_DELETE) key_flags |= ACL_WRITE_PERMISSION; + if (keyspec_flags & CMD_KEY_UPDATE) key_flags |= ACL_WRITE_PERMISSION; + + /* Is given key represent a prefix of a set of keys */ + int prefix = keyspec_flags & CMD_KEY_PREFIX; + + /* Test this key against every pattern. */ + while((ln = listNext(&li))) { + keyPattern *pattern = listNodeValue(ln); + if ((pattern->flags & key_flags) != key_flags) + continue; + size_t plen = sdslen(pattern->pattern); + if (prefix) { + if (prefixmatch(pattern->pattern,plen,key,keylen,0)) + return ACL_OK; + } else { + if (stringmatchlen(pattern->pattern, plen, key, keylen, 0)) + return ACL_OK; + } + } + return ACL_DENIED_KEY; +} + +/* Checks if the provided selector selector has access specified in flags + * to all keys in the keyspace. For example, CMD_KEY_READ access requires either + * '%R~*', '~*', or allkeys to be granted to the selector. Returns 1 if all + * the access flags are satisfied with this selector or 0 otherwise. + */ +static int ACLSelectorHasUnrestrictedKeyAccess(aclSelector *selector, int flags) { + /* The selector can access any key */ + if (selector->flags & SELECTOR_FLAG_ALLKEYS) return 1; + + listIter li; + listNode *ln; + listRewind(selector->patterns,&li); + + int access_flags = 0; + if (flags & CMD_KEY_ACCESS) access_flags |= ACL_READ_PERMISSION; + if (flags & CMD_KEY_INSERT) access_flags |= ACL_WRITE_PERMISSION; + if (flags & CMD_KEY_DELETE) access_flags |= ACL_WRITE_PERMISSION; + if (flags & CMD_KEY_UPDATE) access_flags |= ACL_WRITE_PERMISSION; + + /* Test this key against every pattern. */ + while((ln = listNext(&li))) { + keyPattern *pattern = listNodeValue(ln); + if ((pattern->flags & access_flags) != access_flags) + continue; + if (!strcmp(pattern->pattern,"*")) { + return 1; + } + } + return 0; +} + +/* Checks a channel against a provided list of channels. The is_pattern + * argument should only be used when subscribing (not when publishing) + * and controls whether the input channel is evaluated as a channel pattern + * (like in PSUBSCRIBE) or a plain channel name (like in SUBSCRIBE). + * + * Note that a plain channel name like in PUBLISH or SUBSCRIBE can be + * matched against ACL channel patterns, but the pattern provided in PSUBSCRIBE + * can only be matched as a literal against an ACL pattern (using plain string compare). */ +static int ACLCheckChannelAgainstList(list *reference, const char *channel, int channellen, int is_pattern) { + listIter li; + listNode *ln; + + listRewind(reference, &li); + while((ln = listNext(&li))) { + sds pattern = listNodeValue(ln); + size_t plen = sdslen(pattern); + /* Channel patterns are matched literally against the channels in + * the list. Regular channels perform pattern matching. */ + if ((is_pattern && !strcmp(pattern,channel)) || + (!is_pattern && stringmatchlen(pattern,plen,channel,channellen,0))) + { + return ACL_OK; + } + } + return ACL_DENIED_CHANNEL; +} + +/* To prevent duplicate calls to getKeysResult, a cache is maintained + * in between calls to the various selectors. */ +typedef struct { + int keys_init; + getKeysResult keys; +} aclKeyResultCache; + +void initACLKeyResultCache(aclKeyResultCache *cache) { + cache->keys_init = 0; +} + +void cleanupACLKeyResultCache(aclKeyResultCache *cache) { + if (cache->keys_init) getKeysFreeResult(&(cache->keys)); +} + +/* Check if the command is ready to be executed according to the + * ACLs associated with the specified selector. + * + * If the selector can execute the command ACL_OK is returned, otherwise + * ACL_DENIED_CMD, ACL_DENIED_KEY, or ACL_DENIED_CHANNEL is returned: the first in case the + * command cannot be executed because the selector is not allowed to run such + * command, the second and third if the command is denied because the selector is trying + * to access a key or channel that are not among the specified patterns. */ +static int ACLSelectorCheckCmd(aclSelector *selector, struct redisCommand *cmd, robj **argv, int argc, int *keyidxptr, aclKeyResultCache *cache) { + uint64_t id = cmd->id; + int ret; + if (!(selector->flags & SELECTOR_FLAG_ALLCOMMANDS) && !(cmd->flags & CMD_NO_AUTH)) { + /* If the bit is not set we have to check further, in case the + * command is allowed just with that specific first argument. */ + if (ACLGetSelectorCommandBit(selector,id) == 0) { + /* Check if the first argument matches. */ + if (argc < 2 || + selector->allowed_firstargs == NULL || + selector->allowed_firstargs[id] == NULL) + { + return ACL_DENIED_CMD; + } + + long subid = 0; + while (1) { + if (selector->allowed_firstargs[id][subid] == NULL) + return ACL_DENIED_CMD; + int idx = cmd->parent ? 2 : 1; + if (!strcasecmp(argv[idx]->ptr,selector->allowed_firstargs[id][subid])) + break; /* First argument match found. Stop here. */ + subid++; + } + } + } + + /* Check if the user can execute commands explicitly touching the keys + * mentioned in the command arguments. */ + if (!(selector->flags & SELECTOR_FLAG_ALLKEYS) && doesCommandHaveKeys(cmd)) { + if (!(cache->keys_init)) { + cache->keys = (getKeysResult) GETKEYS_RESULT_INIT; + getKeysFromCommandWithSpecs(cmd, argv, argc, GET_KEYSPEC_DEFAULT, &(cache->keys)); + cache->keys_init = 1; + } + getKeysResult *result = &(cache->keys); + keyReference *resultidx = result->keys; + for (int j = 0; j < result->numkeys; j++) { + int idx = resultidx[j].pos; + ret = ACLSelectorCheckKey(selector, argv[idx]->ptr, sdslen(argv[idx]->ptr), resultidx[j].flags); + if (ret != ACL_OK) { + if (keyidxptr) *keyidxptr = resultidx[j].pos; + return ret; + } + } + } + + /* Check if the user can execute commands explicitly touching the channels + * mentioned in the command arguments */ + const int channel_flags = CMD_CHANNEL_PUBLISH | CMD_CHANNEL_SUBSCRIBE; + if (!(selector->flags & SELECTOR_FLAG_ALLCHANNELS) && doesCommandHaveChannelsWithFlags(cmd, channel_flags)) { + getKeysResult channels = (getKeysResult) GETKEYS_RESULT_INIT; + getChannelsFromCommand(cmd, argv, argc, &channels); + keyReference *channelref = channels.keys; + for (int j = 0; j < channels.numkeys; j++) { + int idx = channelref[j].pos; + if (!(channelref[j].flags & channel_flags)) continue; + int is_pattern = channelref[j].flags & CMD_CHANNEL_PATTERN; + int ret = ACLCheckChannelAgainstList(selector->channels, argv[idx]->ptr, sdslen(argv[idx]->ptr), is_pattern); + if (ret != ACL_OK) { + if (keyidxptr) *keyidxptr = channelref[j].pos; + getKeysFreeResult(&channels); + return ret; + } + } + getKeysFreeResult(&channels); + } + return ACL_OK; +} + +/* Check if the key can be accessed by the client according to + * the ACLs associated with the specified user according to the + * keyspec access flags. + * + * If the user can access the key, ACL_OK is returned, otherwise + * ACL_DENIED_KEY is returned. */ +int ACLUserCheckKeyPerm(user *u, const char *key, int keylen, int flags) { + listIter li; + listNode *ln; + + /* If there is no associated user, the connection can run anything. */ + if (u == NULL) return ACL_OK; + + /* Check all of the selectors */ + listRewind(u->selectors,&li); + while((ln = listNext(&li))) { + aclSelector *s = (aclSelector *) listNodeValue(ln); + if (ACLSelectorCheckKey(s, key, keylen, flags) == ACL_OK) { + return ACL_OK; + } + } + return ACL_DENIED_KEY; +} + +/* Checks if the user can execute the given command with the added restriction + * it must also have the access specified in flags to any key in the key space. + * For example, CMD_KEY_READ access requires either '%R~*', '~*', or allkeys to be + * granted in addition to the access required by the command. Returns 1 + * if the user has access or 0 otherwise. + */ +int ACLUserCheckCmdWithUnrestrictedKeyAccess(user *u, struct redisCommand *cmd, robj **argv, int argc, int flags) { + listIter li; + listNode *ln; + int local_idxptr; + + /* If there is no associated user, the connection can run anything. */ + if (u == NULL) return 1; + + /* For multiple selectors, we cache the key result in between selector + * calls to prevent duplicate lookups. */ + aclKeyResultCache cache; + initACLKeyResultCache(&cache); + + /* Check each selector sequentially */ + listRewind(u->selectors,&li); + while((ln = listNext(&li))) { + aclSelector *s = (aclSelector *) listNodeValue(ln); + int acl_retval = ACLSelectorCheckCmd(s, cmd, argv, argc, &local_idxptr, &cache); + if (acl_retval == ACL_OK && ACLSelectorHasUnrestrictedKeyAccess(s, flags)) { + cleanupACLKeyResultCache(&cache); + return 1; + } + } + cleanupACLKeyResultCache(&cache); + return 0; +} + +/* Check if the channel can be accessed by the client according to + * the ACLs associated with the specified user. + * + * If the user can access the key, ACL_OK is returned, otherwise + * ACL_DENIED_CHANNEL is returned. */ +int ACLUserCheckChannelPerm(user *u, sds channel, int is_pattern) { + listIter li; + listNode *ln; + + /* If there is no associated user, the connection can run anything. */ + if (u == NULL) return ACL_OK; + + /* Check all of the selectors */ + listRewind(u->selectors,&li); + while((ln = listNext(&li))) { + aclSelector *s = (aclSelector *) listNodeValue(ln); + /* The selector can run any keys */ + if (s->flags & SELECTOR_FLAG_ALLCHANNELS) return ACL_OK; + + /* Otherwise, loop over the selectors list and check each channel */ + if (ACLCheckChannelAgainstList(s->channels, channel, sdslen(channel), is_pattern) == ACL_OK) { + return ACL_OK; + } + } + return ACL_DENIED_CHANNEL; +} + +/* Lower level API that checks if a specified user is able to execute a given command. + * + * If the command fails an ACL check, idxptr will be to set to the first argv entry that + * causes the failure, either 0 if the command itself fails or the idx of the key/channel + * that causes the failure */ +int ACLCheckAllUserCommandPerm(user *u, struct redisCommand *cmd, robj **argv, int argc, getKeysResult *key_result, int *idxptr) { + listIter li; + listNode *ln; + + /* If there is no associated user, the connection can run anything. */ + if (u == NULL) return ACL_OK; + + /* Quick check if the user has all permissions, return early if so. */ + if (likely(listFirst(u->selectors) != NULL)) { + aclSelector *s = listNodeValue(listFirst(u->selectors)); + const uint32_t all_perms = SELECTOR_FLAG_ALLCOMMANDS | + SELECTOR_FLAG_ALLKEYS | + SELECTOR_FLAG_ALLCHANNELS; + if ((s->flags & all_perms) == all_perms) return ACL_OK; + } + + /* We have to pick a single error to log, the logic for picking is as follows: + * 1) If no selector can execute the command, return the command. + * 2) Return the last key or channel that no selector could match. */ + int relevant_error = ACL_DENIED_CMD; + int local_idxptr = 0, last_idx = 0; + + /* For multiple selectors, we cache the key result in between selector + * calls to prevent duplicate lookups. */ + aclKeyResultCache cache; + initACLKeyResultCache(&cache); + if (key_result) { + cache.keys = *key_result; + cache.keys_init = 1; + } + + /* Check each selector sequentially */ + listRewind(u->selectors,&li); + while((ln = listNext(&li))) { + aclSelector *s = (aclSelector *) listNodeValue(ln); + int acl_retval = ACLSelectorCheckCmd(s, cmd, argv, argc, &local_idxptr, &cache); + if (acl_retval == ACL_OK) { + if (!key_result) cleanupACLKeyResultCache(&cache); + return ACL_OK; + } + if (acl_retval > relevant_error || + (acl_retval == relevant_error && local_idxptr > last_idx)) + { + relevant_error = acl_retval; + last_idx = local_idxptr; + } + } + + *idxptr = last_idx; + if (!key_result) cleanupACLKeyResultCache(&cache); + return relevant_error; +} + +/* High level API for checking if a client can execute the queued up command */ +int ACLCheckAllPerm(client *c, int *idxptr) { + return ACLCheckAllUserCommandPerm(c->user, c->cmd, c->argv, c->argc, getClientCachedKeyResult(c), idxptr); +} + +/* If 'new' can access all channels 'original' could then return NULL; + Otherwise return a list of channels that the new user can access */ +list *getUpcomingChannelList(user *new, user *original) { + listIter li, lpi; + listNode *ln, *lpn; + + /* Optimization: we check if any selector has all channel permissions. */ + listRewind(new->selectors,&li); + while((ln = listNext(&li))) { + aclSelector *s = (aclSelector *) listNodeValue(ln); + if (s->flags & SELECTOR_FLAG_ALLCHANNELS) return NULL; + } + + /* Next, check if the new list of channels + * is a strict superset of the original. This is done by + * created an "upcoming" list of all channels that are in + * the new user and checking each of the existing channels + * against it. */ + list *upcoming = listCreate(); + listRewind(new->selectors,&li); + while((ln = listNext(&li))) { + aclSelector *s = (aclSelector *) listNodeValue(ln); + listRewind(s->channels, &lpi); + while((lpn = listNext(&lpi))) { + listAddNodeTail(upcoming, listNodeValue(lpn)); + } + } + + int match = 1; + listRewind(original->selectors,&li); + while((ln = listNext(&li)) && match) { + aclSelector *s = (aclSelector *) listNodeValue(ln); + /* If any of the original selectors has the all-channels permission, but + * the new ones don't (this is checked earlier in this function), then the + * new list is not a strict superset of the original. */ + if (s->flags & SELECTOR_FLAG_ALLCHANNELS) { + match = 0; + break; + } + listRewind(s->channels, &lpi); + while((lpn = listNext(&lpi)) && match) { + if (!listSearchKey(upcoming, listNodeValue(lpn))) { + match = 0; + break; + } + } + } + + if (match) { + /* All channels were matched, no need to kill clients. */ + listRelease(upcoming); + return NULL; + } + + return upcoming; +} + +/* Check if the client should be killed because it is subscribed to channels that were + * permitted in the past, are not in the `upcoming` channel list. */ +int ACLShouldKillPubsubClient(client *c, list *upcoming) { + robj *o; + int kill = 0; + + if (getClientType(c) == CLIENT_TYPE_PUBSUB) { + /* Check for pattern violations. */ + dictIterator di; + dictEntry *de; + dictInitIterator(&di, c->pubsub_patterns); + while (!kill && ((de = dictNext(&di)) != NULL)) { + o = dictGetKey(de); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 1); + kill = (res == ACL_DENIED_CHANNEL); + } + dictResetIterator(&di); + + /* Check for channel violations. */ + if (!kill) { + /* Check for global channels violation. */ + dictInitIterator(&di, c->pubsub_channels); + + while (!kill && ((de = dictNext(&di)) != NULL)) { + o = dictGetKey(de); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); + kill = (res == ACL_DENIED_CHANNEL); + } + dictResetIterator(&di); + } + if (!kill) { + /* Check for shard channels violation. */ + dictInitIterator(&di, c->pubsubshard_channels); + while (!kill && ((de = dictNext(&di)) != NULL)) { + o = dictGetKey(de); + int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0); + kill = (res == ACL_DENIED_CHANNEL); + } + dictResetIterator(&di); + } + + if (kill) { + return 1; + } + } + return 0; +} + +/* Check if the user's existing pub/sub clients violate the ACL pub/sub + * permissions specified via the upcoming argument, and kill them if so. */ +void ACLKillPubsubClientsIfNeeded(user *new, user *original) { + /* Do nothing if there are no subscribers. */ + if (pubsubTotalSubscriptions() == 0) + return; + + list *channels = getUpcomingChannelList(new, original); + /* If the new user's pubsub permissions are a strict superset of the original, return early. */ + if (!channels) + return; + + listIter li; + listNode *ln; + + /* Permissions have changed, so we need to iterate through all + * the clients and disconnect those that are no longer valid. + * Scan all connected clients to find the user's pub/subs. */ + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + if (c->user != original) + continue; + if (ACLShouldKillPubsubClient(c, channels)) + deauthenticateAndCloseClient(c); + } + + listRelease(channels); +} + +/* ============================================================================= + * ACL loading / saving functions + * ==========================================================================*/ + + +/* Selector definitions should be sent as a single argument, however + * we will be lenient and try to find selector definitions spread + * across multiple arguments since it makes for a simpler user experience + * for ACL SETUSER as well as when loading from conf files. + * + * This function takes in an array of ACL operators, excluding the username, + * and merges selector operations that are spread across multiple arguments. The return + * value is a new SDS array, with length set to the passed in merged_argc. Arguments + * that are untouched are still duplicated. If there is an unmatched parenthesis, NULL + * is returned and invalid_idx is set to the argument with the start of the opening + * parenthesis. */ +sds *ACLMergeSelectorArguments(sds *argv, int argc, int *merged_argc, int *invalid_idx) { + *merged_argc = 0; + int open_bracket_start = -1; + + sds *acl_args = (sds *) zmalloc(sizeof(sds) * argc); + + sds selector = NULL; + for (int j = 0; j < argc; j++) { + char *op = argv[j]; + + if (open_bracket_start == -1 && + (op[0] == '(' && op[sdslen(op) - 1] != ')')) { + selector = sdsdup(argv[j]); + open_bracket_start = j; + continue; + } + + if (open_bracket_start != -1) { + selector = sdscatfmt(selector, " %s", op); + if (op[sdslen(op) - 1] == ')') { + open_bracket_start = -1; + acl_args[*merged_argc] = selector; + (*merged_argc)++; + } + continue; + } + + acl_args[*merged_argc] = sdsdup(argv[j]); + (*merged_argc)++; + } + + if (open_bracket_start != -1) { + for (int i = 0; i < *merged_argc; i++) sdsfree(acl_args[i]); + zfree(acl_args); + sdsfree(selector); + if (invalid_idx) *invalid_idx = open_bracket_start; + return NULL; + } + + return acl_args; +} + +/* takes an acl string already split on spaces and adds it to the given user + * if the user object is NULL, will create a user with the given username + * + * Returns an error as an sds string if the ACL string is not parsable + */ +sds ACLStringSetUser(user *u, sds username, sds *argv, int argc) { + serverAssert(u != NULL || username != NULL); + + sds error = NULL; + + int merged_argc = 0, invalid_idx = 0; + sds *acl_args = ACLMergeSelectorArguments(argv, argc, &merged_argc, &invalid_idx); + + if (!acl_args) { + return sdscatfmt(sdsempty(), + "Unmatched parenthesis in acl selector starting " + "at '%s'.", (char *) argv[invalid_idx]); + } + + /* Create a temporary user to validate and stage all changes against + * before applying to an existing user or creating a new user. If all + * arguments are valid the user parameters will all be applied together. + * If there are any errors then none of the changes will be applied. */ + user *tempu = ACLCreateUnlinkedUser(); + if (u) { + ACLCopyUser(tempu, u); + } + + for (int j = 0; j < merged_argc; j++) { + if (ACLSetUser(tempu,acl_args[j],(ssize_t) sdslen(acl_args[j])) != C_OK) { + const char *errmsg = ACLSetUserStringError(); + error = sdscatfmt(sdsempty(), + "Error in ACL SETUSER modifier '%s': %s", + (char*)acl_args[j], errmsg); + goto cleanup; + } + } + + /* Existing pub/sub clients authenticated with the user may need to be + * disconnected if (some of) their channel permissions were revoked. */ + if (u) { + ACLKillPubsubClientsIfNeeded(tempu, u); + } + + /* Overwrite the user with the temporary user we modified above. */ + if (!u) { + u = ACLCreateUser(username,sdslen(username)); + } + serverAssert(u != NULL); + + ACLCopyUser(u, tempu); + +cleanup: + ACLFreeUser(tempu); + for (int i = 0; i < merged_argc; i++) { + sdsfree(acl_args[i]); + } + zfree(acl_args); + + return error; +} + +/* Given an argument vector describing a user in the form: + * + * user ... ACL rules and flags ... + * + * this function validates, and if the syntax is valid, appends + * the user definition to a list for later loading. + * + * The rules are tested for validity and if there obvious syntax errors + * the function returns C_ERR and does nothing, otherwise C_OK is returned + * and the user is appended to the list. + * + * Note that this function cannot stop in case of commands that are not found + * and, in that case, the error will be emitted later, because certain + * commands may be defined later once modules are loaded. + * + * When an error is detected and C_ERR is returned, the function populates + * by reference (if not set to NULL) the argc_err argument with the index + * of the argv vector that caused the error. */ +int ACLAppendUserForLoading(sds *argv, int argc, int *argc_err) { + if (argc < 2 || strcasecmp(argv[0],"user")) { + if (argc_err) *argc_err = 0; + return C_ERR; + } + + if (listSearchKey(UsersToLoad, argv[1])) { + if (argc_err) *argc_err = 1; + errno = EALREADY; + return C_ERR; + } + + /* Merged selectors before trying to process */ + int merged_argc; + sds *acl_args = ACLMergeSelectorArguments(argv + 2, argc - 2, &merged_argc, argc_err); + + if (!acl_args) { + return C_ERR; + } + + /* Try to apply the user rules in a fake user to see if they + * are actually valid. */ + user *fakeuser = ACLCreateUnlinkedUser(); + + for (int j = 0; j < merged_argc; j++) { + if (ACLSetUser(fakeuser,acl_args[j],sdslen(acl_args[j])) == C_ERR) { + if (errno != ENOENT) { + ACLFreeUser(fakeuser); + if (argc_err) *argc_err = j; + for (int i = 0; i < merged_argc; i++) sdsfree(acl_args[i]); + zfree(acl_args); + return C_ERR; + } + } + } + + /* Rules look valid, let's append the user to the list. */ + sds *copy = zmalloc(sizeof(sds)*(merged_argc + 2)); + copy[0] = sdsdup(argv[1]); + for (int j = 0; j < merged_argc; j++) copy[j+1] = sdsdup(acl_args[j]); + copy[merged_argc + 1] = NULL; + listAddNodeTail(UsersToLoad,copy); + ACLFreeUser(fakeuser); + for (int i = 0; i < merged_argc; i++) sdsfree(acl_args[i]); + zfree(acl_args); + return C_OK; +} + +/* This function will load the configured users appended to the server + * configuration via ACLAppendUserForLoading(). On loading errors it will + * log an error and return C_ERR, otherwise C_OK will be returned. */ +int ACLLoadConfiguredUsers(void) { + listIter li; + listNode *ln; + listRewind(UsersToLoad,&li); + while ((ln = listNext(&li)) != NULL) { + sds *aclrules = listNodeValue(ln); + sds username = aclrules[0]; + + if (ACLStringHasSpaces(aclrules[0],sdslen(aclrules[0]))) { + serverLog(LL_WARNING,"Spaces not allowed in ACL usernames"); + return C_ERR; + } + + user *u = ACLCreateUser(username,sdslen(username)); + if (!u) { + /* Only valid duplicate user is the default one. */ + serverAssert(!strcmp(username, "default")); + u = ACLGetUserByName("default",7); + ACLSetUser(u,"reset",-1); + } + + /* Load every rule defined for this user. */ + for (int j = 1; aclrules[j]; j++) { + if (ACLSetUser(u,aclrules[j],sdslen(aclrules[j])) != C_OK) { + const char *errmsg = ACLSetUserStringError(); + serverLog(LL_WARNING,"Error loading ACL rule '%s' for " + "the user named '%s': %s", + redactLogCstr(aclrules[j]),redactLogCstr(aclrules[0]),errmsg); + return C_ERR; + } + } + + /* Having a disabled user in the configuration may be an error, + * warn about it without returning any error to the caller. */ + if (u->flags & USER_FLAG_DISABLED) { + serverLog(LL_NOTICE, "The user '%s' is disabled (there is no " + "'on' modifier in the user description). Make " + "sure this is not a configuration error.", + redactLogCstr(aclrules[0])); + } + } + return C_OK; +} + +/* This function loads the ACL from the specified filename: every line + * is validated and should be either empty or in the format used to specify + * users in the redis.conf configuration or in the ACL file, that is: + * + * user ... rules ... + * + * Note that this function considers comments starting with '#' as errors + * because the ACL file is meant to be rewritten, and comments would be + * lost after the rewrite. Yet empty lines are allowed to avoid being too + * strict. + * + * One important part of implementing ACL LOAD, that uses this function, is + * to avoid ending with broken rules if the ACL file is invalid for some + * reason, so the function will attempt to validate the rules before loading + * each user. For every line that will be found broken the function will + * collect an error message. + * + * IMPORTANT: If there is at least a single error, nothing will be loaded + * and the rules will remain exactly as they were. + * + * At the end of the process, if no errors were found in the whole file then + * NULL is returned. Otherwise an SDS string describing in a single line + * a description of all the issues found is returned. */ +sds ACLLoadFromFile(const char *filename) { + FILE *fp; + char buf[1024]; + + /* Open the ACL file. */ + if ((fp = fopen(filename,"r")) == NULL) { + sds errors = sdscatprintf(sdsempty(), + "Error loading ACLs, opening file '%s': %s", + filename, strerror(errno)); + return errors; + } + + /* Load the whole file as a single string in memory. */ + sds acls = sdsempty(); + while(fgets(buf,sizeof(buf),fp) != NULL) + acls = sdscat(acls,buf); + fclose(fp); + + /* Split the file into lines and attempt to load each line. */ + int totlines; + sds *lines, errors = sdsempty(); + lines = sdssplitlen(acls,strlen(acls),"\n",1,&totlines); + sdsfree(acls); + + /* We do all the loading in a fresh instance of the Users radix tree, + * so if there are errors loading the ACL file we can rollback to the + * old version. */ + rax *old_users = Users; + Users = raxNew(); + + /* Load each line of the file. */ + for (int i = 0; i < totlines; i++) { + sds *argv; + int argc; + int linenum = i+1; + + lines[i] = sdstrim(lines[i]," \t\r\n"); + + /* Skip blank lines */ + if (lines[i][0] == '\0') continue; + + /* Split into arguments */ + argv = sdssplitlen(lines[i],sdslen(lines[i])," ",1,&argc); + if (argv == NULL) { + errors = sdscatprintf(errors, + "%s:%d: unbalanced quotes in acl line. ", + server.acl_filename, linenum); + continue; + } + + /* Skip this line if the resulting command vector is empty. */ + if (argc == 0) { + sdsfreesplitres(argv,argc); + continue; + } + + /* The line should start with the "user" keyword. */ + if (strcmp(argv[0],"user") || argc < 2) { + errors = sdscatprintf(errors, + "%s:%d should start with user keyword followed " + "by the username. ", server.acl_filename, + linenum); + sdsfreesplitres(argv,argc); + continue; + } + + /* Spaces are not allowed in usernames. */ + if (ACLStringHasSpaces(argv[1],sdslen(argv[1]))) { + errors = sdscatprintf(errors, + "'%s:%d: username '%s' contains invalid characters. ", + server.acl_filename, linenum, argv[1]); + sdsfreesplitres(argv,argc); + continue; + } + + user *u = ACLCreateUser(argv[1],sdslen(argv[1])); + + /* If the user already exists we assume it's an error and abort. */ + if (!u) { + errors = sdscatprintf(errors,"WARNING: Duplicate user '%s' found on line %d. ", argv[1], linenum); + sdsfreesplitres(argv,argc); + continue; + } + + /* Finally process the options and validate they can + * be cleanly applied to the user. If any option fails + * to apply, the other values won't be applied since + * all the pending changes will get dropped. */ + int merged_argc; + sds *acl_args = ACLMergeSelectorArguments(argv + 2, argc - 2, &merged_argc, NULL); + if (!acl_args) { + errors = sdscatprintf(errors, + "%s:%d: Unmatched parenthesis in selector definition.", + server.acl_filename, linenum); + } + + int syntax_error = 0; + for (int j = 0; j < merged_argc; j++) { + acl_args[j] = sdstrim(acl_args[j],"\t\r\n"); + if (ACLSetUser(u,acl_args[j],sdslen(acl_args[j])) != C_OK) { + const char *errmsg = ACLSetUserStringError(); + if (errno == ENOENT) { + /* For missing commands, we print out more information since + * it shouldn't contain any sensitive information. */ + errors = sdscatprintf(errors, + "%s:%d: Error in applying operation '%s': %s. ", + server.acl_filename, linenum, acl_args[j], errmsg); + } else if (syntax_error == 0) { + /* For all other errors, only print out the first error encountered + * since it might affect future operations. */ + errors = sdscatprintf(errors, + "%s:%d: %s. ", + server.acl_filename, linenum, errmsg); + syntax_error = 1; + } + } + } + + for (int i = 0; i < merged_argc; i++) sdsfree(acl_args[i]); + zfree(acl_args); + + /* Apply the rule to the new users set only if so far there + * are no errors, otherwise it's useless since we are going + * to discard the new users set anyway. */ + if (sdslen(errors) != 0) { + sdsfreesplitres(argv,argc); + continue; + } + + sdsfreesplitres(argv,argc); + } + + sdsfreesplitres(lines,totlines); + + /* Check if we found errors and react accordingly. */ + if (sdslen(errors) == 0) { + /* The default user pointer is referenced in different places: instead + * of replacing such occurrences it is much simpler to copy the new + * default user configuration in the old one. */ + user *new_default = ACLGetUserByName("default",7); + if (!new_default) { + new_default = ACLCreateDefaultUser(); + } + + ACLCopyUser(DefaultUser,new_default); + ACLFreeUser(new_default); + raxInsert(Users,(unsigned char*)"default",7,DefaultUser,NULL); + raxRemove(old_users,(unsigned char*)"default",7,NULL); + + /* If there are some subscribers, we need to check if we need to drop some clients. */ + rax *user_channels = NULL; + if (pubsubTotalSubscriptions() > 0) { + user_channels = raxNew(); + } + + listIter li; + listNode *ln; + + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + /* a MASTER client can do everything (and user = NULL) so we can skip it */ + if (c->flags & CLIENT_MASTER) + continue; + user *original = c->user; + list *channels = NULL; + user *new = ACLGetUserByName(c->user->name, sdslen(c->user->name)); + if (new && user_channels) { + if (!raxFind(user_channels, (unsigned char*)(new->name), sdslen(new->name), (void**)&channels)) { + channels = getUpcomingChannelList(new, original); + raxInsert(user_channels, (unsigned char*)(new->name), sdslen(new->name), channels, NULL); + } + } + /* When the new channel list is NULL, it means the new user's channel list is a superset of the old user's list. */ + if (!new || (channels && ACLShouldKillPubsubClient(c, channels))) { + deauthenticateAndCloseClient(c); + continue; + } + c->user = new; + } + + if (user_channels) + raxFreeWithCallback(user_channels, listReleaseGeneric); + raxFreeWithCallback(old_users, ACLFreeUserGeneric); + sdsfree(errors); + return NULL; + } else { + raxFreeWithCallback(Users, ACLFreeUserGeneric); + Users = old_users; + errors = sdscat(errors,"WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); + return errors; + } +} + +/* Generate a copy of the ACLs currently in memory in the specified filename. + * Returns C_OK on success or C_ERR if there was an error during the I/O. + * When C_ERR is returned a log is produced with hints about the issue. */ +int ACLSaveToFile(const char *filename) { + sds acl = sdsempty(); + int fd = -1; + sds tmpfilename = NULL; + int retval = C_ERR; + + /* Let's generate an SDS string containing the new version of the + * ACL file. */ + raxIterator ri; + raxStart(&ri,Users); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + user *u = ri.data; + /* Return information in the configuration file format. */ + sds user = sdsnew("user "); + user = sdscatsds(user,u->name); + user = sdscatlen(user," ",1); + robj *descr = ACLDescribeUser(u); + user = sdscatsds(user,descr->ptr); + decrRefCount(descr); + acl = sdscatsds(acl,user); + acl = sdscatlen(acl,"\n",1); + sdsfree(user); + } + raxStop(&ri); + + /* Create a temp file with the new content. */ + tmpfilename = sdsnew(filename); + tmpfilename = sdscatfmt(tmpfilename,".tmp-%i-%I", + (int) getpid(),commandTimeSnapshot()); + if ((fd = open(tmpfilename,O_WRONLY|O_CREAT,0644)) == -1) { + serverLog(LL_WARNING,"Opening temp ACL file for ACL SAVE: %s", + strerror(errno)); + goto cleanup; + } + + /* Write it. */ + size_t offset = 0; + while (offset < sdslen(acl)) { + ssize_t written_bytes = write(fd,acl + offset,sdslen(acl) - offset); + if (written_bytes <= 0) { + if (errno == EINTR) continue; + serverLog(LL_WARNING,"Writing ACL file for ACL SAVE: %s", + strerror(errno)); + goto cleanup; + } + offset += written_bytes; + } + if (redis_fsync(fd) == -1) { + serverLog(LL_WARNING,"Syncing ACL file for ACL SAVE: %s", + strerror(errno)); + goto cleanup; + } + close(fd); fd = -1; + + /* Let's replace the new file with the old one. */ + if (rename(tmpfilename,filename) == -1) { + serverLog(LL_WARNING,"Renaming ACL file for ACL SAVE: %s", + strerror(errno)); + goto cleanup; + } + if (fsyncFileDir(filename) == -1) { + serverLog(LL_WARNING,"Syncing ACL directory for ACL SAVE: %s", + strerror(errno)); + goto cleanup; + } + sdsfree(tmpfilename); tmpfilename = NULL; + retval = C_OK; /* If we reached this point, everything is fine. */ + +cleanup: + if (fd != -1) close(fd); + if (tmpfilename) unlink(tmpfilename); + sdsfree(tmpfilename); + sdsfree(acl); + return retval; +} + +/* This function is called once the server is already running, modules are + * loaded, and we are ready to start, in order to load the ACLs either from + * the pending list of users defined in redis.conf, or from the ACL file. + * The function will just exit with an error if the user is trying to mix + * both the loading methods. */ +void ACLLoadUsersAtStartup(void) { + if (server.acl_filename[0] != '\0' && listLength(UsersToLoad) != 0) { + serverLog(LL_WARNING, + "Configuring Redis with users defined in redis.conf and at " + "the same setting an ACL file path is invalid. This setup " + "is very likely to lead to configuration errors and security " + "holes, please define either an ACL file or declare users " + "directly in your redis.conf, but not both."); + exit(1); + } + + if (ACLLoadConfiguredUsers() == C_ERR) { + serverLog(LL_WARNING, + "Critical error while loading ACLs. Exiting."); + exit(1); + } + + if (server.acl_filename[0] != '\0') { + sds errors = ACLLoadFromFile(server.acl_filename); + if (errors) { + serverLog(LL_WARNING, + "Aborting Redis startup because of ACL errors: %s", errors); + sdsfree(errors); + exit(1); + } + } +} + +/* ============================================================================= + * ACL log + * ==========================================================================*/ + +#define ACL_LOG_GROUPING_MAX_TIME_DELTA 60000 + +/* This structure defines an entry inside the ACL log. */ +typedef struct ACLLogEntry { + uint64_t count; /* Number of times this happened recently. */ + int reason; /* Reason for denying the command. ACL_DENIED_*. */ + int context; /* Toplevel, Lua or MULTI/EXEC? ACL_LOG_CTX_*. */ + sds object; /* The key name or command name. */ + sds username; /* User the client is authenticated with. */ + mstime_t ctime; /* Milliseconds time of last update to this entry. */ + sds cinfo; /* Client info (last client if updated). */ + long long entry_id; /* The pair (entry_id, timestamp_created) is a unique identifier of this entry + * in case the node dies and is restarted, it can detect that if it's a new series. */ + mstime_t timestamp_created; /* UNIX time in milliseconds at the time of this entry's creation. */ +} ACLLogEntry; + +/* This function will check if ACL entries 'a' and 'b' are similar enough + * that we should actually update the existing entry in our ACL log instead + * of creating a new one. */ +int ACLLogMatchEntry(ACLLogEntry *a, ACLLogEntry *b) { + if (a->reason != b->reason) return 0; + if (a->context != b->context) return 0; + mstime_t delta = a->ctime - b->ctime; + if (delta < 0) delta = -delta; + if (delta > ACL_LOG_GROUPING_MAX_TIME_DELTA) return 0; + if (sdscmp(a->object,b->object) != 0) return 0; + if (sdscmp(a->username,b->username) != 0) return 0; + return 1; +} + +/* Release an ACL log entry. */ +void ACLFreeLogEntry(void *leptr) { + ACLLogEntry *le = leptr; + sdsfree(le->object); + sdsfree(le->username); + sdsfree(le->cinfo); + zfree(le); +} + +/* Update the relevant counter by the reason */ +void ACLUpdateInfoMetrics(int reason){ + if (reason == ACL_DENIED_AUTH) { + server.acl_info.user_auth_failures++; + } else if (reason == ACL_DENIED_CMD) { + server.acl_info.invalid_cmd_accesses++; + } else if (reason == ACL_DENIED_KEY) { + server.acl_info.invalid_key_accesses++; + } else if (reason == ACL_DENIED_CHANNEL) { + server.acl_info.invalid_channel_accesses++; + } else if (reason == ACL_INVALID_TLS_CERT_AUTH) { + server.acl_info.acl_access_denied_tls_cert++; + } else { + serverPanic("Unknown ACL_DENIED encoding"); + } +} + +static void trimACLLogEntriesToMaxLen(void) { + while(listLength(ACLLog) > server.acllog_max_len) { + listNode *ln = listLast(ACLLog); + ACLLogEntry *le = listNodeValue(ln); + ACLFreeLogEntry(le); + listDelNode(ACLLog,ln); + } +} + +/* Adds a new entry in the ACL log, making sure to delete the old entry + * if we reach the maximum length allowed for the log. This function attempts + * to find similar entries in the current log in order to bump the counter of + * the log entry instead of creating many entries for very similar ACL + * rules issues. + * + * The argpos argument is used when the reason is ACL_DENIED_KEY or + * ACL_DENIED_CHANNEL, since it allows the function to log the key or channel + * name that caused the problem. + * + * The last 2 arguments are a manual override to be used, instead of any of the automatic + * ones which depend on the client and reason arguments (use NULL for default). + * + * If `object` is not NULL, this functions takes over it. + */ +void addACLLogEntry(client *c, int reason, int context, int argpos, sds username, sds object) { + /* Update ACL info metrics */ + ACLUpdateInfoMetrics(reason); + + if (server.acllog_max_len == 0) { + trimACLLogEntriesToMaxLen(); + return; + } + + /* Create a new entry. */ + struct ACLLogEntry *le = zmalloc(sizeof(*le)); + le->count = 1; + le->reason = reason; + le->username = sdsdup(username ? username : c->user->name); + le->ctime = commandTimeSnapshot(); + le->entry_id = ACLLogEntryCount; + le->timestamp_created = le->ctime; + + if (object) { + le->object = object; + } else { + switch(reason) { + case ACL_DENIED_CMD: le->object = sdsdup(c->cmd->fullname); break; + case ACL_DENIED_KEY: le->object = sdsdup(c->argv[argpos]->ptr); break; + case ACL_DENIED_CHANNEL: le->object = sdsdup(c->argv[argpos]->ptr); break; + case ACL_DENIED_AUTH: le->object = sdsdup(c->argv[0]->ptr); break; + default: le->object = sdsempty(); + } + } + + /* if we have a real client from the network, use it (could be missing on module timers) */ + client *realclient = server.current_client? server.current_client : c; + + le->cinfo = catClientInfoString(sdsempty(),realclient); + le->context = context; + + /* Try to match this entry with past ones, to see if we can just + * update an existing entry instead of creating a new one. */ + long toscan = 10; /* Do a limited work trying to find duplicated. */ + listIter li; + listNode *ln; + listRewind(ACLLog,&li); + ACLLogEntry *match = NULL; + while (toscan-- && (ln = listNext(&li)) != NULL) { + ACLLogEntry *current = listNodeValue(ln); + if (ACLLogMatchEntry(current,le)) { + match = current; + listDelNode(ACLLog,ln); + listAddNodeHead(ACLLog,current); + break; + } + } + + /* If there is a match update the entry, otherwise add it as a + * new one. */ + if (match) { + /* We update a few fields of the existing entry and bump the + * counter of events for this entry. */ + sdsfree(match->cinfo); + match->cinfo = le->cinfo; + match->ctime = le->ctime; + match->count++; + + /* Release the old entry. */ + le->cinfo = NULL; + ACLFreeLogEntry(le); + } else { + /* Add it to our list of entries. We'll have to trim the list + * to its maximum size. */ + ACLLogEntryCount++; /* Incrementing the entry_id count to make each record in the log unique. */ + listAddNodeHead(ACLLog, le); + trimACLLogEntriesToMaxLen(); + } +} + +sds getAclErrorMessage(int acl_res, user *user, struct redisCommand *cmd, sds errored_val, int verbose) { + switch (acl_res) { + case ACL_DENIED_CMD: + return sdscatfmt(sdsempty(), "User %S has no permissions to run " + "the '%S' command", user->name, cmd->fullname); + case ACL_DENIED_KEY: + if (verbose) { + return sdscatfmt(sdsempty(), "User %S has no permissions to access " + "the '%S' key", user->name, errored_val); + } else { + return sdsnew("No permissions to access a key"); + } + case ACL_DENIED_CHANNEL: + if (verbose) { + return sdscatfmt(sdsempty(), "User %S has no permissions to access " + "the '%S' channel", user->name, errored_val); + } else { + return sdsnew("No permissions to access a channel"); + } + } + serverPanic("Reached deadcode on getAclErrorMessage"); +} + +/* ============================================================================= + * ACL related commands + * ==========================================================================*/ + +/* ACL CAT category */ +void aclCatWithFlags(client *c, dict *commands, uint64_t cflag, int *arraylen) { + dictEntry *de; + dictIterator di; + dictInitIterator(&di, commands); + while ((de = dictNext(&di)) != NULL) { + struct redisCommand *cmd = dictGetVal(de); + if (cmd->acl_categories & cflag) { + addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); + (*arraylen)++; + } + + if (cmd->subcommands_dict) { + aclCatWithFlags(c, cmd->subcommands_dict, cflag, arraylen); + } + } + dictResetIterator(&di); +} + +/* Add the formatted response from a single selector to the ACL GETUSER + * response. This function returns the number of fields added. + * + * Setting verbose to 1 means that the full qualifier for key and channel + * permissions are shown. + */ +int aclAddReplySelectorDescription(client *c, aclSelector *s) { + listIter li; + listNode *ln; + + /* Commands */ + addReplyBulkCString(c,"commands"); + sds cmddescr = ACLDescribeSelectorCommandRules(s); + addReplyBulkSds(c,cmddescr); + + /* Key patterns */ + addReplyBulkCString(c,"keys"); + if (s->flags & SELECTOR_FLAG_ALLKEYS) { + addReplyBulkCBuffer(c,"~*",2); + } else { + sds dsl = sdsempty(); + listRewind(s->patterns,&li); + while((ln = listNext(&li))) { + keyPattern *thispat = (keyPattern *) listNodeValue(ln); + if (ln != listFirst(s->patterns)) dsl = sdscat(dsl, " "); + dsl = sdsCatPatternString(dsl, thispat); + } + addReplyBulkSds(c, dsl); + } + + /* Pub/sub patterns */ + addReplyBulkCString(c,"channels"); + if (s->flags & SELECTOR_FLAG_ALLCHANNELS) { + addReplyBulkCBuffer(c,"&*",2); + } else { + sds dsl = sdsempty(); + listRewind(s->channels,&li); + while((ln = listNext(&li))) { + sds thispat = listNodeValue(ln); + if (ln != listFirst(s->channels)) dsl = sdscat(dsl, " "); + dsl = sdscatfmt(dsl, "&%S", thispat); + } + addReplyBulkSds(c, dsl); + } + return 3; +} + +/* ACL -- show and modify the configuration of ACL users. + * ACL HELP + * ACL LOAD + * ACL SAVE + * ACL LIST + * ACL USERS + * ACL CAT [] + * ACL SETUSER ... acl rules ... + * ACL DELUSER [...] + * ACL GETUSER + * ACL GENPASS [] + * ACL WHOAMI + * ACL LOG [ | RESET] + */ +void aclCommand(client *c) { + char *sub = c->argv[1]->ptr; + if (!strcasecmp(sub,"setuser") && c->argc >= 3) { + /* Initially redact all of the arguments to not leak any information + * about the user. */ + for (int j = 2; j < c->argc; j++) { + redactClientCommandArgument(c, j); + } + + sds username = c->argv[2]->ptr; + /* Check username validity. */ + if (ACLStringHasSpaces(username,sdslen(username))) { + addReplyError(c, "Usernames can't contain spaces or null characters"); + return; + } + + user *u = ACLGetUserByName(username,sdslen(username)); + + sds *temp_argv = zmalloc(c->argc * sizeof(sds)); + for (int i = 3; i < c->argc; i++) temp_argv[i-3] = c->argv[i]->ptr; + + sds error = ACLStringSetUser(u, username, temp_argv, c->argc - 3); + zfree(temp_argv); + if (error == NULL) { + addReply(c,shared.ok); + } else { + addReplyErrorSdsSafe(c, error); + } + return; + } else if (!strcasecmp(sub,"deluser") && c->argc >= 3) { + /* Initially redact all the arguments to not leak any information + * about the users. */ + for (int j = 2; j < c->argc; j++) redactClientCommandArgument(c, j); + + int deleted = 0; + for (int j = 2; j < c->argc; j++) { + sds username = c->argv[j]->ptr; + if (!strcmp(username,"default")) { + addReplyError(c,"The 'default' user cannot be removed"); + return; + } + } + + for (int j = 2; j < c->argc; j++) { + sds username = c->argv[j]->ptr; + user *u; + if (raxRemove(Users,(unsigned char*)username, + sdslen(username), + (void**)&u)) + { + ACLFreeUserAndKillClients(u); + deleted++; + } + } + addReplyLongLong(c,deleted); + } else if (!strcasecmp(sub,"getuser") && c->argc == 3) { + /* Redact the username to not leak any information about the user. */ + redactClientCommandArgument(c, 2); + + user *u = ACLGetUserByName(c->argv[2]->ptr,sdslen(c->argv[2]->ptr)); + if (u == NULL) { + addReplyNull(c); + return; + } + + void *ufields = addReplyDeferredLen(c); + int fields = 3; + + /* Flags */ + addReplyBulkCString(c,"flags"); + void *deflen = addReplyDeferredLen(c); + int numflags = 0; + for (int j = 0; ACLUserFlags[j].flag; j++) { + if (u->flags & ACLUserFlags[j].flag) { + addReplyBulkCString(c,ACLUserFlags[j].name); + numflags++; + } + } + setDeferredSetLen(c,deflen,numflags); + + /* Passwords */ + addReplyBulkCString(c,"passwords"); + addReplyArrayLen(c,listLength(u->passwords)); + listIter li; + listNode *ln; + listRewind(u->passwords,&li); + while((ln = listNext(&li))) { + sds thispass = listNodeValue(ln); + addReplyBulkCBuffer(c,thispass,sdslen(thispass)); + } + /* Include the root selector at the top level for backwards compatibility */ + fields += aclAddReplySelectorDescription(c, ACLUserGetRootSelector(u)); + + /* Describe all of the selectors on this user, including duplicating the root selector */ + addReplyBulkCString(c,"selectors"); + addReplyArrayLen(c, listLength(u->selectors) - 1); + listRewind(u->selectors,&li); + serverAssert(listNext(&li)); + while((ln = listNext(&li))) { + void *slen = addReplyDeferredLen(c); + int sfields = aclAddReplySelectorDescription(c, (aclSelector *)listNodeValue(ln)); + setDeferredMapLen(c, slen, sfields); + } + setDeferredMapLen(c, ufields, fields); + } else if ((!strcasecmp(sub,"list") || !strcasecmp(sub,"users")) && + c->argc == 2) + { + int justnames = !strcasecmp(sub,"users"); + addReplyArrayLen(c,raxSize(Users)); + raxIterator ri; + raxStart(&ri,Users); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + user *u = ri.data; + if (justnames) { + addReplyBulkCBuffer(c,u->name,sdslen(u->name)); + } else { + /* Return information in the configuration file format. */ + sds config = sdsnew("user "); + config = sdscatsds(config,u->name); + config = sdscatlen(config," ",1); + robj *descr = ACLDescribeUser(u); + config = sdscatsds(config,descr->ptr); + decrRefCount(descr); + addReplyBulkSds(c,config); + } + } + raxStop(&ri); + } else if (!strcasecmp(sub,"whoami") && c->argc == 2) { + if (c->user != NULL) { + addReplyBulkCBuffer(c,c->user->name,sdslen(c->user->name)); + } else { + addReplyNull(c); + } + } else if (server.acl_filename[0] == '\0' && + (!strcasecmp(sub,"load") || !strcasecmp(sub,"save"))) + { + addReplyError(c,"This Redis instance is not configured to use an ACL file. You may want to specify users via the ACL SETUSER command and then issue a CONFIG REWRITE (assuming you have a Redis configuration file set) in order to store users in the Redis configuration."); + return; + } else if (!strcasecmp(sub,"load") && c->argc == 2) { + sds errors = ACLLoadFromFile(server.acl_filename); + if (errors == NULL) { + addReply(c,shared.ok); + } else { + addReplyError(c,errors); + sdsfree(errors); + } + } else if (!strcasecmp(sub,"save") && c->argc == 2) { + if (ACLSaveToFile(server.acl_filename) == C_OK) { + addReply(c,shared.ok); + } else { + addReplyError(c,"There was an error trying to save the ACLs. " + "Please check the server logs for more " + "information"); + } + } else if (!strcasecmp(sub,"cat") && c->argc == 2) { + void *dl = addReplyDeferredLen(c); + int j; + for (j = 0; ACLCommandCategories[j].flag != 0; j++) + addReplyBulkCString(c,ACLCommandCategories[j].name); + setDeferredArrayLen(c,dl,j); + } else if (!strcasecmp(sub,"cat") && c->argc == 3) { + uint64_t cflag = ACLGetCommandCategoryFlagByName(c->argv[2]->ptr); + if (cflag == 0) { + addReplyErrorFormat(c, "Unknown category '%.128s'", (char*)c->argv[2]->ptr); + return; + } + int arraylen = 0; + void *dl = addReplyDeferredLen(c); + aclCatWithFlags(c, server.orig_commands, cflag, &arraylen); + setDeferredArrayLen(c,dl,arraylen); + } else if (!strcasecmp(sub,"genpass") && (c->argc == 2 || c->argc == 3)) { + #define GENPASS_MAX_BITS 4096 + char pass[GENPASS_MAX_BITS/8*2]; /* Hex representation. */ + long bits = 256; /* By default generate 256 bits passwords. */ + + if (c->argc == 3 && getLongFromObjectOrReply(c,c->argv[2],&bits,NULL) + != C_OK) return; + + if (bits <= 0 || bits > GENPASS_MAX_BITS) { + addReplyErrorFormat(c, + "ACL GENPASS argument must be the number of " + "bits for the output password, a positive number " + "up to %d",GENPASS_MAX_BITS); + return; + } + + long chars = (bits+3)/4; /* Round to number of characters to emit. */ + getRandomHexChars(pass,chars); + addReplyBulkCBuffer(c,pass,chars); + } else if (!strcasecmp(sub,"log") && (c->argc == 2 || c->argc ==3)) { + long count = 10; /* Number of entries to emit by default. */ + + /* Parse the only argument that LOG may have: it could be either + * the number of entries the user wants to display, or alternatively + * the "RESET" command in order to flush the old entries. */ + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"reset")) { + listSetFreeMethod(ACLLog,ACLFreeLogEntry); + listEmpty(ACLLog); + listSetFreeMethod(ACLLog,NULL); + addReply(c,shared.ok); + return; + } else if (getLongFromObjectOrReply(c,c->argv[2],&count,NULL) + != C_OK) + { + return; + } + if (count < 0) count = 0; + } + + /* Fix the count according to the number of entries we got. */ + if ((size_t)count > listLength(ACLLog)) + count = listLength(ACLLog); + + addReplyArrayLen(c,count); + listIter li; + listNode *ln; + listRewind(ACLLog,&li); + mstime_t now = commandTimeSnapshot(); + while (count-- && (ln = listNext(&li)) != NULL) { + ACLLogEntry *le = listNodeValue(ln); + addReplyMapLen(c,10); + addReplyBulkCString(c,"count"); + addReplyLongLong(c,le->count); + + addReplyBulkCString(c,"reason"); + char *reasonstr; + switch(le->reason) { + case ACL_DENIED_CMD: reasonstr="command"; break; + case ACL_DENIED_KEY: reasonstr="key"; break; + case ACL_DENIED_CHANNEL: reasonstr="channel"; break; + case ACL_DENIED_AUTH: reasonstr="auth"; break; + case ACL_INVALID_TLS_CERT_AUTH: reasonstr = "tls-cert"; break; + default: reasonstr="unknown"; + } + addReplyBulkCString(c,reasonstr); + + addReplyBulkCString(c,"context"); + char *ctxstr; + switch(le->context) { + case ACL_LOG_CTX_TOPLEVEL: ctxstr="toplevel"; break; + case ACL_LOG_CTX_MULTI: ctxstr="multi"; break; + case ACL_LOG_CTX_LUA: ctxstr="lua"; break; + case ACL_LOG_CTX_MODULE: ctxstr="module"; break; + default: ctxstr="unknown"; + } + addReplyBulkCString(c,ctxstr); + + addReplyBulkCString(c,"object"); + addReplyBulkCBuffer(c,le->object,sdslen(le->object)); + addReplyBulkCString(c,"username"); + addReplyBulkCBuffer(c,le->username,sdslen(le->username)); + addReplyBulkCString(c,"age-seconds"); + double age = (double)(now - le->ctime)/1000; + addReplyDouble(c,age); + addReplyBulkCString(c,"client-info"); + addReplyBulkCBuffer(c,le->cinfo,sdslen(le->cinfo)); + addReplyBulkCString(c, "entry-id"); + addReplyLongLong(c, le->entry_id); + addReplyBulkCString(c, "timestamp-created"); + addReplyLongLong(c, le->timestamp_created); + addReplyBulkCString(c, "timestamp-last-updated"); + addReplyLongLong(c, le->ctime); + } + } else if (!strcasecmp(sub,"dryrun") && c->argc >= 4) { + struct redisCommand *cmd; + user *u = ACLGetUserByName(c->argv[2]->ptr,sdslen(c->argv[2]->ptr)); + if (u == NULL) { + addReplyErrorFormat(c, "User '%s' not found", (char *)c->argv[2]->ptr); + return; + } + + if ((cmd = lookupCommand(c->argv + 3, c->argc - 3)) == NULL) { + addReplyErrorFormat(c, "Command '%s' not found", (char *)c->argv[3]->ptr); + return; + } + + if ((cmd->arity > 0 && cmd->arity != c->argc-3) || + (c->argc-3 < -cmd->arity)) + { + addReplyErrorFormat(c,"wrong number of arguments for '%s' command", cmd->fullname); + return; + } + + int idx; + int result = ACLCheckAllUserCommandPerm(u, cmd, c->argv + 3, c->argc - 3, NULL, &idx); + if (result != ACL_OK) { + sds err = getAclErrorMessage(result, u, cmd, c->argv[idx+3]->ptr, 1); + addReplyBulkSds(c, err); + return; + } + + addReply(c,shared.ok); + } else if (c->argc == 2 && !strcasecmp(sub,"help")) { + const char *help[] = { +"CAT []", +" List all commands that belong to , or all command categories", +" when no category is specified.", +"DELUSER [ ...]", +" Delete a list of users.", +"DRYRUN [ ...]", +" Returns whether the user can execute the given command without executing the command.", +"GETUSER ", +" Get the user's details.", +"GENPASS []", +" Generate a secure 256-bit user password. The optional `bits` argument can", +" be used to specify a different size.", +"LIST", +" Show users details in config file format.", +"LOAD", +" Reload users from the ACL file.", +"LOG [ | RESET]", +" Show the ACL log entries.", +"SAVE", +" Save the current config to the ACL file.", +"SETUSER [ ...]", +" Create or modify a user with the specified attributes.", +"USERS", +" List all the registered usernames.", +"WHOAMI", +" Return the current connection username.", +NULL + }; + addReplyHelp(c,help); + } else { + addReplySubcommandSyntaxError(c); + } +} + +void addReplyCommandCategories(client *c, struct redisCommand *cmd) { + int flagcount = 0; + void *flaglen = addReplyDeferredLen(c); + for (int j = 0; ACLCommandCategories[j].flag != 0; j++) { + if (cmd->acl_categories & ACLCommandCategories[j].flag) { + addReplyStatusFormat(c, "@%s", ACLCommandCategories[j].name); + flagcount++; + } + } + setDeferredSetLen(c, flaglen, flagcount); +} + +/* When successful, initiates an internal connection, that is able to execute + * internal commands (see CMD_INTERNAL). */ +static void internalAuth(client *c) { + if (!server.cluster_enabled) { + addReplyError(c, "Cannot authenticate as an internal connection on non-cluster instances"); + return; + } + + sds password = c->argv[2]->ptr; + + /* Get internal secret. */ + size_t len = -1; + const char *internal_secret = clusterGetSecret(&len); + if (sdslen(password) != len) { + addReplyError(c, "-WRONGPASS invalid internal password"); + return; + } + if (!time_independent_strcmp((char *)internal_secret, (char *)password, len)) { + c->flags |= CLIENT_INTERNAL; + /* No further authentication is needed. */ + c->authenticated = 1; + /* Set the user to the unrestricted user, if it is not already set (default). */ + if (c->user != NULL) { + c->user = NULL; + moduleNotifyUserChanged(c); + } + addReply(c, shared.ok); + } else { + addReplyError(c, "-WRONGPASS invalid internal password"); + } +} + +/* AUTH + * AUTH (Redis >= 6.0 form) + * + * When the user is omitted it means that we are trying to authenticate + * against the default user. */ +void authCommand(client *c) { + /* Only two or three argument forms are allowed. */ + if (c->argc > 3) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + /* Always redact the second argument */ + redactClientCommandArgument(c, 1); + + /* Handle the two different forms here. The form with two arguments + * will just use "default" as username. */ + robj *username, *password; + if (c->argc == 2) { + /* Mimic the old behavior of giving an error for the two argument + * form if no password is configured. */ + if (DefaultUser->flags & USER_FLAG_NOPASS) { + addReplyError(c,"AUTH called without any password " + "configured for the default user. Are you sure " + "your configuration is correct?"); + return; + } + + username = shared.default_username; + password = c->argv[1]; + } else { + username = c->argv[1]; + password = c->argv[2]; + redactClientCommandArgument(c, 2); + + /* Handle internal authentication commands. + * Note: No user-defined ACL user can have this username (no spaces + * allowed), thus no conflicts with ACL possible. */ + if (!strcmp(username->ptr, "internal connection")) { + internalAuth(c); + return; + } + } + + robj *err = NULL; + int result = ACLAuthenticateUser(c, username, password, &err); + if (result == AUTH_OK) { + addReply(c, shared.ok); + } else if (result == AUTH_ERR) { + addAuthErrReply(c, err); + } + if (err) decrRefCount(err); +} + +/* Set the password for the "default" ACL user. This implements supports for + * requirepass config, so passing in NULL will set the user to be nopass. */ +void ACLUpdateDefaultUserPassword(sds password) { + ACLSetUser(DefaultUser,"resetpass",-1); + if (password) { + sds aclop = sdscatlen(sdsnew(">"), password, sdslen(password)); + ACLSetUser(DefaultUser,aclop,sdslen(aclop)); + sdsfree(aclop); + } else { + ACLSetUser(DefaultUser,"nopass",-1); + } +} diff --git a/examples/redis-unstable/src/adlist.c b/examples/redis-unstable/src/adlist.c new file mode 100644 index 0000000..d7ca5fb --- /dev/null +++ b/examples/redis-unstable/src/adlist.c @@ -0,0 +1,395 @@ +/* adlist.c - A generic doubly linked list implementation + * + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + + +#include +#include "adlist.h" +#include "zmalloc.h" + +/* Create a new list. The created list can be freed with + * listRelease(), but private value of every node need to be freed + * by the user before to call listRelease(), or by setting a free method using + * listSetFreeMethod. + * + * On error, NULL is returned. Otherwise the pointer to the new list. */ +list *listCreate(void) +{ + struct list *list; + + if ((list = zmalloc(sizeof(*list))) == NULL) + return NULL; + list->head = list->tail = NULL; + list->len = 0; + list->dup = NULL; + list->free = NULL; + list->match = NULL; + return list; +} + +/* Remove all the elements from the list without destroying the list itself. */ +void listEmpty(list *list) +{ + unsigned long len; + listNode *current, *next; + + current = list->head; + len = list->len; + while(len--) { + next = current->next; + if (list->free) list->free(current->value); + zfree(current); + current = next; + } + list->head = list->tail = NULL; + list->len = 0; +} + +/* Free the whole list. + * + * This function can't fail. */ +void listRelease(list *list) +{ + if (!list) + return; + listEmpty(list); + zfree(list); +} + +/* Generic version of listRelease. */ +void listReleaseGeneric(void *list) { + listRelease((struct list*)list); +} + +/* Add a new node to the list, to head, containing the specified 'value' + * pointer as value. + * + * On error, NULL is returned and no operation is performed (i.e. the + * list remains unaltered). + * On success the 'list' pointer you pass to the function is returned. */ +list *listAddNodeHead(list *list, void *value) +{ + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + listLinkNodeHead(list, node); + return list; +} + +/* + * Add a node that has already been allocated to the head of list + */ +void listLinkNodeHead(list* list, listNode *node) { + if (list->len == 0) { + list->head = list->tail = node; + node->prev = node->next = NULL; + } else { + node->prev = NULL; + node->next = list->head; + list->head->prev = node; + list->head = node; + } + list->len++; +} + +/* Add a new node to the list, to tail, containing the specified 'value' + * pointer as value. + * + * On error, NULL is returned and no operation is performed (i.e. the + * list remains unaltered). + * On success the 'list' pointer you pass to the function is returned. */ +list *listAddNodeTail(list *list, void *value) +{ + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + listLinkNodeTail(list, node); + return list; +} + +/* + * Add a node that has already been allocated to the tail of list + */ +void listLinkNodeTail(list *list, listNode *node) { + if (list->len == 0) { + list->head = list->tail = node; + node->prev = node->next = NULL; + } else { + node->prev = list->tail; + node->next = NULL; + list->tail->next = node; + list->tail = node; + } + list->len++; +} + +list *listInsertNode(list *list, listNode *old_node, void *value, int after) { + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + if (after) { + node->prev = old_node; + node->next = old_node->next; + if (list->tail == old_node) { + list->tail = node; + } + } else { + node->next = old_node; + node->prev = old_node->prev; + if (list->head == old_node) { + list->head = node; + } + } + if (node->prev != NULL) { + node->prev->next = node; + } + if (node->next != NULL) { + node->next->prev = node; + } + list->len++; + return list; +} + +/* Remove the specified node from the specified list. + * The node is freed. If free callback is provided the value is freed as well. + * + * This function can't fail. */ +void listDelNode(list *list, listNode *node) +{ + listUnlinkNode(list, node); + if (list->free) list->free(node->value); + zfree(node); +} + +/* + * Remove the specified node from the list without freeing it. + */ +void listUnlinkNode(list *list, listNode *node) { + if (node->prev) + node->prev->next = node->next; + else + list->head = node->next; + if (node->next) + node->next->prev = node->prev; + else + list->tail = node->prev; + + node->next = NULL; + node->prev = NULL; + + list->len--; +} + +/* Returns a list iterator 'iter'. After the initialization every + * call to listNext() will return the next element of the list. + * + * This function can't fail. */ +void listInitIterator(listIter *iter, list *list, int direction) +{ + if (direction == AL_START_HEAD) + iter->next = list->head; + else + iter->next = list->tail; + iter->direction = direction; +} + +/* Create an iterator in the list private iterator structure */ +void listRewind(list *list, listIter *li) { + li->next = list->head; + li->direction = AL_START_HEAD; +} + +void listRewindTail(list *list, listIter *li) { + li->next = list->tail; + li->direction = AL_START_TAIL; +} + +/* Return the next element of an iterator. + * It's valid to remove the currently returned element using + * listDelNode(), but not to remove other elements. + * + * The function returns a pointer to the next element of the list, + * or NULL if there are no more elements, so the classical usage + * pattern is: + * + * iter = listGetIterator(list,); + * while ((node = listNext(iter)) != NULL) { + * doSomethingWith(listNodeValue(node)); + * } + * + * */ +listNode *listNext(listIter *iter) +{ + listNode *current = iter->next; + + if (current != NULL) { + if (iter->direction == AL_START_HEAD) + iter->next = current->next; + else + iter->next = current->prev; + } + return current; +} + +/* Duplicate the whole list. On out of memory NULL is returned. + * On success a copy of the original list is returned. + * + * The 'Dup' method set with listSetDupMethod() function is used + * to copy the node value. Otherwise the same pointer value of + * the original node is used as value of the copied node. + * + * The original list both on success or error is never modified. */ +list *listDup(list *orig) +{ + list *copy; + listIter iter; + listNode *node; + + if ((copy = listCreate()) == NULL) + return NULL; + copy->dup = orig->dup; + copy->free = orig->free; + copy->match = orig->match; + listRewind(orig, &iter); + while((node = listNext(&iter)) != NULL) { + void *value; + + if (copy->dup) { + value = copy->dup(node->value); + if (value == NULL) { + listRelease(copy); + return NULL; + } + } else { + value = node->value; + } + + if (listAddNodeTail(copy, value) == NULL) { + /* Free value if dup succeed but listAddNodeTail failed. */ + if (copy->free) copy->free(value); + + listRelease(copy); + return NULL; + } + } + return copy; +} + +/* Search the list for a node matching a given key. + * The match is performed using the 'match' method + * set with listSetMatchMethod(). If no 'match' method + * is set, the 'value' pointer of every node is directly + * compared with the 'key' pointer. + * + * On success the first matching node pointer is returned + * (search starts from head). If no matching node exists + * NULL is returned. */ +listNode *listSearchKey(list *list, void *key) +{ + listIter iter; + listNode *node; + + listRewind(list, &iter); + while((node = listNext(&iter)) != NULL) { + if (list->match) { + if (list->match(node->value, key)) { + return node; + } + } else { + if (key == node->value) { + return node; + } + } + } + return NULL; +} + +/* Return the element at the specified zero-based index + * where 0 is the head, 1 is the element next to head + * and so on. Negative integers are used in order to count + * from the tail, -1 is the last element, -2 the penultimate + * and so on. If the index is out of range NULL is returned. */ +listNode *listIndex(list *list, long index) { + listNode *n; + + if (index < 0) { + index = (-index)-1; + n = list->tail; + while(index-- && n) n = n->prev; + } else { + n = list->head; + while(index-- && n) n = n->next; + } + return n; +} + +/* Rotate the list removing the tail node and inserting it to the head. */ +void listRotateTailToHead(list *list) { + if (listLength(list) <= 1) return; + + /* Detach current tail */ + listNode *tail = list->tail; + list->tail = tail->prev; + list->tail->next = NULL; + /* Move it as head */ + list->head->prev = tail; + tail->prev = NULL; + tail->next = list->head; + list->head = tail; +} + +/* Rotate the list removing the head node and inserting it to the tail. */ +void listRotateHeadToTail(list *list) { + if (listLength(list) <= 1) return; + + listNode *head = list->head; + /* Detach current head */ + list->head = head->next; + list->head->prev = NULL; + /* Move it as tail */ + list->tail->next = head; + head->next = NULL; + head->prev = list->tail; + list->tail = head; +} + +/* Add all the elements of the list 'o' at the end of the + * list 'l'. The list 'other' remains empty but otherwise valid. */ +void listJoin(list *l, list *o) { + if (o->len == 0) return; + + o->head->prev = l->tail; + + if (l->tail) + l->tail->next = o->head; + else + l->head = o->head; + + l->tail = o->tail; + l->len += o->len; + + /* Setup other as an empty list. */ + o->head = o->tail = NULL; + o->len = 0; +} + +/* Initializes the node's value and sets its pointers + * so that it is initially not a member of any list. + */ +void listInitNode(listNode *node, void *value) { + node->prev = NULL; + node->next = NULL; + node->value = value; +} diff --git a/examples/redis-unstable/src/adlist.h b/examples/redis-unstable/src/adlist.h new file mode 100644 index 0000000..bb0eed1 --- /dev/null +++ b/examples/redis-unstable/src/adlist.h @@ -0,0 +1,80 @@ +/* adlist.h - A generic doubly linked list implementation + * + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#ifndef __ADLIST_H__ +#define __ADLIST_H__ + +/* Node, List, and Iterator are the only data structures used currently. */ + +typedef struct listNode { + struct listNode *prev; + struct listNode *next; + void *value; +} listNode; + +typedef struct listIter { + listNode *next; + int direction; +} listIter; + +typedef struct list { + listNode *head; + listNode *tail; + void *(*dup)(void *ptr); + void (*free)(void *ptr); + int (*match)(void *ptr, void *key); + unsigned long len; +} list; + +/* Functions implemented as macros */ +#define listLength(l) ((l)->len) +#define listFirst(l) ((l)->head) +#define listLast(l) ((l)->tail) +#define listPrevNode(n) ((n)->prev) +#define listNextNode(n) ((n)->next) +#define listNodeValue(n) ((n)->value) + +#define listSetDupMethod(l,m) ((l)->dup = (m)) +#define listSetFreeMethod(l,m) ((l)->free = (m)) +#define listSetMatchMethod(l,m) ((l)->match = (m)) + +#define listGetDupMethod(l) ((l)->dup) +#define listGetFreeMethod(l) ((l)->free) +#define listGetMatchMethod(l) ((l)->match) + +/* Prototypes */ +list *listCreate(void); +void listRelease(list *list); +void listReleaseGeneric(void *list); +void listEmpty(list *list); +list *listAddNodeHead(list *list, void *value); +list *listAddNodeTail(list *list, void *value); +list *listInsertNode(list *list, listNode *old_node, void *value, int after); +void listDelNode(list *list, listNode *node); +void listInitIterator(listIter *iter, list *list, int direction); +listNode *listNext(listIter *iter); +list *listDup(list *orig); +listNode *listSearchKey(list *list, void *key); +listNode *listIndex(list *list, long index); +void listRewind(list *list, listIter *li); +void listRewindTail(list *list, listIter *li); +void listRotateTailToHead(list *list); +void listRotateHeadToTail(list *list); +void listJoin(list *l, list *o); +void listInitNode(listNode *node, void *value); +void listLinkNodeHead(list *list, listNode *node); +void listLinkNodeTail(list *list, listNode *node); +void listUnlinkNode(list *list, listNode *node); + +/* Directions for iterators */ +#define AL_START_HEAD 0 +#define AL_START_TAIL 1 + +#endif /* __ADLIST_H__ */ diff --git a/examples/redis-unstable/src/ae.c b/examples/redis-unstable/src/ae.c new file mode 100644 index 0000000..733c88d --- /dev/null +++ b/examples/redis-unstable/src/ae.c @@ -0,0 +1,511 @@ +/* A simple event-driven programming library. Originally I wrote this code + * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated + * it in form of a library for easy reuse. + * + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "ae.h" +#include "anet.h" +#include "redisassert.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zmalloc.h" +#include "config.h" + +/* Include the best multiplexing layer supported by this system. + * The following should be ordered by performances, descending. */ +#ifdef HAVE_EVPORT +#include "ae_evport.c" +#else + #ifdef HAVE_EPOLL + #include "ae_epoll.c" + #else + #ifdef HAVE_KQUEUE + #include "ae_kqueue.c" + #else + #include "ae_select.c" + #endif + #endif +#endif + +#define INITIAL_EVENT 1024 +aeEventLoop *aeCreateEventLoop(int setsize) { + aeEventLoop *eventLoop; + int i; + + monotonicInit(); /* just in case the calling app didn't initialize */ + + if ((eventLoop = zmalloc(sizeof(*eventLoop))) == NULL) goto err; + eventLoop->nevents = setsize < INITIAL_EVENT ? setsize : INITIAL_EVENT; + eventLoop->events = zmalloc(sizeof(aeFileEvent)*eventLoop->nevents); + eventLoop->fired = zmalloc(sizeof(aeFiredEvent)*eventLoop->nevents); + if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err; + eventLoop->setsize = setsize; + eventLoop->timeEventHead = NULL; + eventLoop->timeEventNextId = 0; + eventLoop->stop = 0; + eventLoop->maxfd = -1; + eventLoop->beforesleep = NULL; + eventLoop->aftersleep = NULL; + eventLoop->flags = 0; + memset(eventLoop->privdata, 0, sizeof(eventLoop->privdata)); + if (aeApiCreate(eventLoop) == -1) goto err; + /* Events with mask == AE_NONE are not set. So let's initialize the + * vector with it. */ + for (i = 0; i < eventLoop->nevents; i++) + eventLoop->events[i].mask = AE_NONE; + return eventLoop; + +err: + if (eventLoop) { + zfree(eventLoop->events); + zfree(eventLoop->fired); + zfree(eventLoop); + } + return NULL; +} + +/* Return the current set size. */ +int aeGetSetSize(aeEventLoop *eventLoop) { + return eventLoop->setsize; +} + +/* + * Tell the event processing to change the wait timeout as soon as possible. + * + * Note: it just means you turn on/off the global AE_DONT_WAIT. + */ +void aeSetDontWait(aeEventLoop *eventLoop, int noWait) { + if (noWait) + eventLoop->flags |= AE_DONT_WAIT; + else + eventLoop->flags &= ~AE_DONT_WAIT; +} + +/* Resize the maximum set size of the event loop. + * If the requested set size is smaller than the current set size, but + * there is already a file descriptor in use that is >= the requested + * set size minus one, AE_ERR is returned and the operation is not + * performed at all. + * + * Otherwise AE_OK is returned and the operation is successful. */ +int aeResizeSetSize(aeEventLoop *eventLoop, int setsize) { + if (setsize == eventLoop->setsize) return AE_OK; + if (eventLoop->maxfd >= setsize) return AE_ERR; + if (aeApiResize(eventLoop,setsize) == -1) return AE_ERR; + + eventLoop->setsize = setsize; + + /* If the current allocated space is larger than the requested size, + * we need to shrink it to the requested size. */ + if (setsize < eventLoop->nevents) { + eventLoop->events = zrealloc(eventLoop->events,sizeof(aeFileEvent)*setsize); + eventLoop->fired = zrealloc(eventLoop->fired,sizeof(aeFiredEvent)*setsize); + eventLoop->nevents = setsize; + } + return AE_OK; +} + +void aeDeleteEventLoop(aeEventLoop *eventLoop) { + aeApiFree(eventLoop); + zfree(eventLoop->events); + zfree(eventLoop->fired); + + /* Free the time events list. */ + aeTimeEvent *next_te, *te = eventLoop->timeEventHead; + while (te) { + next_te = te->next; + if (te->finalizerProc) + te->finalizerProc(eventLoop, te->clientData); + zfree(te); + te = next_te; + } + zfree(eventLoop); +} + +void aeStop(aeEventLoop *eventLoop) { + eventLoop->stop = 1; +} + +int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, + aeFileProc *proc, void *clientData) +{ + if (fd >= eventLoop->setsize) { + errno = ERANGE; + return AE_ERR; + } + + /* Resize the events and fired arrays if the file + * descriptor exceeds the current number of events. */ + if (unlikely(fd >= eventLoop->nevents)) { + int newnevents = eventLoop->nevents; + newnevents = (newnevents * 2 > fd + 1) ? newnevents * 2 : fd + 1; + newnevents = (newnevents > eventLoop->setsize) ? eventLoop->setsize : newnevents; + eventLoop->events = zrealloc(eventLoop->events, sizeof(aeFileEvent) * newnevents); + eventLoop->fired = zrealloc(eventLoop->fired, sizeof(aeFiredEvent) * newnevents); + + /* Initialize new slots with an AE_NONE mask */ + for (int i = eventLoop->nevents; i < newnevents; i++) + eventLoop->events[i].mask = AE_NONE; + eventLoop->nevents = newnevents; + } + + aeFileEvent *fe = &eventLoop->events[fd]; + + if (aeApiAddEvent(eventLoop, fd, mask) == -1) + return AE_ERR; + fe->mask |= mask; + if (mask & AE_READABLE) fe->rfileProc = proc; + if (mask & AE_WRITABLE) fe->wfileProc = proc; + fe->clientData = clientData; + if (fd > eventLoop->maxfd) + eventLoop->maxfd = fd; + return AE_OK; +} + +void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask) +{ + if (fd >= eventLoop->setsize) return; + aeFileEvent *fe = &eventLoop->events[fd]; + if (fe->mask == AE_NONE) return; + + /* We want to always remove AE_BARRIER if set when AE_WRITABLE + * is removed. */ + if (mask & AE_WRITABLE) mask |= AE_BARRIER; + + aeApiDelEvent(eventLoop, fd, mask); + fe->mask = fe->mask & (~mask); + if (fd == eventLoop->maxfd && fe->mask == AE_NONE) { + /* Update the max fd */ + int j; + + for (j = eventLoop->maxfd-1; j >= 0; j--) + if (eventLoop->events[j].mask != AE_NONE) break; + eventLoop->maxfd = j; + } +} + +void *aeGetFileClientData(aeEventLoop *eventLoop, int fd) { + if (fd >= eventLoop->setsize) return NULL; + aeFileEvent *fe = &eventLoop->events[fd]; + if (fe->mask == AE_NONE) return NULL; + + return fe->clientData; +} + +int aeGetFileEvents(aeEventLoop *eventLoop, int fd) { + if (fd >= eventLoop->setsize) return 0; + aeFileEvent *fe = &eventLoop->events[fd]; + + return fe->mask; +} + +long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, + aeTimeProc *proc, void *clientData, + aeEventFinalizerProc *finalizerProc) +{ + long long id = eventLoop->timeEventNextId++; + aeTimeEvent *te; + + te = zmalloc(sizeof(*te)); + if (te == NULL) return AE_ERR; + te->id = id; + te->when = getMonotonicUs() + milliseconds * 1000; + te->timeProc = proc; + te->finalizerProc = finalizerProc; + te->clientData = clientData; + te->prev = NULL; + te->next = eventLoop->timeEventHead; + te->refcount = 0; + if (te->next) + te->next->prev = te; + eventLoop->timeEventHead = te; + return id; +} + +int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id) +{ + aeTimeEvent *te = eventLoop->timeEventHead; + while(te) { + if (te->id == id) { + te->id = AE_DELETED_EVENT_ID; + return AE_OK; + } + te = te->next; + } + return AE_ERR; /* NO event with the specified ID found */ +} + +/* How many microseconds until the first timer should fire. + * If there are no timers, -1 is returned. + * + * Note that's O(N) since time events are unsorted. + * Possible optimizations (not needed by Redis so far, but...): + * 1) Insert the event in order, so that the nearest is just the head. + * Much better but still insertion or deletion of timers is O(N). + * 2) Use a skiplist to have this operation as O(1) and insertion as O(log(N)). + */ +static int64_t usUntilEarliestTimer(aeEventLoop *eventLoop) { + aeTimeEvent *te = eventLoop->timeEventHead; + if (te == NULL) return -1; + + aeTimeEvent *earliest = NULL; + while (te) { + if ((!earliest || te->when < earliest->when) && te->id != AE_DELETED_EVENT_ID) + earliest = te; + te = te->next; + } + + monotime now = getMonotonicUs(); + return (now >= earliest->when) ? 0 : earliest->when - now; +} + +/* Process time events */ +static int processTimeEvents(aeEventLoop *eventLoop) { + int processed = 0; + aeTimeEvent *te; + long long maxId; + + te = eventLoop->timeEventHead; + maxId = eventLoop->timeEventNextId-1; + monotime now = getMonotonicUs(); + while(te) { + long long id; + + /* Remove events scheduled for deletion. */ + if (te->id == AE_DELETED_EVENT_ID) { + aeTimeEvent *next = te->next; + /* If a reference exists for this timer event, + * don't free it. This is currently incremented + * for recursive timerProc calls */ + if (te->refcount) { + te = next; + continue; + } + if (te->prev) + te->prev->next = te->next; + else + eventLoop->timeEventHead = te->next; + if (te->next) + te->next->prev = te->prev; + if (te->finalizerProc) { + te->finalizerProc(eventLoop, te->clientData); + now = getMonotonicUs(); + } + zfree(te); + te = next; + continue; + } + + /* Make sure we don't process time events created by time events in + * this iteration. Note that this check is currently useless: we always + * add new timers on the head, however if we change the implementation + * detail, this check may be useful again: we keep it here for future + * defense. */ + if (te->id > maxId) { + te = te->next; + continue; + } + + if (te->when <= now) { + int retval; + + id = te->id; + te->refcount++; + retval = te->timeProc(eventLoop, id, te->clientData); + te->refcount--; + processed++; + now = getMonotonicUs(); + if (retval != AE_NOMORE) { + te->when = now + (monotime)retval * 1000; + } else { + te->id = AE_DELETED_EVENT_ID; + } + } + te = te->next; + } + return processed; +} + +/* Process every pending file event, then every pending time event + * (that may be registered by file event callbacks just processed). + * Without special flags the function sleeps until some file event + * fires, or when the next time event occurs (if any). + * + * If flags is 0, the function does nothing and returns. + * if flags has AE_ALL_EVENTS set, all the kind of events are processed. + * if flags has AE_FILE_EVENTS set, file events are processed. + * if flags has AE_TIME_EVENTS set, time events are processed. + * if flags has AE_DONT_WAIT set, the function returns ASAP once all + * the events that can be handled without a wait are processed. + * if flags has AE_CALL_AFTER_SLEEP set, the aftersleep callback is called. + * if flags has AE_CALL_BEFORE_SLEEP set, the beforesleep callback is called. + * + * The function returns the number of events processed. */ +int aeProcessEvents(aeEventLoop *eventLoop, int flags) +{ + int processed = 0, numevents; + + /* Nothing to do? return ASAP */ + if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0; + + /* Note that we want to call aeApiPoll() even if there are no + * file events to process as long as we want to process time + * events, in order to sleep until the next time event is ready + * to fire. */ + if (eventLoop->maxfd != -1 || + ((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) { + int j; + struct timeval tv, *tvp = NULL; /* NULL means infinite wait. */ + int64_t usUntilTimer; + + if (eventLoop->beforesleep != NULL && (flags & AE_CALL_BEFORE_SLEEP)) + eventLoop->beforesleep(eventLoop); + + /* The eventLoop->flags may be changed inside beforesleep. + * So we should check it after beforesleep be called. At the same time, + * the parameter flags always should have the highest priority. + * That is to say, once the parameter flag is set to AE_DONT_WAIT, + * no matter what value eventLoop->flags is set to, we should ignore it. */ + if ((flags & AE_DONT_WAIT) || (eventLoop->flags & AE_DONT_WAIT)) { + tv.tv_sec = tv.tv_usec = 0; + tvp = &tv; + } else if (flags & AE_TIME_EVENTS) { + usUntilTimer = usUntilEarliestTimer(eventLoop); + if (usUntilTimer >= 0) { + tv.tv_sec = usUntilTimer / 1000000; + tv.tv_usec = usUntilTimer % 1000000; + tvp = &tv; + } + } + /* Call the multiplexing API, will return only on timeout or when + * some event fires. */ + numevents = aeApiPoll(eventLoop, tvp); + + /* Don't process file events if not requested. */ + if (!(flags & AE_FILE_EVENTS)) { + numevents = 0; + } + + /* After sleep callback. */ + if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP) + eventLoop->aftersleep(eventLoop); + + for (j = 0; j < numevents; j++) { + int fd = eventLoop->fired[j].fd; + aeFileEvent *fe = &eventLoop->events[fd]; + int mask = eventLoop->fired[j].mask; + int fired = 0; /* Number of events fired for current fd. */ + + /* Normally we execute the readable event first, and the writable + * event later. This is useful as sometimes we may be able + * to serve the reply of a query immediately after processing the + * query. + * + * However if AE_BARRIER is set in the mask, our application is + * asking us to do the reverse: never fire the writable event + * after the readable. In such a case, we invert the calls. + * This is useful when, for instance, we want to do things + * in the beforeSleep() hook, like fsyncing a file to disk, + * before replying to a client. */ + int invert = fe->mask & AE_BARRIER; + + /* Note the "fe->mask & mask & ..." code: maybe an already + * processed event removed an element that fired and we still + * didn't processed, so we check if the event is still valid. + * + * Fire the readable event if the call sequence is not + * inverted. */ + if (!invert && fe->mask & mask & AE_READABLE) { + fe->rfileProc(eventLoop,fd,fe->clientData,mask); + fired++; + fe = &eventLoop->events[fd]; /* Refresh in case of resize. */ + } + + /* Fire the writable event. */ + if (fe->mask & mask & AE_WRITABLE) { + if (!fired || fe->wfileProc != fe->rfileProc) { + fe->wfileProc(eventLoop,fd,fe->clientData,mask); + fired++; + } + } + + /* If we have to invert the call, fire the readable event now + * after the writable one. */ + if (invert) { + fe = &eventLoop->events[fd]; /* Refresh in case of resize. */ + if ((fe->mask & mask & AE_READABLE) && + (!fired || fe->wfileProc != fe->rfileProc)) + { + fe->rfileProc(eventLoop,fd,fe->clientData,mask); + fired++; + } + } + + processed++; + } + } + /* Check time events */ + if (flags & AE_TIME_EVENTS) + processed += processTimeEvents(eventLoop); + + return processed; /* return the number of processed file/time events */ +} + +/* Wait for milliseconds until the given file descriptor becomes + * writable/readable/exception */ +int aeWait(int fd, int mask, long long milliseconds) { + struct pollfd pfd; + int retmask = 0, retval; + + memset(&pfd, 0, sizeof(pfd)); + pfd.fd = fd; + if (mask & AE_READABLE) pfd.events |= POLLIN; + if (mask & AE_WRITABLE) pfd.events |= POLLOUT; + + if ((retval = poll(&pfd, 1, milliseconds))== 1) { + if (pfd.revents & POLLIN) retmask |= AE_READABLE; + if (pfd.revents & POLLOUT) retmask |= AE_WRITABLE; + if (pfd.revents & POLLERR) retmask |= AE_WRITABLE; + if (pfd.revents & POLLHUP) retmask |= AE_WRITABLE; + return retmask; + } else { + return retval; + } +} + +void aeMain(aeEventLoop *eventLoop) { + eventLoop->stop = 0; + while (!eventLoop->stop) { + aeProcessEvents(eventLoop, AE_ALL_EVENTS| + AE_CALL_BEFORE_SLEEP| + AE_CALL_AFTER_SLEEP); + } +} + +char *aeGetApiName(void) { + return aeApiName(); +} + +void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) { + eventLoop->beforesleep = beforesleep; +} + +void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep) { + eventLoop->aftersleep = aftersleep; +} diff --git a/examples/redis-unstable/src/ae.h b/examples/redis-unstable/src/ae.h new file mode 100644 index 0000000..996d48b --- /dev/null +++ b/examples/redis-unstable/src/ae.h @@ -0,0 +1,118 @@ +/* A simple event-driven programming library. Originally I wrote this code + * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated + * it in form of a library for easy reuse. + * + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#ifndef __AE_H__ +#define __AE_H__ + +#include "monotonic.h" + +#define AE_OK 0 +#define AE_ERR -1 + +#define AE_NONE 0 /* No events registered. */ +#define AE_READABLE 1 /* Fire when descriptor is readable. */ +#define AE_WRITABLE 2 /* Fire when descriptor is writable. */ +#define AE_BARRIER 4 /* With WRITABLE, never fire the event if the + READABLE event already fired in the same event + loop iteration. Useful when you want to persist + things to disk before sending replies, and want + to do that in a group fashion. */ + +#define AE_FILE_EVENTS (1<<0) +#define AE_TIME_EVENTS (1<<1) +#define AE_ALL_EVENTS (AE_FILE_EVENTS|AE_TIME_EVENTS) +#define AE_DONT_WAIT (1<<2) +#define AE_CALL_BEFORE_SLEEP (1<<3) +#define AE_CALL_AFTER_SLEEP (1<<4) + +#define AE_NOMORE -1 +#define AE_DELETED_EVENT_ID -1 + +/* Macros */ +#define AE_NOTUSED(V) ((void) V) + +struct aeEventLoop; + +/* Types and data structures */ +typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask); +typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData); +typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData); +typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop); + +/* File event structure */ +typedef struct aeFileEvent { + int mask; /* one of AE_(READABLE|WRITABLE|BARRIER) */ + aeFileProc *rfileProc; + aeFileProc *wfileProc; + void *clientData; +} aeFileEvent; + +/* Time event structure */ +typedef struct aeTimeEvent { + long long id; /* time event identifier. */ + monotime when; + aeTimeProc *timeProc; + aeEventFinalizerProc *finalizerProc; + void *clientData; + struct aeTimeEvent *prev; + struct aeTimeEvent *next; + int refcount; /* refcount to prevent timer events from being + * freed in recursive time event calls. */ +} aeTimeEvent; + +/* A fired event */ +typedef struct aeFiredEvent { + int fd; + int mask; +} aeFiredEvent; + +/* State of an event based program */ +typedef struct aeEventLoop { + int maxfd; /* highest file descriptor currently registered */ + int setsize; /* max number of file descriptors tracked */ + long long timeEventNextId; + int nevents; /* Size of Registered events */ + aeFileEvent *events; /* Registered events */ + aeFiredEvent *fired; /* Fired events */ + aeTimeEvent *timeEventHead; + int stop; + void *apidata; /* This is used for polling API specific data */ + aeBeforeSleepProc *beforesleep; + aeBeforeSleepProc *aftersleep; + int flags; + void *privdata[2]; +} aeEventLoop; + +/* Prototypes */ +aeEventLoop *aeCreateEventLoop(int setsize); +void aeDeleteEventLoop(aeEventLoop *eventLoop); +void aeStop(aeEventLoop *eventLoop); +int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, + aeFileProc *proc, void *clientData); +void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask); +int aeGetFileEvents(aeEventLoop *eventLoop, int fd); +void *aeGetFileClientData(aeEventLoop *eventLoop, int fd); +long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, + aeTimeProc *proc, void *clientData, + aeEventFinalizerProc *finalizerProc); +int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id); +int aeProcessEvents(aeEventLoop *eventLoop, int flags); +int aeWait(int fd, int mask, long long milliseconds); +void aeMain(aeEventLoop *eventLoop); +char *aeGetApiName(void); +void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep); +void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *aftersleep); +int aeGetSetSize(aeEventLoop *eventLoop); +int aeResizeSetSize(aeEventLoop *eventLoop, int setsize); +void aeSetDontWait(aeEventLoop *eventLoop, int noWait); + +#endif diff --git a/examples/redis-unstable/src/ae_epoll.c b/examples/redis-unstable/src/ae_epoll.c new file mode 100644 index 0000000..6b91661 --- /dev/null +++ b/examples/redis-unstable/src/ae_epoll.c @@ -0,0 +1,119 @@ +/* Linux epoll(2) based ae.c module + * + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + + +#include + +typedef struct aeApiState { + int epfd; + struct epoll_event *events; +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + state->events = zmalloc(sizeof(struct epoll_event)*eventLoop->setsize); + if (!state->events) { + zfree(state); + return -1; + } + state->epfd = epoll_create(1024); /* 1024 is just a hint for the kernel */ + if (state->epfd == -1) { + zfree(state->events); + zfree(state); + return -1; + } + anetCloexec(state->epfd); + eventLoop->apidata = state; + return 0; +} + +static int aeApiResize(aeEventLoop *eventLoop, int setsize) { + aeApiState *state = eventLoop->apidata; + + state->events = zrealloc(state->events, sizeof(struct epoll_event)*setsize); + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->epfd); + zfree(state->events); + zfree(state); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee = {0}; /* avoid valgrind warning */ + /* If the fd was already monitored for some event, we need a MOD + * operation. Otherwise we need an ADD operation. */ + int op = eventLoop->events[fd].mask == AE_NONE ? + EPOLL_CTL_ADD : EPOLL_CTL_MOD; + + ee.events = 0; + mask |= eventLoop->events[fd].mask; /* Merge old events */ + if (mask & AE_READABLE) ee.events |= EPOLLIN; + if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; + ee.data.fd = fd; + if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1; + return 0; +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee = {0}; /* avoid valgrind warning */ + int mask = eventLoop->events[fd].mask & (~delmask); + + ee.events = 0; + if (mask & AE_READABLE) ee.events |= EPOLLIN; + if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; + ee.data.fd = fd; + if (mask != AE_NONE) { + epoll_ctl(state->epfd,EPOLL_CTL_MOD,fd,&ee); + } else { + /* Note, Kernel < 2.6.9 requires a non null event pointer even for + * EPOLL_CTL_DEL. */ + epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee); + } +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, numevents = 0; + + retval = epoll_wait(state->epfd,state->events,eventLoop->setsize, + tvp ? (tvp->tv_sec*1000 + (tvp->tv_usec + 999)/1000) : -1); + if (retval > 0) { + int j; + + numevents = retval; + for (j = 0; j < numevents; j++) { + int mask = 0; + struct epoll_event *e = state->events+j; + + if (e->events & EPOLLIN) mask |= AE_READABLE; + if (e->events & EPOLLOUT) mask |= AE_WRITABLE; + if (e->events & EPOLLERR) mask |= AE_WRITABLE|AE_READABLE; + if (e->events & EPOLLHUP) mask |= AE_WRITABLE|AE_READABLE; + eventLoop->fired[j].fd = e->data.fd; + eventLoop->fired[j].mask = mask; + } + } else if (retval == -1 && errno != EINTR) { + panic("aeApiPoll: epoll_wait, %s", strerror(errno)); + } + + return numevents; +} + +static char *aeApiName(void) { + return "epoll"; +} diff --git a/examples/redis-unstable/src/ae_evport.c b/examples/redis-unstable/src/ae_evport.c new file mode 100644 index 0000000..2598ca0 --- /dev/null +++ b/examples/redis-unstable/src/ae_evport.c @@ -0,0 +1,323 @@ +/* ae.c module for illumos event ports. + * + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include +#include + +#include +#include + +#include + +static int evport_debug = 0; + +/* + * This file implements the ae API using event ports, present on Solaris-based + * systems since Solaris 10. Using the event port interface, we associate file + * descriptors with the port. Each association also includes the set of poll(2) + * events that the consumer is interested in (e.g., POLLIN and POLLOUT). + * + * There's one tricky piece to this implementation: when we return events via + * aeApiPoll, the corresponding file descriptors become dissociated from the + * port. This is necessary because poll events are level-triggered, so if the + * fd didn't become dissociated, it would immediately fire another event since + * the underlying state hasn't changed yet. We must re-associate the file + * descriptor, but only after we know that our caller has actually read from it. + * The ae API does not tell us exactly when that happens, but we do know that + * it must happen by the time aeApiPoll is called again. Our solution is to + * keep track of the last fds returned by aeApiPoll and re-associate them next + * time aeApiPoll is invoked. + * + * To summarize, in this module, each fd association is EITHER (a) represented + * only via the in-kernel association OR (b) represented by pending_fds and + * pending_masks. (b) is only true for the last fds we returned from aeApiPoll, + * and only until we enter aeApiPoll again (at which point we restore the + * in-kernel association). + */ +#define MAX_EVENT_BATCHSZ 512 + +typedef struct aeApiState { + int portfd; /* event port */ + uint_t npending; /* # of pending fds */ + int pending_fds[MAX_EVENT_BATCHSZ]; /* pending fds */ + int pending_masks[MAX_EVENT_BATCHSZ]; /* pending fds' masks */ +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + int i; + aeApiState *state = zmalloc(sizeof(aeApiState)); + if (!state) return -1; + + state->portfd = port_create(); + if (state->portfd == -1) { + zfree(state); + return -1; + } + anetCloexec(state->portfd); + + state->npending = 0; + + for (i = 0; i < MAX_EVENT_BATCHSZ; i++) { + state->pending_fds[i] = -1; + state->pending_masks[i] = AE_NONE; + } + + eventLoop->apidata = state; + return 0; +} + +static int aeApiResize(aeEventLoop *eventLoop, int setsize) { + (void) eventLoop; + (void) setsize; + /* Nothing to resize here. */ + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->portfd); + zfree(state); +} + +static int aeApiLookupPending(aeApiState *state, int fd) { + uint_t i; + + for (i = 0; i < state->npending; i++) { + if (state->pending_fds[i] == fd) + return (i); + } + + return (-1); +} + +/* + * Helper function to invoke port_associate for the given fd and mask. + */ +static int aeApiAssociate(const char *where, int portfd, int fd, int mask) { + int events = 0; + int rv, err; + + if (mask & AE_READABLE) + events |= POLLIN; + if (mask & AE_WRITABLE) + events |= POLLOUT; + + if (evport_debug) + fprintf(stderr, "%s: port_associate(%d, 0x%x) = ", where, fd, events); + + rv = port_associate(portfd, PORT_SOURCE_FD, fd, events, + (void *)(uintptr_t)mask); + err = errno; + + if (evport_debug) + fprintf(stderr, "%d (%s)\n", rv, rv == 0 ? "no error" : strerror(err)); + + if (rv == -1) { + fprintf(stderr, "%s: port_associate: %s\n", where, strerror(err)); + + if (err == EAGAIN) + fprintf(stderr, "aeApiAssociate: event port limit exceeded."); + } + + return rv; +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + int fullmask, pfd; + + if (evport_debug) + fprintf(stderr, "aeApiAddEvent: fd %d mask 0x%x\n", fd, mask); + + /* + * Since port_associate's "events" argument replaces any existing events, we + * must be sure to include whatever events are already associated when + * we call port_associate() again. + */ + fullmask = mask | eventLoop->events[fd].mask; + pfd = aeApiLookupPending(state, fd); + + if (pfd != -1) { + /* + * This fd was recently returned from aeApiPoll. It should be safe to + * assume that the consumer has processed that poll event, but we play + * it safer by simply updating pending_mask. The fd will be + * re-associated as usual when aeApiPoll is called again. + */ + if (evport_debug) + fprintf(stderr, "aeApiAddEvent: adding to pending fd %d\n", fd); + state->pending_masks[pfd] |= fullmask; + return 0; + } + + return (aeApiAssociate("aeApiAddEvent", state->portfd, fd, fullmask)); +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + int fullmask, pfd; + + if (evport_debug) + fprintf(stderr, "del fd %d mask 0x%x\n", fd, mask); + + pfd = aeApiLookupPending(state, fd); + + if (pfd != -1) { + if (evport_debug) + fprintf(stderr, "deleting event from pending fd %d\n", fd); + + /* + * This fd was just returned from aeApiPoll, so it's not currently + * associated with the port. All we need to do is update + * pending_mask appropriately. + */ + state->pending_masks[pfd] &= ~mask; + + if (state->pending_masks[pfd] == AE_NONE) + state->pending_fds[pfd] = -1; + + return; + } + + /* + * The fd is currently associated with the port. Like with the add case + * above, we must look at the full mask for the file descriptor before + * updating that association. We don't have a good way of knowing what the + * events are without looking into the eventLoop state directly. We rely on + * the fact that our caller has already updated the mask in the eventLoop. + */ + + /* We always remove the specified events from the current mask, + * regardless of whether eventLoop->events[fd].mask has been updated yet. */ + fullmask = eventLoop->events[fd].mask & ~mask; + if (fullmask == AE_NONE) { + /* + * We're removing *all* events, so use port_dissociate to remove the + * association completely. Failure here indicates a bug. + */ + if (evport_debug) + fprintf(stderr, "aeApiDelEvent: port_dissociate(%d)\n", fd); + + if (port_dissociate(state->portfd, PORT_SOURCE_FD, fd) != 0) { + perror("aeApiDelEvent: port_dissociate"); + abort(); /* will not return */ + } + } else if (aeApiAssociate("aeApiDelEvent", state->portfd, fd, + fullmask) != 0) { + /* + * ENOMEM is a potentially transient condition, but the kernel won't + * generally return it unless things are really bad. EAGAIN indicates + * we've reached a resource limit, for which it doesn't make sense to + * retry (counter-intuitively). All other errors indicate a bug. In any + * of these cases, the best we can do is to abort. + */ + abort(); /* will not return */ + } +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + struct timespec timeout, *tsp; + uint_t mask, i; + uint_t nevents; + port_event_t event[MAX_EVENT_BATCHSZ]; + + /* + * If we've returned fd events before, we must re-associate them with the + * port now, before calling port_get(). See the block comment at the top of + * this file for an explanation of why. + */ + for (i = 0; i < state->npending; i++) { + if (state->pending_fds[i] == -1) + /* This fd has since been deleted. */ + continue; + + if (aeApiAssociate("aeApiPoll", state->portfd, + state->pending_fds[i], state->pending_masks[i]) != 0) { + /* See aeApiDelEvent for why this case is fatal. */ + abort(); + } + + state->pending_masks[i] = AE_NONE; + state->pending_fds[i] = -1; + } + + state->npending = 0; + + if (tvp != NULL) { + timeout.tv_sec = tvp->tv_sec; + timeout.tv_nsec = tvp->tv_usec * 1000; + tsp = &timeout; + } else { + tsp = NULL; + } + + /* + * port_getn can return with errno == ETIME having returned some events (!). + * So if we get ETIME, we check nevents, too. + */ + nevents = 1; + if (port_getn(state->portfd, event, MAX_EVENT_BATCHSZ, &nevents, + tsp) == -1 && (errno != ETIME || nevents == 0)) { + if (errno == ETIME || errno == EINTR) + return 0; + + /* Any other error indicates a bug. */ + panic("aeApiPoll: port_getn, %s", strerror(errno)); + } + + state->npending = nevents; + + for (i = 0; i < nevents; i++) { + mask = 0; + if (event[i].portev_events & POLLIN) + mask |= AE_READABLE; + if (event[i].portev_events & POLLOUT) + mask |= AE_WRITABLE; + + eventLoop->fired[i].fd = event[i].portev_object; + eventLoop->fired[i].mask = mask; + + if (evport_debug) + fprintf(stderr, "aeApiPoll: fd %d mask 0x%x\n", + (int)event[i].portev_object, mask); + + state->pending_fds[i] = event[i].portev_object; + state->pending_masks[i] = (uintptr_t)event[i].portev_user; + } + + return nevents; +} + +static char *aeApiName(void) { + return "evport"; +} diff --git a/examples/redis-unstable/src/ae_kqueue.c b/examples/redis-unstable/src/ae_kqueue.c new file mode 100644 index 0000000..ec10a5e --- /dev/null +++ b/examples/redis-unstable/src/ae_kqueue.c @@ -0,0 +1,183 @@ +/* Kqueue(2)-based ae.c module + * + * Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include +#include + +typedef struct aeApiState { + int kqfd; + struct kevent *events; + + /* Events mask for merge read and write event. + * To reduce memory consumption, we use 2 bits to store the mask + * of an event, so that 1 byte will store the mask of 4 events. */ + char *eventsMask; +} aeApiState; + +#define EVENT_MASK_MALLOC_SIZE(sz) (((sz) + 3) / 4) +#define EVENT_MASK_OFFSET(fd) ((fd) % 4 * 2) +#define EVENT_MASK_ENCODE(fd, mask) (((mask) & 0x3) << EVENT_MASK_OFFSET(fd)) + +static inline int getEventMask(const char *eventsMask, int fd) { + return (eventsMask[fd/4] >> EVENT_MASK_OFFSET(fd)) & 0x3; +} + +static inline void addEventMask(char *eventsMask, int fd, int mask) { + eventsMask[fd/4] |= EVENT_MASK_ENCODE(fd, mask); +} + +static inline void resetEventMask(char *eventsMask, int fd) { + eventsMask[fd/4] &= ~EVENT_MASK_ENCODE(fd, 0x3); +} + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + state->events = zmalloc(sizeof(struct kevent)*eventLoop->setsize); + if (!state->events) { + zfree(state); + return -1; + } + state->kqfd = kqueue(); + if (state->kqfd == -1) { + zfree(state->events); + zfree(state); + return -1; + } + anetCloexec(state->kqfd); + state->eventsMask = zmalloc(EVENT_MASK_MALLOC_SIZE(eventLoop->setsize)); + memset(state->eventsMask, 0, EVENT_MASK_MALLOC_SIZE(eventLoop->setsize)); + eventLoop->apidata = state; + return 0; +} + +static int aeApiResize(aeEventLoop *eventLoop, int setsize) { + aeApiState *state = eventLoop->apidata; + + state->events = zrealloc(state->events, sizeof(struct kevent)*setsize); + state->eventsMask = zrealloc(state->eventsMask, EVENT_MASK_MALLOC_SIZE(setsize)); + memset(state->eventsMask, 0, EVENT_MASK_MALLOC_SIZE(setsize)); + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->kqfd); + zfree(state->events); + zfree(state->eventsMask); + zfree(state); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct kevent evs[2]; + int nch = 0; + + if (mask & AE_READABLE) EV_SET(evs + nch++, fd, EVFILT_READ, EV_ADD, 0, 0, NULL); + if (mask & AE_WRITABLE) EV_SET(evs + nch++, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL); + + return kevent(state->kqfd, evs, nch, NULL, 0, NULL); +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct kevent evs[2]; + int nch = 0; + + if (mask & AE_READABLE) EV_SET(evs + nch++, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + if (mask & AE_WRITABLE) EV_SET(evs + nch++, fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + + kevent(state->kqfd, evs, nch, NULL, 0, NULL); +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, numevents = 0; + + if (tvp != NULL) { + struct timespec timeout; + timeout.tv_sec = tvp->tv_sec; + timeout.tv_nsec = tvp->tv_usec * 1000; + retval = kevent(state->kqfd, NULL, 0, state->events, eventLoop->setsize, + &timeout); + } else { + retval = kevent(state->kqfd, NULL, 0, state->events, eventLoop->setsize, + NULL); + } + + if (retval > 0) { + int j; + + /* Normally we execute the read event first and then the write event. + * When the barrier is set, we will do it reverse. + * + * However, under kqueue, read and write events would be separate + * events, which would make it impossible to control the order of + * reads and writes. So we store the event's mask we've got and merge + * the same fd events later. */ + for (j = 0; j < retval; j++) { + struct kevent *e = state->events+j; + int fd = e->ident; + int mask = 0; + + if (e->filter == EVFILT_READ) mask = AE_READABLE; + else if (e->filter == EVFILT_WRITE) mask = AE_WRITABLE; + addEventMask(state->eventsMask, fd, mask); + } + + /* Re-traversal to merge read and write events, and set the fd's mask to + * 0 so that events are not added again when the fd is encountered again. */ + numevents = 0; + for (j = 0; j < retval; j++) { + struct kevent *e = state->events+j; + int fd = e->ident; + int mask = getEventMask(state->eventsMask, fd); + + if (mask) { + eventLoop->fired[numevents].fd = fd; + eventLoop->fired[numevents].mask = mask; + resetEventMask(state->eventsMask, fd); + numevents++; + } + } + } else if (retval == -1 && errno != EINTR) { + panic("aeApiPoll: kevent, %s", strerror(errno)); + } + + return numevents; +} + +static char *aeApiName(void) { + return "kqueue"; +} diff --git a/examples/redis-unstable/src/ae_select.c b/examples/redis-unstable/src/ae_select.c new file mode 100644 index 0000000..208cc32 --- /dev/null +++ b/examples/redis-unstable/src/ae_select.c @@ -0,0 +1,90 @@ +/* Select()-based ae.c module. + * + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + + +#include +#include + +typedef struct aeApiState { + fd_set rfds, wfds; + /* We need to have a copy of the fd sets as it's not safe to reuse + * FD sets after select(). */ + fd_set _rfds, _wfds; +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + FD_ZERO(&state->rfds); + FD_ZERO(&state->wfds); + eventLoop->apidata = state; + return 0; +} + +static int aeApiResize(aeEventLoop *eventLoop, int setsize) { + AE_NOTUSED(eventLoop); + /* Just ensure we have enough room in the fd_set type. */ + if (setsize >= FD_SETSIZE) return -1; + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + zfree(eventLoop->apidata); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + + if (mask & AE_READABLE) FD_SET(fd,&state->rfds); + if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds); + return 0; +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + + if (mask & AE_READABLE) FD_CLR(fd,&state->rfds); + if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds); +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, j, numevents = 0; + + memcpy(&state->_rfds,&state->rfds,sizeof(fd_set)); + memcpy(&state->_wfds,&state->wfds,sizeof(fd_set)); + + retval = select(eventLoop->maxfd+1, + &state->_rfds,&state->_wfds,NULL,tvp); + if (retval > 0) { + for (j = 0; j <= eventLoop->maxfd; j++) { + int mask = 0; + aeFileEvent *fe = &eventLoop->events[j]; + + if (fe->mask == AE_NONE) continue; + if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds)) + mask |= AE_READABLE; + if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds)) + mask |= AE_WRITABLE; + eventLoop->fired[numevents].fd = j; + eventLoop->fired[numevents].mask = mask; + numevents++; + } + } else if (retval == -1 && errno != EINTR) { + panic("aeApiPoll: select, %s", strerror(errno)); + } + + return numevents; +} + +static char *aeApiName(void) { + return "select"; +} diff --git a/examples/redis-unstable/src/anet.c b/examples/redis-unstable/src/anet.c new file mode 100644 index 0000000..8b7b91e --- /dev/null +++ b/examples/redis-unstable/src/anet.c @@ -0,0 +1,812 @@ +/* anet.c -- Basic TCP socket stuff made a bit less boring + * + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "anet.h" +#include "config.h" +#include "util.h" + +#define UNUSED(x) (void)(x) + +static void anetSetError(char *err, const char *fmt, ...) +{ + va_list ap; + + if (!err) return; + va_start(ap, fmt); + vsnprintf(err, ANET_ERR_LEN, fmt, ap); + va_end(ap); +} + +int anetGetError(int fd) { + int sockerr = 0; + socklen_t errlen = sizeof(sockerr); + + if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &sockerr, &errlen) == -1) + sockerr = errno; + return sockerr; +} + +int anetSetBlock(char *err, int fd, int non_block) { + int flags; + + /* Set the socket blocking (if non_block is zero) or non-blocking. + * Note that fcntl(2) for F_GETFL and F_SETFL can't be + * interrupted by a signal. */ + if ((flags = fcntl(fd, F_GETFL)) == -1) { + anetSetError(err, "fcntl(F_GETFL): %s", strerror(errno)); + return ANET_ERR; + } + + /* Check if this flag has been set or unset, if so, + * then there is no need to call fcntl to set/unset it again. */ + if (!!(flags & O_NONBLOCK) == !!non_block) + return ANET_OK; + + if (non_block) + flags |= O_NONBLOCK; + else + flags &= ~O_NONBLOCK; + + if (fcntl(fd, F_SETFL, flags) == -1) { + anetSetError(err, "fcntl(F_SETFL,O_NONBLOCK): %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetNonBlock(char *err, int fd) { + return anetSetBlock(err,fd,1); +} + +int anetBlock(char *err, int fd) { + return anetSetBlock(err,fd,0); +} + +/* Enable the FD_CLOEXEC on the given fd to avoid fd leaks. + * This function should be invoked for fd's on specific places + * where fork + execve system calls are called. */ +int anetCloexec(int fd) { + int r; + int flags; + + do { + r = fcntl(fd, F_GETFD); + } while (r == -1 && errno == EINTR); + + if (r == -1 || (r & FD_CLOEXEC)) + return r; + + flags = r | FD_CLOEXEC; + + do { + r = fcntl(fd, F_SETFD, flags); + } while (r == -1 && errno == EINTR); + + return r; +} + +/* Enable TCP keep-alive mechanism to detect dead peers, + * TCP_KEEPIDLE, TCP_KEEPINTVL and TCP_KEEPCNT will be set accordingly. */ +int anetKeepAlive(char *err, int fd, int interval) +{ + int enabled = 1; + if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &enabled, sizeof(enabled))) + { + anetSetError(err, "setsockopt SO_KEEPALIVE: %s", strerror(errno)); + return ANET_ERR; + } + + int idle; + int intvl; + int cnt; + + /* There are platforms that are expected to support the full mechanism of TCP keep-alive, + * we want the compiler to emit warnings of unused variables if the preprocessor directives + * somehow fail, and other than those platforms, just omit these warnings if they happen. + */ +#if !(defined(_AIX) || defined(__APPLE__) || defined(__DragonFly__) || \ + defined(__FreeBSD__) || defined(__illumos__) || defined(__linux__) || \ + defined(__NetBSD__) || defined(__sun)) + UNUSED(interval); + UNUSED(idle); + UNUSED(intvl); + UNUSED(cnt); +#endif + +#ifdef __sun + /* The implementation of TCP keep-alive on Solaris/SmartOS is a bit unusual + * compared to other Unix-like systems. + * Thus, we need to specialize it on Solaris. + * + * There are two keep-alive mechanisms on Solaris: + * - By default, the first keep-alive probe is sent out after a TCP connection is idle for two hours. + * If the peer does not respond to the probe within eight minutes, the TCP connection is aborted. + * You can alter the interval for sending out the first probe using the socket option TCP_KEEPALIVE_THRESHOLD + * in milliseconds or TCP_KEEPIDLE in seconds. + * The system default is controlled by the TCP ndd parameter tcp_keepalive_interval. The minimum value is ten seconds. + * The maximum is ten days, while the default is two hours. If you receive no response to the probe, + * you can use the TCP_KEEPALIVE_ABORT_THRESHOLD socket option to change the time threshold for aborting a TCP connection. + * The option value is an unsigned integer in milliseconds. The value zero indicates that TCP should never time out and + * abort the connection when probing. The system default is controlled by the TCP ndd parameter tcp_keepalive_abort_interval. + * The default is eight minutes. + * + * - The second implementation is activated if socket option TCP_KEEPINTVL and/or TCP_KEEPCNT are set. + * The time between each consequent probes is set by TCP_KEEPINTVL in seconds. + * The minimum value is ten seconds. The maximum is ten days, while the default is two hours. + * The TCP connection will be aborted after certain amount of probes, which is set by TCP_KEEPCNT, without receiving response. + */ + + idle = interval; + if (idle < 10) idle = 10; // kernel expects at least 10 seconds + if (idle > 10*24*60*60) idle = 10*24*60*60; // kernel expects at most 10 days + + /* `TCP_KEEPIDLE`, `TCP_KEEPINTVL`, and `TCP_KEEPCNT` were not available on Solaris + * until version 11.4, but let's take a chance here. */ +#if defined(TCP_KEEPIDLE) && defined(TCP_KEEPINTVL) && defined(TCP_KEEPCNT) + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno)); + return ANET_ERR; + } + + intvl = idle/3; + if (intvl < 10) intvl = 10; /* kernel expects at least 10 seconds */ + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl))) { + anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); + return ANET_ERR; + } + + cnt = 3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt))) { + anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); + return ANET_ERR; + } +#else + /* Fall back to the first implementation of tcp-alive mechanism for older Solaris, + * simulate the tcp-alive mechanism on other platforms via `TCP_KEEPALIVE_THRESHOLD` + `TCP_KEEPALIVE_ABORT_THRESHOLD`. + */ + idle *= 1000; // kernel expects milliseconds + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE_THRESHOLD, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); + return ANET_ERR; + } + + /* Note that the consequent probes will not be sent at equal intervals on Solaris, + * but will be sent using the exponential backoff algorithm. */ + int time_to_abort = idle; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE_ABORT_THRESHOLD, &time_to_abort, sizeof(time_to_abort))) { + anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); + return ANET_ERR; + } +#endif + + return ANET_OK; + +#endif + +#ifdef TCP_KEEPIDLE + /* Default settings are more or less garbage, with the keepalive time + * set to 7200 by default on Linux and other Unix-like systems. + * Modify settings to make the feature actually useful. */ + + /* Send first probe after interval. */ + idle = interval; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno)); + return ANET_ERR; + } +#elif defined(TCP_KEEPALIVE) + /* Darwin/macOS uses TCP_KEEPALIVE in place of TCP_KEEPIDLE. */ + idle = interval; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &idle, sizeof(idle))) { + anetSetError(err, "setsockopt TCP_KEEPALIVE: %s\n", strerror(errno)); + return ANET_ERR; + } +#endif + +#ifdef TCP_KEEPINTVL + /* Send next probes after the specified interval. Note that we set the + * delay as interval / 3, as we send three probes before detecting + * an error (see the next setsockopt call). */ + intvl = interval/3; + if (intvl == 0) intvl = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl))) { + anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno)); + return ANET_ERR; + } +#endif + +#ifdef TCP_KEEPCNT + /* Consider the socket in error state after three we send three ACK + * probes without getting a reply. */ + cnt = 3; + if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt))) { + anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno)); + return ANET_ERR; + } +#endif + + return ANET_OK; +} + +static int anetSetTcpNoDelay(char *err, int fd, int val) +{ + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) == -1) + { + anetSetError(err, "setsockopt TCP_NODELAY: %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetEnableTcpNoDelay(char *err, int fd) +{ + return anetSetTcpNoDelay(err, fd, 1); +} + +int anetDisableTcpNoDelay(char *err, int fd) +{ + return anetSetTcpNoDelay(err, fd, 0); +} + +/* Set the socket send timeout (SO_SNDTIMEO socket option) to the specified + * number of milliseconds, or disable it if the 'ms' argument is zero. */ +int anetSendTimeout(char *err, int fd, long long ms) { + struct timeval tv; + + tv.tv_sec = ms/1000; + tv.tv_usec = (ms%1000)*1000; + if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) == -1) { + anetSetError(err, "setsockopt SO_SNDTIMEO: %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +/* Set the socket receive timeout (SO_RCVTIMEO socket option) to the specified + * number of milliseconds, or disable it if the 'ms' argument is zero. */ +int anetRecvTimeout(char *err, int fd, long long ms) { + struct timeval tv; + + tv.tv_sec = ms/1000; + tv.tv_usec = (ms%1000)*1000; + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) == -1) { + anetSetError(err, "setsockopt SO_RCVTIMEO: %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +/* Resolve the hostname "host" and set the string representation of the + * IP address into the buffer pointed by "ipbuf". + * + * If flags is set to ANET_IP_ONLY the function only resolves hostnames + * that are actually already IPv4 or IPv6 addresses. This turns the function + * into a validating / normalizing function. + * + * If the flag ANET_PREFER_IPV4 is set, IPv4 is preferred over IPv6. + * If the flag ANET_PREFER_IPV6 is set, IPv6 is preferred over IPv4. + * */ +int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len, + int flags) +{ + struct addrinfo hints, *info; + int rv; + + memset(&hints,0,sizeof(hints)); + if (flags & ANET_IP_ONLY) hints.ai_flags = AI_NUMERICHOST; + hints.ai_family = AF_UNSPEC; + if (flags & ANET_PREFER_IPV4 && !(flags & ANET_PREFER_IPV6)) { + hints.ai_family = AF_INET; + } else if (flags & ANET_PREFER_IPV6 && !(flags & ANET_PREFER_IPV4)) { + hints.ai_family = AF_INET6; + } + hints.ai_socktype = SOCK_STREAM; /* specify socktype to avoid dups */ + + rv = getaddrinfo(host, NULL, &hints, &info); + if (rv != 0 && hints.ai_family != AF_UNSPEC) { + /* Try the other IP version. */ + hints.ai_family = (hints.ai_family == AF_INET) ? AF_INET6 : AF_INET; + rv = getaddrinfo(host, NULL, &hints, &info); + } + if (rv != 0) { + anetSetError(err, "%s", gai_strerror(rv)); + return ANET_ERR; + } + if (info->ai_family == AF_INET) { + struct sockaddr_in *sa = (struct sockaddr_in *)info->ai_addr; + inet_ntop(AF_INET, &(sa->sin_addr), ipbuf, ipbuf_len); + } else { + struct sockaddr_in6 *sa = (struct sockaddr_in6 *)info->ai_addr; + inet_ntop(AF_INET6, &(sa->sin6_addr), ipbuf, ipbuf_len); + } + + freeaddrinfo(info); + return ANET_OK; +} + +static int anetSetReuseAddr(char *err, int fd) { + int yes = 1; + /* Make sure connection-intensive things like the redis benchmark + * will be able to close/open sockets a zillion of times */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) == -1) { + anetSetError(err, "setsockopt SO_REUSEADDR: %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +static int anetCreateSocket(char *err, int domain) { + int s; + if ((s = socket(domain, SOCK_STREAM, 0)) == -1) { + anetSetError(err, "creating socket: %s", strerror(errno)); + return ANET_ERR; + } + + /* Make sure connection-intensive things like the redis benchmark + * will be able to close/open sockets a zillion of times */ + if (anetSetReuseAddr(err,s) == ANET_ERR) { + close(s); + return ANET_ERR; + } + return s; +} + +#define ANET_CONNECT_NONE 0 +#define ANET_CONNECT_NONBLOCK 1 +#define ANET_CONNECT_BE_BINDING 2 /* Best effort binding. */ +static int anetTcpGenericConnect(char *err, const char *addr, int port, + const char *source_addr, int flags) +{ + int s = ANET_ERR, rv; + char portstr[6]; /* strlen("65535") + 1; */ + struct addrinfo hints, *servinfo, *bservinfo, *p, *b; + + snprintf(portstr,sizeof(portstr),"%d",port); + memset(&hints,0,sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + + if ((rv = getaddrinfo(addr,portstr,&hints,&servinfo)) != 0) { + anetSetError(err, "%s", gai_strerror(rv)); + return ANET_ERR; + } + for (p = servinfo; p != NULL; p = p->ai_next) { + /* Try to create the socket and to connect it. + * If we fail in the socket() call, or on connect(), we retry with + * the next entry in servinfo. */ + if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) + continue; + if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; + if (flags & ANET_CONNECT_NONBLOCK && anetNonBlock(err,s) != ANET_OK) + goto error; + if (source_addr) { + int bound = 0; + /* Using getaddrinfo saves us from self-determining IPv4 vs IPv6 */ + if ((rv = getaddrinfo(source_addr, NULL, &hints, &bservinfo)) != 0) + { + anetSetError(err, "%s", gai_strerror(rv)); + goto error; + } + for (b = bservinfo; b != NULL; b = b->ai_next) { + if (bind(s,b->ai_addr,b->ai_addrlen) != -1) { + bound = 1; + break; + } + } + freeaddrinfo(bservinfo); + if (!bound) { + anetSetError(err, "bind: %s", strerror(errno)); + goto error; + } + } + if (connect(s,p->ai_addr,p->ai_addrlen) == -1) { + /* If the socket is non-blocking, it is ok for connect() to + * return an EINPROGRESS error here. */ + if (errno == EINPROGRESS && flags & ANET_CONNECT_NONBLOCK) + goto end; + close(s); + s = ANET_ERR; + continue; + } + + /* If we ended an iteration of the for loop without errors, we + * have a connected socket. Let's return to the caller. */ + goto end; + } + if (p == NULL) + anetSetError(err, "creating socket: %s", strerror(errno)); + +error: + if (s != ANET_ERR) { + close(s); + s = ANET_ERR; + } + +end: + freeaddrinfo(servinfo); + + /* Handle best effort binding: if a binding address was used, but it is + * not possible to create a socket, try again without a binding address. */ + if (s == ANET_ERR && source_addr && (flags & ANET_CONNECT_BE_BINDING)) { + return anetTcpGenericConnect(err,addr,port,NULL,flags); + } else { + return s; + } +} + +int anetTcpNonBlockConnect(char *err, const char *addr, int port) +{ + return anetTcpGenericConnect(err,addr,port,NULL,ANET_CONNECT_NONBLOCK); +} + +int anetTcpNonBlockBestEffortBindConnect(char *err, const char *addr, int port, + const char *source_addr) +{ + return anetTcpGenericConnect(err,addr,port,source_addr, + ANET_CONNECT_NONBLOCK|ANET_CONNECT_BE_BINDING); +} + +int anetUnixGenericConnect(char *err, const char *path, int flags) +{ + int s; + struct sockaddr_un sa; + + if ((s = anetCreateSocket(err,AF_LOCAL)) == ANET_ERR) + return ANET_ERR; + + sa.sun_family = AF_LOCAL; + redis_strlcpy(sa.sun_path,path,sizeof(sa.sun_path)); + if (flags & ANET_CONNECT_NONBLOCK) { + if (anetNonBlock(err,s) != ANET_OK) { + close(s); + return ANET_ERR; + } + } + if (connect(s,(struct sockaddr*)&sa,sizeof(sa)) == -1) { + if (errno == EINPROGRESS && + flags & ANET_CONNECT_NONBLOCK) + return s; + + anetSetError(err, "connect: %s", strerror(errno)); + close(s); + return ANET_ERR; + } + return s; +} + +static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len, int backlog, mode_t perm) { + if (bind(s,sa,len) == -1) { + anetSetError(err, "bind: %s", strerror(errno)); + close(s); + return ANET_ERR; + } + + if (sa->sa_family == AF_LOCAL && perm) + chmod(((struct sockaddr_un *) sa)->sun_path, perm); + + if (listen(s, backlog) == -1) { + anetSetError(err, "listen: %s", strerror(errno)); + close(s); + return ANET_ERR; + } + return ANET_OK; +} + +static int anetV6Only(char *err, int s) { + int yes = 1; + if (setsockopt(s,IPPROTO_IPV6,IPV6_V6ONLY,&yes,sizeof(yes)) == -1) { + anetSetError(err, "setsockopt: %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backlog) +{ + int s = -1, rv; + char _port[6]; /* strlen("65535") */ + struct addrinfo hints, *servinfo, *p; + + snprintf(_port,6,"%d",port); + memset(&hints,0,sizeof(hints)); + hints.ai_family = af; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_PASSIVE; /* No effect if bindaddr != NULL */ + if (bindaddr && !strcmp("*", bindaddr)) + bindaddr = NULL; + if (af == AF_INET6 && bindaddr && !strcmp("::*", bindaddr)) + bindaddr = NULL; + + if ((rv = getaddrinfo(bindaddr,_port,&hints,&servinfo)) != 0) { + anetSetError(err, "%s", gai_strerror(rv)); + return ANET_ERR; + } + for (p = servinfo; p != NULL; p = p->ai_next) { + if ((s = socket(p->ai_family,p->ai_socktype,p->ai_protocol)) == -1) + continue; + + if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error; + if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; + if (anetListen(err,s,p->ai_addr,p->ai_addrlen,backlog,0) == ANET_ERR) s = ANET_ERR; + goto end; + } + if (p == NULL) { + anetSetError(err, "unable to bind socket, errno: %d", errno); + goto error; + } + +error: + if (s != -1) close(s); + s = ANET_ERR; +end: + freeaddrinfo(servinfo); + return s; +} + +int anetTcpServer(char *err, int port, char *bindaddr, int backlog) +{ + return _anetTcpServer(err, port, bindaddr, AF_INET, backlog); +} + +int anetTcp6Server(char *err, int port, char *bindaddr, int backlog) +{ + return _anetTcpServer(err, port, bindaddr, AF_INET6, backlog); +} + +int anetUnixServer(char *err, char *path, mode_t perm, int backlog) +{ + int s; + struct sockaddr_un sa; + + if (strlen(path) > sizeof(sa.sun_path)-1) { + anetSetError(err,"unix socket path too long (%zu), must be under %zu", strlen(path), sizeof(sa.sun_path)); + return ANET_ERR; + } + if ((s = anetCreateSocket(err,AF_LOCAL)) == ANET_ERR) + return ANET_ERR; + + memset(&sa,0,sizeof(sa)); + sa.sun_family = AF_LOCAL; + redis_strlcpy(sa.sun_path,path,sizeof(sa.sun_path)); + if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa),backlog,perm) == ANET_ERR) + return ANET_ERR; + return s; +} + +/* Accept a connection and also make sure the socket is non-blocking, and CLOEXEC. + * returns the new socket FD, or -1 on error. */ +static int anetGenericAccept(char *err, int s, struct sockaddr *sa, socklen_t *len) { + int fd; + do { + /* Use the accept4() call on linux to simultaneously accept and + * set a socket as non-blocking. */ +#ifdef HAVE_ACCEPT4 + fd = accept4(s, sa, len, SOCK_NONBLOCK | SOCK_CLOEXEC); +#else + fd = accept(s,sa,len); +#endif + } while(fd == -1 && errno == EINTR); + if (fd == -1) { + anetSetError(err, "accept: %s", strerror(errno)); + return ANET_ERR; + } +#ifndef HAVE_ACCEPT4 + if (anetCloexec(fd) == -1) { + anetSetError(err, "anetCloexec: %s", strerror(errno)); + close(fd); + return ANET_ERR; + } + if (anetNonBlock(err, fd) != ANET_OK) { + close(fd); + return ANET_ERR; + } +#endif + return fd; +} + +/* Accept a connection and also make sure the socket is non-blocking, and CLOEXEC. + * returns the new socket FD, or -1 on error. */ +int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port) { + int fd; + struct sockaddr_storage sa; + socklen_t salen = sizeof(sa); + if ((fd = anetGenericAccept(err,serversock,(struct sockaddr*)&sa,&salen)) == ANET_ERR) + return ANET_ERR; + + if (sa.ss_family == AF_INET) { + struct sockaddr_in *s = (struct sockaddr_in *)&sa; + if (ip) inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len); + if (port) *port = ntohs(s->sin_port); + } else { + struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa; + if (ip) inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len); + if (port) *port = ntohs(s->sin6_port); + } + return fd; +} + +/* Accept a connection and also make sure the socket is non-blocking, and CLOEXEC. + * returns the new socket FD, or -1 on error. */ +int anetUnixAccept(char *err, int s) { + int fd; + struct sockaddr_un sa; + socklen_t salen = sizeof(sa); + if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == ANET_ERR) + return ANET_ERR; + + return fd; +} + +int anetFdToString(int fd, char *ip, size_t ip_len, int *port, int remote) { + struct sockaddr_storage sa; + socklen_t salen = sizeof(sa); + + if (remote) { + if (getpeername(fd, (struct sockaddr *)&sa, &salen) == -1) goto error; + } else { + if (getsockname(fd, (struct sockaddr *)&sa, &salen) == -1) goto error; + } + + if (sa.ss_family == AF_INET) { + struct sockaddr_in *s = (struct sockaddr_in *)&sa; + if (ip) { + if (inet_ntop(AF_INET,(void*)&(s->sin_addr),ip,ip_len) == NULL) + goto error; + } + if (port) *port = ntohs(s->sin_port); + } else if (sa.ss_family == AF_INET6) { + struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa; + if (ip) { + if (inet_ntop(AF_INET6,(void*)&(s->sin6_addr),ip,ip_len) == NULL) + goto error; + } + if (port) *port = ntohs(s->sin6_port); + } else if (sa.ss_family == AF_UNIX) { + if (ip) { + int res = snprintf(ip, ip_len, "/unixsocket"); + if (res < 0 || (unsigned int) res >= ip_len) goto error; + } + if (port) *port = 0; + } else { + goto error; + } + return 0; + +error: + if (ip) { + if (ip_len >= 2) { + ip[0] = '?'; + ip[1] = '\0'; + } else if (ip_len == 1) { + ip[0] = '\0'; + } + } + if (port) *port = 0; + return -1; +} + +/* Create a pipe buffer with given flags for read end and write end. + * Note that it supports the file flags defined by pipe2() and fcntl(F_SETFL), + * and one of the use cases is O_CLOEXEC|O_NONBLOCK. */ +int anetPipe(int fds[2], int read_flags, int write_flags) { + int pipe_flags = 0; +#if defined(__linux__) || defined(__FreeBSD__) + /* When possible, try to leverage pipe2() to apply flags that are common to both ends. + * There is no harm to set O_CLOEXEC to prevent fd leaks. */ + pipe_flags = O_CLOEXEC | (read_flags & write_flags); + if (pipe2(fds, pipe_flags)) { + /* Fail on real failures, and fallback to simple pipe if pipe2 is unsupported. */ + if (errno != ENOSYS && errno != EINVAL) + return -1; + pipe_flags = 0; + } else { + /* If the flags on both ends are identical, no need to do anything else. */ + if ((O_CLOEXEC | read_flags) == (O_CLOEXEC | write_flags)) + return 0; + /* Clear the flags which have already been set using pipe2. */ + read_flags &= ~pipe_flags; + write_flags &= ~pipe_flags; + } +#endif + + /* When we reach here with pipe_flags of 0, it means pipe2 failed (or was not attempted), + * so we try to use pipe. Otherwise, we skip and proceed to set specific flags below. */ + if (pipe_flags == 0 && pipe(fds)) + return -1; + + /* File descriptor flags. + * Currently, only one such flag is defined: FD_CLOEXEC, the close-on-exec flag. */ + if (read_flags & O_CLOEXEC) + if (fcntl(fds[0], F_SETFD, FD_CLOEXEC)) + goto error; + if (write_flags & O_CLOEXEC) + if (fcntl(fds[1], F_SETFD, FD_CLOEXEC)) + goto error; + + /* File status flags after clearing the file descriptor flag O_CLOEXEC. */ + read_flags &= ~O_CLOEXEC; + if (read_flags) + if (fcntl(fds[0], F_SETFL, read_flags)) + goto error; + write_flags &= ~O_CLOEXEC; + if (write_flags) + if (fcntl(fds[1], F_SETFL, write_flags)) + goto error; + + return 0; + +error: + close(fds[0]); + close(fds[1]); + return -1; +} + +int anetSetSockMarkId(char *err, int fd, uint32_t id) { +#ifdef HAVE_SOCKOPTMARKID + if (setsockopt(fd, SOL_SOCKET, SOCKOPTMARKID, (void *)&id, sizeof(id)) == -1) { + anetSetError(err, "setsockopt: %s", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +#else + UNUSED(fd); + UNUSED(id); + anetSetError(err,"anetSetSockMarkid unsupported on this platform"); + return ANET_OK; +#endif +} + +int anetIsFifo(char *filepath) { + struct stat sb; + if (stat(filepath, &sb) == -1) return 0; + return S_ISFIFO(sb.st_mode); +} + +/* This function must be called after accept4() fails. It returns 1 if 'err' + * indicates accepted connection faced an error, and it's okay to continue + * accepting next connection by calling accept4() again. Other errors either + * indicate programming errors, e.g. calling accept() on a closed fd or indicate + * a resource limit has been reached, e.g. -EMFILE, open fd limit has been + * reached. In the latter case, caller might wait until resources are available. + * See accept4() documentation for details. */ +int anetAcceptFailureNeedsRetry(int err) { + if (err == ECONNABORTED) + return 1; + +#if defined(__linux__) + /* For details, see 'Error Handling' section on + * https://man7.org/linux/man-pages/man2/accept.2.html */ + if (err == ENETDOWN || err == EPROTO || err == ENOPROTOOPT || + err == EHOSTDOWN || err == ENONET || err == EHOSTUNREACH || + err == EOPNOTSUPP || err == ENETUNREACH) + { + return 1; + } +#endif + return 0; +} diff --git a/examples/redis-unstable/src/anet.h b/examples/redis-unstable/src/anet.h new file mode 100644 index 0000000..1d3aec9 --- /dev/null +++ b/examples/redis-unstable/src/anet.h @@ -0,0 +1,58 @@ +/* anet.c -- Basic TCP socket stuff made a bit less boring + * + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#ifndef ANET_H +#define ANET_H + +#include + +#define ANET_OK 0 +#define ANET_ERR -1 +#define ANET_ERR_LEN 256 + +/* Flags used with certain functions. */ +#define ANET_NONE 0 +#define ANET_IP_ONLY (1<<0) +#define ANET_PREFER_IPV4 (1<<1) +#define ANET_PREFER_IPV6 (1<<2) + +#if defined(__sun) || defined(_AIX) +#define AF_LOCAL AF_UNIX +#endif + +#ifdef _AIX +#undef ip_len +#endif + +int anetTcpNonBlockConnect(char *err, const char *addr, int port); +int anetTcpNonBlockBestEffortBindConnect(char *err, const char *addr, int port, const char *source_addr); +int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len, int flags); +int anetTcpServer(char *err, int port, char *bindaddr, int backlog); +int anetTcp6Server(char *err, int port, char *bindaddr, int backlog); +int anetUnixServer(char *err, char *path, mode_t perm, int backlog); +int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port); +int anetUnixAccept(char *err, int serversock); +int anetNonBlock(char *err, int fd); +int anetBlock(char *err, int fd); +int anetCloexec(int fd); +int anetEnableTcpNoDelay(char *err, int fd); +int anetDisableTcpNoDelay(char *err, int fd); +int anetSendTimeout(char *err, int fd, long long ms); +int anetRecvTimeout(char *err, int fd, long long ms); +int anetFdToString(int fd, char *ip, size_t ip_len, int *port, int remote); +int anetKeepAlive(char *err, int fd, int interval); +int anetFormatAddr(char *fmt, size_t fmt_len, char *ip, int port); +int anetPipe(int fds[2], int read_flags, int write_flags); +int anetSetSockMarkId(char *err, int fd, uint32_t id); +int anetGetError(int fd); +int anetIsFifo(char *filepath); +int anetAcceptFailureNeedsRetry(int err); + +#endif diff --git a/examples/redis-unstable/src/aof.c b/examples/redis-unstable/src/aof.c new file mode 100644 index 0000000..3ace670 --- /dev/null +++ b/examples/redis-unstable/src/aof.c @@ -0,0 +1,2921 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "server.h" +#include "bio.h" +#include "rio.h" +#include "functions.h" +#include "cluster_asm.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +void freeClientArgv(client *c); +off_t getAppendOnlyFileSize(sds filename, int *status); +off_t getBaseAndIncrAppendOnlyFilesSize(aofManifest *am, int *status); +int getBaseAndIncrAppendOnlyFilesNum(aofManifest *am); +int aofFileExist(char *filename); +int rewriteAppendOnlyFile(char *filename); +aofManifest *aofLoadManifestFromFile(sds am_filepath); +void aofManifestFreeAndUpdate(aofManifest *am); +void aof_background_fsync_and_close(int fd); + +/* When we call 'startAppendOnly', we will create a temp INCR AOF, and rename + * it to the real INCR AOF name when the AOFRW is done, so if want to know the + * accurate start offset of the INCR AOF, we need to record it when we create + * the temp INCR AOF. This variable is used to record the start offset, and + * set the start offset of the real INCR AOF when the AOFRW is done. */ +static long long tempIncAofStartReplOffset = 0; + +/* ---------------------------------------------------------------------------- + * AOF Manifest file implementation. + * + * The following code implements the read/write logic of AOF manifest file, which + * is used to track and manage all AOF files. + * + * Append-only files consist of three types: + * + * BASE: Represents a Redis snapshot from the time of last AOF rewrite. The manifest + * file contains at most a single BASE file, which will always be the first file in the + * list. + * + * INCR: Represents all write commands executed by Redis following the last successful + * AOF rewrite. In some cases it is possible to have several ordered INCR files. For + * example: + * - During an on-going AOF rewrite + * - After an AOF rewrite was aborted/failed, and before the next one succeeded. + * + * HISTORY: After a successful rewrite, the previous BASE and INCR become HISTORY files. + * They will be automatically removed unless garbage collection is disabled. + * + * The following is a possible AOF manifest file content: + * + * file appendonly.aof.2.base.rdb seq 2 type b + * file appendonly.aof.1.incr.aof seq 1 type h + * file appendonly.aof.2.incr.aof seq 2 type h + * file appendonly.aof.3.incr.aof seq 3 type h + * file appendonly.aof.4.incr.aof seq 4 type i + * file appendonly.aof.5.incr.aof seq 5 type i + * ------------------------------------------------------------------------- */ + +/* Naming rules. */ +#define BASE_FILE_SUFFIX ".base" +#define INCR_FILE_SUFFIX ".incr" +#define RDB_FORMAT_SUFFIX ".rdb" +#define AOF_FORMAT_SUFFIX ".aof" +#define MANIFEST_NAME_SUFFIX ".manifest" +#define TEMP_FILE_NAME_PREFIX "temp-" + +/* AOF manifest key. */ +#define AOF_MANIFEST_KEY_FILE_NAME "file" +#define AOF_MANIFEST_KEY_FILE_SEQ "seq" +#define AOF_MANIFEST_KEY_FILE_TYPE "type" +#define AOF_MANIFEST_KEY_FILE_STARTOFFSET "startoffset" +#define AOF_MANIFEST_KEY_FILE_ENDOFFSET "endoffset" + +/* Create an empty aofInfo. */ +aofInfo *aofInfoCreate(void) { + aofInfo *ai = zcalloc(sizeof(aofInfo)); + ai->start_offset = -1; + ai->end_offset = -1; + return ai; +} + +/* Free the aofInfo structure (pointed to by ai) and its embedded file_name. */ +void aofInfoFree(aofInfo *ai) { + serverAssert(ai != NULL); + if (ai->file_name) sdsfree(ai->file_name); + zfree(ai); +} + +/* Deep copy an aofInfo. */ +aofInfo *aofInfoDup(aofInfo *orig) { + serverAssert(orig != NULL); + aofInfo *ai = aofInfoCreate(); + ai->file_name = sdsdup(orig->file_name); + ai->file_seq = orig->file_seq; + ai->file_type = orig->file_type; + ai->start_offset = orig->start_offset; + ai->end_offset = orig->end_offset; + return ai; +} + +/* Format aofInfo as a string and it will be a line in the manifest. + * + * When update this format, make sure to update redis-check-aof as well. */ +sds aofInfoFormat(sds buf, aofInfo *ai) { + sds filename_repr = NULL; + + if (sdsneedsrepr(ai->file_name)) + filename_repr = sdscatrepr(sdsempty(), ai->file_name, sdslen(ai->file_name)); + + sds ret = sdscatprintf(buf, "%s %s %s %lld %s %c", + AOF_MANIFEST_KEY_FILE_NAME, filename_repr ? filename_repr : ai->file_name, + AOF_MANIFEST_KEY_FILE_SEQ, ai->file_seq, + AOF_MANIFEST_KEY_FILE_TYPE, ai->file_type); + + if (ai->start_offset != -1) { + ret = sdscatprintf(ret, " %s %lld", AOF_MANIFEST_KEY_FILE_STARTOFFSET, ai->start_offset); + if (ai->end_offset != -1) { + ret = sdscatprintf(ret, " %s %lld", AOF_MANIFEST_KEY_FILE_ENDOFFSET, ai->end_offset); + } + } + + ret = sdscatlen(ret, "\n", 1); + sdsfree(filename_repr); + + return ret; +} + +/* Method to free AOF list elements. */ +void aofListFree(void *item) { + aofInfo *ai = (aofInfo *)item; + aofInfoFree(ai); +} + +/* Method to duplicate AOF list elements. */ +void *aofListDup(void *item) { + return aofInfoDup(item); +} + +/* Create an empty aofManifest, which will be called in `aofLoadManifestFromDisk`. */ +aofManifest *aofManifestCreate(void) { + aofManifest *am = zcalloc(sizeof(aofManifest)); + am->incr_aof_list = listCreate(); + am->history_aof_list = listCreate(); + listSetFreeMethod(am->incr_aof_list, aofListFree); + listSetDupMethod(am->incr_aof_list, aofListDup); + listSetFreeMethod(am->history_aof_list, aofListFree); + listSetDupMethod(am->history_aof_list, aofListDup); + return am; +} + +/* Free the aofManifest structure (pointed to by am) and its embedded members. */ +void aofManifestFree(aofManifest *am) { + if (am->base_aof_info) aofInfoFree(am->base_aof_info); + if (am->incr_aof_list) listRelease(am->incr_aof_list); + if (am->history_aof_list) listRelease(am->history_aof_list); + zfree(am); +} + +sds getAofManifestFileName(void) { + return sdscatprintf(sdsempty(), "%s%s", server.aof_filename, + MANIFEST_NAME_SUFFIX); +} + +sds getTempAofManifestFileName(void) { + return sdscatprintf(sdsempty(), "%s%s%s", TEMP_FILE_NAME_PREFIX, + server.aof_filename, MANIFEST_NAME_SUFFIX); +} + +sds appendAofInfoFromList(sds buf, list *aofList) { + listNode *ln; + listIter li; + + listRewind(aofList, &li); + while ((ln = listNext(&li)) != NULL) { + aofInfo *ai = (aofInfo*)ln->value; + buf = aofInfoFormat(buf, ai); + } + + return buf; +} + +/* Returns the string representation of aofManifest pointed to by am. + * + * The string is multiple lines separated by '\n', and each line represents + * an AOF file. + * + * Each line is space delimited and contains 6 fields, as follows: + * "file" [filename] "seq" [sequence] "type" [type] + * + * Where "file", "seq" and "type" are keywords that describe the next value, + * [filename] and [sequence] describe file name and order, and [type] is one + * of 'b' (base), 'h' (history) or 'i' (incr). + * + * The base file, if exists, will always be first, followed by history files, + * and incremental files. + */ +sds getAofManifestAsString(aofManifest *am) { + serverAssert(am != NULL); + + sds buf = sdsempty(); + + /* 1. Add BASE File information, it is always at the beginning + * of the manifest file. */ + if (am->base_aof_info) { + buf = aofInfoFormat(buf, am->base_aof_info); + } + + /* 2. Add HISTORY type AOF information. */ + buf = appendAofInfoFromList(buf, am->history_aof_list); + + /* 3. Add INCR type AOF information. */ + buf = appendAofInfoFromList(buf, am->incr_aof_list); + + return buf; +} + +/* Load the manifest information from the disk to `server.aof_manifest` + * when the Redis server start. + * + * During loading, this function does strict error checking and will abort + * the entire Redis server process on error (I/O error, invalid format, etc.) + * + * If the AOF directory or manifest file do not exist, this will be ignored + * in order to support seamless upgrades from previous versions which did not + * use them. + */ +void aofLoadManifestFromDisk(void) { + server.aof_manifest = aofManifestCreate(); + if (!dirExists(server.aof_dirname)) { + serverLog(LL_DEBUG, "The AOF directory %s doesn't exist", server.aof_dirname); + return; + } + + sds am_name = getAofManifestFileName(); + sds am_filepath = makePath(server.aof_dirname, am_name); + if (!fileExist(am_filepath)) { + serverLog(LL_DEBUG, "The AOF manifest file %s doesn't exist", am_name); + sdsfree(am_name); + sdsfree(am_filepath); + return; + } + + aofManifest *am = aofLoadManifestFromFile(am_filepath); + if (am) aofManifestFreeAndUpdate(am); + sdsfree(am_name); + sdsfree(am_filepath); +} + +/* Generic manifest loading function, used in `aofLoadManifestFromDisk` and redis-check-aof tool. */ +#define MANIFEST_MAX_LINE 1024 +aofManifest *aofLoadManifestFromFile(sds am_filepath) { + const char *err = NULL; + long long maxseq = 0; + + aofManifest *am = aofManifestCreate(); + FILE *fp = fopen(am_filepath, "r"); + if (fp == NULL) { + serverLog(LL_WARNING, "Fatal error: can't open the AOF manifest " + "file %s for reading: %s", am_filepath, strerror(errno)); + exit(1); + } + + char buf[MANIFEST_MAX_LINE+1]; + sds *argv = NULL; + int argc; + aofInfo *ai = NULL; + + sds line = NULL; + int linenum = 0; + + while (1) { + if (fgets(buf, MANIFEST_MAX_LINE+1, fp) == NULL) { + if (feof(fp)) { + if (linenum == 0) { + err = "Found an empty AOF manifest"; + goto loaderr; + } else { + break; + } + } else { + err = "Read AOF manifest failed"; + goto loaderr; + } + } + + linenum++; + + /* Skip comments lines */ + if (buf[0] == '#') continue; + + if (strchr(buf, '\n') == NULL) { + err = "The AOF manifest file contains too long line"; + goto loaderr; + } + + line = sdstrim(sdsnew(buf), " \t\r\n"); + if (!sdslen(line)) { + err = "Invalid AOF manifest file format"; + goto loaderr; + } + + argv = sdssplitargs(line, &argc); + /* 'argc < 6' was done for forward compatibility. */ + if (argv == NULL || argc < 6 || (argc % 2)) { + err = "Invalid AOF manifest file format"; + goto loaderr; + } + + ai = aofInfoCreate(); + for (int i = 0; i < argc; i += 2) { + if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_FILE_NAME)) { + ai->file_name = sdsnew(argv[i+1]); + if (!pathIsBaseName(ai->file_name)) { + err = "File can't be a path, just a filename"; + goto loaderr; + } + } else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_FILE_SEQ)) { + ai->file_seq = atoll(argv[i+1]); + } else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_FILE_TYPE)) { + ai->file_type = (argv[i+1])[0]; + } else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_FILE_STARTOFFSET)) { + ai->start_offset = atoll(argv[i+1]); + } else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_FILE_ENDOFFSET)) { + ai->end_offset = atoll(argv[i+1]); + } + /* else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_OTHER)) {} */ + } + + /* We have to make sure we load all the information. */ + if (!ai->file_name || !ai->file_seq || !ai->file_type) { + err = "Invalid AOF manifest file format"; + goto loaderr; + } + + sdsfreesplitres(argv, argc); + argv = NULL; + + if (ai->file_type == AOF_FILE_TYPE_BASE) { + if (am->base_aof_info) { + err = "Found duplicate base file information"; + goto loaderr; + } + am->base_aof_info = ai; + am->curr_base_file_seq = ai->file_seq; + } else if (ai->file_type == AOF_FILE_TYPE_HIST) { + listAddNodeTail(am->history_aof_list, ai); + } else if (ai->file_type == AOF_FILE_TYPE_INCR) { + if (ai->file_seq <= maxseq) { + err = "Found a non-monotonic sequence number"; + goto loaderr; + } + listAddNodeTail(am->incr_aof_list, ai); + am->curr_incr_file_seq = ai->file_seq; + maxseq = ai->file_seq; + } else { + err = "Unknown AOF file type"; + goto loaderr; + } + + sdsfree(line); + line = NULL; + ai = NULL; + } + + fclose(fp); + return am; + +loaderr: + /* Sanitizer suppression: may report a false positive if we goto loaderr + * and exit(1) without freeing these allocations. */ + if (argv) sdsfreesplitres(argv, argc); + if (ai) aofInfoFree(ai); + + serverLog(LL_WARNING, "\n*** FATAL AOF MANIFEST FILE ERROR ***\n"); + if (line) { + serverLog(LL_WARNING, "Reading the manifest file, at line %d\n", linenum); + serverLog(LL_WARNING, ">>> '%s'\n", line); + } + serverLog(LL_WARNING, "%s\n", err); + exit(1); +} + +/* Deep copy an aofManifest from orig. + * + * In `backgroundRewriteDoneHandler` and `openNewIncrAofForAppend`, we will + * first deep copy a temporary AOF manifest from the `server.aof_manifest` and + * try to modify it. Once everything is modified, we will atomically make the + * `server.aof_manifest` point to this temporary aof_manifest. + */ +aofManifest *aofManifestDup(aofManifest *orig) { + serverAssert(orig != NULL); + aofManifest *am = zcalloc(sizeof(aofManifest)); + + am->curr_base_file_seq = orig->curr_base_file_seq; + am->curr_incr_file_seq = orig->curr_incr_file_seq; + am->dirty = orig->dirty; + + if (orig->base_aof_info) { + am->base_aof_info = aofInfoDup(orig->base_aof_info); + } + + am->incr_aof_list = listDup(orig->incr_aof_list); + am->history_aof_list = listDup(orig->history_aof_list); + serverAssert(am->incr_aof_list != NULL); + serverAssert(am->history_aof_list != NULL); + return am; +} + +/* Change the `server.aof_manifest` pointer to 'am' and free the previous + * one if we have. */ +void aofManifestFreeAndUpdate(aofManifest *am) { + serverAssert(am != NULL); + if (server.aof_manifest) aofManifestFree(server.aof_manifest); + server.aof_manifest = am; +} + +/* Called in `backgroundRewriteDoneHandler` to get a new BASE file + * name, and mark the previous (if we have) BASE file as HISTORY type. + * + * BASE file naming rules: `server.aof_filename`.seq.base.format + * + * for example: + * appendonly.aof.1.base.aof (server.aof_use_rdb_preamble is no) + * appendonly.aof.1.base.rdb (server.aof_use_rdb_preamble is yes) + */ +sds getNewBaseFileNameAndMarkPreAsHistory(aofManifest *am) { + serverAssert(am != NULL); + if (am->base_aof_info) { + serverAssert(am->base_aof_info->file_type == AOF_FILE_TYPE_BASE); + am->base_aof_info->file_type = AOF_FILE_TYPE_HIST; + listAddNodeHead(am->history_aof_list, am->base_aof_info); + } + + char *format_suffix = server.aof_use_rdb_preamble ? + RDB_FORMAT_SUFFIX:AOF_FORMAT_SUFFIX; + + aofInfo *ai = aofInfoCreate(); + ai->file_name = sdscatprintf(sdsempty(), "%s.%lld%s%s", server.aof_filename, + ++am->curr_base_file_seq, BASE_FILE_SUFFIX, format_suffix); + ai->file_seq = am->curr_base_file_seq; + ai->file_type = AOF_FILE_TYPE_BASE; + am->base_aof_info = ai; + am->dirty = 1; + return am->base_aof_info->file_name; +} + +/* Get a new INCR type AOF name. + * + * INCR AOF naming rules: `server.aof_filename`.seq.incr.aof + * + * for example: + * appendonly.aof.1.incr.aof + */ +sds getNewIncrAofName(aofManifest *am, long long start_reploff) { + aofInfo *ai = aofInfoCreate(); + ai->file_type = AOF_FILE_TYPE_INCR; + ai->file_name = sdscatprintf(sdsempty(), "%s.%lld%s%s", server.aof_filename, + ++am->curr_incr_file_seq, INCR_FILE_SUFFIX, AOF_FORMAT_SUFFIX); + ai->file_seq = am->curr_incr_file_seq; + ai->start_offset = start_reploff; + listAddNodeTail(am->incr_aof_list, ai); + am->dirty = 1; + return ai->file_name; +} + +/* Get temp INCR type AOF name. */ +sds getTempIncrAofName(void) { + return sdscatprintf(sdsempty(), "%s%s%s", TEMP_FILE_NAME_PREFIX, server.aof_filename, + INCR_FILE_SUFFIX); +} + +/* Get the last INCR AOF name or create a new one. */ +sds getLastIncrAofName(aofManifest *am) { + serverAssert(am != NULL); + + /* If 'incr_aof_list' is empty, just create a new one. */ + if (!listLength(am->incr_aof_list)) { + return getNewIncrAofName(am, server.master_repl_offset); + } + + /* Or return the last one. */ + listNode *lastnode = listIndex(am->incr_aof_list, -1); + aofInfo *ai = listNodeValue(lastnode); + return ai->file_name; +} + +/* Called in `backgroundRewriteDoneHandler`. when AOFRW success, This + * function will change the AOF file type in 'incr_aof_list' from + * AOF_FILE_TYPE_INCR to AOF_FILE_TYPE_HIST, and move them to the + * 'history_aof_list'. + */ +void markRewrittenIncrAofAsHistory(aofManifest *am) { + serverAssert(am != NULL); + if (!listLength(am->incr_aof_list)) { + return; + } + + listNode *ln; + listIter li; + + listRewindTail(am->incr_aof_list, &li); + + /* "server.aof_fd != -1" means AOF enabled, then we must skip the + * last AOF, because this file is our currently writing. */ + if (server.aof_fd != -1) { + ln = listNext(&li); + serverAssert(ln != NULL); + } + + /* Move aofInfo from 'incr_aof_list' to 'history_aof_list'. */ + while ((ln = listNext(&li)) != NULL) { + aofInfo *ai = (aofInfo*)ln->value; + serverAssert(ai->file_type == AOF_FILE_TYPE_INCR); + + aofInfo *hai = aofInfoDup(ai); + hai->file_type = AOF_FILE_TYPE_HIST; + listAddNodeHead(am->history_aof_list, hai); + listDelNode(am->incr_aof_list, ln); + } + + am->dirty = 1; +} + +/* Write the formatted manifest string to disk. */ +int writeAofManifestFile(sds buf) { + int ret = C_OK; + ssize_t nwritten; + int len; + + sds am_name = getAofManifestFileName(); + sds am_filepath = makePath(server.aof_dirname, am_name); + sds tmp_am_name = getTempAofManifestFileName(); + sds tmp_am_filepath = makePath(server.aof_dirname, tmp_am_name); + + int fd = open(tmp_am_filepath, O_WRONLY|O_TRUNC|O_CREAT, 0644); + if (fd == -1) { + serverLog(LL_WARNING, "Can't open the AOF manifest file %s: %s", + tmp_am_name, strerror(errno)); + + ret = C_ERR; + goto cleanup; + } + + len = sdslen(buf); + while(len) { + nwritten = write(fd, buf, len); + + if (nwritten < 0) { + if (errno == EINTR) continue; + + serverLog(LL_WARNING, "Error trying to write the temporary AOF manifest file %s: %s", + tmp_am_name, strerror(errno)); + + ret = C_ERR; + goto cleanup; + } + + len -= nwritten; + buf += nwritten; + } + + if (redis_fsync(fd) == -1) { + serverLog(LL_WARNING, "Fail to fsync the temp AOF file %s: %s.", + tmp_am_name, strerror(errno)); + + ret = C_ERR; + goto cleanup; + } + + if (rename(tmp_am_filepath, am_filepath) != 0) { + serverLog(LL_WARNING, + "Error trying to rename the temporary AOF manifest file %s into %s: %s", + tmp_am_name, am_name, strerror(errno)); + + ret = C_ERR; + goto cleanup; + } + + /* Also sync the AOF directory as new AOF files may be added in the directory */ + if (fsyncFileDir(am_filepath) == -1) { + serverLog(LL_WARNING, "Fail to fsync AOF directory %s: %s.", + am_filepath, strerror(errno)); + + ret = C_ERR; + goto cleanup; + } + +cleanup: + if (fd != -1) close(fd); + sdsfree(am_name); + sdsfree(am_filepath); + sdsfree(tmp_am_name); + sdsfree(tmp_am_filepath); + return ret; +} + +/* Persist the aofManifest information pointed to by am to disk. */ +int persistAofManifest(aofManifest *am) { + if (am->dirty == 0) { + return C_OK; + } + + sds amstr = getAofManifestAsString(am); + int ret = writeAofManifestFile(amstr); + sdsfree(amstr); + if (ret == C_OK) am->dirty = 0; + return ret; +} + +/* Called in `loadAppendOnlyFiles` when we upgrade from a old version redis. + * + * 1) Create AOF directory use 'server.aof_dirname' as the name. + * 2) Use 'server.aof_filename' to construct a BASE type aofInfo and add it to + * aofManifest, then persist the manifest file to AOF directory. + * 3) Move the old AOF file (server.aof_filename) to AOF directory. + * + * If any of the above steps fails or crash occurs, this will not cause any + * problems, and redis will retry the upgrade process when it restarts. + */ +void aofUpgradePrepare(aofManifest *am) { + serverAssert(!aofFileExist(server.aof_filename)); + + /* Create AOF directory use 'server.aof_dirname' as the name. */ + if (dirCreateIfMissing(server.aof_dirname) == -1) { + serverLog(LL_WARNING, "Can't open or create append-only dir %s: %s", + server.aof_dirname, strerror(errno)); + exit(1); + } + + /* Manually construct a BASE type aofInfo and add it to aofManifest. */ + if (am->base_aof_info) aofInfoFree(am->base_aof_info); + aofInfo *ai = aofInfoCreate(); + ai->file_name = sdsnew(server.aof_filename); + ai->file_seq = 1; + ai->file_type = AOF_FILE_TYPE_BASE; + am->base_aof_info = ai; + am->curr_base_file_seq = 1; + am->dirty = 1; + + /* Persist the manifest file to AOF directory. */ + if (persistAofManifest(am) != C_OK) { + exit(1); + } + + /* Move the old AOF file to AOF directory. */ + sds aof_filepath = makePath(server.aof_dirname, server.aof_filename); + if (rename(server.aof_filename, aof_filepath) == -1) { + serverLog(LL_WARNING, + "Error trying to move the old AOF file %s into dir %s: %s", + server.aof_filename, + server.aof_dirname, + strerror(errno)); + sdsfree(aof_filepath); + exit(1); + } + sdsfree(aof_filepath); + + serverLog(LL_NOTICE, "Successfully migrated an old-style AOF file (%s) into the AOF directory (%s).", + server.aof_filename, server.aof_dirname); +} + +/* When AOFRW success, the previous BASE and INCR AOFs will + * become HISTORY type and be moved into 'history_aof_list'. + * + * The function will traverse the 'history_aof_list' and submit + * the delete task to the bio thread. + */ +int aofDelHistoryFiles(void) { + if (server.aof_manifest == NULL || + server.aof_disable_auto_gc == 1 || + !listLength(server.aof_manifest->history_aof_list)) + { + return C_OK; + } + + listNode *ln; + listIter li; + + listRewind(server.aof_manifest->history_aof_list, &li); + while ((ln = listNext(&li)) != NULL) { + aofInfo *ai = (aofInfo*)ln->value; + serverAssert(ai->file_type == AOF_FILE_TYPE_HIST); + serverLog(LL_NOTICE, "Removing the history file %s in the background", ai->file_name); + sds aof_filepath = makePath(server.aof_dirname, ai->file_name); + bg_unlink(aof_filepath); + sdsfree(aof_filepath); + listDelNode(server.aof_manifest->history_aof_list, ln); + } + + server.aof_manifest->dirty = 1; + return persistAofManifest(server.aof_manifest); +} + +/* Used to clean up temp INCR AOF when AOFRW fails. */ +void aofDelTempIncrAofFile(void) { + sds aof_filename = getTempIncrAofName(); + sds aof_filepath = makePath(server.aof_dirname, aof_filename); + serverLog(LL_NOTICE, "Removing the temp incr aof file %s in the background", aof_filename); + bg_unlink(aof_filepath); + sdsfree(aof_filepath); + sdsfree(aof_filename); + return; +} + +/* Called after `loadDataFromDisk` when redis start. If `server.aof_state` is + * 'AOF_ON', It will do three things: + * 1. Force create a BASE file when redis starts with an empty dataset + * 2. Open the last opened INCR type AOF for writing, If not, create a new one + * 3. Synchronously update the manifest file to the disk + * + * If any of the above steps fails, the redis process will exit. + */ +void aofOpenIfNeededOnServerStart(void) { + if (server.aof_state != AOF_ON) { + return; + } + + serverAssert(server.aof_manifest != NULL); + serverAssert(server.aof_fd == -1); + + if (dirCreateIfMissing(server.aof_dirname) == -1) { + serverLog(LL_WARNING, "Can't open or create append-only dir %s: %s", + server.aof_dirname, strerror(errno)); + exit(1); + } + + /* If we start with an empty dataset, we will force create a BASE file. */ + size_t incr_aof_len = listLength(server.aof_manifest->incr_aof_list); + if (!server.aof_manifest->base_aof_info && !incr_aof_len) { + sds base_name = getNewBaseFileNameAndMarkPreAsHistory(server.aof_manifest); + sds base_filepath = makePath(server.aof_dirname, base_name); + if (rewriteAppendOnlyFile(base_filepath) != C_OK) { + exit(1); + } + sdsfree(base_filepath); + serverLog(LL_NOTICE, "Creating AOF base file %s on server start", + base_name); + } + + /* Because we will 'exit(1)' if open AOF or persistent manifest fails, so + * we don't need atomic modification here. */ + sds aof_name = getLastIncrAofName(server.aof_manifest); + + /* Here we should use 'O_APPEND' flag. */ + sds aof_filepath = makePath(server.aof_dirname, aof_name); + server.aof_fd = open(aof_filepath, O_WRONLY|O_APPEND|O_CREAT, 0644); + sdsfree(aof_filepath); + if (server.aof_fd == -1) { + serverLog(LL_WARNING, "Can't open the append-only file %s: %s", + aof_name, strerror(errno)); + exit(1); + } + + /* Persist our changes. */ + int ret = persistAofManifest(server.aof_manifest); + if (ret != C_OK) { + exit(1); + } + + server.aof_last_incr_size = getAppendOnlyFileSize(aof_name, NULL); + server.aof_last_incr_fsync_offset = server.aof_last_incr_size; + + if (incr_aof_len) { + serverLog(LL_NOTICE, "Opening AOF incr file %s on server start", aof_name); + } else { + serverLog(LL_NOTICE, "Creating AOF incr file %s on server start", aof_name); + } +} + +int aofFileExist(char *filename) { + sds file_path = makePath(server.aof_dirname, filename); + int ret = fileExist(file_path); + sdsfree(file_path); + return ret; +} + +/* Called in `rewriteAppendOnlyFileBackground`. If `server.aof_state` + * is 'AOF_ON', It will do two things: + * 1. Open a new INCR type AOF for writing + * 2. Synchronously update the manifest file to the disk + * + * The above two steps of modification are atomic, that is, if + * any step fails, the entire operation will rollback and returns + * C_ERR, and if all succeeds, it returns C_OK. + * + * If `server.aof_state` is 'AOF_WAIT_REWRITE', It will open a temporary INCR AOF + * file to accumulate data during AOF_WAIT_REWRITE, and it will eventually be + * renamed in the `backgroundRewriteDoneHandler` and written to the manifest file. + * */ +int openNewIncrAofForAppend(void) { + serverAssert(server.aof_manifest != NULL); + int newfd = -1; + aofManifest *temp_am = NULL; + sds new_aof_name = NULL; + + /* Only open new INCR AOF when AOF enabled. */ + if (server.aof_state == AOF_OFF) return C_OK; + + /* Open new AOF. */ + if (server.aof_state == AOF_WAIT_REWRITE) { + /* Use a temporary INCR AOF file to accumulate data during AOF_WAIT_REWRITE. */ + new_aof_name = getTempIncrAofName(); + tempIncAofStartReplOffset = server.master_repl_offset; + } else { + /* Dup a temp aof_manifest to modify. */ + temp_am = aofManifestDup(server.aof_manifest); + new_aof_name = sdsdup(getNewIncrAofName(temp_am, server.master_repl_offset)); + } + sds new_aof_filepath = makePath(server.aof_dirname, new_aof_name); + newfd = open(new_aof_filepath, O_WRONLY|O_TRUNC|O_CREAT, 0644); + sdsfree(new_aof_filepath); + if (newfd == -1) { + serverLog(LL_WARNING, "Can't open the append-only file %s: %s", + new_aof_name, strerror(errno)); + goto cleanup; + } + + if (temp_am) { + /* Persist AOF Manifest. */ + if (persistAofManifest(temp_am) == C_ERR) { + goto cleanup; + } + } + + serverLog(LL_NOTICE, "Creating AOF incr file %s on background rewrite", + new_aof_name); + sdsfree(new_aof_name); + + /* If reaches here, we can safely modify the `server.aof_manifest` + * and `server.aof_fd`. */ + + /* fsync and close old aof_fd if needed. In fsync everysec it's ok to delay + * the fsync as long as we grantee it happens, and in fsync always the file + * is already synced at this point so fsync doesn't matter. */ + if (server.aof_fd != -1) { + aof_background_fsync_and_close(server.aof_fd); + server.aof_last_fsync = server.mstime; + } + server.aof_fd = newfd; + + /* Reset the aof_last_incr_size. */ + server.aof_last_incr_size = 0; + /* Reset the aof_last_incr_fsync_offset. */ + server.aof_last_incr_fsync_offset = 0; + /* Update `server.aof_manifest`. */ + if (temp_am) aofManifestFreeAndUpdate(temp_am); + return C_OK; + +cleanup: + if (new_aof_name) sdsfree(new_aof_name); + if (newfd != -1) close(newfd); + if (temp_am) aofManifestFree(temp_am); + return C_ERR; +} + +/* When we close gracefully the AOF file, we have the chance to persist the + * end replication offset of current INCR AOF. */ +void updateCurIncrAofEndOffset(void) { + if (server.aof_state != AOF_ON) return; + serverAssert(server.aof_manifest != NULL); + + if (listLength(server.aof_manifest->incr_aof_list) == 0) return; + aofInfo *ai = listNodeValue(listLast(server.aof_manifest->incr_aof_list)); + ai->end_offset = server.master_repl_offset; + server.aof_manifest->dirty = 1; + /* It doesn't matter if the persistence fails since this information is not + * critical, we can get an approximate value by start offset plus file size. */ + persistAofManifest(server.aof_manifest); +} + +/* After loading AOF data, we need to update the `server.master_repl_offset` + * based on the information of the last INCR AOF, to avoid the rollback of + * the start offset of new INCR AOF. */ +void updateReplOffsetAndResetEndOffset(void) { + if (server.aof_state != AOF_ON) return; + serverAssert(server.aof_manifest != NULL); + + /* If the INCR file has an end offset, we directly use it, and clear it + * to avoid the next time we load the manifest file, we will use the same + * offset, but the real offset may have advanced. */ + if (listLength(server.aof_manifest->incr_aof_list) == 0) return; + aofInfo *ai = listNodeValue(listLast(server.aof_manifest->incr_aof_list)); + if (ai->end_offset != -1) { + server.master_repl_offset = ai->end_offset; + ai->end_offset = -1; + server.aof_manifest->dirty = 1; + /* We must update the end offset of INCR file correctly, otherwise we + * may keep wrong information in the manifest file, since we continue + * to append data to the same INCR file. */ + if (persistAofManifest(server.aof_manifest) != AOF_OK) + exit(1); + } else { + /* If the INCR file doesn't have an end offset, we need to calculate + * the replication offset by the start offset plus the file size. */ + server.master_repl_offset = (ai->start_offset == -1 ? 0 : ai->start_offset) + + getAppendOnlyFileSize(ai->file_name, NULL); + } +} + +/* Whether to limit the execution of Background AOF rewrite. + * + * At present, if AOFRW fails, redis will automatically retry. If it continues + * to fail, we may get a lot of very small INCR files. so we need an AOFRW + * limiting measure. + * + * We can't directly use `server.aof_current_size` and `server.aof_last_incr_size`, + * because there may be no new writes after AOFRW fails. + * + * So, we use time delay to achieve our goal. When AOFRW fails, we delay the execution + * of the next AOFRW by 1 minute. If the next AOFRW also fails, it will be delayed by 2 + * minutes. The next is 4, 8, 16, the maximum delay is 60 minutes (1 hour). + * + * During the limit period, we can still use the 'bgrewriteaof' command to execute AOFRW + * immediately. + * + * Return 1 means that AOFRW is limited and cannot be executed. 0 means that we can execute + * AOFRW, which may be that we have reached the 'next_rewrite_time' or the number of INCR + * AOFs has not reached the limit threshold. + * */ +#define AOF_REWRITE_LIMITE_THRESHOLD 3 +#define AOF_REWRITE_LIMITE_MAX_MINUTES 60 /* 1 hour */ +int aofRewriteLimited(void) { + static int next_delay_minutes = 0; + static time_t next_rewrite_time = 0; + + if (server.stat_aofrw_consecutive_failures < AOF_REWRITE_LIMITE_THRESHOLD) { + /* We may be recovering from limited state, so reset all states. */ + next_delay_minutes = 0; + next_rewrite_time = 0; + return 0; + } + + /* if it is in the limiting state, then check if the next_rewrite_time is reached */ + if (next_rewrite_time != 0) { + if (server.unixtime < next_rewrite_time) { + return 1; + } else { + next_rewrite_time = 0; + return 0; + } + } + + next_delay_minutes = (next_delay_minutes == 0) ? 1 : (next_delay_minutes * 2); + if (next_delay_minutes > AOF_REWRITE_LIMITE_MAX_MINUTES) { + next_delay_minutes = AOF_REWRITE_LIMITE_MAX_MINUTES; + } + + next_rewrite_time = server.unixtime + next_delay_minutes * 60; + serverLog(LL_WARNING, + "Background AOF rewrite has repeatedly failed and triggered the limit, will retry in %d minutes", next_delay_minutes); + return 1; +} + +/* ---------------------------------------------------------------------------- + * AOF file implementation + * ------------------------------------------------------------------------- */ + +/* Return true if an AOf fsync is currently already in progress in a + * BIO thread. */ +int aofFsyncInProgress(void) { + /* Note that we don't care about aof_background_fsync_and_close because + * server.aof_fd has been replaced by the new INCR AOF file fd, + * see openNewIncrAofForAppend. */ + return bioPendingJobsOfType(BIO_AOF_FSYNC) != 0; +} + +/* Starts a background task that performs fsync() against the specified + * file descriptor (the one of the AOF file) in another thread. */ +void aof_background_fsync(int fd) { + bioCreateFsyncJob(fd, server.master_repl_offset, 1); +} + +/* Close the fd on the basis of aof_background_fsync. */ +void aof_background_fsync_and_close(int fd) { + bioCreateCloseAofJob(fd, server.master_repl_offset, 1); +} + +/* Kills an AOFRW child process if exists */ +void killAppendOnlyChild(void) { + int statloc; + /* No AOFRW child? return. */ + if (server.child_type != CHILD_TYPE_AOF) return; + /* Kill AOFRW child, wait for child exit. */ + serverLog(LL_NOTICE,"Killing running AOF rewrite child: %ld", + (long) server.child_pid); + if (kill(server.child_pid,SIGUSR1) != -1) { + while(waitpid(-1, &statloc, 0) != server.child_pid); + } + aofRemoveTempFile(server.child_pid); + resetChildState(); + server.aof_rewrite_time_start = -1; +} + +/* Called when the user switches from "appendonly yes" to "appendonly no" + * at runtime using the CONFIG command. */ +void stopAppendOnly(void) { + serverAssert(server.aof_state != AOF_OFF); + flushAppendOnlyFile(1); + if (redis_fsync(server.aof_fd) == -1) { + serverLog(LL_WARNING,"Fail to fsync the AOF file: %s",strerror(errno)); + } else { + server.aof_last_fsync = server.mstime; + } + close(server.aof_fd); + updateCurIncrAofEndOffset(); + + server.aof_fd = -1; + server.aof_selected_db = -1; + server.aof_state = AOF_OFF; + server.aof_rewrite_scheduled = 0; + server.aof_last_incr_size = 0; + server.aof_last_incr_fsync_offset = 0; + server.fsynced_reploff = -1; + atomicSet(server.fsynced_reploff_pending, 0); + killAppendOnlyChild(); + sdsfree(server.aof_buf); + server.aof_buf = sdsempty(); +} + +/* Called when the user switches from "appendonly no" to "appendonly yes" + * at runtime using the CONFIG command. */ +int startAppendOnly(void) { + serverAssert(server.aof_state == AOF_OFF); + + server.aof_state = AOF_WAIT_REWRITE; + if (hasActiveChildProcess() && server.child_type != CHILD_TYPE_AOF) { + server.aof_rewrite_scheduled = 1; + serverLog(LL_NOTICE,"AOF was enabled but there is already another background operation. An AOF background was scheduled to start when possible."); + } else if (server.in_exec){ + server.aof_rewrite_scheduled = 1; + serverLog(LL_NOTICE,"AOF was enabled during a transaction. An AOF background was scheduled to start when possible."); + } else { + /* If there is a pending AOF rewrite, we need to switch it off and + * start a new one: the old one cannot be reused because it is not + * accumulating the AOF buffer. */ + if (server.child_type == CHILD_TYPE_AOF) { + serverLog(LL_NOTICE,"AOF was enabled but there is already an AOF rewriting in background. Stopping background AOF and starting a rewrite now."); + killAppendOnlyChild(); + } + + if (rewriteAppendOnlyFileBackground() == C_ERR) { + server.aof_state = AOF_OFF; + serverLog(LL_WARNING,"Redis needs to enable the AOF but can't trigger a background AOF rewrite operation. Check the above logs for more info about the error."); + return C_ERR; + } + } + server.aof_last_fsync = server.mstime; + /* If AOF fsync error in bio job, we just ignore it and log the event. */ + int aof_bio_fsync_status; + atomicGet(server.aof_bio_fsync_status, aof_bio_fsync_status); + if (aof_bio_fsync_status == C_ERR) { + serverLog(LL_WARNING, + "AOF reopen, just ignore the AOF fsync error in bio job"); + atomicSet(server.aof_bio_fsync_status,C_OK); + } + + /* If AOF was in error state, we just ignore it and log the event. */ + if (server.aof_last_write_status == C_ERR) { + serverLog(LL_WARNING,"AOF reopen, just ignore the last error."); + server.aof_last_write_status = C_OK; + } + return C_OK; +} + +void startAppendOnlyWithRetry(void) { + unsigned int tries, max_tries = 10; + for (tries = 0; tries < max_tries; ++tries) { + if (startAppendOnly() == C_OK) + break; + serverLog(LL_WARNING, "Failed to enable AOF! Trying it again in one second."); + sleep(1); + } + if (tries == max_tries) { + serverLog(LL_WARNING, "FATAL: AOF can't be turned on. Exiting now."); + exit(1); + } +} + +/* Called after "appendonly" config is changed. */ +void applyAppendOnlyConfig(void) { + if (!server.aof_enabled && server.aof_state != AOF_OFF) { + stopAppendOnly(); + } else if (server.aof_enabled && server.aof_state == AOF_OFF) { + startAppendOnlyWithRetry(); + } +} + +/* This is a wrapper to the write syscall in order to retry on short writes + * or if the syscall gets interrupted. It could look strange that we retry + * on short writes given that we are writing to a block device: normally if + * the first call is short, there is a end-of-space condition, so the next + * is likely to fail. However apparently in modern systems this is no longer + * true, and in general it looks just more resilient to retry the write. If + * there is an actual error condition we'll get it at the next try. */ +ssize_t aofWrite(int fd, const char *buf, size_t len) { + ssize_t nwritten = 0, totwritten = 0; + + while(len) { + nwritten = write(fd, buf, len); + + if (nwritten < 0) { + if (errno == EINTR) continue; + return totwritten ? totwritten : -1; + } + + len -= nwritten; + buf += nwritten; + totwritten += nwritten; + } + + return totwritten; +} + +/* Write the append only file buffer on disk. + * + * Since we are required to write the AOF before replying to the client, + * and the only way the client socket can get a write is entering when + * the event loop, we accumulate all the AOF writes in a memory + * buffer and write it on disk using this function just before entering + * the event loop again. + * + * About the 'force' argument: + * + * When the fsync policy is set to 'everysec' we may delay the flush if there + * is still an fsync() going on in the background thread, since for instance + * on Linux write(2) will be blocked by the background fsync anyway. + * When this happens we remember that there is some aof buffer to be + * flushed ASAP, and will try to do that in the serverCron() function. + * + * However if force is set to 1 we'll write regardless of the background + * fsync. */ +#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */ +void flushAppendOnlyFile(int force) { + ssize_t nwritten; + int sync_in_progress = 0; + mstime_t latency; + + if (sdslen(server.aof_buf) == 0) { + if (server.aof_last_incr_fsync_offset == server.aof_last_incr_size) { + /* All data is fsync'd already: Update fsynced_reploff_pending just in case. + * This is needed to avoid a WAITAOF hang in case a module used RM_Call + * with the NO_AOF flag, in which case master_repl_offset will increase but + * fsynced_reploff_pending won't be updated (because there's no reason, from + * the AOF POV, to call fsync) and then WAITAOF may wait on the higher offset + * (which contains data that was only propagated to replicas, and not to AOF) */ + if (!aofFsyncInProgress()) + atomicSet(server.fsynced_reploff_pending, server.master_repl_offset); + } else { + /* Check if we need to do fsync even the aof buffer is empty, + * because previously in AOF_FSYNC_EVERYSEC mode, fsync is + * called only when aof buffer is not empty, so if users + * stop write commands before fsync called in one second, + * the data in page cache cannot be flushed in time. */ + if (server.aof_fsync == AOF_FSYNC_EVERYSEC && + server.mstime - server.aof_last_fsync >= 1000 && + !(sync_in_progress = aofFsyncInProgress())) + goto try_fsync; + + /* Check if we need to do fsync even the aof buffer is empty, + * the reason is described in the previous AOF_FSYNC_EVERYSEC block, + * and AOF_FSYNC_ALWAYS is also checked here to handle a case where + * aof_fsync is changed from everysec to always. */ + if (server.aof_fsync == AOF_FSYNC_ALWAYS) + goto try_fsync; + } + return; + } + + if (server.aof_fsync == AOF_FSYNC_EVERYSEC) + sync_in_progress = aofFsyncInProgress(); + + if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) { + /* With this append fsync policy we do background fsyncing. + * If the fsync is still in progress we can try to delay + * the write for a couple of seconds. */ + if (sync_in_progress) { + if (server.aof_flush_postponed_start == 0) { + /* No previous write postponing, remember that we are + * postponing the flush and return. */ + server.aof_flush_postponed_start = server.mstime; + return; + } else if (server.mstime - server.aof_flush_postponed_start < 2000) { + /* We were already waiting for fsync to finish, but for less + * than two seconds this is still ok. Postpone again. */ + return; + } + /* Otherwise fall through, and go write since we can't wait + * over two seconds. */ + server.aof_delayed_fsync++; + serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis."); + } + } + /* We want to perform a single write. This should be guaranteed atomic + * at least if the filesystem we are writing is a real physical one. + * While this will save us against the server being killed I don't think + * there is much to do about the whole server stopping for power problems + * or alike */ + + if (server.aof_flush_sleep && sdslen(server.aof_buf)) { + usleep(server.aof_flush_sleep); + } + + latencyStartMonitor(latency); + nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf)); + latencyEndMonitor(latency); + /* We want to capture different events for delayed writes: + * when the delay happens with a pending fsync, or with a saving child + * active, and when the above two conditions are missing. + * We also use an additional event name to save all samples which is + * useful for graphing / monitoring purposes. */ + if (sync_in_progress) { + latencyAddSampleIfNeeded("aof-write-pending-fsync",latency); + } else if (hasActiveChildProcess()) { + latencyAddSampleIfNeeded("aof-write-active-child",latency); + } else { + latencyAddSampleIfNeeded("aof-write-alone",latency); + } + latencyAddSampleIfNeeded("aof-write",latency); + + /* We performed the write so reset the postponed flush sentinel to zero. */ + server.aof_flush_postponed_start = 0; + + if (nwritten != (ssize_t)sdslen(server.aof_buf)) { + static time_t last_write_error_log = 0; + int can_log = 0; + + /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */ + if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) { + can_log = 1; + last_write_error_log = server.unixtime; + } + + /* Log the AOF write error and record the error code. */ + if (nwritten == -1) { + if (can_log) { + serverLog(LL_WARNING,"Error writing to the AOF file: %s", + strerror(errno)); + } + server.aof_last_write_errno = errno; + } else { + if (can_log) { + serverLog(LL_WARNING,"Short write while writing to " + "the AOF file: (nwritten=%lld, " + "expected=%lld)", + (long long)nwritten, + (long long)sdslen(server.aof_buf)); + } + + if (ftruncate(server.aof_fd, server.aof_last_incr_size) == -1) { + if (can_log) { + serverLog(LL_WARNING, "Could not remove short write " + "from the append-only file. Redis may refuse " + "to load the AOF the next time it starts. " + "ftruncate: %s", strerror(errno)); + } + } else { + /* If the ftruncate() succeeded we can set nwritten to + * -1 since there is no longer partial data into the AOF. */ + nwritten = -1; + } + server.aof_last_write_errno = ENOSPC; + } + + /* Handle the AOF write error. */ + if (server.aof_fsync == AOF_FSYNC_ALWAYS) { + /* We can't recover when the fsync policy is ALWAYS since the reply + * for the client is already in the output buffers (both writes and + * reads), and the changes to the db can't be rolled back. Since we + * have a contract with the user that on acknowledged or observed + * writes are is synced on disk, we must exit. */ + serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting..."); + exit(1); + } else { + /* Recover from failed write leaving data into the buffer. However + * set an error to stop accepting writes as long as the error + * condition is not cleared. */ + server.aof_last_write_status = C_ERR; + + /* Trim the sds buffer if there was a partial write, and there + * was no way to undo it with ftruncate(2). */ + if (nwritten > 0) { + server.aof_current_size += nwritten; + server.aof_last_incr_size += nwritten; + sdsrange(server.aof_buf,nwritten,-1); + } + return; /* We'll try again on the next call... */ + } + } else { + /* Successful write(2). If AOF was in error state, restore the + * OK state and log the event. */ + if (server.aof_last_write_status == C_ERR) { + serverLog(LL_NOTICE, + "AOF write error looks solved, Redis can write again."); + server.aof_last_write_status = C_OK; + } + } + server.aof_current_size += nwritten; + server.aof_last_incr_size += nwritten; + + /* Re-use AOF buffer when it is small enough. The maximum comes from the + * arena size of 4k minus some overhead (but is otherwise arbitrary). */ + if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) { + sdsclear(server.aof_buf); + } else { + sdsfree(server.aof_buf); + server.aof_buf = sdsempty(); + } + +try_fsync: + /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are + * children doing I/O in the background. */ + if (server.aof_no_fsync_on_rewrite && hasActiveChildProcess()) + return; + + /* Perform the fsync if needed. */ + if (server.aof_fsync == AOF_FSYNC_ALWAYS) { + /* redis_fsync is defined as fdatasync() for Linux in order to avoid + * flushing metadata. */ + latencyStartMonitor(latency); + /* Let's try to get this data on the disk. To guarantee data safe when + * the AOF fsync policy is 'always', we should exit if failed to fsync + * AOF (see comment next to the exit(1) after write error above). */ + if (redis_fsync(server.aof_fd) == -1) { + serverLog(LL_WARNING,"Can't persist AOF for fsync error when the " + "AOF fsync policy is 'always': %s. Exiting...", strerror(errno)); + exit(1); + } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("aof-fsync-always",latency); + server.aof_last_incr_fsync_offset = server.aof_last_incr_size; + server.aof_last_fsync = server.mstime; + atomicSet(server.fsynced_reploff_pending, server.master_repl_offset); + } else if (server.aof_fsync == AOF_FSYNC_EVERYSEC && + server.mstime - server.aof_last_fsync >= 1000) { + if (!sync_in_progress) { + aof_background_fsync(server.aof_fd); + server.aof_last_incr_fsync_offset = server.aof_last_incr_size; + } + server.aof_last_fsync = server.mstime; + } +} + +sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) { + char buf[32]; + int len, j; + robj *o; + + buf[0] = '*'; + len = 1+ll2string(buf+1,sizeof(buf)-1,argc); + buf[len++] = '\r'; + buf[len++] = '\n'; + dst = sdscatlen(dst,buf,len); + + for (j = 0; j < argc; j++) { + o = getDecodedObject(argv[j]); + buf[0] = '$'; + len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr)); + buf[len++] = '\r'; + buf[len++] = '\n'; + dst = sdscatlen(dst,buf,len); + dst = sdscatlen(dst,o->ptr,sdslen(o->ptr)); + dst = sdscatlen(dst,"\r\n",2); + decrRefCount(o); + } + return dst; +} + +/* Generate a piece of timestamp annotation for AOF if current record timestamp + * in AOF is not equal server unix time. If we specify 'force' argument to 1, + * we would generate one without check, currently, it is useful in AOF rewriting + * child process which always needs to record one timestamp at the beginning of + * rewriting AOF. + * + * Timestamp annotation format is "#TS:${timestamp}\r\n". "TS" is short of + * timestamp and this method could save extra bytes in AOF. */ +sds genAofTimestampAnnotationIfNeeded(int force) { + sds ts = NULL; + + if (force || server.aof_cur_timestamp < server.unixtime) { + server.aof_cur_timestamp = force ? time(NULL) : server.unixtime; + ts = sdscatfmt(sdsempty(), "#TS:%I\r\n", server.aof_cur_timestamp); + serverAssert(sdslen(ts) <= AOF_ANNOTATION_LINE_MAX_LEN); + } + return ts; +} + +/* Write the given command to the aof file. + * dictid - dictionary id the command should be applied to, + * this is used in order to decide if a `select` command + * should also be written to the aof. Value of -1 means + * to avoid writing `select` command in any case. + * argv - The command to write to the aof. + * argc - Number of values in argv + */ +void feedAppendOnlyFile(int dictid, robj **argv, int argc) { + sds buf = sdsempty(); + + serverAssert(dictid == -1 || (dictid >= 0 && dictid < server.dbnum)); + + /* Feed timestamp if needed */ + if (server.aof_timestamp_enabled) { + sds ts = genAofTimestampAnnotationIfNeeded(0); + if (ts != NULL) { + buf = sdscatsds(buf, ts); + sdsfree(ts); + } + } + + /* The DB this command was targeting is not the same as the last command + * we appended. To issue a SELECT command is needed. */ + if (dictid != -1 && dictid != server.aof_selected_db) { + char seldb[64]; + + snprintf(seldb,sizeof(seldb),"%d",dictid); + buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", + (unsigned long)strlen(seldb),seldb); + server.aof_selected_db = dictid; + } + + /* All commands should be propagated the same way in AOF as in replication. + * No need for AOF-specific translation. */ + buf = catAppendOnlyGenericCommand(buf,argc,argv); + + /* Append to the AOF buffer. This will be flushed on disk just before + * of re-entering the event loop, so before the client will get a + * positive reply about the operation performed. */ + if (server.aof_state == AOF_ON || + (server.aof_state == AOF_WAIT_REWRITE && server.child_type == CHILD_TYPE_AOF)) + { + server.aof_buf = sdscatlen(server.aof_buf, buf, sdslen(buf)); + } + + sdsfree(buf); +} + +/* ---------------------------------------------------------------------------- + * AOF loading + * ------------------------------------------------------------------------- */ + +/* In Redis commands are always executed in the context of a client, so in + * order to load the append only file we need to create a fake client. */ +struct client *createAOFClient(void) { + struct client *c = createClient(NULL); + + c->id = CLIENT_ID_AOF; /* So modules can identify it's the AOF client. */ + + /* + * The AOF client should never be blocked (unlike master + * replication connection). + * This is because blocking the AOF client might cause + * deadlock (because potentially no one will unblock it). + * Also, if the AOF client will be blocked just for + * background processing there is a chance that the + * command execution order will be violated. + */ + c->flags = CLIENT_DENY_BLOCKING; + + /* We set the fake client as a slave waiting for the synchronization + * so that Redis will not try to send replies to this client. */ + c->replstate = SLAVE_STATE_WAIT_BGSAVE_START; + return c; +} + +static int truncateAppendOnlyFile(char *filename, off_t valid_up_to) { + if (valid_up_to == -1) { + serverLog(LL_WARNING,"Last valid command offset is invalid"); + return 0; + } + + if (truncate(filename, valid_up_to) == -1) { + serverLog(LL_WARNING,"Error truncating the AOF file %s: %s", + filename, strerror(errno)); + return 0; + } + + /* Make sure the AOF file descriptor points to the end of the + * file after the truncate call. */ + if (server.aof_fd != -1 && lseek(server.aof_fd, 0, SEEK_END) == -1) { + serverLog(LL_WARNING,"Can't seek the end of the AOF file %s: %s", + filename, strerror(errno)); + return 0; + } + + return 1; /* Success */ +} + +/* Replay an append log file. On success AOF_OK or AOF_TRUNCATED is returned, + * otherwise, one of the following is returned: + * AOF_OPEN_ERR: Failed to open the AOF file. + * AOF_NOT_EXIST: AOF file doesn't exist. + * AOF_EMPTY: The AOF file is empty (nothing to load). + * AOF_FAILED: Failed to load the AOF file. */ +int loadSingleAppendOnlyFile(char *filename) { + struct client *fakeClient; + struct redis_stat sb; + int old_aof_state = server.aof_state; + long loops = 0; + off_t valid_up_to = 0; /* Offset of latest well-formed command loaded. */ + off_t valid_before_multi = 0; /* Offset before MULTI command loaded. */ + off_t last_progress_report_size = 0; + int ret = AOF_OK; + + sds aof_filepath = makePath(server.aof_dirname, filename); + FILE *fp = fopen(aof_filepath, "r"); + if (fp == NULL) { + int en = errno; + if (redis_stat(aof_filepath, &sb) == 0 || errno != ENOENT) { + serverLog(LL_WARNING,"Fatal error: can't open the append log file %s for reading: %s", filename, strerror(en)); + sdsfree(aof_filepath); + return AOF_OPEN_ERR; + } else { + serverLog(LL_WARNING,"The append log file %s doesn't exist: %s", filename, strerror(errno)); + sdsfree(aof_filepath); + return AOF_NOT_EXIST; + } + } + + if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) { + fclose(fp); + sdsfree(aof_filepath); + return AOF_EMPTY; + } + + /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI + * to the same file we're about to read. */ + server.aof_state = AOF_OFF; + + client *old_cur_client = server.current_client; + client *old_exec_client = server.executing_client; + fakeClient = createAOFClient(); + server.current_client = server.executing_client = fakeClient; + + /* Check if the AOF file is in RDB format (it may be RDB encoded base AOF + * or old style RDB-preamble AOF). In that case we need to load the RDB file + * and later continue loading the AOF tail if it is an old style RDB-preamble AOF. */ + char sig[5]; /* "REDIS" */ + if (fread(sig,1,5,fp) != 5 || memcmp(sig,"REDIS",5) != 0) { + /* Not in RDB format, seek back at 0 offset. */ + if (fseek(fp,0,SEEK_SET) == -1) goto readerr; + } else { + /* RDB format. Pass loading the RDB functions. */ + rio rdb; + int old_style = !strcmp(filename, server.aof_filename); + if (old_style) + serverLog(LL_NOTICE, "Reading RDB preamble from AOF file..."); + else + serverLog(LL_NOTICE, "Reading RDB base file on AOF loading..."); + + if (fseek(fp,0,SEEK_SET) == -1) goto readerr; + rioInitWithFile(&rdb,fp); + if (rdbLoadRio(&rdb,RDBFLAGS_AOF_PREAMBLE,NULL) != C_OK) { + if (old_style) + serverLog(LL_WARNING, "Error reading the RDB preamble of the AOF file %s, AOF loading aborted", filename); + else + serverLog(LL_WARNING, "Error reading the RDB base file %s, AOF loading aborted", filename); + + ret = AOF_FAILED; + goto cleanup; + } else { + loadingAbsProgress(ftello(fp)); + last_progress_report_size = ftello(fp); + if (old_style) serverLog(LL_NOTICE, "Reading the remaining AOF tail..."); + } + } + + /* Read the actual AOF file, in REPL format, command by command. */ + while(1) { + int argc, j; + unsigned long len; + robj **argv; + char buf[AOF_ANNOTATION_LINE_MAX_LEN]; + sds argsds; + struct redisCommand *cmd; + + /* Serve the clients from time to time */ + if (!(loops++ % 1024)) { + off_t progress_delta = ftello(fp) - last_progress_report_size; + loadingIncrProgress(progress_delta); + last_progress_report_size += progress_delta; + processEventsWhileBlocked(); + processModuleLoadingProgressEvent(1); + } + if (fgets(buf,sizeof(buf),fp) == NULL) { + if (feof(fp)) { + break; + } else { + goto readerr; + } + } + if (buf[0] == '#') continue; /* Skip annotations */ + if (buf[0] != '*') goto fmterr; + if (buf[1] == '\0') goto readerr; + argc = atoi(buf+1); + if (argc < 1) goto fmterr; + if ((size_t)argc > SIZE_MAX / sizeof(robj*)) goto fmterr; + + /* Load the next command in the AOF as our fake client + * argv. */ + argv = zmalloc(sizeof(robj*)*argc); + fakeClient->argc = argc; + fakeClient->argv = argv; + fakeClient->argv_len = argc; + + for (j = 0; j < argc; j++) { + /* Parse the argument len. */ + char *readres = fgets(buf,sizeof(buf),fp); + if (readres == NULL || buf[0] != '$') { + fakeClient->argc = j; /* Free up to j-1. */ + freeClientArgv(fakeClient); + if (readres == NULL) + goto readerr; + else + goto fmterr; + } + len = strtol(buf+1,NULL,10); + + /* Read it into a string object. */ + argsds = sdsnewlen(SDS_NOINIT,len); + if (len && fread(argsds,len,1,fp) == 0) { + sdsfree(argsds); + fakeClient->argc = j; /* Free up to j-1. */ + freeClientArgv(fakeClient); + goto readerr; + } + argv[j] = createObject(OBJ_STRING,argsds); + + /* Discard CRLF. */ + if (fread(buf,2,1,fp) == 0) { + fakeClient->argc = j+1; /* Free up to j. */ + freeClientArgv(fakeClient); + goto readerr; + } + } + + /* Command lookup */ + cmd = lookupCommand(argv,argc); + if (!cmd) { + serverLog(LL_WARNING, + "Unknown command '%s' reading the append only file %s", + (char*)argv[0]->ptr, filename); + freeClientArgv(fakeClient); + ret = AOF_FAILED; + goto cleanup; + } + + if (cmd->proc == multiCommand) valid_before_multi = valid_up_to; + + /* Run the command in the context of a fake client */ + fakeClient->cmd = fakeClient->lastcmd = cmd; + if (fakeClient->flags & CLIENT_MULTI && + fakeClient->cmd->proc != execCommand) + { + /* queueMultiCommand requires a pendingCommand, so we create a "fake" one here + * for it to consume */ + pendingCommand *pcmd = zmalloc(sizeof(pendingCommand)); + initPendingCommand(pcmd); + addPendingCommand(&fakeClient->pending_cmds, pcmd); + + pcmd->argc = argc; + pcmd->argv_len = argc; + pcmd->argv = argv; + pcmd->cmd = cmd; + + /* Note: we don't have to attempt calling evalGetCommandFlags, + * since this is AOF, the checks in processCommand are not made + * anyway.*/ + queueMultiCommand(fakeClient, cmd->flags); + } else { + cmd->proc(fakeClient); + fakeClient->all_argv_len_sum = 0; /* Otherwise no one cleans this up and we reach cleanup with it non-zero */ + } + + /* The fake client should not have a reply */ + serverAssert(fakeClient->bufpos == 0 && + listLength(fakeClient->reply) == 0); + + /* The fake client should never get blocked */ + serverAssert((fakeClient->flags & CLIENT_BLOCKED) == 0); + + /* Clean up. Command code may have changed argv/argc so we use the + * argv/argc of the client instead of the local variables. */ + freeClientArgv(fakeClient); + if (server.aof_load_truncated || server.aof_load_corrupt_tail_max_size) valid_up_to = ftello(fp); + if (server.key_load_delay) + debugDelay(server.key_load_delay); + } + + /* This point can only be reached when EOF is reached without errors. + * If the client is in the middle of a MULTI/EXEC, handle it as it was + * a short read, even if technically the protocol is correct: we want + * to remove the unprocessed tail and continue. */ + if (fakeClient->flags & CLIENT_MULTI) { + serverLog(LL_WARNING, + "Revert incomplete MULTI/EXEC transaction in AOF file %s", filename); + valid_up_to = valid_before_multi; + goto uxeof; + } + +loaded_ok: /* DB loaded, cleanup and return success (AOF_OK or AOF_TRUNCATED). */ + loadingIncrProgress(ftello(fp) - last_progress_report_size); + server.aof_state = old_aof_state; + goto cleanup; + +readerr: /* Read error. If feof(fp) is true, fall through to unexpected EOF. */ + if (!feof(fp)) { + serverLog(LL_WARNING,"Unrecoverable error reading the append only file %s: %s", filename, strerror(errno)); + ret = AOF_FAILED; + goto cleanup; + } + +uxeof: /* Unexpected AOF end of file. */ + if (server.aof_load_truncated) { + serverLog(LL_WARNING,"!!! Warning: short read while loading the AOF file %s!!!", filename); + serverLog(LL_WARNING,"!!! Truncating the AOF %s at offset %llu !!!", + filename, (unsigned long long) valid_up_to); + if (truncateAppendOnlyFile(aof_filepath, valid_up_to)) { + serverLog(LL_WARNING, "AOF %s loaded anyway because aof-load-truncated is enabled", aof_filepath); + ret = AOF_TRUNCATED; + goto loaded_ok; + } + } + serverLog(LL_WARNING, "Unexpected end of file reading the append only file %s. You can: " + "1) Make a backup of your AOF file, then use ./redis-check-aof --fix . " + "2) Alternatively you can set the 'aof-load-truncated' configuration option to yes and restart the server.", filename); + ret = AOF_FAILED; + goto cleanup; + +fmterr: /* Format error. */ + /* fmterr may be caused by accidentally machine shutdown, so if the broken tail + * is less than a specified size, try to recover it automatically */ + if (server.aof_load_corrupt_tail_max_size && sb.st_size - valid_up_to < server.aof_load_corrupt_tail_max_size) { + serverLog(LL_WARNING,"!!! Warning: corrupt AOF file tail!!!"); + serverLog(LL_WARNING,"!!! Truncating the AOF %s at offset %llu (remaining %llu) !!!", + aof_filepath, (unsigned long long) valid_up_to, (unsigned long long) sb.st_size - valid_up_to); + if (truncateAppendOnlyFile(aof_filepath, valid_up_to)) { + serverLog(LL_WARNING, "AOF %s loaded anyway because aof-load-corrupt-tail-max-size is enabled", aof_filepath); + ret = AOF_BROKEN_RECOVERED; + goto loaded_ok; + } + } + serverLog(LL_WARNING, "Bad file format reading the append only file %s at offset %llu. \ + make a backup of your AOF file, then use ./redis-check-aof --fix . \ + Alternatively you can set the 'aof-load-corrupt-tail-max-size' configuration option to %llu and restart the server.", + aof_filepath, (unsigned long long)valid_up_to, (unsigned long long) sb.st_size - valid_up_to); + ret = AOF_FAILED; + /* fall through to cleanup. */ + +cleanup: + if (fakeClient) freeClient(fakeClient); + server.current_client = old_cur_client; + server.executing_client = old_exec_client; + int fd = dup(fileno(fp)); + fclose(fp); + /* Reclaim page cache memory used by the AOF file in background. */ + if (fd >= 0) bioCreateCloseJob(fd, 0, 1); + sdsfree(aof_filepath); + return ret; +} + +/* Load the AOF files according the aofManifest pointed by am. */ +int loadAppendOnlyFiles(aofManifest *am) { + serverAssert(am != NULL); + int status, ret = AOF_OK; + long long start; + off_t total_size = 0, base_size = 0; + sds aof_name; + int total_num, aof_num = 0, last_file; + + /* If the 'server.aof_filename' file exists in dir, we may be starting + * from an old redis version. We will use enter upgrade mode in three situations. + * + * 1. If the 'server.aof_dirname' directory not exist + * 2. If the 'server.aof_dirname' directory exists but the manifest file is missing + * 3. If the 'server.aof_dirname' directory exists and the manifest file it contains + * has only one base AOF record, and the file name of this base AOF is 'server.aof_filename', + * and the 'server.aof_filename' file not exist in 'server.aof_dirname' directory + * */ + if (fileExist(server.aof_filename)) { + if (!dirExists(server.aof_dirname) || + (am->base_aof_info == NULL && listLength(am->incr_aof_list) == 0) || + (am->base_aof_info != NULL && listLength(am->incr_aof_list) == 0 && + !strcmp(am->base_aof_info->file_name, server.aof_filename) && !aofFileExist(server.aof_filename))) + { + aofUpgradePrepare(am); + } + } + + if (am->base_aof_info == NULL && listLength(am->incr_aof_list) == 0) { + return AOF_NOT_EXIST; + } + + total_num = getBaseAndIncrAppendOnlyFilesNum(am); + serverAssert(total_num > 0); + + /* Here we calculate the total size of all BASE and INCR files in + * advance, it will be set to `server.loading_total_bytes`. */ + total_size = getBaseAndIncrAppendOnlyFilesSize(am, &status); + if (status != AOF_OK) { + /* If an AOF exists in the manifest but not on the disk, we consider this to be a fatal error. */ + if (status == AOF_NOT_EXIST) status = AOF_FAILED; + + return status; + } else if (total_size == 0) { + return AOF_EMPTY; + } + + startLoading(total_size, RDBFLAGS_AOF_PREAMBLE, 0); + + /* Load BASE AOF if needed. */ + if (am->base_aof_info) { + serverAssert(am->base_aof_info->file_type == AOF_FILE_TYPE_BASE); + aof_name = (char*)am->base_aof_info->file_name; + updateLoadingFileName(aof_name); + base_size = getAppendOnlyFileSize(aof_name, NULL); + last_file = ++aof_num == total_num; + start = ustime(); + ret = loadSingleAppendOnlyFile(aof_name); + if (ret == AOF_OK || ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && last_file)) { + serverLog(LL_NOTICE, "DB loaded from base file %s: %.3f seconds", + aof_name, (float)(ustime()-start)/1000000); + } + + /* If the truncated file is not the last file, we consider this to be a fatal error. */ + if ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && !last_file) { + ret = AOF_FAILED; + serverLog(LL_WARNING, "Fatal error: the truncated file is not the last file"); + } + + if (ret == AOF_OPEN_ERR || ret == AOF_FAILED) { + goto cleanup; + } + } + + /* Load INCR AOFs if needed. */ + if (listLength(am->incr_aof_list)) { + listNode *ln; + listIter li; + + listRewind(am->incr_aof_list, &li); + while ((ln = listNext(&li)) != NULL) { + aofInfo *ai = (aofInfo*)ln->value; + serverAssert(ai->file_type == AOF_FILE_TYPE_INCR); + aof_name = (char*)ai->file_name; + updateLoadingFileName(aof_name); + last_file = ++aof_num == total_num; + start = ustime(); + ret = loadSingleAppendOnlyFile(aof_name); + if (ret == AOF_OK || ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && last_file)) { + serverLog(LL_NOTICE, "DB loaded from incr file %s: %.3f seconds", + aof_name, (float)(ustime()-start)/1000000); + } + + /* We know that (at least) one of the AOF files has data (total_size > 0), + * so empty incr AOF file doesn't count as a AOF_EMPTY result */ + if (ret == AOF_EMPTY) ret = AOF_OK; + + /* If the truncated file is not the last file, we consider this to be a fatal error. */ + if ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && !last_file) { + ret = AOF_FAILED; + serverLog(LL_WARNING, "Fatal error: the truncated file is not the last file"); + } + + if (ret == AOF_OPEN_ERR || ret == AOF_FAILED) { + goto cleanup; + } + } + } + + server.aof_current_size = total_size; + /* Ideally, the aof_rewrite_base_size variable should hold the size of the + * AOF when the last rewrite ended, this should include the size of the + * incremental file that was created during the rewrite since otherwise we + * risk the next automatic rewrite to happen too soon (or immediately if + * auto-aof-rewrite-percentage is low). However, since we do not persist + * aof_rewrite_base_size information anywhere, we initialize it on restart + * to the size of BASE AOF file. This might cause the first AOFRW to be + * executed early, but that shouldn't be a problem since everything will be + * fine after the first AOFRW. */ + server.aof_rewrite_base_size = base_size; + +cleanup: + stopLoading(ret == AOF_OK || ret == AOF_TRUNCATED); + return ret; +} + +/* ---------------------------------------------------------------------------- + * AOF rewrite + * ------------------------------------------------------------------------- */ + +/* Delegate writing an object to writing a bulk string or bulk long long. + * This is not placed in rio.c since that adds the server.h dependency. */ +int rioWriteBulkObject(rio *r, robj *obj) { + /* Avoid using getDecodedObject to help copy-on-write (we are often + * in a child process when this function is called). */ + if (obj->encoding == OBJ_ENCODING_INT) { + return rioWriteBulkLongLong(r,(long)obj->ptr); + } else if (sdsEncodedObject(obj)) { + return rioWriteBulkString(r,obj->ptr,sdslen(obj->ptr)); + } else { + serverPanic("Unknown string encoding"); + } +} + +/* Emit the commands needed to rebuild a list object. + * The function returns 0 on error, 1 on success. */ +int rewriteListObject(rio *r, robj *key, robj *o) { + long long count = 0, items = listTypeLength(o); + + listTypeIterator li; + listTypeEntry entry; + listTypeInitIterator(&li, o, 0, LIST_TAIL); + while (listTypeNext(&li, &entry)) { + if (count == 0) { + int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? + AOF_REWRITE_ITEMS_PER_CMD : items; + if (!rioWriteBulkCount(r,'*',2+cmd_items) || + !rioWriteBulkString(r,"RPUSH",5) || + !rioWriteBulkObject(r,key)) + { + listTypeResetIterator(&li); + return 0; + } + } + + unsigned char *vstr; + size_t vlen; + long long lval; + vstr = listTypeGetValue(&entry,&vlen,&lval); + if (vstr) { + if (!rioWriteBulkString(r,(char*)vstr,vlen)) { + listTypeResetIterator(&li); + return 0; + } + } else { + if (!rioWriteBulkLongLong(r,lval)) { + listTypeResetIterator(&li); + return 0; + } + } + if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; + items--; + } + listTypeResetIterator(&li); + return 1; +} + +/* Emit the commands needed to rebuild a set object. + * The function returns 0 on error, 1 on success. */ +int rewriteSetObject(rio *r, robj *key, robj *o) { + long long count = 0, items = setTypeSize(o); + setTypeIterator si; + char *str; + size_t len; + int64_t llval; + setTypeInitIterator(&si, o); + while (setTypeNext(&si, &str, &len, &llval) != -1) { + if (count == 0) { + int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? + AOF_REWRITE_ITEMS_PER_CMD : items; + if (!rioWriteBulkCount(r,'*',2+cmd_items) || + !rioWriteBulkString(r,"SADD",4) || + !rioWriteBulkObject(r,key)) + { + setTypeResetIterator(&si); + return 0; + } + } + size_t written = str ? + rioWriteBulkString(r, str, len) : rioWriteBulkLongLong(r, llval); + if (!written) { + setTypeResetIterator(&si); + return 0; + } + if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; + items--; + } + setTypeResetIterator(&si); + return 1; +} + +/* Emit the commands needed to rebuild a sorted set object. + * The function returns 0 on error, 1 on success. */ +int rewriteSortedSetObject(rio *r, robj *key, robj *o) { + long long count = 0, items = zsetLength(o); + + if (o->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = o->ptr; + unsigned char *eptr, *sptr; + unsigned char *vstr; + unsigned int vlen; + long long vll; + double score; + + eptr = lpSeek(zl,0); + serverAssert(eptr != NULL); + sptr = lpNext(zl,eptr); + serverAssert(sptr != NULL); + + while (eptr != NULL) { + vstr = lpGetValue(eptr,&vlen,&vll); + score = zzlGetScore(sptr); + + if (count == 0) { + int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? + AOF_REWRITE_ITEMS_PER_CMD : items; + + if (!rioWriteBulkCount(r,'*',2+cmd_items*2) || + !rioWriteBulkString(r,"ZADD",4) || + !rioWriteBulkObject(r,key)) + { + return 0; + } + } + if (!rioWriteBulkDouble(r,score)) return 0; + if (vstr != NULL) { + if (!rioWriteBulkString(r,(char*)vstr,vlen)) return 0; + } else { + if (!rioWriteBulkLongLong(r,vll)) return 0; + } + zzlNext(zl,&eptr,&sptr); + if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; + items--; + } + } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = o->ptr; + dictIterator di; + dictEntry *de; + + dictInitIterator(&di, zs->dict); + while((de = dictNext(&di)) != NULL) { + zskiplistNode *znode = dictGetKey(de); + sds ele = zslGetNodeElement(znode); + double score = znode->score; + + if (count == 0) { + int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? + AOF_REWRITE_ITEMS_PER_CMD : items; + + if (!rioWriteBulkCount(r,'*',2+cmd_items*2) || + !rioWriteBulkString(r,"ZADD",4) || + !rioWriteBulkObject(r,key)) + { + dictResetIterator(&di); + return 0; + } + } + if (!rioWriteBulkDouble(r,score) || + !rioWriteBulkString(r,ele,sdslen(ele))) + { + dictResetIterator(&di); + return 0; + } + if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; + items--; + } + dictResetIterator(&di); + } else { + serverPanic("Unknown sorted zset encoding"); + } + return 1; +} + +/* Write either the key or the value of the currently selected item of a hash. + * The 'hi' argument passes a valid Redis hash iterator. + * The 'what' filed specifies if to write a key or a value and can be + * either OBJ_HASH_KEY or OBJ_HASH_VALUE. + * + * The function returns 0 on error, non-zero on success. */ +static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { + if ((hi->encoding == OBJ_ENCODING_LISTPACK) || (hi->encoding == OBJ_ENCODING_LISTPACK_EX)) { + unsigned char *vstr = NULL; + unsigned int vlen = UINT_MAX; + long long vll = LLONG_MAX; + + hashTypeCurrentFromListpack(hi, what, &vstr, &vlen, &vll, NULL); + if (vstr) + return rioWriteBulkString(r, (char*)vstr, vlen); + else + return rioWriteBulkLongLong(r, vll); + } else if (hi->encoding == OBJ_ENCODING_HT) { + char *str; + size_t len; + hashTypeCurrentFromHashTable(hi, what, &str, &len, NULL); + return rioWriteBulkString(r, str, len); + } + + serverPanic("Unknown hash encoding"); + return 0; +} + +/* Emit the commands needed to rebuild a hash object. + * The function returns 0 on error, 1 on success. */ +int rewriteHashObject(rio *r, robj *key, robj *o) { + int res = 0; /*fail*/ + + hashTypeIterator hi; + long long count = 0, items = hashTypeLength(o, 0); + + int isHFE = hashTypeGetMinExpire(o, 0) != EB_EXPIRE_TIME_INVALID; + hashTypeInitIterator(&hi, o); + + if (!isHFE) { + while (hashTypeNext(&hi, 0) != C_ERR) { + if (count == 0) { + int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? + AOF_REWRITE_ITEMS_PER_CMD : items; + if (!rioWriteBulkCount(r, '*', 2 + cmd_items * 2) || + !rioWriteBulkString(r, "HMSET", 5) || + !rioWriteBulkObject(r, key)) + goto reHashEnd; + } + + if (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_KEY) || + !rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) + goto reHashEnd; + + if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; + items--; + } + } else { + while (hashTypeNext(&hi, 0) != C_ERR) { + + char hmsetCmd[] = "*4\r\n$5\r\nHMSET\r\n"; + if ( (!rioWrite(r, hmsetCmd, sizeof(hmsetCmd) - 1)) || + (!rioWriteBulkObject(r, key)) || + (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_KEY)) || + (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) ) + goto reHashEnd; + + if (hi.expire_time != EB_EXPIRE_TIME_INVALID) { + char cmd[] = "*6\r\n$10\r\nHPEXPIREAT\r\n"; + if ( (!rioWrite(r, cmd, sizeof(cmd) - 1)) || + (!rioWriteBulkObject(r, key)) || + (!rioWriteBulkLongLong(r, hi.expire_time)) || + (!rioWriteBulkString(r, "FIELDS", 6)) || + (!rioWriteBulkString(r, "1", 1)) || + (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_KEY)) ) + goto reHashEnd; + } + } + } + + res = 1; /* success */ + +reHashEnd: + hashTypeResetIterator(&hi); + return res; +} + +/* Helper for rewriteStreamObject() that generates a bulk string into the + * AOF representing the ID 'id'. */ +int rioWriteBulkStreamID(rio *r,streamID *id) { + int retval; + + sds replyid = sdscatfmt(sdsempty(),"%U-%U",id->ms,id->seq); + retval = rioWriteBulkString(r,replyid,sdslen(replyid)); + sdsfree(replyid); + return retval; +} + +/* Helper for rewriteStreamObject(): emit the XCLAIM needed in order to + * add the message described by 'nack' having the id 'rawid', into the pending + * list of the specified consumer. All this in the context of the specified + * key and group. */ +int rioWriteStreamPendingEntry(rio *r, robj *key, const char *groupname, size_t groupname_len, streamConsumer *consumer, unsigned char *rawid, streamNACK *nack) { + /* XCLAIM 0 TIME + RETRYCOUNT JUSTID FORCE. */ + streamID id; + streamDecodeID(rawid,&id); + if (rioWriteBulkCount(r,'*',12) == 0) return 0; + if (rioWriteBulkString(r,"XCLAIM",6) == 0) return 0; + if (rioWriteBulkObject(r,key) == 0) return 0; + if (rioWriteBulkString(r,groupname,groupname_len) == 0) return 0; + if (rioWriteBulkString(r,consumer->name,sdslen(consumer->name)) == 0) return 0; + if (rioWriteBulkString(r,"0",1) == 0) return 0; + if (rioWriteBulkStreamID(r,&id) == 0) return 0; + if (rioWriteBulkString(r,"TIME",4) == 0) return 0; + if (rioWriteBulkLongLong(r,nack->delivery_time) == 0) return 0; + if (rioWriteBulkString(r,"RETRYCOUNT",10) == 0) return 0; + if (rioWriteBulkLongLong(r,nack->delivery_count) == 0) return 0; + if (rioWriteBulkString(r,"JUSTID",6) == 0) return 0; + if (rioWriteBulkString(r,"FORCE",5) == 0) return 0; + return 1; +} + +/* Helper for rewriteStreamObject(): emit the XGROUP CREATECONSUMER is + * needed in order to create consumers that do not have any pending entries. + * All this in the context of the specified key and group. */ +int rioWriteStreamEmptyConsumer(rio *r, robj *key, const char *groupname, size_t groupname_len, streamConsumer *consumer) { + /* XGROUP CREATECONSUMER */ + if (rioWriteBulkCount(r,'*',5) == 0) return 0; + if (rioWriteBulkString(r,"XGROUP",6) == 0) return 0; + if (rioWriteBulkString(r,"CREATECONSUMER",14) == 0) return 0; + if (rioWriteBulkObject(r,key) == 0) return 0; + if (rioWriteBulkString(r,groupname,groupname_len) == 0) return 0; + if (rioWriteBulkString(r,consumer->name,sdslen(consumer->name)) == 0) return 0; + return 1; +} + +/* Emit the commands needed to rebuild a stream object. + * The function returns 0 on error, 1 on success. */ +int rewriteStreamObject(rio *r, robj *key, robj *o) { + stream *s = o->ptr; + streamIterator si; + streamIteratorStart(&si,s,NULL,NULL,0); + streamID id; + int64_t numfields; + + if (s->length) { + /* Reconstruct the stream data using XADD commands. */ + while(streamIteratorGetID(&si,&id,&numfields)) { + /* Emit a two elements array for each item. The first is + * the ID, the second is an array of field-value pairs. */ + + /* Emit the XADD ...fields... command. */ + if (!rioWriteBulkCount(r,'*',3+numfields*2) || + !rioWriteBulkString(r,"XADD",4) || + !rioWriteBulkObject(r,key) || + !rioWriteBulkStreamID(r,&id)) + { + streamIteratorStop(&si); + return 0; + } + while(numfields--) { + unsigned char *field, *value; + int64_t field_len, value_len; + streamIteratorGetField(&si,&field,&value,&field_len,&value_len); + if (!rioWriteBulkString(r,(char*)field,field_len) || + !rioWriteBulkString(r,(char*)value,value_len)) + { + streamIteratorStop(&si); + return 0; + } + } + } + } else { + /* Use the XADD MAXLEN 0 trick to generate an empty stream if + * the key we are serializing is an empty string, which is possible + * for the Stream type. */ + id.ms = 0; id.seq = 1; + if (!rioWriteBulkCount(r,'*',7) || + !rioWriteBulkString(r,"XADD",4) || + !rioWriteBulkObject(r,key) || + !rioWriteBulkString(r,"MAXLEN",6) || + !rioWriteBulkString(r,"0",1) || + !rioWriteBulkStreamID(r,&id) || + !rioWriteBulkString(r,"x",1) || + !rioWriteBulkString(r,"y",1)) + { + streamIteratorStop(&si); + return 0; + } + } + + /* Append XSETID after XADD, make sure lastid is correct, + * in case of XDEL lastid. */ + if (!rioWriteBulkCount(r,'*',7) || + !rioWriteBulkString(r,"XSETID",6) || + !rioWriteBulkObject(r,key) || + !rioWriteBulkStreamID(r,&s->last_id) || + !rioWriteBulkString(r,"ENTRIESADDED",12) || + !rioWriteBulkLongLong(r,s->entries_added) || + !rioWriteBulkString(r,"MAXDELETEDID",12) || + !rioWriteBulkStreamID(r,&s->max_deleted_entry_id)) + { + streamIteratorStop(&si); + return 0; + } + + + /* Create all the stream consumer groups. */ + if (s->cgroups) { + raxIterator ri; + raxStart(&ri,s->cgroups); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + streamCG *group = ri.data; + /* Emit the XGROUP CREATE in order to create the group. */ + if (!rioWriteBulkCount(r,'*',7) || + !rioWriteBulkString(r,"XGROUP",6) || + !rioWriteBulkString(r,"CREATE",6) || + !rioWriteBulkObject(r,key) || + !rioWriteBulkString(r,(char*)ri.key,ri.key_len) || + !rioWriteBulkStreamID(r,&group->last_id) || + !rioWriteBulkString(r,"ENTRIESREAD",11) || + !rioWriteBulkLongLong(r,group->entries_read)) + { + raxStop(&ri); + streamIteratorStop(&si); + return 0; + } + + /* Generate XCLAIMs for each consumer that happens to + * have pending entries. Empty consumers would be generated with + * XGROUP CREATECONSUMER. */ + raxIterator ri_cons; + raxStart(&ri_cons,group->consumers); + raxSeek(&ri_cons,"^",NULL,0); + while(raxNext(&ri_cons)) { + streamConsumer *consumer = ri_cons.data; + /* If there are no pending entries, just emit XGROUP CREATECONSUMER */ + if (raxSize(consumer->pel) == 0) { + if (rioWriteStreamEmptyConsumer(r,key,(char*)ri.key, + ri.key_len,consumer) == 0) + { + raxStop(&ri_cons); + raxStop(&ri); + streamIteratorStop(&si); + return 0; + } + continue; + } + /* For the current consumer, iterate all the PEL entries + * to emit the XCLAIM protocol. */ + raxIterator ri_pel; + raxStart(&ri_pel,consumer->pel); + raxSeek(&ri_pel,"^",NULL,0); + while(raxNext(&ri_pel)) { + streamNACK *nack = ri_pel.data; + if (rioWriteStreamPendingEntry(r,key,(char*)ri.key, + ri.key_len,consumer, + ri_pel.key,nack) == 0) + { + raxStop(&ri_pel); + raxStop(&ri_cons); + raxStop(&ri); + streamIteratorStop(&si); + return 0; + } + } + raxStop(&ri_pel); + } + raxStop(&ri_cons); + } + raxStop(&ri); + } + + streamIteratorStop(&si); + return 1; +} + +/* Call the module type callback in order to rewrite a data type + * that is exported by a module and is not handled by Redis itself. + * The function returns 0 on error, 1 on success. */ +int rewriteModuleObject(rio *r, robj *key, robj *o, int dbid) { + RedisModuleIO io; + moduleValue *mv = o->ptr; + moduleType *mt = mv->type; + moduleInitIOContext(&io, &mt->entity, r, key, dbid); + mt->aof_rewrite(&io,key,mv->value); + if (io.ctx) { + moduleFreeContext(io.ctx); + zfree(io.ctx); + } + return io.error ? 0 : 1; +} + +static int rewriteFunctions(rio *aof) { + dict *functions = functionsLibGet(); + dictIterator iter; + dictEntry *entry = NULL; + dictInitIterator(&iter, functions); + while ((entry = dictNext(&iter))) { + functionLibInfo *li = dictGetVal(entry); + if (rioWrite(aof, "*3\r\n", 4) == 0) goto werr; + char function_load[] = "$8\r\nFUNCTION\r\n$4\r\nLOAD\r\n"; + if (rioWrite(aof, function_load, sizeof(function_load) - 1) == 0) goto werr; + if (rioWriteBulkString(aof, li->code, sdslen(li->code)) == 0) goto werr; + } + dictResetIterator(&iter); + return 1; + +werr: + dictResetIterator(&iter); + return 0; +} + +int rewriteObject(rio *r, robj *key, robj *o, int dbid, long long expiretime) { + /* Save the key and associated value */ + if (o->type == OBJ_STRING) { + /* Emit a SET command */ + static const char cmd[]="*3\r\n$3\r\nSET\r\n"; + if (rioWrite(r,cmd,sizeof(cmd)-1) == 0) return C_ERR; + /* Key and value */ + if (rioWriteBulkObject(r,key) == 0) return C_ERR; + if (rioWriteBulkObject(r,o) == 0) return C_ERR; + } else if (o->type == OBJ_LIST) { + if (rewriteListObject(r,key,o) == 0) return C_ERR; + } else if (o->type == OBJ_SET) { + if (rewriteSetObject(r,key,o) == 0) return C_ERR; + } else if (o->type == OBJ_ZSET) { + if (rewriteSortedSetObject(r,key,o) == 0) return C_ERR; + } else if (o->type == OBJ_HASH) { + if (rewriteHashObject(r,key,o) == 0) return C_ERR; + } else if (o->type == OBJ_STREAM) { + if (rewriteStreamObject(r,key,o) == 0) return C_ERR; + } else if (o->type == OBJ_MODULE) { + if (rewriteModuleObject(r,key,o,dbid) == 0) return C_ERR; + } else { + serverPanic("Unknown object type"); + } + + /* Save the expire time */ + if (expiretime != -1) { + static const char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n"; + if (rioWrite(r,cmd,sizeof(cmd)-1) == 0) return C_ERR; + if (rioWriteBulkObject(r,key) == 0) return C_ERR; + if (rioWriteBulkLongLong(r,expiretime) == 0) return C_ERR; + } + + /* If modules metadata is available */ + if ((getModuleMetaBits(o->metabits)) && (keyMetaOnAof(r, key, o, dbid) == 0)) + return C_ERR; + + return C_OK; +} + +int rewriteAppendOnlyFileRio(rio *aof) { + dictEntry *de; + int j; + long key_count = 0; + long long updated_time = 0; + unsigned long long skipped = 0; + kvstoreIterator kvs_it; + + /* Record timestamp at the beginning of rewriting AOF. */ + if (server.aof_timestamp_enabled) { + sds ts = genAofTimestampAnnotationIfNeeded(1); + if (rioWrite(aof,ts,sdslen(ts)) == 0) { sdsfree(ts); goto werr; } + sdsfree(ts); + } + + if (rewriteFunctions(aof) == 0) goto werr; + + for (j = 0; j < server.dbnum; j++) { + char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; + redisDb *db = server.db + j; + if (kvstoreSize(db->keys) == 0) continue; + + /* SELECT the new DB */ + if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr; + if (rioWriteBulkLongLong(aof,j) == 0) goto werr; + + kvstoreIteratorInit(&kvs_it, db->keys); + /* Iterate this DB writing every entry */ + while((de = kvstoreIteratorNext(&kvs_it)) != NULL) { + long long expiretime; + size_t aof_bytes_before_key = aof->processed_bytes; + + /* Get the value object (of type kvobj) */ + kvobj *o = dictGetKV(de); + + /* Get the expire time */ + expiretime = kvobjGetExpire(o); + + /* Skip keys that are being trimmed */ + if (server.cluster_enabled) { + int curr_slot = kvstoreIteratorGetCurrentDictIndex(&kvs_it); + if (isSlotInTrimJob(curr_slot)) { + skipped++; + continue; + } + } + + /* Set on stack string object for key */ + robj key; + initStaticStringObject(key, kvobjGetKey(o)); + + if (rewriteObject(aof, &key, o, j, expiretime) == C_ERR) goto werr2; + + /* In fork child process, we can try to release memory back to the + * OS and possibly avoid or decrease COW. We give the dismiss + * mechanism a hint about an estimated size of the object we stored. */ + size_t dump_size = aof->processed_bytes - aof_bytes_before_key; + if (server.in_fork_child) dismissObject(o, dump_size); + + /* Update info every 1 second (approximately). + * in order to avoid calling mstime() on each iteration, we will + * check the diff every 1024 keys */ + if ((key_count++ & 1023) == 0) { + long long now = mstime(); + if (now - updated_time >= 1000) { + sendChildInfo(CHILD_INFO_TYPE_CURRENT_INFO, key_count, "AOF rewrite"); + updated_time = now; + } + } + + /* Delay before next key if required (for testing) */ + if (server.rdb_key_save_delay) + debugDelay(server.rdb_key_save_delay); + } + kvstoreIteratorReset(&kvs_it); + } + serverLog(LL_NOTICE, "AOF rewrite done, %ld keys saved, %llu keys skipped.", key_count, skipped); + return C_OK; + +werr2: + kvstoreIteratorReset(&kvs_it); +werr: + return C_ERR; +} + +/* Write a sequence of commands able to fully rebuild the dataset into + * "filename". Used both by REWRITEAOF and BGREWRITEAOF. + * + * In order to minimize the number of commands needed in the rewritten + * log Redis uses variadic commands when possible, such as RPUSH, SADD + * and ZADD. However at max AOF_REWRITE_ITEMS_PER_CMD items per time + * are inserted using a single command. */ +int rewriteAppendOnlyFile(char *filename) { + rio aof; + FILE *fp = NULL; + char tmpfile[256]; + + /* Note that we have to use a different temp name here compared to the + * one used by rewriteAppendOnlyFileBackground() function. */ + snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); + fp = fopen(tmpfile,"w"); + if (!fp) { + serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno)); + return C_ERR; + } + + rioInitWithFile(&aof,fp); + + if (server.aof_rewrite_incremental_fsync) { + rioSetAutoSync(&aof,REDIS_AUTOSYNC_BYTES); + rioSetReclaimCache(&aof,1); + } + + startSaving(RDBFLAGS_AOF_PREAMBLE); + + if (server.aof_use_rdb_preamble) { + int error; + if (rdbSaveRio(SLAVE_REQ_NONE,&aof,&error,RDBFLAGS_AOF_PREAMBLE,NULL) == C_ERR) { + errno = error; + goto werr; + } + } else { + if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr; + } + + /* Make sure data will not remain on the OS's output buffers */ + if (fflush(fp)) goto werr; + if (fsync(fileno(fp))) goto werr; + if (reclaimFilePageCache(fileno(fp), 0, 0) == -1) { + /* A minor error. Just log to know what happens */ + serverLog(LL_NOTICE,"Unable to reclaim page cache: %s", strerror(errno)); + } + if (fclose(fp)) { fp = NULL; goto werr; } + fp = NULL; + + /* Use RENAME to make sure the DB file is changed atomically only + * if the generate DB file is ok. */ + if (rename(tmpfile,filename) == -1) { + serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); + unlink(tmpfile); + stopSaving(0); + return C_ERR; + } + stopSaving(1); + + return C_OK; + +werr: + serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); + if (fp) fclose(fp); + unlink(tmpfile); + stopSaving(0); + return C_ERR; +} +/* ---------------------------------------------------------------------------- + * AOF background rewrite + * ------------------------------------------------------------------------- */ + +/* This is how rewriting of the append only file in background works: + * + * 1) The user calls BGREWRITEAOF + * 2) Redis calls this function, that forks(): + * 2a) the child rewrite the append only file in a temp file. + * 2b) the parent open a new INCR AOF file to continue writing. + * 3) When the child finished '2a' exists. + * 4) The parent will trap the exit code, if it's OK, it will: + * 4a) get a new BASE file name and mark the previous (if we have) as the HISTORY type + * 4b) rename(2) the temp file in new BASE file name + * 4c) mark the rewritten INCR AOFs as history type + * 4d) persist AOF manifest file + * 4e) Delete the history files use bio + */ +int rewriteAppendOnlyFileBackground(void) { + pid_t childpid; + + if (hasActiveChildProcess()) return C_ERR; + + if (dirCreateIfMissing(server.aof_dirname) == -1) { + serverLog(LL_WARNING, "Can't open or create append-only dir %s: %s", + server.aof_dirname, strerror(errno)); + server.aof_lastbgrewrite_status = C_ERR; + return C_ERR; + } + + /* We set aof_selected_db to -1 in order to force the next call to the + * feedAppendOnlyFile() to issue a SELECT command. */ + server.aof_selected_db = -1; + flushAppendOnlyFile(1); + if (openNewIncrAofForAppend() != C_OK) { + server.aof_lastbgrewrite_status = C_ERR; + return C_ERR; + } + + if (server.aof_state == AOF_WAIT_REWRITE) { + /* Wait for all bio jobs related to AOF to drain. This prevents a race + * between updates to `fsynced_reploff_pending` of the worker thread, belonging + * to the previous AOF, and the new one. This concern is specific for a full + * sync scenario where we don't wanna risk the ACKed replication offset + * jumping backwards or forward when switching to a different master. */ + bioDrainWorker(BIO_AOF_FSYNC); + + /* Set the initial repl_offset, which will be applied to fsynced_reploff + * when AOFRW finishes (after possibly being updated by a bio thread) */ + atomicSet(server.fsynced_reploff_pending, server.master_repl_offset); + server.fsynced_reploff = 0; + } + + server.stat_aof_rewrites++; + + if ((childpid = redisFork(CHILD_TYPE_AOF)) == 0) { + char tmpfile[256]; + + /* Child */ + redisSetProcTitle("redis-aof-rewrite"); + redisSetCpuAffinity(server.aof_rewrite_cpulist); + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); + if (rewriteAppendOnlyFile(tmpfile) == C_OK) { + serverLog(LL_NOTICE, + "Successfully created the temporary AOF base file %s", tmpfile); + sendChildCowInfo(CHILD_INFO_TYPE_AOF_COW_SIZE, "AOF rewrite"); + exitFromChild(0, 0); + } else { + exitFromChild(1, 0); + } + } else { + /* Parent */ + if (childpid == -1) { + server.aof_lastbgrewrite_status = C_ERR; + serverLog(LL_WARNING, + "Can't rewrite append only file in background: fork: %s", + strerror(errno)); + return C_ERR; + } + serverLog(LL_NOTICE, + "Background append only file rewriting started by pid %ld",(long) childpid); + server.aof_rewrite_scheduled = 0; + server.aof_rewrite_time_start = time(NULL); + return C_OK; + } + return C_OK; /* unreached */ +} + +void bgrewriteaofCommand(client *c) { + if (server.child_type == CHILD_TYPE_AOF) { + addReplyError(c,"Background append only file rewriting already in progress"); + } else if (hasActiveChildProcess() || server.in_exec) { + server.aof_rewrite_scheduled = 1; + /* When manually triggering AOFRW we reset the count + * so that it can be executed immediately. */ + server.stat_aofrw_consecutive_failures = 0; + addReplyStatus(c,"Background append only file rewriting scheduled"); + } else if (rewriteAppendOnlyFileBackground() == C_OK) { + addReplyStatus(c,"Background append only file rewriting started"); + } else { + addReplyError(c,"Can't execute an AOF background rewriting. " + "Please check the server logs for more information."); + } +} + +void aofRemoveTempFile(pid_t childpid) { + char tmpfile[256]; + + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); + bg_unlink(tmpfile); + + snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) childpid); + bg_unlink(tmpfile); +} + +/* Get size of an AOF file. + * The status argument is an optional output argument to be filled with + * one of the AOF_ status values. */ +off_t getAppendOnlyFileSize(sds filename, int *status) { + struct redis_stat sb; + off_t size; + mstime_t latency; + + sds aof_filepath = makePath(server.aof_dirname, filename); + latencyStartMonitor(latency); + if (redis_stat(aof_filepath, &sb) == -1) { + if (status) *status = errno == ENOENT ? AOF_NOT_EXIST : AOF_OPEN_ERR; + serverLog(LL_WARNING, "Unable to obtain the AOF file %s length. stat: %s", + filename, strerror(errno)); + size = 0; + } else { + if (status) *status = AOF_OK; + size = sb.st_size; + } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("aof-fstat", latency); + sdsfree(aof_filepath); + return size; +} + +/* Get size of all AOF files referred by the manifest (excluding history). + * The status argument is an output argument to be filled with + * one of the AOF_ status values. */ +off_t getBaseAndIncrAppendOnlyFilesSize(aofManifest *am, int *status) { + off_t size = 0; + listNode *ln; + listIter li; + + if (am->base_aof_info) { + serverAssert(am->base_aof_info->file_type == AOF_FILE_TYPE_BASE); + + size += getAppendOnlyFileSize(am->base_aof_info->file_name, status); + if (*status != AOF_OK) return 0; + } + + listRewind(am->incr_aof_list, &li); + while ((ln = listNext(&li)) != NULL) { + aofInfo *ai = (aofInfo*)ln->value; + serverAssert(ai->file_type == AOF_FILE_TYPE_INCR); + size += getAppendOnlyFileSize(ai->file_name, status); + if (*status != AOF_OK) return 0; + } + + return size; +} + +int getBaseAndIncrAppendOnlyFilesNum(aofManifest *am) { + int num = 0; + if (am->base_aof_info) num++; + if (am->incr_aof_list) num += listLength(am->incr_aof_list); + return num; +} + +/* A background append only file rewriting (BGREWRITEAOF) terminated its work. + * Handle this. */ +void backgroundRewriteDoneHandler(int exitcode, int bysignal) { + if (!bysignal && exitcode == 0) { + char tmpfile[256]; + long long now = ustime(); + sds new_base_filepath = NULL; + sds new_incr_filepath = NULL; + aofManifest *temp_am; + mstime_t latency; + + serverLog(LL_NOTICE, + "Background AOF rewrite terminated with success"); + + snprintf(tmpfile, 256, "temp-rewriteaof-bg-%d.aof", + (int)server.child_pid); + + serverAssert(server.aof_manifest != NULL); + + /* Dup a temporary aof_manifest for subsequent modifications. */ + temp_am = aofManifestDup(server.aof_manifest); + + /* Get a new BASE file name and mark the previous (if we have) + * as the HISTORY type. */ + sds new_base_filename = getNewBaseFileNameAndMarkPreAsHistory(temp_am); + serverAssert(new_base_filename != NULL); + new_base_filepath = makePath(server.aof_dirname, new_base_filename); + + /* Rename the temporary aof file to 'new_base_filename'. */ + latencyStartMonitor(latency); + if (rename(tmpfile, new_base_filepath) == -1) { + serverLog(LL_WARNING, + "Error trying to rename the temporary AOF base file %s into %s: %s", + tmpfile, + new_base_filepath, + strerror(errno)); + aofManifestFree(temp_am); + sdsfree(new_base_filepath); + server.aof_lastbgrewrite_status = C_ERR; + server.stat_aofrw_consecutive_failures++; + goto cleanup; + } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("aof-rename", latency); + serverLog(LL_NOTICE, + "Successfully renamed the temporary AOF base file %s into %s", tmpfile, new_base_filename); + + /* Rename the temporary incr aof file to 'new_incr_filename'. */ + if (server.aof_state == AOF_WAIT_REWRITE) { + /* Get temporary incr aof name. */ + sds temp_incr_aof_name = getTempIncrAofName(); + sds temp_incr_filepath = makePath(server.aof_dirname, temp_incr_aof_name); + /* Get next new incr aof name. */ + sds new_incr_filename = getNewIncrAofName(temp_am, tempIncAofStartReplOffset); + new_incr_filepath = makePath(server.aof_dirname, new_incr_filename); + latencyStartMonitor(latency); + if (rename(temp_incr_filepath, new_incr_filepath) == -1) { + serverLog(LL_WARNING, + "Error trying to rename the temporary AOF incr file %s into %s: %s", + temp_incr_filepath, + new_incr_filepath, + strerror(errno)); + bg_unlink(new_base_filepath); + sdsfree(new_base_filepath); + aofManifestFree(temp_am); + sdsfree(temp_incr_filepath); + sdsfree(new_incr_filepath); + sdsfree(temp_incr_aof_name); + server.aof_lastbgrewrite_status = C_ERR; + server.stat_aofrw_consecutive_failures++; + goto cleanup; + } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("aof-rename", latency); + serverLog(LL_NOTICE, + "Successfully renamed the temporary AOF incr file %s into %s", temp_incr_aof_name, new_incr_filename); + sdsfree(temp_incr_filepath); + sdsfree(temp_incr_aof_name); + } + + /* Change the AOF file type in 'incr_aof_list' from AOF_FILE_TYPE_INCR + * to AOF_FILE_TYPE_HIST, and move them to the 'history_aof_list'. */ + markRewrittenIncrAofAsHistory(temp_am); + + /* Persist our modifications. */ + if (persistAofManifest(temp_am) == C_ERR) { + bg_unlink(new_base_filepath); + aofManifestFree(temp_am); + sdsfree(new_base_filepath); + if (new_incr_filepath) { + bg_unlink(new_incr_filepath); + sdsfree(new_incr_filepath); + } + server.aof_lastbgrewrite_status = C_ERR; + server.stat_aofrw_consecutive_failures++; + goto cleanup; + } + sdsfree(new_base_filepath); + if (new_incr_filepath) sdsfree(new_incr_filepath); + + /* We can safely let `server.aof_manifest` point to 'temp_am' and free the previous one. */ + aofManifestFreeAndUpdate(temp_am); + + if (server.aof_state != AOF_OFF) { + /* AOF enabled. */ + server.aof_current_size = getAppendOnlyFileSize(new_base_filename, NULL) + server.aof_last_incr_size; + server.aof_rewrite_base_size = server.aof_current_size; + } + + /* We don't care about the return value of `aofDelHistoryFiles`, because the history + * deletion failure will not cause any problems. */ + aofDelHistoryFiles(); + + server.aof_lastbgrewrite_status = C_OK; + server.stat_aofrw_consecutive_failures = 0; + + serverLog(LL_NOTICE, "Background AOF rewrite finished successfully"); + /* Change state from WAIT_REWRITE to ON if needed */ + if (server.aof_state == AOF_WAIT_REWRITE) { + server.aof_state = AOF_ON; + + /* Update the fsynced replication offset that just now become valid. + * This could either be the one we took in startAppendOnly, or a + * newer one set by the bio thread. */ + long long fsynced_reploff_pending; + atomicGet(server.fsynced_reploff_pending, fsynced_reploff_pending); + server.fsynced_reploff = fsynced_reploff_pending; + } + + serverLog(LL_VERBOSE, + "Background AOF rewrite signal handler took %lldus", ustime()-now); + } else if (!bysignal && exitcode != 0) { + server.aof_lastbgrewrite_status = C_ERR; + server.stat_aofrw_consecutive_failures++; + + serverLog(LL_WARNING, + "Background AOF rewrite terminated with error"); + } else { + /* SIGUSR1 is whitelisted, so we have a way to kill a child without + * triggering an error condition. */ + if (bysignal != SIGUSR1) { + server.aof_lastbgrewrite_status = C_ERR; + server.stat_aofrw_consecutive_failures++; + } + + serverLog(LL_WARNING, + "Background AOF rewrite terminated by signal %d", bysignal); + } + +cleanup: + aofRemoveTempFile(server.child_pid); + /* Clear AOF buffer and delete temp incr aof for next rewrite. */ + if (server.aof_state == AOF_WAIT_REWRITE) { + sdsfree(server.aof_buf); + server.aof_buf = sdsempty(); + aofDelTempIncrAofFile(); + } + server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start; + server.aof_rewrite_time_start = -1; + /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */ + if (server.aof_state == AOF_WAIT_REWRITE) + server.aof_rewrite_scheduled = 1; +} diff --git a/examples/redis-unstable/src/asciilogo.h b/examples/redis-unstable/src/asciilogo.h new file mode 100644 index 0000000..73a9977 --- /dev/null +++ b/examples/redis-unstable/src/asciilogo.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +const char *ascii_logo = +" _._ \n" +" _.-``__ ''-._ \n" +" _.-`` `. `_. ''-._ Redis Open Source \n" +" .-`` .-```. ```\\/ _.,_ ''-._ %s (%s/%d) %s bit\n" +" ( ' , .-` | `, ) Running in %s mode\n" +" |`-._`-...-` __...-.``-._|'` _.-'| Port: %d\n" +" | `-._ `._ / _.-' | PID: %ld\n" +" `-._ `-._ `-./ _.-' _.-' \n" +" |`-._`-._ `-.__.-' _.-'_.-'| \n" +" | `-._`-._ _.-'_.-' | https://redis.io \n" +" `-._ `-._`-.__.-'_.-' _.-' \n" +" |`-._`-._ `-.__.-' _.-'_.-'| \n" +" | `-._`-._ _.-'_.-' | \n" +" `-._ `-._`-.__.-'_.-' _.-' \n" +" `-._ `-.__.-' _.-' \n" +" `-._ _.-' \n" +" `-.__.-' \n\n"; diff --git a/examples/redis-unstable/src/atomicvar.h b/examples/redis-unstable/src/atomicvar.h new file mode 100644 index 0000000..3c332ee --- /dev/null +++ b/examples/redis-unstable/src/atomicvar.h @@ -0,0 +1,186 @@ +/* This file implements atomic counters using c11 _Atomic, __atomic or __sync + * macros if available, otherwise we will throw an error when compile. + * + * The exported interface is composed of the following macros: + * + * atomicIncr(var,count) -- Increment the atomic counter + * atomicGetIncr(var,oldvalue_var,count) -- Get and increment the atomic counter + * atomicIncrGet(var,newvalue_var,count) -- Increment and get the atomic counter new value + * atomicDecr(var,count) -- Decrement the atomic counter + * atomicGet(var,dstvar) -- Fetch the atomic counter value + * atomicSet(var,value) -- Set the atomic counter value + * atomicGetWithSync(var,value) -- 'atomicGet' with inter-thread synchronization + * atomicSetWithSync(var,value) -- 'atomicSet' with inter-thread synchronization + * atomicCompareExchange(type,var,expected_var,desired) -- Compare and exchange (CAS) operation + * + * Atomic operations on flags. + * Flag type can be int, long, long long or their unsigned counterparts. + * The value of the flag can be 1 or 0. + * + * atomicFlagGetSet(var,oldvalue_var) -- Get and set the atomic counter value + * + * NOTE1: __atomic* and _Atomic implementations can be actually elaborated to support any value by changing the + * hardcoded new value passed to __atomic_exchange* from 1 to @param count + * i.e oldvalue_var = atomic_exchange_explicit(&var, count). + * However, in order to be compatible with the __sync functions family, we can use only 0 and 1. + * The only exchange alternative suggested by __sync is __sync_lock_test_and_set, + * But as described by the gnu manual for __sync_lock_test_and_set(): + * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html + * "A target may support reduced functionality here by which the only valid value to store is the immediate constant 1. The exact value + * actually stored in *ptr is implementation defined." + * Hence, we can't rely on it for a any value other than 1. + * We eventually chose to implement this method with __sync_val_compare_and_swap since it satisfies functionality needed for atomicFlagGetSet + * (if the flag was 0 -> set to 1, if it's already 1 -> do nothing, but the final result is that the flag is set), + * and also it has a full barrier (__sync_lock_test_and_set has acquire barrier). + * + * NOTE2: Unlike other atomic type, which aren't guaranteed to be lock free, c11 atomic_flag does. + * To check whether a type is lock free, atomic_is_lock_free() can be used. + * It can be considered to limit the flag type to atomic_flag to improve performance. + * + * Never use return value from the macros, instead use the AtomicGetIncr() + * if you need to get the current value and increment it atomically, like + * in the following example: + * + * long oldvalue; + * atomicGetIncr(myvar,oldvalue,1); + * doSomethingWith(oldvalue); + * + * ---------------------------------------------------------------------------- + * + * Copyright (c) 2015-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include +#include "config.h" + +#ifndef __ATOMIC_VAR_H +#define __ATOMIC_VAR_H + +/* Define redisAtomic for atomic variable. */ +#define redisAtomic + +/* To test Redis with Helgrind (a Valgrind tool) it is useful to define + * the following macro, so that __sync macros are used: those can be detected + * by Helgrind (even if they are less efficient) so that no false positive + * is reported. */ +// #define __ATOMIC_VAR_FORCE_SYNC_MACROS + +/* There will be many false positives if we test Redis with Helgrind, since + * Helgrind can't understand we have imposed ordering on the program, so + * we use macros in helgrind.h to tell Helgrind inter-thread happens-before + * relationship explicitly for avoiding false positives. + * + * For more details, please see: valgrind/helgrind.h and + * https://www.valgrind.org/docs/manual/hg-manual.html#hg-manual.effective-use + * + * These macros take effect only when 'make helgrind', and you must first + * install Valgrind in the default path configuration. */ +#ifdef __ATOMIC_VAR_FORCE_SYNC_MACROS +#include +#else +#define ANNOTATE_HAPPENS_BEFORE(v) ((void) v) +#define ANNOTATE_HAPPENS_AFTER(v) ((void) v) +#endif + +#if !defined(__ATOMIC_VAR_FORCE_SYNC_MACROS) && defined(__STDC_VERSION__) && \ + (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) +/* Use '_Atomic' keyword if the compiler supports. */ +#undef redisAtomic +#define redisAtomic _Atomic +/* Implementation using _Atomic in C11. */ + +#include +#define atomicIncr(var,count) atomic_fetch_add_explicit(&var,(count),memory_order_relaxed) +#define atomicGetIncr(var,oldvalue_var,count) do { \ + oldvalue_var = atomic_fetch_add_explicit(&var,(count),memory_order_relaxed); \ +} while(0) +#define atomicIncrGet(var, newvalue_var, count) \ + newvalue_var = atomicIncr(var,count) + count +#define atomicDecr(var,count) atomic_fetch_sub_explicit(&var,(count),memory_order_relaxed) +#define atomicGet(var,dstvar) do { \ + dstvar = atomic_load_explicit(&var,memory_order_relaxed); \ +} while(0) +#define atomicSet(var,value) atomic_store_explicit(&var,value,memory_order_relaxed) +#define atomicGetWithSync(var,dstvar) do { \ + dstvar = atomic_load_explicit(&var,memory_order_seq_cst); \ +} while(0) +#define atomicSetWithSync(var,value) \ + atomic_store_explicit(&var,value,memory_order_seq_cst) +#define atomicCompareExchange(type,var,expected_var,desired) \ + atomic_compare_exchange_weak_explicit(&var,&expected_var,desired,memory_order_relaxed,memory_order_relaxed) +#define atomicFlagGetSet(var,oldvalue_var) \ + oldvalue_var = atomic_exchange_explicit(&var,1,memory_order_relaxed) +#define REDIS_ATOMIC_API "c11-builtin" + +#elif !defined(__ATOMIC_VAR_FORCE_SYNC_MACROS) && \ + (!defined(__clang__) || !defined(__APPLE__) || __apple_build_version__ > 4210057) && \ + defined(__ATOMIC_RELAXED) && defined(__ATOMIC_SEQ_CST) +/* Implementation using __atomic macros. */ + +#define atomicIncr(var,count) __atomic_add_fetch(&var,(count),__ATOMIC_RELAXED) +#define atomicIncrGet(var, newvalue_var, count) \ + newvalue_var = __atomic_add_fetch(&var,(count),__ATOMIC_RELAXED) +#define atomicGetIncr(var,oldvalue_var,count) do { \ + oldvalue_var = __atomic_fetch_add(&var,(count),__ATOMIC_RELAXED); \ +} while(0) +#define atomicDecr(var,count) __atomic_sub_fetch(&var,(count),__ATOMIC_RELAXED) +#define atomicGet(var,dstvar) do { \ + dstvar = __atomic_load_n(&var,__ATOMIC_RELAXED); \ +} while(0) +#define atomicSet(var,value) __atomic_store_n(&var,value,__ATOMIC_RELAXED) +#define atomicGetWithSync(var,dstvar) do { \ + dstvar = __atomic_load_n(&var,__ATOMIC_SEQ_CST); \ +} while(0) +#define atomicSetWithSync(var,value) \ + __atomic_store_n(&var,value,__ATOMIC_SEQ_CST) +#define atomicCompareExchange(type,var,expected_var,desired) \ + __atomic_compare_exchange_n(&var,&expected_var,desired,1,__ATOMIC_RELAXED,__ATOMIC_RELAXED) +#define atomicFlagGetSet(var,oldvalue_var) \ + oldvalue_var = __atomic_exchange_n(&var,1,__ATOMIC_RELAXED) +#define REDIS_ATOMIC_API "atomic-builtin" + +#elif defined(HAVE_ATOMIC) +/* Implementation using __sync macros. */ + +#define atomicIncr(var,count) __sync_add_and_fetch(&var,(count)) +#define atomicIncrGet(var, newvalue_var, count) \ + newvalue_var = __sync_add_and_fetch(&var,(count)) +#define atomicGetIncr(var,oldvalue_var,count) do { \ + oldvalue_var = __sync_fetch_and_add(&var,(count)); \ +} while(0) +#define atomicDecr(var,count) __sync_sub_and_fetch(&var,(count)) +#define atomicGet(var,dstvar) do { \ + dstvar = __sync_sub_and_fetch(&var,0); \ +} while(0) +#define atomicSet(var,value) do { \ + while(!__sync_bool_compare_and_swap(&var,var,value)); \ +} while(0) +/* Actually the builtin issues a full memory barrier by default. */ +#define atomicGetWithSync(var,dstvar) do { \ + dstvar = __sync_sub_and_fetch(&var,0,__sync_synchronize); \ + ANNOTATE_HAPPENS_AFTER(&var); \ +} while(0) +#define atomicSetWithSync(var,value) do { \ + ANNOTATE_HAPPENS_BEFORE(&var); \ + while(!__sync_bool_compare_and_swap(&var,var,value,__sync_synchronize)); \ +} while(0) +#define atomicCompareExchange(type,var,expected_var,desired) ({ \ + type _old = __sync_val_compare_and_swap(&var,expected_var,desired); \ + int _success = (_old == expected_var); \ + if (!_success) expected_var = _old; \ + _success; \ +}) +#define atomicFlagGetSet(var,oldvalue_var) \ + oldvalue_var = __sync_val_compare_and_swap(&var,0,1) +#define REDIS_ATOMIC_API "sync-builtin" + +#else +#error "Unable to determine atomic operations for your platform" + +#endif +#endif /* __ATOMIC_VAR_H */ diff --git a/examples/redis-unstable/src/bio.c b/examples/redis-unstable/src/bio.c new file mode 100644 index 0000000..4d36e3e --- /dev/null +++ b/examples/redis-unstable/src/bio.c @@ -0,0 +1,445 @@ +/* Background I/O service for Redis. + * + * This file implements operations that we need to perform in the background. + * Currently there are 3 operations: + * 1) a background close(2) system call. This is needed when the process is + * the last owner of a reference to a file closing it means unlinking it, and + * the deletion of the file is slow, blocking the server. + * 2) AOF fsync + * 3) lazyfree of memory + * + * In the future we'll either continue implementing new things we need or + * we'll switch to libeio. However there are probably long term uses for this + * file as we may want to put Redis specific background tasks here. + * + * DESIGN + * ------ + * + * The design is simple: We have a structure representing a job to perform, + * and several worker threads and job queues. Every job type is assigned to + * a specific worker thread, and a single worker may handle several different + * job types. + * Every thread waits for new jobs in its queue, and processes every job + * sequentially. + * + * Jobs handled by the same worker are guaranteed to be processed from the + * least-recently-inserted to the most-recently-inserted (older jobs processed + * first). + * + * To let the creator of the job to be notified about the completion of the + * operation, it will need to submit additional dummy job, coined as + * completion job request that will be written back eventually, by the + * background thread, into completion job response queue. This notification + * layout can simplify flows that might submit more than one job, such as + * in case of FLUSHALL which for a single command submits multiple jobs. It + * is also correct because jobs are processed in FIFO fashion. + * + * ---------------------------------------------------------------------------- + * + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "server.h" +#include "bio.h" +#include + +static char* bio_worker_title[] = { + "bio_close_file", + "bio_aof", + "bio_lazy_free", +}; + +#define BIO_WORKER_NUM (sizeof(bio_worker_title) / sizeof(*bio_worker_title)) + +static unsigned int bio_job_to_worker[] = { + [BIO_CLOSE_FILE] = 0, + [BIO_AOF_FSYNC] = 1, + [BIO_CLOSE_AOF] = 1, + [BIO_LAZY_FREE] = 2, + [BIO_COMP_RQ_CLOSE_FILE] = 0, + [BIO_COMP_RQ_AOF_FSYNC] = 1, + [BIO_COMP_RQ_LAZY_FREE] = 2 +}; + +static pthread_t bio_threads[BIO_WORKER_NUM]; +static pthread_mutex_t bio_mutex[BIO_WORKER_NUM]; +static pthread_cond_t bio_newjob_cond[BIO_WORKER_NUM]; +static list *bio_jobs[BIO_WORKER_NUM]; +static unsigned long bio_jobs_counter[BIO_NUM_OPS] = {0}; + +/* The bio_comp_list is used to hold completion job responses and to handover + * to main thread to callback as notification for job completion. Main + * thread will be triggered to read the list by signaling via writing to a pipe */ +static list *bio_comp_list; +static pthread_mutex_t bio_mutex_comp; +static int job_comp_pipe[2]; /* Pipe used to awake the event loop */ + +typedef struct bio_comp_item { + comp_fn *func; /* callback after completion job will be processed */ + uint64_t arg; /* user data to be passed to the function */ + void *ptr; /* user pointer to be passed to the function */ +} bio_comp_item; + +/* This structure represents a background Job. It is only used locally to this + * file as the API does not expose the internals at all. */ +typedef union bio_job { + struct { + int type; /* Job-type tag. This needs to appear as the first element in all union members. */ + } header; + + /* Job specific arguments.*/ + struct { + int type; + int fd; /* Fd for file based background jobs */ + long long offset; /* A job-specific offset, if applicable */ + unsigned need_fsync:1; /* A flag to indicate that a fsync is required before + * the file is closed. */ + unsigned need_reclaim_cache:1; /* A flag to indicate that reclaim cache is required before + * the file is closed. */ + } fd_args; + + struct { + int type; + lazy_free_fn *free_fn; /* Function that will free the provided arguments */ + void *free_args[]; /* List of arguments to be passed to the free function */ + } free_args; + struct { + int type; /* header */ + comp_fn *fn; /* callback. Handover to main thread to cb as notify for job completion */ + uint64_t arg; /* callback arguments */ + void *ptr; /* callback pointer */ + } comp_rq; +} bio_job; + +void *bioProcessBackgroundJobs(void *arg); +void bioPipeReadJobCompList(aeEventLoop *el, int fd, void *privdata, int mask); + +/* Make sure we have enough stack to perform all the things we do in the + * main thread. */ +#define REDIS_THREAD_STACK_SIZE (1024*1024*4) + +/* Initialize the background system, spawning the thread. */ +void bioInit(void) { + pthread_attr_t attr; + pthread_t thread; + size_t stacksize; + unsigned long j; + + /* Initialization of state vars and objects */ + for (j = 0; j < BIO_WORKER_NUM; j++) { + pthread_mutex_init(&bio_mutex[j],NULL); + pthread_cond_init(&bio_newjob_cond[j],NULL); + bio_jobs[j] = listCreate(); + } + + /* init jobs comp responses */ + bio_comp_list = listCreate(); + pthread_mutex_init(&bio_mutex_comp, NULL); + + /* Create a pipe for background thread to be able to wake up the redis main thread. + * Make the pipe non blocking. This is just a best effort aware mechanism + * and we do not want to block not in the read nor in the write half. + * Enable close-on-exec flag on pipes in case of the fork-exec system calls in + * sentinels or redis servers. */ + if (anetPipe(job_comp_pipe, O_CLOEXEC|O_NONBLOCK, O_CLOEXEC|O_NONBLOCK) == -1) { + serverLog(LL_WARNING, + "Can't create the pipe for bio thread: %s", strerror(errno)); + exit(1); + } + + /* Register a readable event for the pipe used to awake the event loop on job completion */ + if (aeCreateFileEvent(server.el, job_comp_pipe[0], AE_READABLE, + bioPipeReadJobCompList, NULL) == AE_ERR) { + serverPanic("Error registering the readable event for the bio pipe."); + } + + /* Set the stack size as by default it may be small in some system */ + pthread_attr_init(&attr); + pthread_attr_getstacksize(&attr,&stacksize); + if (!stacksize) stacksize = 1; /* The world is full of Solaris Fixes */ + while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2; + pthread_attr_setstacksize(&attr, stacksize); + + /* Ready to spawn our threads. We use the single argument the thread + * function accepts in order to pass the job ID the thread is + * responsible for. */ + for (j = 0; j < BIO_WORKER_NUM; j++) { + int err = pthread_create(&thread,&attr,bioProcessBackgroundJobs, (void*) j); + if (err) { + serverLog(LL_WARNING, "Fatal: Can't initialize Background Jobs. Error message: %s", strerror(err)); + exit(1); + } + bio_threads[j] = thread; + } +} + +void bioSubmitJob(int type, bio_job *job) { + job->header.type = type; + unsigned long worker = bio_job_to_worker[type]; + pthread_mutex_lock(&bio_mutex[worker]); + listAddNodeTail(bio_jobs[worker],job); + bio_jobs_counter[type]++; + pthread_cond_signal(&bio_newjob_cond[worker]); + pthread_mutex_unlock(&bio_mutex[worker]); +} + +void bioCreateLazyFreeJob(lazy_free_fn free_fn, int arg_count, ...) { + va_list valist; + /* Allocate memory for the job structure and all required + * arguments */ + bio_job *job = zmalloc(sizeof(*job) + sizeof(void *) * (arg_count)); + job->free_args.free_fn = free_fn; + + va_start(valist, arg_count); + for (int i = 0; i < arg_count; i++) { + job->free_args.free_args[i] = va_arg(valist, void *); + } + va_end(valist); + bioSubmitJob(BIO_LAZY_FREE, job); +} + +void bioCreateCompRq(bio_worker_t assigned_worker, comp_fn *func, uint64_t user_data, void *user_ptr) { + int type; + switch (assigned_worker) { + case BIO_WORKER_CLOSE_FILE: + type = BIO_COMP_RQ_CLOSE_FILE; + break; + case BIO_WORKER_AOF_FSYNC: + type = BIO_COMP_RQ_AOF_FSYNC; + break; + case BIO_WORKER_LAZY_FREE: + type = BIO_COMP_RQ_LAZY_FREE; + break; + default: + serverPanic("Invalid worker type in bioCreateCompRq()."); + } + + bio_job *job = zmalloc(sizeof(*job)); + job->comp_rq.fn = func; + job->comp_rq.arg = user_data; + job->comp_rq.ptr = user_ptr; + bioSubmitJob(type, job); +} + +void bioCreateCloseJob(int fd, int need_fsync, int need_reclaim_cache) { + bio_job *job = zmalloc(sizeof(*job)); + job->fd_args.fd = fd; + job->fd_args.need_fsync = need_fsync; + job->fd_args.need_reclaim_cache = need_reclaim_cache; + + bioSubmitJob(BIO_CLOSE_FILE, job); +} + +void bioCreateCloseAofJob(int fd, long long offset, int need_reclaim_cache) { + bio_job *job = zmalloc(sizeof(*job)); + job->fd_args.fd = fd; + job->fd_args.offset = offset; + job->fd_args.need_fsync = 1; + job->fd_args.need_reclaim_cache = need_reclaim_cache; + + bioSubmitJob(BIO_CLOSE_AOF, job); +} + +void bioCreateFsyncJob(int fd, long long offset, int need_reclaim_cache) { + bio_job *job = zmalloc(sizeof(*job)); + job->fd_args.fd = fd; + job->fd_args.offset = offset; + job->fd_args.need_reclaim_cache = need_reclaim_cache; + + bioSubmitJob(BIO_AOF_FSYNC, job); +} + +void *bioProcessBackgroundJobs(void *arg) { + bio_job *job; + unsigned long worker = (unsigned long) arg; + sigset_t sigset; + + /* Check that the worker is within the right interval. */ + serverAssert(worker < BIO_WORKER_NUM); + + redis_set_thread_title(bio_worker_title[worker]); + + redisSetCpuAffinity(server.bio_cpulist); + + makeThreadKillable(); + + pthread_mutex_lock(&bio_mutex[worker]); + /* Block SIGALRM so we are sure that only the main thread will + * receive the watchdog signal. */ + sigemptyset(&sigset); + sigaddset(&sigset, SIGALRM); + int err = pthread_sigmask(SIG_BLOCK, &sigset, NULL); + if (err) + serverLog(LL_WARNING, + "Warning: can't mask SIGALRM in bio.c thread: %s", strerror(err)); + + while(1) { + listNode *ln; + + /* The loop always starts with the lock hold. */ + if (listLength(bio_jobs[worker]) == 0) { + pthread_cond_wait(&bio_newjob_cond[worker], &bio_mutex[worker]); + continue; + } + /* Get the job from the queue. */ + ln = listFirst(bio_jobs[worker]); + job = ln->value; + /* It is now possible to unlock the background system as we know have + * a stand alone job structure to process.*/ + pthread_mutex_unlock(&bio_mutex[worker]); + + /* Process the job accordingly to its type. */ + int job_type = job->header.type; + + if (job_type == BIO_CLOSE_FILE) { + if (job->fd_args.need_fsync && + redis_fsync(job->fd_args.fd) == -1 && + errno != EBADF && errno != EINVAL) + { + serverLog(LL_WARNING, "Fail to fsync the AOF file: %s",strerror(errno)); + } + if (job->fd_args.need_reclaim_cache) { + if (reclaimFilePageCache(job->fd_args.fd, 0, 0) == -1) { + serverLog(LL_NOTICE,"Unable to reclaim page cache: %s", strerror(errno)); + } + } + close(job->fd_args.fd); + } else if (job_type == BIO_AOF_FSYNC || job_type == BIO_CLOSE_AOF) { + /* The fd may be closed by main thread and reused for another + * socket, pipe, or file. We just ignore these errno because + * aof fsync did not really fail. */ + if (redis_fsync(job->fd_args.fd) == -1 && + errno != EBADF && errno != EINVAL) + { + int last_status; + atomicGet(server.aof_bio_fsync_status,last_status); + atomicSet(server.aof_bio_fsync_status,C_ERR); + atomicSet(server.aof_bio_fsync_errno,errno); + if (last_status == C_OK) { + serverLog(LL_WARNING, + "Fail to fsync the AOF file: %s",strerror(errno)); + } + } else { + atomicSet(server.aof_bio_fsync_status,C_OK); + atomicSet(server.fsynced_reploff_pending, job->fd_args.offset); + } + + if (job->fd_args.need_reclaim_cache) { + if (reclaimFilePageCache(job->fd_args.fd, 0, 0) == -1) { + serverLog(LL_NOTICE,"Unable to reclaim page cache: %s", strerror(errno)); + } + } + if (job_type == BIO_CLOSE_AOF) + close(job->fd_args.fd); + } else if (job_type == BIO_LAZY_FREE) { + job->free_args.free_fn(job->free_args.free_args); + } else if ((job_type == BIO_COMP_RQ_CLOSE_FILE) || + (job_type == BIO_COMP_RQ_AOF_FSYNC) || + (job_type == BIO_COMP_RQ_LAZY_FREE)) { + bio_comp_item *comp_rsp = zmalloc(sizeof(bio_comp_item)); + comp_rsp->func = job->comp_rq.fn; + comp_rsp->arg = job->comp_rq.arg; + comp_rsp->ptr = job->comp_rq.ptr; + + /* just write it to completion job responses */ + pthread_mutex_lock(&bio_mutex_comp); + listAddNodeTail(bio_comp_list, comp_rsp); + pthread_mutex_unlock(&bio_mutex_comp); + + if (write(job_comp_pipe[1],"A",1) != 1) { + /* Pipe is non-blocking, write() may fail if it's full. */ + } + } else { + serverPanic("Wrong job type in bioProcessBackgroundJobs()."); + } + zfree(job); + + /* Lock again before reiterating the loop, if there are no longer + * jobs to process we'll block again in pthread_cond_wait(). */ + pthread_mutex_lock(&bio_mutex[worker]); + listDelNode(bio_jobs[worker], ln); + bio_jobs_counter[job_type]--; + pthread_cond_signal(&bio_newjob_cond[worker]); + } +} + +/* Return the number of pending jobs of the specified type. */ +unsigned long bioPendingJobsOfType(int type) { + unsigned int worker = bio_job_to_worker[type]; + + pthread_mutex_lock(&bio_mutex[worker]); + unsigned long val = bio_jobs_counter[type]; + pthread_mutex_unlock(&bio_mutex[worker]); + + return val; +} + +/* Wait for the job queue of the worker for jobs of specified type to become empty. */ +void bioDrainWorker(int job_type) { + unsigned long worker = bio_job_to_worker[job_type]; + + pthread_mutex_lock(&bio_mutex[worker]); + while (listLength(bio_jobs[worker]) > 0) { + pthread_cond_wait(&bio_newjob_cond[worker], &bio_mutex[worker]); + } + pthread_mutex_unlock(&bio_mutex[worker]); +} + +/* Kill the running bio threads in an unclean way. This function should be + * used only when it's critical to stop the threads for some reason. + * Currently Redis does this only on crash (for instance on SIGSEGV) in order + * to perform a fast memory check without other threads messing with memory. */ +void bioKillThreads(void) { + int err; + unsigned long j; + + for (j = 0; j < BIO_WORKER_NUM; j++) { + if (bio_threads[j] == pthread_self()) continue; + if (bio_threads[j] && pthread_cancel(bio_threads[j]) == 0) { + if ((err = pthread_join(bio_threads[j],NULL)) != 0) { + serverLog(LL_WARNING, + "Bio worker thread #%lu can not be joined: %s", + j, strerror(err)); + } else { + serverLog(LL_WARNING, + "Bio worker thread #%lu terminated",j); + } + } + } +} + +void bioPipeReadJobCompList(aeEventLoop *el, int fd, void *privdata, int mask) { + UNUSED(el); + UNUSED(mask); + UNUSED(privdata); + + char buf[128]; + list *tmp_list = NULL; + + while (read(fd, buf, sizeof(buf)) == sizeof(buf)); + + /* Handle event loop events if pipe was written from event loop API */ + pthread_mutex_lock(&bio_mutex_comp); + if (listLength(bio_comp_list)) { + tmp_list = bio_comp_list; + bio_comp_list = listCreate(); + } + pthread_mutex_unlock(&bio_mutex_comp); + + if (!tmp_list) return; + + /* callback to all job completions */ + while (listLength(tmp_list)) { + listNode *ln = listFirst(tmp_list); + bio_comp_item *rsp = ln->value; + listDelNode(tmp_list, ln); + rsp->func(rsp->arg, rsp->ptr); + zfree(rsp); + } + listRelease(tmp_list); +} diff --git a/examples/redis-unstable/src/bio.h b/examples/redis-unstable/src/bio.h new file mode 100644 index 0000000..615cf45 --- /dev/null +++ b/examples/redis-unstable/src/bio.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#ifndef __BIO_H +#define __BIO_H + +typedef void lazy_free_fn(void *args[]); +typedef void comp_fn(uint64_t user_data, void *user_ptr); + +typedef enum bio_worker_t { + BIO_WORKER_CLOSE_FILE = 0, + BIO_WORKER_AOF_FSYNC, + BIO_WORKER_LAZY_FREE, + BIO_WORKER_NUM +} bio_worker_t; + +/* Background job opcodes */ +typedef enum bio_job_type_t { + BIO_CLOSE_FILE = 0, /* Deferred close(2) syscall. */ + BIO_AOF_FSYNC, /* Deferred AOF fsync. */ + BIO_LAZY_FREE, /* Deferred objects freeing. */ + BIO_CLOSE_AOF, + BIO_COMP_RQ_CLOSE_FILE, /* Job completion request, registered on close-file worker's queue */ + BIO_COMP_RQ_AOF_FSYNC, /* Job completion request, registered on aof-fsync worker's queue */ + BIO_COMP_RQ_LAZY_FREE, /* Job completion request, registered on lazy-free worker's queue */ + BIO_NUM_OPS +} bio_job_type_t; + +/* Exported API */ +void bioInit(void); +unsigned long bioPendingJobsOfType(int type); +void bioDrainWorker(int job_type); +void bioKillThreads(void); +void bioCreateCloseJob(int fd, int need_fsync, int need_reclaim_cache); +void bioCreateCloseAofJob(int fd, long long offset, int need_reclaim_cache); +void bioCreateFsyncJob(int fd, long long offset, int need_reclaim_cache); +void bioCreateLazyFreeJob(lazy_free_fn free_fn, int arg_count, ...); +void bioCreateCompRq(bio_worker_t assigned_worker, comp_fn *func, uint64_t user_data, void *user_ptr); + + +#endif diff --git a/examples/redis-unstable/src/bitops.c b/examples/redis-unstable/src/bitops.c new file mode 100644 index 0000000..7a3d9f9 --- /dev/null +++ b/examples/redis-unstable/src/bitops.c @@ -0,0 +1,2037 @@ +/* Bit operations. + * + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "server.h" +#include "ctype.h" + +#ifdef HAVE_AVX2 +/* Define __MM_MALLOC_H to prevent importing the memory aligned + * allocation functions, which we don't use. */ +#define __MM_MALLOC_H +#include +#endif + +#ifdef HAVE_AVX512 +/* Define __MM_MALLOC_H to prevent importing the memory aligned + * allocation functions, which we don't use. */ +#define __MM_MALLOC_H +#include +#endif + +#ifdef HAVE_AARCH64_NEON +#include +#endif + +#ifdef HAVE_AVX2 +#define BITOP_USE_AVX2 (__builtin_cpu_supports("avx2")) +#else +#define BITOP_USE_AVX2 0 +#endif + +/* AArch64 NEON support is determined at compile time via HAVE_AARCH64_NEON */ +#ifdef HAVE_AVX512 +#define BITOP_USE_AVX512 (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512vpopcntdq")) +#else +#define BITOP_USE_AVX512 0 +#endif + + +/* ----------------------------------------------------------------------------- + * Helpers and low level bit functions. + * -------------------------------------------------------------------------- */ + + /* Shared lookup table for bit counting - maps each byte value to its popcount */ +static const uint8_t bitsinbyte[256] = { + #define B2(n) n, n+1, n+1, n+2 + #define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2) + #define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2) + B6(0), B6(1), B6(1), B6(2) + #undef B6 + #undef B4 + #undef B2 +}; + +/* Count number of bits set in the binary array pointed by 's' and long + * 'count' bytes. The implementation of this function is required to + * work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */ +ATTRIBUTE_TARGET_POPCNT +long long redisPopcount(void *s, long count) { + long long bits = 0; + unsigned char *p = s; + uint32_t *p4; +#if defined(HAVE_POPCNT) + int use_popcnt = __builtin_cpu_supports("popcnt"); /* Check if CPU supports POPCNT instruction. */ +#else + int use_popcnt = 0; /* Assume CPU does not support POPCNT if + * __builtin_cpu_supports() is not available. */ +#endif + /* Count initial bytes not aligned to 64-bit when using the POPCNT instruction, + * otherwise align to 32-bit. */ + int align = use_popcnt ? 7 : 3; + while ((unsigned long)p & align && count) { + bits += bitsinbyte[*p++]; + count--; + } + + if (likely(use_popcnt)) { + /* Use separate counters to make the CPU think there are no + * dependencies between these popcnt operations. */ + uint64_t cnt[4]; + memset(cnt, 0, sizeof(cnt)); + + /* Count bits 32 bytes at a time by using popcnt. + * Unroll the loop to avoid the overhead of a single popcnt per iteration, + * allowing the CPU to extract more instruction-level parallelism. + * Reference: https://danluu.com/assembly-intrinsics/ */ + while (count >= 32) { + cnt[0] += __builtin_popcountll(*(uint64_t*)(p)); + cnt[1] += __builtin_popcountll(*(uint64_t*)(p + 8)); + cnt[2] += __builtin_popcountll(*(uint64_t*)(p + 16)); + cnt[3] += __builtin_popcountll(*(uint64_t*)(p + 24)); + count -= 32; + p += 32; + /* Prefetch with 2K stride is just enough to overlap L3 miss latency effectively + * without causing pressure on lower memory hierarchy or polluting L1/L2 */ + redis_prefetch_read(p + 2048); + } + bits += cnt[0] + cnt[1] + cnt[2] + cnt[3]; + goto remain; + } + + /* Count bits 28 bytes at a time */ + p4 = (uint32_t*)p; + while(count>=28) { + uint32_t aux1, aux2, aux3, aux4, aux5, aux6, aux7; + + aux1 = *p4++; + aux2 = *p4++; + aux3 = *p4++; + aux4 = *p4++; + aux5 = *p4++; + aux6 = *p4++; + aux7 = *p4++; + count -= 28; + + aux1 = aux1 - ((aux1 >> 1) & 0x55555555); + aux1 = (aux1 & 0x33333333) + ((aux1 >> 2) & 0x33333333); + aux2 = aux2 - ((aux2 >> 1) & 0x55555555); + aux2 = (aux2 & 0x33333333) + ((aux2 >> 2) & 0x33333333); + aux3 = aux3 - ((aux3 >> 1) & 0x55555555); + aux3 = (aux3 & 0x33333333) + ((aux3 >> 2) & 0x33333333); + aux4 = aux4 - ((aux4 >> 1) & 0x55555555); + aux4 = (aux4 & 0x33333333) + ((aux4 >> 2) & 0x33333333); + aux5 = aux5 - ((aux5 >> 1) & 0x55555555); + aux5 = (aux5 & 0x33333333) + ((aux5 >> 2) & 0x33333333); + aux6 = aux6 - ((aux6 >> 1) & 0x55555555); + aux6 = (aux6 & 0x33333333) + ((aux6 >> 2) & 0x33333333); + aux7 = aux7 - ((aux7 >> 1) & 0x55555555); + aux7 = (aux7 & 0x33333333) + ((aux7 >> 2) & 0x33333333); + bits += ((((aux1 + (aux1 >> 4)) & 0x0F0F0F0F) + + ((aux2 + (aux2 >> 4)) & 0x0F0F0F0F) + + ((aux3 + (aux3 >> 4)) & 0x0F0F0F0F) + + ((aux4 + (aux4 >> 4)) & 0x0F0F0F0F) + + ((aux5 + (aux5 >> 4)) & 0x0F0F0F0F) + + ((aux6 + (aux6 >> 4)) & 0x0F0F0F0F) + + ((aux7 + (aux7 >> 4)) & 0x0F0F0F0F))* 0x01010101) >> 24; + } + p = (unsigned char*)p4; + +remain: + /* Count the remaining bytes. */ + while(count--) bits += bitsinbyte[*p++]; + return bits; +} + +#ifdef HAVE_AARCH64_NEON +/* AArch64 optimized popcount implementation. + * Processes the input bitmap using four NEON vector accumulators in parallel + * to improve instruction-level parallelism and reduce the frequency of + * scalar reductions. Each accumulator holds 16-bit partial sums that are + * combined only once per large block (128 bytes), minimizing data movement. + * + * Benchmark results show this approach outperforms 2-lane implementations + * and matches or exceeds 8-lane versions in throughput, while avoiding + * register pressure and keeping the backend pipeline fully utilized. + * + * This function is now memory bound on large bitmaps, as confirmed by perf + * profiling, with backend stalls dominated by L1/L2 data cache refills. + */ +long long redisPopCountAarch64(void *s, long count) { + long long bits = 0; + const uint8_t *p = (const uint8_t*)s; + + /* Align */ + while (((uintptr_t)p & 15) && count) { + bits += bitsinbyte[*p++]; + count--; + } + + /* Four vector accumulators of u16 (pairwise-accumulated byte counts). */ + uint16x8_t acc0 = vdupq_n_u16(0); + uint16x8_t acc1 = vdupq_n_u16(0); + uint16x8_t acc2 = vdupq_n_u16(0); + uint16x8_t acc3 = vdupq_n_u16(0); + + /* Process 128B per loop to amortize reductions. */ + while (count >= 128) { + uint8x16_t d0 = vld1q_u8(p + 0); + uint8x16_t d1 = vld1q_u8(p + 16); + uint8x16_t d2 = vld1q_u8(p + 32); + uint8x16_t d3 = vld1q_u8(p + 48); + uint8x16_t d4 = vld1q_u8(p + 64); + uint8x16_t d5 = vld1q_u8(p + 80); + uint8x16_t d6 = vld1q_u8(p + 96); + uint8x16_t d7 = vld1q_u8(p +112); + + /* Per-byte popcount */ + uint8x16_t c0 = vcntq_u8(d0); + uint8x16_t c1 = vcntq_u8(d1); + uint8x16_t c2 = vcntq_u8(d2); + uint8x16_t c3 = vcntq_u8(d3); + uint8x16_t c4 = vcntq_u8(d4); + uint8x16_t c5 = vcntq_u8(d5); + uint8x16_t c6 = vcntq_u8(d6); + uint8x16_t c7 = vcntq_u8(d7); + + /* Pairwise widen-add with accumulation: u8 -> u16, stay in vectors */ + acc0 = vpadalq_u8(acc0, c0); + acc1 = vpadalq_u8(acc1, c1); + acc2 = vpadalq_u8(acc2, c2); + acc3 = vpadalq_u8(acc3, c3); + + acc0 = vpadalq_u8(acc0, c4); + acc1 = vpadalq_u8(acc1, c5); + acc2 = vpadalq_u8(acc2, c6); + acc3 = vpadalq_u8(acc3, c7); + + p += 128; + count -= 128; + } + + /* Reduce vector accumulators to scalar once. */ + uint32x4_t s0 = vpaddlq_u16(acc0); + uint32x4_t s1 = vpaddlq_u16(acc1); + uint32x4_t s2 = vpaddlq_u16(acc2); + uint32x4_t s3 = vpaddlq_u16(acc3); + uint32x4_t s01 = vaddq_u32(s0, s1); + uint32x4_t s23 = vaddq_u32(s2, s3); + uint32x4_t st = vaddq_u32(s01, s23); + uint64x2_t s64 = vpaddlq_u32(st); + bits += (long long)(vgetq_lane_u64(s64, 0) + vgetq_lane_u64(s64, 1)); + + /* Remaining 64B blocks (keep vector domain) */ + while (count >= 64) { + uint8x16_t d0 = vld1q_u8(p + 0); + uint8x16_t d1 = vld1q_u8(p + 16); + uint8x16_t d2 = vld1q_u8(p + 32); + uint8x16_t d3 = vld1q_u8(p + 48); + + uint8x16_t c0 = vcntq_u8(d0); + uint8x16_t c1 = vcntq_u8(d1); + uint8x16_t c2 = vcntq_u8(d2); + uint8x16_t c3 = vcntq_u8(d3); + + uint64x2_t t0 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(c0))); + uint64x2_t t1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(c1))); + uint64x2_t t2 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(c2))); + uint64x2_t t3 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(c3))); + + uint64x2_t s = vaddq_u64(vaddq_u64(t0, t1), vaddq_u64(t2, t3)); + bits += (long long)(vgetq_lane_u64(s, 0) + vgetq_lane_u64(s, 1)); + + p += 64; + count -= 64; + } + + /* 16B chunks */ + while (count >= 16) { + uint8x16_t d = vld1q_u8(p); + uint64x2_t s = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(d)))); + bits += (long long)(vgetq_lane_u64(s, 0) + vgetq_lane_u64(s, 1)); + p += 16; + count -= 16; + } + + /* Tail */ + while (count--) bits += bitsinbyte[*p++]; + + return bits; +} +#endif + +#ifdef HAVE_AVX512 +/* AVX512 optimized version of redisPopcount using VPOPCNTDQ instruction. + * This function requires AVX512F and AVX512VPOPCNTDQ support. */ +ATTRIBUTE_TARGET_AVX512_POPCOUNT +long long redisPopCountAvx512(void *s, long count) { + long long bits = 0; + unsigned char *p = s; + + /* Align to 64-byte boundary for optimal AVX512 performance */ + while ((unsigned long)p & 63 && count) { + bits += bitsinbyte[*p++]; + count--; + } + + /* Process 64 bytes at a time using AVX512 */ + while (count >= 64) { + __m512i data = _mm512_loadu_si512((__m512i*)p); + __m512i popcnt = _mm512_popcnt_epi64(data); + + /* Sum all 8 64-bit popcount results */ + bits += _mm512_reduce_add_epi64(popcnt); + + p += 64; + count -= 64; + + /* Prefetch next cache line */ + redis_prefetch_read(p + 2048); + } + + /* Handle remaining bytes with scalar popcount */ + while (count >= 8) { + bits += __builtin_popcountll(*(uint64_t*)p); + p += 8; + count -= 8; + } + + /* Handle final bytes */ + while (count--) { + bits += bitsinbyte[*p++]; + } + + return bits; +} +#endif + +#ifdef HAVE_AVX2 +/* AVX2 optimized version of redisPopcount. + * This function requires AVX2 and POPCNT support. */ +ATTRIBUTE_TARGET_AVX2_POPCOUNT +long long redisPopCountAvx2(void *s, long count) { + long long bits = 0; + unsigned char *p = s; + + /* Align to 8-byte boundary for 64-bit operations */ + while ((unsigned long)p & 7 && count) { + bits += bitsinbyte[*p++]; + count--; + } + + /* Use separate counters to avoid dependencies, similar to regular redisPopcount */ + uint64_t cnt[4]; + memset(cnt, 0, sizeof(cnt)); + + /* Process 32 bytes at a time using POPCNT on 64-bit chunks */ + while (count >= 32) { + cnt[0] += __builtin_popcountll(*(uint64_t*)(p)); + cnt[1] += __builtin_popcountll(*(uint64_t*)(p + 8)); + cnt[2] += __builtin_popcountll(*(uint64_t*)(p + 16)); + cnt[3] += __builtin_popcountll(*(uint64_t*)(p + 24)); + + p += 32; + count -= 32; + + /* Prefetch next cache line */ + redis_prefetch_read(p + 2048); + } + + bits += cnt[0] + cnt[1] + cnt[2] + cnt[3]; + + /* Handle remaining bytes with scalar popcount */ + while (count >= 8) { + bits += __builtin_popcountll(*(uint64_t*)p); + p += 8; + count -= 8; + } + + /* Handle final bytes */ + while (count--) { + bits += bitsinbyte[*p++]; + } + + return bits; +} +#endif + +/* Automatically select the best available popcount implementation */ +static inline long long redisPopcountAuto(const unsigned char *p, long count) { +#ifdef HAVE_AVX512 + if (BITOP_USE_AVX512) { + return redisPopCountAvx512((void*)p, count); + } +#endif +#ifdef HAVE_AVX2 + if (BITOP_USE_AVX2) { + return redisPopCountAvx2((void*)p, count); + } +#endif +#ifdef HAVE_AARCH64_NEON + return redisPopCountAarch64((void*)p, count); +#else + return redisPopcount((void*)p, count); +#endif +} + +/* Return the position of the first bit set to one (if 'bit' is 1) or + * zero (if 'bit' is 0) in the bitmap starting at 's' and long 'count' bytes. + * + * The function is guaranteed to return a value >= 0 if 'bit' is 0 since if + * no zero bit is found, it returns count*8 assuming the string is zero + * padded on the right. However if 'bit' is 1 it is possible that there is + * not a single set bit in the bitmap. In this special case -1 is returned. */ +long long redisBitpos(void *s, unsigned long count, int bit) { + unsigned long *l; + unsigned char *c; + unsigned long skipval, word = 0, one; + long long pos = 0; /* Position of bit, to return to the caller. */ + unsigned long j; + int found; + + /* Process whole words first, seeking for first word that is not + * all ones or all zeros respectively if we are looking for zeros + * or ones. This is much faster with large strings having contiguous + * blocks of 1 or 0 bits compared to the vanilla bit per bit processing. + * + * Note that if we start from an address that is not aligned + * to sizeof(unsigned long) we consume it byte by byte until it is + * aligned. */ + + /* Skip initial bits not aligned to sizeof(unsigned long) byte by byte. */ + skipval = bit ? 0 : UCHAR_MAX; + c = (unsigned char*) s; + found = 0; + while((unsigned long)c & (sizeof(*l)-1) && count) { + if (*c != skipval) { + found = 1; + break; + } + c++; + count--; + pos += 8; + } + + /* Skip bits with full word step. */ + l = (unsigned long*) c; + if (!found) { + skipval = bit ? 0 : ULONG_MAX; + while (count >= sizeof(*l)) { + if (*l != skipval) break; + l++; + count -= sizeof(*l); + pos += sizeof(*l)*8; + } + } + + /* Load bytes into "word" considering the first byte as the most significant + * (we basically consider it as written in big endian, since we consider the + * string as a set of bits from left to right, with the first bit at position + * zero. + * + * Note that the loading is designed to work even when the bytes left + * (count) are less than a full word. We pad it with zero on the right. */ + c = (unsigned char*)l; + for (j = 0; j < sizeof(*l); j++) { + word <<= 8; + if (count) { + word |= *c; + c++; + count--; + } + } + + /* Special case: + * If bits in the string are all zero and we are looking for one, + * return -1 to signal that there is not a single "1" in the whole + * string. This can't happen when we are looking for "0" as we assume + * that the right of the string is zero padded. */ + if (bit == 1 && word == 0) return -1; + + /* Last word left, scan bit by bit. The first thing we need is to + * have a single "1" set in the most significant position in an + * unsigned long. We don't know the size of the long so we use a + * simple trick. */ + one = ULONG_MAX; /* All bits set to 1.*/ + one >>= 1; /* All bits set to 1 but the MSB. */ + one = ~one; /* All bits set to 0 but the MSB. */ + + while(one) { + if (((one & word) != 0) == bit) return pos; + pos++; + one >>= 1; + } + + /* If we reached this point, there is a bug in the algorithm, since + * the case of no match is handled as a special case before. */ + serverPanic("End of redisBitpos() reached."); + return 0; /* Just to avoid warnings. */ +} + +/* The following set.*Bitfield and get.*Bitfield functions implement setting + * and getting arbitrary size (up to 64 bits) signed and unsigned integers + * at arbitrary positions into a bitmap. + * + * The representation considers the bitmap as having the bit number 0 to be + * the most significant bit of the first byte, and so forth, so for example + * setting a 5 bits unsigned integer to value 23 at offset 7 into a bitmap + * previously set to all zeroes, will produce the following representation: + * + * +--------+--------+ + * |00000001|01110000| + * +--------+--------+ + * + * When offsets and integer sizes are aligned to bytes boundaries, this is the + * same as big endian, however when such alignment does not exist, its important + * to also understand how the bits inside a byte are ordered. + * + * Note that this format follows the same convention as SETBIT and related + * commands. + */ + +void setUnsignedBitfield(unsigned char *p, uint64_t offset, uint64_t bits, uint64_t value) { + uint64_t byte, bit, byteval, bitval, j; + + for (j = 0; j < bits; j++) { + bitval = (value & ((uint64_t)1<<(bits-1-j))) != 0; + byte = offset >> 3; + bit = 7 - (offset & 0x7); + byteval = p[byte]; + byteval &= ~(1 << bit); + byteval |= bitval << bit; + p[byte] = byteval & 0xff; + offset++; + } +} + +void setSignedBitfield(unsigned char *p, uint64_t offset, uint64_t bits, int64_t value) { + uint64_t uv = value; /* Casting will add UINT64_MAX + 1 if v is negative. */ + setUnsignedBitfield(p,offset,bits,uv); +} + +uint64_t getUnsignedBitfield(unsigned char *p, uint64_t offset, uint64_t bits) { + uint64_t byte, bit, byteval, bitval, j, value = 0; + + for (j = 0; j < bits; j++) { + byte = offset >> 3; + bit = 7 - (offset & 0x7); + byteval = p[byte]; + bitval = (byteval >> bit) & 1; + value = (value<<1) | bitval; + offset++; + } + return value; +} + +int64_t getSignedBitfield(unsigned char *p, uint64_t offset, uint64_t bits) { + int64_t value; + union {uint64_t u; int64_t i;} conv; + + /* Converting from unsigned to signed is undefined when the value does + * not fit, however here we assume two's complement and the original value + * was obtained from signed -> unsigned conversion, so we'll find the + * most significant bit set if the original value was negative. + * + * Note that two's complement is mandatory for exact-width types + * according to the C99 standard. */ + conv.u = getUnsignedBitfield(p,offset,bits); + value = conv.i; + + /* If the top significant bit is 1, propagate it to all the + * higher bits for two's complement representation of signed + * integers. */ + if (bits < 64 && (value & ((uint64_t)1 << (bits-1)))) + value |= ((uint64_t)-1) << bits; + return value; +} + +/* The following two functions detect overflow of a value in the context + * of storing it as an unsigned or signed integer with the specified + * number of bits. The functions both take the value and a possible increment. + * If no overflow could happen and the value+increment fit inside the limits, + * then zero is returned, otherwise in case of overflow, 1 is returned, + * otherwise in case of underflow, -1 is returned. + * + * When non-zero is returned (overflow or underflow), if not NULL, *limit is + * set to the value the operation should result when an overflow happens, + * depending on the specified overflow semantics: + * + * For BFOVERFLOW_SAT if 1 is returned, *limit it is set maximum value that + * you can store in that integer. when -1 is returned, *limit is set to the + * minimum value that an integer of that size can represent. + * + * For BFOVERFLOW_WRAP *limit is set by performing the operation in order to + * "wrap" around towards zero for unsigned integers, or towards the most + * negative number that is possible to represent for signed integers. */ + +#define BFOVERFLOW_WRAP 0 +#define BFOVERFLOW_SAT 1 +#define BFOVERFLOW_FAIL 2 /* Used by the BITFIELD command implementation. */ + +int checkUnsignedBitfieldOverflow(uint64_t value, int64_t incr, uint64_t bits, int owtype, uint64_t *limit) { + uint64_t max = (bits == 64) ? UINT64_MAX : (((uint64_t)1< max || (incr > 0 && incr > maxincr)) { + if (limit) { + if (owtype == BFOVERFLOW_WRAP) { + goto handle_wrap; + } else if (owtype == BFOVERFLOW_SAT) { + *limit = max; + } + } + return 1; + } else if (incr < 0 && incr < minincr) { + if (limit) { + if (owtype == BFOVERFLOW_WRAP) { + goto handle_wrap; + } else if (owtype == BFOVERFLOW_SAT) { + *limit = 0; + } + } + return -1; + } + return 0; + +handle_wrap: + { + uint64_t mask = ((uint64_t)-1) << bits; + uint64_t res = value+incr; + + res &= ~mask; + *limit = res; + } + return 1; +} + +int checkSignedBitfieldOverflow(int64_t value, int64_t incr, uint64_t bits, int owtype, int64_t *limit) { + int64_t max = (bits == 64) ? INT64_MAX : (((int64_t)1<<(bits-1))-1); + int64_t min = (-max)-1; + + /* Note that maxincr and minincr could overflow, but we use the values + * only after checking 'value' range, so when we use it no overflow + * happens. 'uint64_t' cast is there just to prevent undefined behavior on + * overflow */ + int64_t maxincr = (uint64_t)max-value; + int64_t minincr = min-value; + + if (value > max || (bits != 64 && incr > maxincr) || (value >= 0 && incr > 0 && incr > maxincr)) + { + if (limit) { + if (owtype == BFOVERFLOW_WRAP) { + goto handle_wrap; + } else if (owtype == BFOVERFLOW_SAT) { + *limit = max; + } + } + return 1; + } else if (value < min || (bits != 64 && incr < minincr) || (value < 0 && incr < 0 && incr < minincr)) { + if (limit) { + if (owtype == BFOVERFLOW_WRAP) { + goto handle_wrap; + } else if (owtype == BFOVERFLOW_SAT) { + *limit = min; + } + } + return -1; + } + return 0; + +handle_wrap: + { + uint64_t msb = (uint64_t)1 << (bits-1); + uint64_t a = value, b = incr, c; + c = a+b; /* Perform addition as unsigned so that's defined. */ + + /* If the sign bit is set, propagate to all the higher order + * bits, to cap the negative value. If it's clear, mask to + * the positive integer limit. */ + if (bits < 64) { + uint64_t mask = ((uint64_t)-1) << bits; + if (c & msb) { + c |= mask; + } else { + c &= ~mask; + } + } + *limit = c; + } + return 1; +} + +/* Debugging function. Just show bits in the specified bitmap. Not used + * but here for not having to rewrite it when debugging is needed. */ +void printBits(unsigned char *p, unsigned long count) { + unsigned long j, i, byte; + + for (j = 0; j < count; j++) { + byte = p[j]; + for (i = 0x80; i > 0; i /= 2) + printf("%c", (byte & i) ? '1' : '0'); + printf("|"); + } + printf("\n"); +} + +/* ----------------------------------------------------------------------------- + * Bits related string commands: GETBIT, SETBIT, BITCOUNT, BITOP. + * -------------------------------------------------------------------------- */ + +#define BITOP_AND 0 +#define BITOP_OR 1 +#define BITOP_XOR 2 +#define BITOP_NOT 3 +#define BITOP_DIFF 4 /* DIFF(X, A1, A2, ..., An) = X & !(A1 | A2 | ... | An) */ +#define BITOP_DIFF1 5 /* DIFF1(X, A1, A2, ..., An) = !X & (A1 | A2 | ... | An) */ +#define BITOP_ANDOR 6 /* ANDOR(X, A1, A2, ..., An) = X & (A1 | A2 | ... | An) */ + +/* ONE(A1, A2, ..., An) = X. + * If X[i] is the i-th bit of X then: + * X[i] == 1 if and only if there is m such that: + * Am[i] == 1 and Al[i] == 0 for all l != m. */ +#define BITOP_ONE 7 + +#define BITFIELDOP_GET 0 +#define BITFIELDOP_SET 1 +#define BITFIELDOP_INCRBY 2 + +/* This helper function used by GETBIT / SETBIT parses the bit offset argument + * making sure an error is returned if it is negative or if it overflows + * Redis 512 MB limit for the string value or more (server.proto_max_bulk_len). + * + * If the 'hash' argument is true, and 'bits is positive, then the command + * will also parse bit offsets prefixed by "#". In such a case the offset + * is multiplied by 'bits'. This is useful for the BITFIELD command. */ +int getBitOffsetFromArgument(client *c, robj *o, uint64_t *offset, int hash, int bits) { + long long loffset; + char *err = "bit offset is not an integer or out of range"; + char *p = o->ptr; + size_t plen = sdslen(p); + int usehash = 0; + + /* Handle # form. */ + if (p[0] == '#' && hash && bits > 0) usehash = 1; + + if (string2ll(p+usehash,plen-usehash,&loffset) == 0) { + addReplyError(c,err); + return C_ERR; + } + + /* Adjust the offset by 'bits' for # form. */ + if (usehash) loffset *= bits; + + /* Limit offset to server.proto_max_bulk_len (512MB in bytes by default) */ + if (loffset < 0 || (!mustObeyClient(c) && (loffset >> 3) >= server.proto_max_bulk_len)) + { + addReplyError(c,err); + return C_ERR; + } + + *offset = loffset; + return C_OK; +} + +/* This helper function for BITFIELD parses a bitfield type in the form + * where sign is 'u' or 'i' for unsigned and signed, and + * the bits is a value between 1 and 64. However 64 bits unsigned integers + * are reported as an error because of current limitations of Redis protocol + * to return unsigned integer values greater than INT64_MAX. + * + * On error C_ERR is returned and an error is sent to the client. */ +int getBitfieldTypeFromArgument(client *c, robj *o, int *sign, int *bits) { + char *p = o->ptr; + char *err = "Invalid bitfield type. Use something like i16 u8. Note that u64 is not supported but i64 is."; + long long llbits; + + if (p[0] == 'i') { + *sign = 1; + } else if (p[0] == 'u') { + *sign = 0; + } else { + addReplyError(c,err); + return C_ERR; + } + + if ((string2ll(p+1,strlen(p+1),&llbits)) == 0 || + llbits < 1 || + (*sign == 1 && llbits > 64) || + (*sign == 0 && llbits > 63)) + { + addReplyError(c,err); + return C_ERR; + } + *bits = llbits; + return C_OK; +} + +/* This is a helper function for commands implementations that need to write + * bits to a string object. The command creates or pad with zeroes the string + * so that the 'maxbit' bit can be addressed. The object is finally + * returned. Otherwise if the key holds a wrong type NULL is returned and + * an error is sent to the client. + * + * (Must provide all the arguments to the function) + */ +static kvobj *lookupStringForBitCommand(client *c, uint64_t maxbit, + size_t *strOldSize, size_t *strGrowSize) +{ + dictEntryLink link; + size_t byte = maxbit >> 3; + size_t oldAllocSize = 0; + kvobj *o = lookupKeyWriteWithLink(c->db,c->argv[1],&link); + if (checkType(c,o,OBJ_STRING)) return NULL; + + if (o == NULL) { + o = createObject(OBJ_STRING,sdsnewlen(NULL, byte+1)); + dbAddByLink(c->db,c->argv[1],&o,&link); + *strGrowSize = byte + 1; + *strOldSize = 0; + } else { + o = dbUnshareStringValue(c->db,c->argv[1],o); + *strOldSize = sdslen(o->ptr); + if (server.memory_tracking_per_slot) + oldAllocSize = stringObjectAllocSize(o); + o->ptr = sdsgrowzero(o->ptr,byte+1); + if (server.memory_tracking_per_slot) + updateSlotAllocSize(c->db, getKeySlot(c->argv[1]->ptr), oldAllocSize, stringObjectAllocSize(o)); + *strGrowSize = sdslen(o->ptr) - *strOldSize; + } + return o; +} + +/* Return a pointer to the string object content, and stores its length + * in 'len'. The user is required to pass (likely stack allocated) buffer + * 'llbuf' of at least LONG_STR_SIZE bytes. Such a buffer is used in the case + * the object is integer encoded in order to provide the representation + * without using heap allocation. + * + * The function returns the pointer to the object array of bytes representing + * the string it contains, that may be a pointer to 'llbuf' or to the + * internal object representation. As a side effect 'len' is filled with + * the length of such buffer. + * + * If the source object is NULL the function is guaranteed to return NULL + * and set 'len' to 0. */ +unsigned char *getObjectReadOnlyString(robj *o, long *len, char *llbuf) { + serverAssert(!o || o->type == OBJ_STRING); + unsigned char *p = NULL; + + /* Set the 'p' pointer to the string, that can be just a stack allocated + * array if our string was integer encoded. */ + if (o && o->encoding == OBJ_ENCODING_INT) { + p = (unsigned char*) llbuf; + if (len) *len = ll2string(llbuf,LONG_STR_SIZE,(long)o->ptr); + } else if (o) { + p = (unsigned char*) o->ptr; + if (len) *len = sdslen(o->ptr); + } else { + if (len) *len = 0; + } + return p; +} + +/* SETBIT key offset bitvalue */ +void setbitCommand(client *c) { + char *err = "bit is not an integer or out of range"; + uint64_t bitoffset; + ssize_t byte, bit; + int byteval, bitval; + long on; + + if (getBitOffsetFromArgument(c,c->argv[2],&bitoffset,0,0) != C_OK) + return; + + if (getLongFromObjectOrReply(c,c->argv[3],&on,err) != C_OK) + return; + + /* Bits can only be set or cleared... */ + if (on & ~1) { + addReplyError(c,err); + return; + } + + size_t strOldSize, strGrowSize; + kvobj *o = lookupStringForBitCommand(c, bitoffset, &strOldSize, &strGrowSize); + if (o == NULL) return; + + /* Get current values */ + byte = bitoffset >> 3; + byteval = ((uint8_t*)o->ptr)[byte]; + bit = 7 - (bitoffset & 0x7); + bitval = byteval & (1 << bit); + + /* Either it is newly created, changed length, or the bit changes before and after. + * Note that the bitval here is actually a decimal number. + * So we need to use `!!` to convert it to 0 or 1 for comparison. */ + if (strGrowSize || (!!bitval != on)) { + /* Update byte with new bit value. */ + byteval &= ~(1 << bit); + byteval |= ((on & 0x1) << bit); + ((uint8_t*)o->ptr)[byte] = byteval; + keyModified(c,c->db,c->argv[1],o,1); + notifyKeyspaceEvent(NOTIFY_STRING,"setbit",c->argv[1],c->db->id); + server.dirty++; + + /* If this is not a new key (old size not 0) and size changed, then + * update the keysizes histogram. Otherwise, the histogram already + * updated in lookupStringForBitCommand() by calling dbAdd(). */ + if ((strOldSize > 0) && (strGrowSize != 0)) + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_STRING, + strOldSize, strOldSize + strGrowSize); + } + + /* Return original value. */ + addReply(c, bitval ? shared.cone : shared.czero); +} + +/* GETBIT key offset */ +void getbitCommand(client *c) { + char llbuf[32]; + uint64_t bitoffset; + size_t byte, bit; + size_t bitval = 0; + + if (getBitOffsetFromArgument(c,c->argv[2],&bitoffset,0,0) != C_OK) + return; + + kvobj *kv = lookupKeyReadOrReply(c, c->argv[1], shared.czero); + if (kv == NULL || checkType(c,kv,OBJ_STRING)) return; + + byte = bitoffset >> 3; + bit = 7 - (bitoffset & 0x7); + if (sdsEncodedObject(kv)) { + if (byte < sdslen(kv->ptr)) + bitval = ((uint8_t*)kv->ptr)[byte] & (1 << bit); + } else { + if (byte < (size_t)ll2string(llbuf,sizeof(llbuf),(long)kv->ptr)) + bitval = llbuf[byte] & (1 << bit); + } + + addReply(c, bitval ? shared.cone : shared.czero); +} + +#ifdef HAVE_AVX2 +/* Compute the given bitop operation using AVX2 intrinsics. + * Return how many bytes were successfully processed, as AVX2 operates on + * 256-bit registers so if `minlen` is not a multiple of 32 some of the bytes + * will be skipped. They will be taken care for in the unoptimized loop in the + * main bitopCommand function. */ +ATTRIBUTE_TARGET_AVX2_POPCOUNT +unsigned long bitopCommandAVX(unsigned char **keys, unsigned char *res, + unsigned long op, unsigned long numkeys, + unsigned long minlen) +{ + const unsigned long step = sizeof(__m256i); + + unsigned long i; + unsigned long processed = 0; + unsigned char *res_start = res; + unsigned char *fst_key = keys[0]; + + if (minlen < step) { + return 0; + } + + /* Unlike other operations that do the same with all source keys + * DIFF, DIFF1 and ANDOR all compute the disjunction of all the source keys + * but the first one. We first store that disjunction in `lres` and later + * compute the final operation using the first source key. */ + if (op != BITOP_DIFF && op != BITOP_DIFF1 && op != BITOP_ANDOR) { + memcpy(res, keys[0], minlen); + } + + const __m256i max256 = _mm256_set1_epi64x(-1); + const __m256i zero256 = _mm256_set1_epi64x(0); + + switch (op) { + case BITOP_AND: + while (minlen >= step) { + __m256i lres = _mm256_lddqu_si256((__m256i*)res); + + for (i = 1; i < numkeys; i++) { + __m256i lkey = _mm256_lddqu_si256((__m256i*)(keys[i]+processed)); + lres = _mm256_and_si256(lres, lkey); + } + _mm256_storeu_si256((__m256i*)res, lres); + res += step; + processed += step; + minlen -= step; + } + break; + case BITOP_DIFF: + case BITOP_DIFF1: + case BITOP_ANDOR: + case BITOP_OR: + while (minlen >= step) { + __m256i lres = _mm256_lddqu_si256((__m256i*)res); + + for (i = 1; i < numkeys; i++) { + __m256i lkey = _mm256_lddqu_si256((__m256i*)(keys[i]+processed)); + lres = _mm256_or_si256(lres, lkey); + } + _mm256_storeu_si256((__m256i*)res, lres); + res += step; + processed += step; + minlen -= step; + } + break; + case BITOP_XOR: + while (minlen >= step) { + __m256i lres = _mm256_lddqu_si256((__m256i*)res); + + for (i = 1; i < numkeys; i++) { + __m256i lkey = _mm256_lddqu_si256((__m256i*)(keys[i]+processed)); + lres = _mm256_xor_si256(lres, lkey); + } + _mm256_storeu_si256((__m256i*)res, lres); + res += step; + processed += step; + minlen -= step; + } + break; + case BITOP_NOT: + while (minlen >= step) { + __m256i lres = _mm256_lddqu_si256((__m256i*)res); + lres = _mm256_xor_si256(lres, max256); + _mm256_storeu_si256((__m256i*)res, lres); + res += step; + processed += step; + minlen -= step; + } + break; + case BITOP_ONE: + while (minlen >= step) { + __m256i lres = _mm256_lddqu_si256((__m256i*)res); + __m256i common_bits = zero256; + + for (i = 1; i < numkeys; i++) { + __m256i lkey = _mm256_lddqu_si256((__m256i*)(keys[i]+processed)); + __m256i common = _mm256_and_si256(lres, lkey); + common_bits = _mm256_or_si256(common_bits, common); + + lres = _mm256_xor_si256(lres, lkey); + } + lres = _mm256_andnot_si256(common_bits, lres); + _mm256_storeu_si256((__m256i*)res, lres); + res += step; + processed += step; + minlen -= step; + } + break; + default: + break; + } + + res = res_start; + switch (op) { + case BITOP_DIFF: + for (i = 0; i < processed; i += step) { + __m256i lres = _mm256_lddqu_si256((__m256i*)res); + __m256i fkey = _mm256_lddqu_si256((__m256i*)fst_key); + + lres = _mm256_andnot_si256(lres, fkey); + _mm256_storeu_si256((__m256i*)res, lres); + + res += step; + fst_key += step; + } + break; + case BITOP_DIFF1: + for (i = 0; i < processed; i += step) { + __m256i lres = _mm256_lddqu_si256((__m256i*)res); + __m256i fkey = _mm256_lddqu_si256((__m256i*)fst_key); + + lres = _mm256_andnot_si256(fkey, lres); + _mm256_storeu_si256((__m256i*)res, lres); + + res += step; + fst_key += step; + } + break; + case BITOP_ANDOR: + for (i = 0; i < processed; i += step) { + __m256i lres = _mm256_lddqu_si256((__m256i*)res); + __m256i fkey = _mm256_lddqu_si256((__m256i*)fst_key); + + lres = _mm256_and_si256(fkey, lres); + _mm256_storeu_si256((__m256i*)res, lres); + + res += step; + fst_key += step; + } + break; + default: + break; + } + + return processed; +} +#endif /* HAVE_AVX2 */ + +/* BITOP op_name target_key src_key1 src_key2 src_key3 ... src_keyN */ +REDIS_NO_SANITIZE("alignment") +void bitopCommand(client *c) { + char *opname = c->argv[1]->ptr; + robj *targetkey = c->argv[2]; + unsigned long op, j, numkeys; + robj **objects; /* Array of source objects. */ + unsigned char **src; /* Array of source strings pointers. */ + unsigned long *len, maxlen = 0; /* Array of length of src strings, + and max len. */ + unsigned long minlen = 0; /* Min len among the input keys. */ + unsigned char *res = NULL; /* Resulting string. */ + + /* Parse the operation name. */ + if ((opname[0] == 'a' || opname[0] == 'A') && !strcasecmp(opname,"and")) + op = BITOP_AND; + else if((opname[0] == 'o' || opname[0] == 'O') && !strcasecmp(opname,"or")) + op = BITOP_OR; + else if((opname[0] == 'x' || opname[0] == 'X') && !strcasecmp(opname,"xor")) + op = BITOP_XOR; + else if((opname[0] == 'n' || opname[0] == 'N') && !strcasecmp(opname,"not")) + op = BITOP_NOT; + else if ((opname[0] == 'd' || opname[0] == 'D') && !strcasecmp(opname,"diff")) + op = BITOP_DIFF; + else if ((opname[0] == 'd' || opname[0] == 'D') && !strcasecmp(opname,"diff1")) + op = BITOP_DIFF1; + else if ((opname[0] == 'a' || opname[0] == 'A') && !strcasecmp(opname,"andor")) + op = BITOP_ANDOR; + else if ((opname[0] == 'o' || opname[0] == 'O') && !strcasecmp(opname,"one")) + op = BITOP_ONE; + else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + + /* Sanity check: NOT accepts only a single key argument. */ + if (op == BITOP_NOT && c->argc != 4) { + addReplyError(c,"BITOP NOT must be called with a single source key."); + return; + } + + if ((op == BITOP_DIFF || op == BITOP_DIFF1 || op == BITOP_ANDOR) && c->argc < 5) { + sds opname_upper = sdsnew(opname); + sdstoupper(opname_upper); + addReplyErrorFormat(c,"BITOP %s must be called with at least two source keys.", opname_upper); + sdsfree(opname_upper); + return; + } + + /* Lookup keys, and store pointers to the string objects into an array. */ + numkeys = c->argc - 3; + src = zmalloc(sizeof(unsigned char*) * numkeys); + len = zmalloc(sizeof(long) * numkeys); + objects = zmalloc(sizeof(robj*) * numkeys); + for (j = 0; j < numkeys; j++) { + kvobj *kv = lookupKeyRead(c->db, c->argv[j + 3]); + /* Handle non-existing keys as empty strings. */ + if (kv == NULL) { + objects[j] = NULL; + src[j] = NULL; + len[j] = 0; + minlen = 0; + continue; + } + /* Return an error if one of the keys is not a string. */ + if (checkType(c, kv, OBJ_STRING)) { + unsigned long i; + for (i = 0; i < j; i++) { + if (objects[i]) + decrRefCount(objects[i]); + } + zfree(src); + zfree(len); + zfree(objects); + return; + } + objects[j] = getDecodedObject(kv); + src[j] = objects[j]->ptr; + len[j] = sdslen(objects[j]->ptr); + if (len[j] > maxlen) maxlen = len[j]; + if (j == 0 || len[j] < minlen) minlen = len[j]; + } + + /* Compute the bit operation, if at least one string is not empty. */ + if (maxlen) { + res = (unsigned char*) sdsnewlen(NULL,maxlen); + unsigned char output, byte, disjunction, common_bits; + unsigned long i; + int useAVX2 = 0; + + /* Number of bytes processed from each source key */ + j = 0; + +#if defined(HAVE_AVX2) + if (BITOP_USE_AVX2) { + j = bitopCommandAVX(src, res, op, numkeys, minlen); + + serverAssert(minlen >= j); + minlen -= j; + + useAVX2 = 1; + } +#endif + +#if !defined(USE_ALIGNED_ACCESS) + /* We don't have AVX2 but we still have fast path: + * as far as we have data for all the input bitmaps we + * can take a fast path that performs much better than the + * vanilla algorithm. On ARM we skip the fast path since it will + * result in GCC compiling the code using multiple-words load/store + * operations that are not supported even in ARM >= v6. */ + if (minlen >= sizeof(unsigned long)*4) { + /* We can't have entered the AVX2 path since minlen >= sizeof(unsigned long)*4 + * AVX2 path operates on steps of sizeof(__m256i) which for 64-bit + * machines (the only ones supporting AVX2) is equal to + * sizeof(unsigned long)*4. That means after the AVX2 + * path minlen will necessarily be < sizeof(unsigned long)*4. */ + serverAssert(!useAVX2); + + unsigned long **lp = (unsigned long**)src; + unsigned long *lres = (unsigned long*) res; + + /* Index over the unsigned long version of the source keys */ + size_t k = 0; + + /* Unlike other operations that do the same with all source keys + * DIFF, DIFF1 and ANDOR all compute the disjunction of all the + * source keys but the first one. We first store that disjunction + * in `lres` and later compute the final operation using the first + * source key. */ + if (op != BITOP_DIFF && op != BITOP_DIFF1 && op != BITOP_ANDOR) + memcpy(lres,src[0],minlen); + + /* Different branches per different operations for speed (sorry). */ + if (op == BITOP_AND) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] &= lp[i][k+0]; + lres[1] &= lp[i][k+1]; + lres[2] &= lp[i][k+2]; + lres[3] &= lp[i][k+3]; + } + k+=4; + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_OR) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] |= lp[i][k+0]; + lres[1] |= lp[i][k+1]; + lres[2] |= lp[i][k+2]; + lres[3] |= lp[i][k+3]; + } + k+=4; + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_XOR) { + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] ^= lp[i][k+0]; + lres[1] ^= lp[i][k+1]; + lres[2] ^= lp[i][k+2]; + lres[3] ^= lp[i][k+3]; + } + k+=4; + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_NOT) { + while(minlen >= sizeof(unsigned long)*4) { + lres[0] = ~lres[0]; + lres[1] = ~lres[1]; + lres[2] = ~lres[2]; + lres[3] = ~lres[3]; + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } else if (op == BITOP_DIFF || op == BITOP_DIFF1 || op == BITOP_ANDOR) { + size_t processed = 0; + while(minlen >= sizeof(unsigned long)*4) { + for (i = 1; i < numkeys; i++) { + lres[0] |= lp[i][k+0]; + lres[1] |= lp[i][k+1]; + lres[2] |= lp[i][k+2]; + lres[3] |= lp[i][k+3]; + } + k+=4; + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + processed += sizeof(unsigned long)*4; + } + + lres = (unsigned long*) res; + unsigned long *first_key = (unsigned long*)src[0]; + switch (op) { + case BITOP_DIFF: + for (i = 0; i < processed; i += sizeof(unsigned long)*4) { + lres[0] = (first_key[0] & ~lres[0]); + lres[1] = (first_key[1] & ~lres[1]); + lres[2] = (first_key[2] & ~lres[2]); + lres[3] = (first_key[3] & ~lres[3]); + lres+=4; + first_key += 4; + } + break; + case BITOP_DIFF1: + for (i = 0; i < processed; i += sizeof(unsigned long)*4) { + lres[0] = (~first_key[0] & lres[0]); + lres[1] = (~first_key[1] & lres[1]); + lres[2] = (~first_key[2] & lres[2]); + lres[3] = (~first_key[3] & lres[3]); + lres+=4; + first_key += 4; + } + break; + case BITOP_ANDOR: + for (i = 0; i < processed; i += sizeof(unsigned long)*4) { + lres[0] = (first_key[0] & lres[0]); + lres[1] = (first_key[1] & lres[1]); + lres[2] = (first_key[2] & lres[2]); + lres[3] = (first_key[3] & lres[3]); + lres+=4; + first_key += 4; + } + break; + } + } else if (op == BITOP_ONE) { + unsigned long lcommon_bits[4]; + + while(minlen >= sizeof(unsigned long)*4) { + memset(lcommon_bits, 0, sizeof(lcommon_bits)); + + for (i = 1; i < numkeys; i++) { + lcommon_bits[0] |= (lres[0] & lp[i][k+0]); + lcommon_bits[1] |= (lres[1] & lp[i][k+1]); + lcommon_bits[2] |= (lres[2] & lp[i][k+2]); + lcommon_bits[3] |= (lres[3] & lp[i][k+3]); + + lres[0] ^= lp[i][k+0]; + lres[1] ^= lp[i][k+1]; + lres[2] ^= lp[i][k+2]; + lres[3] ^= lp[i][k+3]; + } + + lres[0] &= ~lcommon_bits[0]; + lres[1] &= ~lcommon_bits[1]; + lres[2] &= ~lcommon_bits[2]; + lres[3] &= ~lcommon_bits[3]; + + k+=4; + lres+=4; + j += sizeof(unsigned long)*4; + minlen -= sizeof(unsigned long)*4; + } + } + } +#endif /* !defined(USE_ALIGNED_ACCESS) */ + + /* j is set to the next byte to process by the previous loop. */ + for (; j < maxlen; j++) { + output = (len[0] <= j) ? 0 : src[0][j]; + if (op == BITOP_NOT) output = ~output; + disjunction = 0; + common_bits = 0; + + for (i = 1; i < numkeys; i++) { + int skip = 0; + byte = (len[i] <= j) ? 0 : src[i][j]; + switch(op) { + case BITOP_AND: + output &= byte; + skip = (output == 0); + break; + case BITOP_OR: + output |= byte; + skip = (output == 0xff); + break; + case BITOP_XOR: output ^= byte; break; + + /* For DIFF, DIFF1 and ANDOR we compute the disjunction of all + * key arguments except the first one. After that we do their + * respective bit op on said first arg and that disjunction. + * */ + case BITOP_DIFF: + case BITOP_DIFF1: + case BITOP_ANDOR: + disjunction |= byte; + skip = (disjunction == 0xff); + break; + + /* BITOP ONE dest key_1 [key_2...] + * If dest[i] is the i-th bit of dest then: + * dest[i] == 1 if and only if there is j such that key_j[i] == 1 + * and key_n[i] == 0 for all n != j. + * + * In order to compute that on each step we track which bits + * were seen in more than one key and store that in a helper + * variable. Then the operation is just XOR but on each step we + * nullify the bits that are set in the helper. + * Logically, this operation is the same as nullifying the + * helper bits only once at the end, but performance-wise it had + * no significant benefit and makes the code only more unclear. + * + * e.g: + * 0001 0111 # key1 + * 0010 0110 # key2 + * + * 0011 0001 # intermediate1 + * 0000 0110 # helper + * 0011 0001 # intermediate1 & ~helper + * + * 0100 1101 # key3 + * + * 0111 1100 # intermediate2 + * 0000 0111 # helper + * 0111 1000 # intermediate2 & ~helper + * --------- + * 0111 1000 # result + * */ + case BITOP_ONE: + common_bits |= (output & byte); + output ^= byte; + output &= ~common_bits; + skip = (common_bits == 0xff); + break; + default: + break; + } + + if (skip) { + break; + } + } + + switch(op) { + case BITOP_DIFF: + res[j] = (output & ~disjunction); + break; + case BITOP_DIFF1: + res[j] = (~output & disjunction); + break; + case BITOP_ANDOR: + res[j] = (output & disjunction); + break; + default: + res[j] = output; + break; + } + } + } + for (j = 0; j < numkeys; j++) { + if (objects[j]) + decrRefCount(objects[j]); + } + zfree(src); + zfree(len); + zfree(objects); + + /* Store the computed value into the target key */ + if (maxlen) { + robj *o = createObject(OBJ_STRING, res); + setKey(c, c->db, targetkey, &o, 0); + notifyKeyspaceEvent(NOTIFY_STRING,"set",targetkey,c->db->id); + server.dirty++; + } else if (dbDelete(c->db,targetkey)) { + keyModified(c,c->db,targetkey,NULL,1); + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",targetkey,c->db->id); + server.dirty++; + } + addReplyLongLong(c,maxlen); /* Return the output string length in bytes. */ +} + +/* BITCOUNT key [start end [BIT|BYTE]] */ +void bitcountCommand(client *c) { + kvobj *o; + long long start, end; + long strlen; + unsigned char *p; + char llbuf[LONG_STR_SIZE]; + int isbit = 0; + unsigned char first_byte_neg_mask = 0, last_byte_neg_mask = 0; + + /* Parse start/end range if any. */ + if (c->argc == 4 || c->argc == 5) { + if (getLongLongFromObjectOrReply(c,c->argv[2],&start,NULL) != C_OK) + return; + if (getLongLongFromObjectOrReply(c,c->argv[3],&end,NULL) != C_OK) + return; + if (c->argc == 5) { + if (!strcasecmp(c->argv[4]->ptr,"bit")) isbit = 1; + else if (!strcasecmp(c->argv[4]->ptr,"byte")) isbit = 0; + else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + /* Lookup, check for type. */ + o = lookupKeyRead(c->db, c->argv[1]); + if (checkType(c, o, OBJ_STRING)) return; + p = getObjectReadOnlyString(o,&strlen,llbuf); + long long totlen = strlen; + + /* Make sure we will not overflow */ + serverAssert(totlen <= LLONG_MAX >> 3); + + /* Convert negative indexes */ + if (start < 0 && end < 0 && start > end) { + addReply(c,shared.czero); + return; + } + if (isbit) totlen <<= 3; + if (start < 0) start = totlen+start; + if (end < 0) end = totlen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + if (end >= totlen) end = totlen-1; + if (isbit && start <= end) { + /* Before converting bit offset to byte offset, create negative masks + * for the edges. */ + first_byte_neg_mask = ~((1<<(8-(start&7)))-1) & 0xFF; + last_byte_neg_mask = (1<<(7-(end&7)))-1; + start >>= 3; + end >>= 3; + } + } else if (c->argc == 2) { + /* Lookup, check for type. */ + o = lookupKeyRead(c->db, c->argv[1]); + if (checkType(c, o, OBJ_STRING)) return; + p = getObjectReadOnlyString(o,&strlen,llbuf); + /* The whole string. */ + start = 0; + end = strlen-1; + } else { + /* Syntax error. */ + addReplyErrorObject(c,shared.syntaxerr); + return; + } + + /* Return 0 for non existing keys. */ + if (o == NULL) { + addReply(c, shared.czero); + return; + } + + /* Precondition: end >= 0 && end < strlen, so the only condition where + * zero can be returned is: start > end. */ + if (start > end) { + addReply(c,shared.czero); + } else { + long bytes = (long)(end-start+1); + long long count; + + /* Use the best available popcount implementation */ + count = redisPopcountAuto(p+start, bytes); + + if (first_byte_neg_mask != 0 || last_byte_neg_mask != 0) { + unsigned char firstlast[2] = {0, 0}; + /* We may count bits of first byte and last byte which are out of + * range. So we need to subtract them. Here we use a trick. We set + * bits in the range to zero. So these bit will not be excluded. */ + if (first_byte_neg_mask != 0) firstlast[0] = p[start] & first_byte_neg_mask; + if (last_byte_neg_mask != 0) firstlast[1] = p[end] & last_byte_neg_mask; + + /* Use the same popcount implementation for consistency */ + count -= redisPopcountAuto(firstlast, 2); + } + addReplyLongLong(c,count); + } +} + +/* BITPOS key bit [start [end [BIT|BYTE]]] */ +void bitposCommand(client *c) { + kvobj *o; + long long start, end; + long bit, strlen; + unsigned char *p; + char llbuf[LONG_STR_SIZE]; + int isbit = 0, end_given = 0; + unsigned char first_byte_neg_mask = 0, last_byte_neg_mask = 0; + + /* Parse the bit argument to understand what we are looking for, set + * or clear bits. */ + if (getLongFromObjectOrReply(c,c->argv[2],&bit,NULL) != C_OK) + return; + if (bit != 0 && bit != 1) { + addReplyError(c, "The bit argument must be 1 or 0."); + return; + } + + /* Parse start/end range if any. */ + if (c->argc == 4 || c->argc == 5 || c->argc == 6) { + if (getLongLongFromObjectOrReply(c,c->argv[3],&start,NULL) != C_OK) + return; + if (c->argc == 6) { + if (!strcasecmp(c->argv[5]->ptr,"bit")) isbit = 1; + else if (!strcasecmp(c->argv[5]->ptr,"byte")) isbit = 0; + else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + if (c->argc >= 5) { + if (getLongLongFromObjectOrReply(c,c->argv[4],&end,NULL) != C_OK) + return; + end_given = 1; + } + + /* Lookup, check for type. */ + o = lookupKeyRead(c->db, c->argv[1]); + if (checkType(c, o, OBJ_STRING)) return; + p = getObjectReadOnlyString(o, &strlen, llbuf); + + /* Make sure we will not overflow */ + long long totlen = strlen; + serverAssert(totlen <= LLONG_MAX >> 3); + + if (c->argc < 5) { + if (isbit) end = (totlen<<3) + 7; + else end = totlen-1; + } + + if (isbit) totlen <<= 3; + /* Convert negative indexes */ + if (start < 0) start = totlen+start; + if (end < 0) end = totlen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + if (end >= totlen) end = totlen-1; + if (isbit && start <= end) { + /* Before converting bit offset to byte offset, create negative masks + * for the edges. */ + first_byte_neg_mask = ~((1<<(8-(start&7)))-1) & 0xFF; + last_byte_neg_mask = (1<<(7-(end&7)))-1; + start >>= 3; + end >>= 3; + } + } else if (c->argc == 3) { + /* Lookup, check for type. */ + o = lookupKeyRead(c->db, c->argv[1]); + if (checkType(c,o,OBJ_STRING)) return; + p = getObjectReadOnlyString(o,&strlen,llbuf); + + /* The whole string. */ + start = 0; + end = strlen-1; + } else { + /* Syntax error. */ + addReplyErrorObject(c,shared.syntaxerr); + return; + } + + /* If the key does not exist, from our point of view it is an infinite + * array of 0 bits. If the user is looking for the first clear bit return 0, + * If the user is looking for the first set bit, return -1. */ + if (o == NULL) { + addReplyLongLong(c, bit ? -1 : 0); + return; + } + + /* For empty ranges (start > end) we return -1 as an empty range does + * not contain a 0 nor a 1. */ + if (start > end) { + addReplyLongLong(c, -1); + } else { + long bytes = end-start+1; + long long pos; + unsigned char tmpchar; + if (first_byte_neg_mask) { + if (bit) tmpchar = p[start] & ~first_byte_neg_mask; + else tmpchar = p[start] | first_byte_neg_mask; + /* Special case, there is only one byte */ + if (last_byte_neg_mask && bytes == 1) { + if (bit) tmpchar = tmpchar & ~last_byte_neg_mask; + else tmpchar = tmpchar | last_byte_neg_mask; + } + pos = redisBitpos(&tmpchar,1,bit); + /* If there are no more bytes or we get valid pos, we can exit early */ + if (bytes == 1 || (pos != -1 && pos != 8)) goto result; + start++; + bytes--; + } + /* If the last byte has not bits in the range, we should exclude it */ + long curbytes = bytes - (last_byte_neg_mask ? 1 : 0); + if (curbytes > 0) { + pos = redisBitpos(p+start,curbytes,bit); + /* If there is no more bytes or we get valid pos, we can exit early */ + if (bytes == curbytes || (pos != -1 && pos != (long long)curbytes<<3)) goto result; + start += curbytes; + bytes -= curbytes; + } + if (bit) tmpchar = p[end] & ~last_byte_neg_mask; + else tmpchar = p[end] | last_byte_neg_mask; + pos = redisBitpos(&tmpchar,1,bit); + + result: + /* If we are looking for clear bits, and the user specified an exact + * range with start-end, we can't consider the right of the range as + * zero padded (as we do when no explicit end is given). + * + * So if redisBitpos() returns the first bit outside the range, + * we return -1 to the caller, to mean, in the specified range there + * is not a single "0" bit. */ + if (end_given && bit == 0 && pos == (long long)bytes<<3) { + addReplyLongLong(c,-1); + return; + } + if (pos != -1) pos += (long long)start<<3; /* Adjust for the bytes we skipped. */ + addReplyLongLong(c,pos); + } +} + +/* BITFIELD key subcommand-1 arg ... subcommand-2 arg ... subcommand-N ... + * + * Supported subcommands: + * + * GET + * SET + * INCRBY + * OVERFLOW [WRAP|SAT|FAIL] + */ + +#define BITFIELD_FLAG_NONE 0 +#define BITFIELD_FLAG_READONLY (1<<0) + +struct bitfieldOp { + uint64_t offset; /* Bitfield offset. */ + int64_t i64; /* Increment amount (INCRBY) or SET value */ + int opcode; /* Operation id. */ + int owtype; /* Overflow type to use. */ + int bits; /* Integer bitfield bits width. */ + int sign; /* True if signed, otherwise unsigned op. */ +}; + +/* This implements both the BITFIELD command and the BITFIELD_RO command + * when flags is set to BITFIELD_FLAG_READONLY: in this case only the + * GET subcommand is allowed, other subcommands will return an error. */ +void bitfieldGeneric(client *c, int flags) { + kvobj *o; + uint64_t bitoffset; + int j, numops = 0, changes = 0; + size_t strOldSize, strGrowSize = 0; + struct bitfieldOp *ops = NULL; /* Array of ops to execute at end. */ + int owtype = BFOVERFLOW_WRAP; /* Overflow type. */ + int readonly = 1; + uint64_t highest_write_offset = 0; + + for (j = 2; j < c->argc; j++) { + int remargs = c->argc-j-1; /* Remaining args other than current. */ + char *subcmd = c->argv[j]->ptr; /* Current command name. */ + int opcode; /* Current operation code. */ + long long i64 = 0; /* Signed SET value. */ + int sign = 0; /* Signed or unsigned type? */ + int bits = 0; /* Bitfield width in bits. */ + + if (!strcasecmp(subcmd,"get") && remargs >= 2) + opcode = BITFIELDOP_GET; + else if (!strcasecmp(subcmd,"set") && remargs >= 3) + opcode = BITFIELDOP_SET; + else if (!strcasecmp(subcmd,"incrby") && remargs >= 3) + opcode = BITFIELDOP_INCRBY; + else if (!strcasecmp(subcmd,"overflow") && remargs >= 1) { + char *owtypename = c->argv[j+1]->ptr; + j++; + if (!strcasecmp(owtypename,"wrap")) + owtype = BFOVERFLOW_WRAP; + else if (!strcasecmp(owtypename,"sat")) + owtype = BFOVERFLOW_SAT; + else if (!strcasecmp(owtypename,"fail")) + owtype = BFOVERFLOW_FAIL; + else { + addReplyError(c,"Invalid OVERFLOW type specified"); + zfree(ops); + return; + } + continue; + } else { + addReplyErrorObject(c,shared.syntaxerr); + zfree(ops); + return; + } + + /* Get the type and offset arguments, common to all the ops. */ + if (getBitfieldTypeFromArgument(c,c->argv[j+1],&sign,&bits) != C_OK) { + zfree(ops); + return; + } + + if (getBitOffsetFromArgument(c,c->argv[j+2],&bitoffset,1,bits) != C_OK){ + zfree(ops); + return; + } + + if (opcode != BITFIELDOP_GET) { + readonly = 0; + if (highest_write_offset < bitoffset + bits - 1) + highest_write_offset = bitoffset + bits - 1; + /* INCRBY and SET require another argument. */ + if (getLongLongFromObjectOrReply(c,c->argv[j+3],&i64,NULL) != C_OK){ + zfree(ops); + return; + } + } + + /* Populate the array of operations we'll process. */ + ops = zrealloc(ops,sizeof(*ops)*(numops+1)); + ops[numops].offset = bitoffset; + ops[numops].i64 = i64; + ops[numops].opcode = opcode; + ops[numops].owtype = owtype; + ops[numops].bits = bits; + ops[numops].sign = sign; + numops++; + + j += 3 - (opcode == BITFIELDOP_GET); + } + + if (readonly) { + /* Lookup for read is ok if key doesn't exit, but errors + * if it's not a string. */ + o = lookupKeyRead(c->db,c->argv[1]); + if (o != NULL && checkType(c,o,OBJ_STRING)) { + zfree(ops); + return; + } + } else { + if (flags & BITFIELD_FLAG_READONLY) { + zfree(ops); + addReplyError(c, "BITFIELD_RO only supports the GET subcommand"); + return; + } + + /* Lookup by making room up to the farthest bit reached by + * this operation. */ + if ((o = lookupStringForBitCommand(c, + highest_write_offset,&strOldSize,&strGrowSize)) == NULL) { + zfree(ops); + return; + } + } + + addReplyArrayLen(c,numops); + + /* Actually process the operations. */ + for (j = 0; j < numops; j++) { + struct bitfieldOp *thisop = ops+j; + + /* Execute the operation. */ + if (thisop->opcode == BITFIELDOP_SET || + thisop->opcode == BITFIELDOP_INCRBY) + { + /* SET and INCRBY: We handle both with the same code path + * for simplicity. SET return value is the previous value so + * we need fetch & store as well. */ + + /* We need two different but very similar code paths for signed + * and unsigned operations, since the set of functions to get/set + * the integers and the used variables types are different. */ + if (thisop->sign) { + int64_t oldval, newval, wrapped, retval; + int overflow; + + oldval = getSignedBitfield(o->ptr,thisop->offset, + thisop->bits); + + if (thisop->opcode == BITFIELDOP_INCRBY) { + overflow = checkSignedBitfieldOverflow(oldval, + thisop->i64,thisop->bits,thisop->owtype,&wrapped); + newval = overflow ? wrapped : oldval + thisop->i64; + retval = newval; + } else { + newval = thisop->i64; + overflow = checkSignedBitfieldOverflow(newval, + 0,thisop->bits,thisop->owtype,&wrapped); + if (overflow) newval = wrapped; + retval = oldval; + } + + /* On overflow of type is "FAIL", don't write and return + * NULL to signal the condition. */ + if (!(overflow && thisop->owtype == BFOVERFLOW_FAIL)) { + addReplyLongLong(c,retval); + setSignedBitfield(o->ptr,thisop->offset, + thisop->bits,newval); + + if (strGrowSize || (oldval != newval)) + changes++; + } else { + addReplyNull(c); + } + } else { + /* Initialization of 'wrapped' is required to avoid + * false-positive warning "-Wmaybe-uninitialized" */ + uint64_t oldval, newval, retval, wrapped = 0; + int overflow; + + oldval = getUnsignedBitfield(o->ptr,thisop->offset, + thisop->bits); + + if (thisop->opcode == BITFIELDOP_INCRBY) { + newval = oldval + thisop->i64; + overflow = checkUnsignedBitfieldOverflow(oldval, + thisop->i64,thisop->bits,thisop->owtype,&wrapped); + if (overflow) newval = wrapped; + retval = newval; + } else { + newval = thisop->i64; + overflow = checkUnsignedBitfieldOverflow(newval, + 0,thisop->bits,thisop->owtype,&wrapped); + if (overflow) newval = wrapped; + retval = oldval; + } + /* On overflow of type is "FAIL", don't write and return + * NULL to signal the condition. */ + if (!(overflow && thisop->owtype == BFOVERFLOW_FAIL)) { + addReplyLongLong(c,retval); + setUnsignedBitfield(o->ptr,thisop->offset, + thisop->bits,newval); + + if (strGrowSize || (oldval != newval)) + changes++; + } else { + addReplyNull(c); + } + } + } else { + /* GET */ + unsigned char buf[9]; + long strlen = 0; + unsigned char *src = NULL; + char llbuf[LONG_STR_SIZE]; + + if (o != NULL) + src = getObjectReadOnlyString(o,&strlen,llbuf); + + /* For GET we use a trick: before executing the operation + * copy up to 9 bytes to a local buffer, so that we can easily + * execute up to 64 bit operations that are at actual string + * object boundaries. */ + memset(buf,0,9); + int i; + uint64_t byte = thisop->offset >> 3; + for (i = 0; i < 9; i++) { + if (src == NULL || i+byte >= (uint64_t)strlen) break; + buf[i] = src[i+byte]; + } + + /* Now operate on the copied buffer which is guaranteed + * to be zero-padded. */ + if (thisop->sign) { + int64_t val = getSignedBitfield(buf,thisop->offset-(byte*8), + thisop->bits); + addReplyLongLong(c,val); + } else { + uint64_t val = getUnsignedBitfield(buf,thisop->offset-(byte*8), + thisop->bits); + addReplyLongLong(c,val); + } + } + } + + if (changes) { + + /* If this is not a new key (old size not 0) and size changed, then + * update the keysizes histogram. Otherwise, the histogram already + * updated in lookupStringForBitCommand() by calling dbAdd(). */ + if ((strOldSize > 0) && (strGrowSize != 0)) + updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_STRING, + strOldSize, strOldSize + strGrowSize); + + keyModified(c,c->db,c->argv[1],o,1); + notifyKeyspaceEvent(NOTIFY_STRING,"setbit",c->argv[1],c->db->id); + server.dirty += changes; + } + zfree(ops); +} + +void bitfieldCommand(client *c) { + bitfieldGeneric(c, BITFIELD_FLAG_NONE); +} + +void bitfieldroCommand(client *c) { + bitfieldGeneric(c, BITFIELD_FLAG_READONLY); +} + +#ifdef REDIS_TEST +/* Test function to verify popcount implementations */ +int bitopsTest(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + /* Test data with known popcount values */ + unsigned char test_data[] = {0xFF, 0x00, 0xAA, 0x55, 0xF0, 0x0F, 0x33, 0xCC}; + int expected_bits = 8 + 0 + 4 + 4 + 4 + 4 + 4 + 4; /* = 32 bits */ + + long long result_regular = redisPopcount(test_data, sizeof(test_data)); + + printf("Regular popcount: %lld (expected: %d)\n", result_regular, expected_bits); + + if (result_regular != expected_bits) { + printf("FAIL: Regular popcount mismatch\n"); + return 1; + } + +#ifdef HAVE_AVX2 + if (BITOP_USE_AVX2) { + long long result_avx2 = redisPopCountAvx2(test_data, sizeof(test_data)); + printf("AVX2 popcount: %lld (expected: %d)\n", result_avx2, expected_bits); + + if (result_avx2 != expected_bits) { + printf("FAIL: AVX2 popcount mismatch\n"); + return 1; + } + } else { + printf("AVX2 not supported on this CPU\n"); + } +#else + printf("AVX2 not compiled in\n"); +#endif + +#ifdef HAVE_AVX512 + if (BITOP_USE_AVX512) { + long long result_avx512 = redisPopCountAvx512(test_data, sizeof(test_data)); + printf("AVX512 popcount: %lld (expected: %d)\n", result_avx512, expected_bits); + + if (result_avx512 != expected_bits) { + printf("FAIL: AVX512 popcount mismatch\n"); + return 1; + } + } else { + printf("AVX512 not supported on this CPU\n"); + } +#else + printf("AVX512 not compiled in\n"); +#endif + +#ifdef HAVE_AARCH64_NEON + { + long long result_aarch64 = redisPopCountAarch64(test_data, sizeof(test_data)); + printf("AArch64 NEON popcount: %lld (expected: %d)\n", result_aarch64, expected_bits); + + if (result_aarch64 != expected_bits) { + printf("FAIL: AArch64 NEON popcount mismatch\n"); + return 1; + } + } +#else + printf("AArch64 NEON not available\n"); +#endif + printf("All popcount tests passed!\n"); + return 0; +} +#endif diff --git a/examples/redis-unstable/src/blocked.c b/examples/redis-unstable/src/blocked.c new file mode 100644 index 0000000..4f518c9 --- /dev/null +++ b/examples/redis-unstable/src/blocked.c @@ -0,0 +1,787 @@ +/* blocked.c - generic support for blocking operations like BLPOP & WAIT. + * + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + * + * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. + * + * --------------------------------------------------------------------------- + * + * API: + * + * blockClient() set the CLIENT_BLOCKED flag in the client, and set the + * specified block type 'btype' filed to one of BLOCKED_* macros. + * + * unblockClient() unblocks the client doing the following: + * 1) It calls the btype-specific function to cleanup the state. + * 2) It unblocks the client by unsetting the CLIENT_BLOCKED flag. + * 3) It puts the client into a list of just unblocked clients that are + * processed ASAP in the beforeSleep() event loop callback, so that + * if there is some query buffer to process, we do it. This is also + * required because otherwise there is no 'readable' event fired, we + * already read the pending commands. We also set the CLIENT_UNBLOCKED + * flag to remember the client is in the unblocked_clients list. + * + * processUnblockedClients() is called inside the beforeSleep() function + * to process the query buffer from unblocked clients and remove the clients + * from the blocked_clients queue. + * + * replyToBlockedClientTimedOut() is called by the cron function when + * a client blocked reaches the specified timeout (if the timeout is set + * to 0, no timeout is processed). + * It usually just needs to send a reply to the client. + * + * When implementing a new type of blocking operation, the implementation + * should modify unblockClient() and replyToBlockedClientTimedOut() in order + * to handle the btype-specific behavior of this two functions. + * If the blocking operation waits for certain keys to change state, the + * clusterRedirectBlockedClientIfNeeded() function should also be updated. + */ + +#include "server.h" +#include "slowlog.h" +#include "latency.h" +#include "monotonic.h" +#include "cluster_slot_stats.h" + +/* forward declarations */ +static void unblockClientWaitingData(client *c); +static void handleClientsBlockedOnKey(readyList *rl); +static void unblockClientOnKey(client *c, robj *key); +static void moduleUnblockClientOnKey(client *c, robj *key); +static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key); + +void initClientBlockingState(client *c) { + c->bstate.btype = BLOCKED_NONE; + c->bstate.timeout = 0; + c->bstate.keys = dictCreate(&objectKeyHeapPointerValueDictType); + c->bstate.numreplicas = 0; + c->bstate.reploffset = 0; + c->bstate.unblock_on_nokey = 0; + c->bstate.async_rm_call_handle = NULL; +} + +/* Block a client for the specific operation type. Once the CLIENT_BLOCKED + * flag is set client query buffer is not longer processed, but accumulated, + * and will be processed when the client is unblocked. */ +void blockClient(client *c, int btype) { + /* Master client should never be blocked unless pause or module */ + serverAssert(!(c->flags & CLIENT_MASTER && + btype != BLOCKED_MODULE && + btype != BLOCKED_LAZYFREE && + btype != BLOCKED_POSTPONE && + btype != BLOCKED_POSTPONE_TRIM)); + + c->flags |= CLIENT_BLOCKED; + c->bstate.btype = btype; + if (!(c->flags & CLIENT_MODULE)) server.blocked_clients++; /* We count blocked client stats on regular clients and not on module clients */ + server.blocked_clients_by_type[btype]++; + addClientToTimeoutTable(c); +} + +/* Usually when a client is unblocked due to being blocked while processing some command + * he will attempt to reprocess the command which will update the statistics. + * However in case the client was timed out or in case of module blocked client is being unblocked + * the command will not be reprocessed and we need to make stats update. + * This function will make updates to the commandstats, slowlog and monitors.*/ +void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int had_errors){ + const ustime_t total_cmd_duration = c->duration + blocked_us + reply_us; + clusterSlotStatsAddCpuDuration(c, total_cmd_duration); + c->lastcmd->microseconds += total_cmd_duration; + c->lastcmd->calls++; + c->commands_processed++; + server.stat_numcommands++; + if (had_errors) + c->lastcmd->failed_calls++; + if (server.latency_tracking_enabled) + updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), total_cmd_duration*1000); + /* Log the command into the Slow log if needed. */ + slowlogPushCurrentCommand(c, c->lastcmd, total_cmd_duration); + c->duration = 0; + /* Log the reply duration event. */ + latencyAddSampleIfNeeded("command-unblocking",reply_us/1000); +} + +/* This function is called in the beforeSleep() function of the event loop + * in order to process the pending input buffer of clients that were + * unblocked after a blocking operation. */ +void processUnblockedClients(void) { + listNode *ln; + client *c; + + while (listLength(server.unblocked_clients)) { + ln = listFirst(server.unblocked_clients); + serverAssert(ln != NULL); + c = ln->value; + listDelNode(server.unblocked_clients,ln); + c->flags &= ~CLIENT_UNBLOCKED; + + /* Reset the client for a new query, unless the client has pending command to process. */ + if (!(c->flags & CLIENT_PENDING_COMMAND)) { + freeClientOriginalArgv(c); + /* Clients that are not blocked on keys are not reprocessed so we must + * call reqresAppendResponse here (for clients blocked on key, + * unblockClientOnKey is called, which eventually calls processCommand, + * which calls reqresAppendResponse) */ + prepareForNextCommand(c, 0); + } + + if (c->flags & CLIENT_MODULE) { + if (!(c->flags & CLIENT_BLOCKED)) { + moduleCallCommandUnblockedHandler(c); + } + continue; + } + + /* Process remaining data in the input buffer, unless the client + * is blocked again. Actually processInputBuffer() checks that the + * client is not blocked before to proceed, but things may change and + * the code is conceptually more correct this way. */ + if (!(c->flags & CLIENT_BLOCKED)) { + /* If we have a queued command, execute it now. */ + if (processPendingCommandAndInputBuffer(c) == C_ERR) { + c = NULL; + } + } + beforeNextClient(c); + } +} + +/* This function will schedule the client for reprocessing at a safe time. + * + * This is useful when a client was blocked for some reason (blocking operation, + * CLIENT PAUSE, or whatever), because it may end with some accumulated query + * buffer that needs to be processed ASAP: + * + * 1. When a client is blocked, its readable handler is still active. + * 2. However in this case it only gets data into the query buffer, but the + * query is not parsed or executed once there is enough to proceed as + * usually (because the client is blocked... so we can't execute commands). + * 3. When the client is unblocked, without this function, the client would + * have to write some query in order for the readable handler to finally + * call processQueryBuffer*() on it. + * 4. With this function instead we can put the client in a queue that will + * process it for queries ready to be executed at a safe time. + */ +void queueClientForReprocessing(client *c) { + /* The client may already be into the unblocked list because of a previous + * blocking operation, don't add back it into the list multiple times. */ + if (!(c->flags & CLIENT_UNBLOCKED)) { + c->flags |= CLIENT_UNBLOCKED; + listAddNodeTail(server.unblocked_clients,c); + } +} + +/* Unblock a client calling the right function depending on the kind + * of operation the client is blocking for. */ +void unblockClient(client *c, int queue_for_reprocessing) { + if (c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET || + c->bstate.btype == BLOCKED_STREAM) { + unblockClientWaitingData(c); + } else if (c->bstate.btype == BLOCKED_WAIT || c->bstate.btype == BLOCKED_WAITAOF) { + unblockClientWaitingReplicas(c); + } else if (c->bstate.btype == BLOCKED_MODULE) { + if (moduleClientIsBlockedOnKeys(c)) unblockClientWaitingData(c); + unblockClientFromModule(c); + } else if (c->bstate.btype == BLOCKED_POSTPONE || c->bstate.btype == BLOCKED_POSTPONE_TRIM) { + listDelNode(server.postponed_clients,c->postponed_list_node); + c->postponed_list_node = NULL; + } else if (c->bstate.btype == BLOCKED_SHUTDOWN) { + /* No special cleanup. */ + } else if (c->bstate.btype == BLOCKED_LAZYFREE) { + /* No special cleanup. */ + } else { + serverPanic("Unknown btype in unblockClient()."); + } + + + /* Clear the flags, and put the client in the unblocked list so that + * we'll process new commands in its query buffer ASAP. */ + if (!(c->flags & CLIENT_MODULE)) server.blocked_clients--; /* We count blocked client stats on regular clients and not on module clients */ + server.blocked_clients_by_type[c->bstate.btype]--; + c->flags &= ~CLIENT_BLOCKED; + c->bstate.btype = BLOCKED_NONE; + c->bstate.unblock_on_nokey = 0; + removeClientFromTimeoutTable(c); + if (queue_for_reprocessing) queueClientForReprocessing(c); +} + +/* Check if the specified client can be safely timed out using + * unblockClientOnTimeout(). */ +int blockedClientMayTimeout(client *c) { + if (c->bstate.btype == BLOCKED_MODULE) { + return moduleBlockedClientMayTimeout(c); + } + + if (c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET || + c->bstate.btype == BLOCKED_STREAM || + c->bstate.btype == BLOCKED_WAIT || + c->bstate.btype == BLOCKED_WAITAOF) + { + return 1; + } + return 0; +} + +/* This function gets called when a blocked client timed out in order to + * send it a reply of some kind. After this function is called, + * unblockClient() will be called with the same client as argument. */ +void replyToBlockedClientTimedOut(client *c) { + if (c->bstate.btype == BLOCKED_LAZYFREE) { + addReply(c, shared.ok); /* No reason lazy-free to fail */ + } else if (c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET || + c->bstate.btype == BLOCKED_STREAM) { + addReplyNullArray(c); + updateStatsOnUnblock(c, 0, 0, 0); + } else if (c->bstate.btype == BLOCKED_WAIT) { + addReplyLongLong(c,replicationCountAcksByOffset(c->bstate.reploffset)); + } else if (c->bstate.btype == BLOCKED_WAITAOF) { + addReplyArrayLen(c,2); + addReplyLongLong(c,server.fsynced_reploff >= c->bstate.reploffset); + addReplyLongLong(c,replicationCountAOFAcksByOffset(c->bstate.reploffset)); + } else if (c->bstate.btype == BLOCKED_MODULE) { + moduleBlockedClientTimedOut(c); + } else { + serverPanic("Unknown btype in replyToBlockedClientTimedOut()."); + } +} + +/* If one or more clients are blocked on the SHUTDOWN command, this function + * sends them an error reply and unblocks them. */ +void replyToClientsBlockedOnShutdown(void) { + if (server.blocked_clients_by_type[BLOCKED_SHUTDOWN] == 0) return; + listNode *ln; + listIter li; + listRewind(server.clients, &li); + while((ln = listNext(&li))) { + client *c = listNodeValue(ln); + if (c->flags & CLIENT_BLOCKED && c->bstate.btype == BLOCKED_SHUTDOWN) { + c->duration = 0; + addReplyError(c, "Errors trying to SHUTDOWN. Check logs."); + unblockClient(c, 1); + } + } +} + +/* Mass-unblock clients because something changed in the instance that makes + * blocking no longer safe. For example clients blocked in list operations + * in an instance which turns from master to slave is unsafe, so this function + * is called when a master turns into a slave. + * + * The semantics is to send an -UNBLOCKED error to the client, disconnecting + * it at the same time. */ +void disconnectAllBlockedClients(void) { + listNode *ln; + listIter li; + + listRewind(server.clients,&li); + while((ln = listNext(&li))) { + client *c = listNodeValue(ln); + + if (c->flags & CLIENT_BLOCKED) { + /* POSTPONEd clients are an exception, when they'll be unblocked, the + * command processing will start from scratch, and the command will + * be either executed or rejected. (unlike LIST blocked clients for + * which the command is already in progress in a way. */ + if (c->bstate.btype == BLOCKED_POSTPONE || c->bstate.btype == BLOCKED_POSTPONE_TRIM) + continue; + + if (c->bstate.btype == BLOCKED_LAZYFREE) { + addReply(c, shared.ok); /* No reason lazy-free to fail */ + updateStatsOnUnblock(c, 0, 0, 0); + c->flags &= ~CLIENT_PENDING_COMMAND; + unblockClient(c, 1); + } else { + + unblockClientOnError(c, + "-UNBLOCKED force unblock from blocking operation, " + "instance state changed (master -> replica?)"); + } + c->flags |= CLIENT_CLOSE_AFTER_REPLY; + } + } +} + +/* This function should be called by Redis every time a single command, + * a MULTI/EXEC block, or a Lua script, terminated its execution after + * being called by a client. It handles serving clients blocked in all scenarios + * where a specific key access requires to block until that key is available. + * + * All the keys with at least one client blocked that are signaled as ready + * are accumulated into the server.ready_keys list. This function will run + * the list and will serve clients accordingly. + * Note that the function will iterate again and again (for example as a result of serving BLMOVE + * we can have new blocking clients to serve because of the PUSH side of BLMOVE.) + * + * This function is normally "fair", that is, it will serve clients + * using a FIFO behavior. However this fairness is violated in certain + * edge cases, that is, when we have clients blocked at the same time + * in a sorted set and in a list, for the same key (a very odd thing to + * do client side, indeed!). Because mismatching clients (blocking for + * a different type compared to the current key type) are moved in the + * other side of the linked list. However as long as the key starts to + * be used only for a single type, like virtually any Redis application will + * do, the function is already fair. */ +void handleClientsBlockedOnKeys(void) { + + /* In case we are already in the process of unblocking clients we should + * not make a recursive call, in order to prevent breaking fairness. */ + static int in_handling_blocked_clients = 0; + if (in_handling_blocked_clients) + return; + in_handling_blocked_clients = 1; + + /* This function is called only when also_propagate is in its basic state + * (i.e. not from call(), module context, etc.) */ + serverAssert(server.also_propagate.numops == 0); + + /* If a command being unblocked causes another command to get unblocked, + * like a BLMOVE would do, then the new unblocked command will get processed + * right away rather than wait for later. */ + while(listLength(server.ready_keys) != 0) { + list *l; + + /* Point server.ready_keys to a fresh list and save the current one + * locally. This way as we run the old list we are free to call + * signalKeyAsReady() that may push new elements in server.ready_keys + * when handling clients blocked into BLMOVE. */ + l = server.ready_keys; + server.ready_keys = listCreate(); + + while(listLength(l) != 0) { + listNode *ln = listFirst(l); + readyList *rl = ln->value; + + /* First of all remove this key from db->ready_keys so that + * we can safely call signalKeyAsReady() against this key. */ + dictDelete(rl->db->ready_keys,rl->key); + + handleClientsBlockedOnKey(rl); + + /* Free this item. */ + decrRefCount(rl->key); + zfree(rl); + listDelNode(l,ln); + } + listRelease(l); /* We have the new list on place at this point. */ + } + in_handling_blocked_clients = 0; +} + +/* Set a client in blocking mode for the specified key, with the specified timeout. + * The 'type' argument is BLOCKED_LIST,BLOCKED_ZSET or BLOCKED_STREAM depending on the kind of operation we are + * waiting for an empty key in order to awake the client. The client is blocked + * for all the 'numkeys' keys as in the 'keys' argument. + * The client will unblocked as soon as one of the keys in 'keys' value was updated. + * the parameter unblock_on_nokey can be used to force client to be unblocked even in the case the key + * is updated to become unavailable, either by type change (override), deletion or swapdb */ +void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeout, int unblock_on_nokey) { + dictEntry *db_blocked_entry, *db_blocked_existing_entry, *client_blocked_entry; + list *l; + int j; + + if (!(c->flags & CLIENT_REEXECUTING_COMMAND)) { + /* If the client is re-processing the command, we do not set the timeout + * because we need to retain the client's original timeout. */ + c->bstate.timeout = timeout; + } + + for (j = 0; j < numkeys; j++) { + /* If the key already exists in the dictionary ignore it. */ + if (!(client_blocked_entry = dictAddRaw(c->bstate.keys,keys[j],NULL))) { + continue; + } + incrRefCount(keys[j]); + + /* And in the other "side", to map keys -> clients */ + db_blocked_entry = dictAddRaw(c->db->blocking_keys,keys[j], &db_blocked_existing_entry); + + /* In case key[j] did not have blocking clients yet, we need to create a new list */ + if (db_blocked_entry != NULL) { + l = listCreate(); + dictSetVal(c->db->blocking_keys, db_blocked_entry, l); + incrRefCount(keys[j]); + } else { + l = dictGetVal(db_blocked_existing_entry); + } + listAddNodeTail(l,c); + dictSetVal(c->bstate.keys,client_blocked_entry,listLast(l)); + + /* We need to add the key to blocking_keys_unblock_on_nokey, if the client + * wants to be awakened if key is deleted (like XREADGROUP) */ + if (unblock_on_nokey) { + db_blocked_entry = dictAddRaw(c->db->blocking_keys_unblock_on_nokey, keys[j], &db_blocked_existing_entry); + if (db_blocked_entry) { + incrRefCount(keys[j]); + dictSetUnsignedIntegerVal(db_blocked_entry, 1); + } else { + dictIncrUnsignedIntegerVal(db_blocked_existing_entry, 1); + } + } + } + c->bstate.unblock_on_nokey = unblock_on_nokey; + /* Currently we assume key blocking will require reprocessing the command. + * However in case of modules, they have a different way to handle the reprocessing + * which does not require setting the pending command flag */ + if (btype != BLOCKED_MODULE) + c->flags |= CLIENT_PENDING_COMMAND; + blockClient(c,btype); +} + +/* Helper function to unblock a client that's waiting in a blocking operation such as BLPOP. + * Internal function for unblockClient() */ +static void unblockClientWaitingData(client *c) { + dictEntry *de; + dictIterator di; + + if (dictSize(c->bstate.keys) == 0) + return; + + dictInitIterator(&di, c->bstate.keys); + /* The client may wait for multiple keys, so unblock it for every key. */ + while((de = dictNext(&di)) != NULL) { + releaseBlockedEntry(c, de, 0); + } + dictResetIterator(&di); + dictEmpty(c->bstate.keys, NULL); +} + +static blocking_type getBlockedTypeByType(int type) { + switch (type) { + case OBJ_LIST: return BLOCKED_LIST; + case OBJ_ZSET: return BLOCKED_ZSET; + case OBJ_MODULE: return BLOCKED_MODULE; + case OBJ_STREAM: return BLOCKED_STREAM; + default: return BLOCKED_NONE; + } +} + +/* If the specified key has clients blocked waiting for list pushes, this + * function will put the key reference into the server.ready_keys list. + * Note that db->ready_keys is a hash table that allows us to avoid putting + * the same key again and again in the list in case of multiple pushes + * made by a script or in the context of MULTI/EXEC. + * + * The list will be finally processed by handleClientsBlockedOnKeys() */ +static void signalKeyAsReadyLogic(redisDb *db, robj *key, int type, int deleted) { + readyList *rl; + + /* Quick returns. */ + int btype = getBlockedTypeByType(type); + if (btype == BLOCKED_NONE) { + /* The type can never block. */ + return; + } + if (!server.blocked_clients_by_type[btype] && + !server.blocked_clients_by_type[BLOCKED_MODULE]) { + /* No clients block on this type. Note: Blocked modules are represented + * by BLOCKED_MODULE, even if the intention is to wake up by normal + * types (list, zset, stream), so we need to check that there are no + * blocked modules before we do a quick return here. */ + return; + } + + if (deleted) { + /* Key deleted and no clients blocking for this key? No need to queue it. */ + if (dictFind(db->blocking_keys_unblock_on_nokey,key) == NULL) + return; + /* Note: if we made it here it means the key is also present in db->blocking_keys */ + } else { + /* No clients blocking for this key? No need to queue it. */ + if (dictFind(db->blocking_keys,key) == NULL) + return; + } + + dictEntry *de, *existing; + de = dictAddRaw(db->ready_keys, key, &existing); + if (de) { + /* We add the key in the db->ready_keys dictionary in order + * to avoid adding it multiple times into a list with a simple O(1) + * check. */ + incrRefCount(key); + } else { + /* Key was already signaled? No need to queue it again. */ + return; + } + + /* Ok, we need to queue this key into server.ready_keys. */ + rl = zmalloc(sizeof(*rl)); + rl->key = key; + rl->db = db; + incrRefCount(key); + listAddNodeTail(server.ready_keys,rl); +} + +/* Helper function to wrap the logic of removing a client blocked key entry + * In this case we would like to do the following: + * 1. unlink the client from the global DB locked client list + * 2. remove the entry from the global db blocking list in case the list is empty + * 3. in case the global list is empty, also remove the key from the global dict of keys + * which should trigger unblock on key deletion + * 4. remove key from the client blocking keys list - NOTE, since client can be blocked on lots of keys, + * but unblocked when only one of them is triggered, we would like to avoid deleting each key separately + * and instead clear the dictionary in one-shot. this is why the remove_key argument is provided + * to support this logic in unblockClientWaitingData + */ +static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key) { + list *l; + listNode *pos; + void *key; + dictEntry *unblock_on_nokey_entry; + + key = dictGetKey(de); + pos = dictGetVal(de); + /* Remove this client from the list of clients waiting for this key. */ + l = dictFetchValue(c->db->blocking_keys, key); + serverAssertWithInfo(c,key,l != NULL); + listUnlinkNode(l,pos); + /* If the list is empty we need to remove it to avoid wasting memory + * We will also remove the key (if exists) from the blocking_keys_unblock_on_nokey dict. + * However, in case the list is not empty, we will have to still perform reference accounting + * on the blocking_keys_unblock_on_nokey and delete the entry in case of zero reference. + * Why? because it is possible that some more clients are blocked on the same key but without + * require to be triggered on key deletion, we do not want these to be later triggered by the + * signalDeletedKeyAsReady. */ + if (listLength(l) == 0) { + dictDelete(c->db->blocking_keys, key); + dictDelete(c->db->blocking_keys_unblock_on_nokey,key); + } else if (c->bstate.unblock_on_nokey) { + unblock_on_nokey_entry = dictFind(c->db->blocking_keys_unblock_on_nokey,key); + /* it is not possible to have a client blocked on nokey with no matching entry */ + serverAssertWithInfo(c,key,unblock_on_nokey_entry != NULL); + if (!dictIncrUnsignedIntegerVal(unblock_on_nokey_entry, -1)) { + /* in case the count is zero, we can delete the entry */ + dictDelete(c->db->blocking_keys_unblock_on_nokey,key); + } + } + if (remove_key) + dictDelete(c->bstate.keys, key); +} + +void signalKeyAsReady(redisDb *db, robj *key, int type) { + signalKeyAsReadyLogic(db, key, type, 0); +} + +void signalDeletedKeyAsReady(redisDb *db, robj *key, int type) { + signalKeyAsReadyLogic(db, key, type, 1); +} + +/* Helper function for handleClientsBlockedOnKeys(). This function is called + * whenever a key is ready. we iterate over all the clients blocked on this key + * and try to re-execute the command (in case the key is still available). */ +static void handleClientsBlockedOnKey(readyList *rl) { + + /* We serve clients in the same order they blocked for + * this key, from the first blocked to the last. */ + dictEntry *de = dictFind(rl->db->blocking_keys,rl->key); + + if (de) { + list *clients = dictGetVal(de); + listNode *ln; + listIter li; + listRewind(clients,&li); + + /* Avoid processing more than the initial count so that we're not stuck + * in an endless loop in case the reprocessing of the command blocks again. */ + long count = listLength(clients); + while ((ln = listNext(&li)) && count--) { + client *receiver = listNodeValue(ln); + kvobj *o = lookupKeyReadWithFlags(rl->db, rl->key, LOOKUP_NOEFFECTS); + /* 1. In case new key was added/touched we need to verify it satisfy the + * blocked type, since we might process the wrong key type. + * 2. We want to serve clients blocked on module keys + * regardless of the object type: we don't know what the + * module is trying to accomplish right now. + * 3. In case of XREADGROUP call we will want to unblock on any change in object type + * or in case the key was deleted, since the group is no longer valid. */ + if ((o != NULL && (receiver->bstate.btype == getBlockedTypeByType(o->type))) || + (o != NULL && (receiver->bstate.btype == BLOCKED_MODULE)) || + (receiver->bstate.unblock_on_nokey)) + { + if (receiver->bstate.btype != BLOCKED_MODULE) + unblockClientOnKey(receiver, rl->key); + else + moduleUnblockClientOnKey(receiver, rl->key); + } + } + } +} + +/* block a client due to wait command */ +void blockForReplication(client *c, mstime_t timeout, long long offset, long numreplicas) { + c->bstate.timeout = timeout; + c->bstate.reploffset = offset; + c->bstate.numreplicas = numreplicas; + listAddNodeHead(server.clients_waiting_acks,c); + blockClient(c,BLOCKED_WAIT); +} + +/* block a client due to waitaof command */ +void blockForAofFsync(client *c, mstime_t timeout, long long offset, int numlocal, long numreplicas) { + c->bstate.timeout = timeout; + c->bstate.reploffset = offset; + c->bstate.numreplicas = numreplicas; + c->bstate.numlocal = numlocal; + listAddNodeHead(server.clients_waiting_acks,c); + blockClient(c,BLOCKED_WAITAOF); +} + +/* Postpone client from executing a command. For example the server might be busy + * requesting to avoid processing clients commands which will be processed later + * when the it is ready to accept them. */ +void blockPostponeClientWithType(client *c, int btype) { + serverAssert(btype == BLOCKED_POSTPONE || btype == BLOCKED_POSTPONE_TRIM); + c->bstate.timeout = 0; + blockClient(c, btype); + listAddNodeTail(server.postponed_clients, c); + c->postponed_list_node = listLast(server.postponed_clients); + /* Mark this client to execute its command */ + c->flags |= CLIENT_PENDING_COMMAND; +} + +/* Postpone client from executing a command. */ +void blockPostponeClient(client *c) { + blockPostponeClientWithType(c, BLOCKED_POSTPONE); +} + +/* Block client due to shutdown command */ +void blockClientShutdown(client *c) { + blockClient(c, BLOCKED_SHUTDOWN); +} + +/* Unblock a client once a specific key became available for it. + * This function will remove the client from the list of clients blocked on this key + * and also remove the key from the dictionary of keys this client is blocked on. + * in case the client has a command pending it will process it immediately. */ +static void unblockClientOnKey(client *c, robj *key) { + dictEntry *de; + + de = dictFind(c->bstate.keys, key); + releaseBlockedEntry(c, de, 1); + + /* Only in case of blocking API calls, we might be blocked on several keys. + however we should force unblock the entire blocking keys */ + serverAssert(c->bstate.btype == BLOCKED_STREAM || + c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET); + + /* We need to unblock the client before calling processCommandAndResetClient + * because it checks the CLIENT_BLOCKED flag */ + unblockClient(c, 0); + /* In case this client was blocked on keys during command + * we need to re process the command again */ + if (c->flags & CLIENT_PENDING_COMMAND) { + c->flags &= ~CLIENT_PENDING_COMMAND; + c->flags |= CLIENT_REEXECUTING_COMMAND; + /* We want the command processing and the unblock handler (see RM_Call 'K' option) + * to run atomically, this is why we must enter the execution unit here before + * running the command, and exit the execution unit after calling the unblock handler (if exists). + * Notice that we also must set the current client so it will be available + * when we will try to send the client side caching notification (done on 'afterCommand'). */ + client *old_client = server.current_client; + server.current_client = c; + enterExecutionUnit(1, 0); + processCommandAndResetClient(c); + if (!(c->flags & CLIENT_BLOCKED)) { + if (c->flags & CLIENT_MODULE) { + moduleCallCommandUnblockedHandler(c); + } else { + queueClientForReprocessing(c); + } + } + exitExecutionUnit(); + afterCommand(c); + /* Clear the CLIENT_REEXECUTING_COMMAND flag after the proc is executed. */ + c->flags &= ~CLIENT_REEXECUTING_COMMAND; + server.current_client = old_client; + } +} + +/* Unblock a client blocked on the specific key from module context. + * This function will try to serve the module call, and in case it succeeds, + * it will add the client to the list of module unblocked clients which will + * be processed in moduleHandleBlockedClients. */ +static void moduleUnblockClientOnKey(client *c, robj *key) { + long long prev_error_replies = server.stat_total_error_replies; + client *old_client = server.current_client; + server.current_client = c; + monotime replyTimer; + elapsedStart(&replyTimer); + + if (moduleTryServeClientBlockedOnKey(c, key)) { + updateStatsOnUnblock(c, 0, elapsedUs(replyTimer), server.stat_total_error_replies != prev_error_replies); + moduleUnblockClient(c); + } + /* We need to call afterCommand even if the client was not unblocked + * in order to propagate any changes that could have been done inside + * moduleTryServeClientBlockedOnKey */ + afterCommand(c); + server.current_client = old_client; +} + +/* Unblock a client which is currently Blocked on and provided a timeout. + * The implementation will first reply to the blocked client with null response + * or, in case of module blocked client the timeout callback will be used. + * In this case since we might have a command pending + * we want to remove the pending flag to indicate we already responded to the + * command with timeout reply. */ +void unblockClientOnTimeout(client *c) { + /* The client has been unlocked (in the moduleUnblocked list), return ASAP. */ + if (c->bstate.btype == BLOCKED_MODULE && isModuleClientUnblocked(c)) return; + + replyToBlockedClientTimedOut(c); + if (c->flags & CLIENT_PENDING_COMMAND) + c->flags &= ~CLIENT_PENDING_COMMAND; + unblockClient(c, 1); +} + +/* Unblock a client which is currently Blocked with error. + * If err_str is provided it will be used to reply to the blocked client */ +void unblockClientOnError(client *c, const char *err_str) { + if (err_str) + addReplyError(c, err_str); + updateStatsOnUnblock(c, 0, 0, 1); + if (c->flags & CLIENT_PENDING_COMMAND) + c->flags &= ~CLIENT_PENDING_COMMAND; + unblockClient(c, 1); +} + +void blockedBeforeSleep(void) { + /* Handle precise timeouts of blocked clients. */ + handleBlockedClientsTimeout(); + + /* Handle for expired pending entries. */ + handleClaimableStreamEntries(); + + /* Unblock all the clients blocked for synchronous replication + * in WAIT or WAITAOF. */ + if (listLength(server.clients_waiting_acks)) + processClientsWaitingReplicas(); + + /* Try to process blocked clients every once in while. + * + * Example: A module calls RM_SignalKeyAsReady from within a timer callback + * (So we don't visit processCommand() at all). + * + * This may unblock clients, so must be done before processUnblockedClients */ + handleClientsBlockedOnKeys(); + + /* Check if there are clients unblocked by modules that implement + * blocking commands. */ + if (moduleCount()) + moduleHandleBlockedClients(); + + /* Try to process pending commands for clients that were just unblocked. */ + if (listLength(server.unblocked_clients)) + processUnblockedClients(); +} diff --git a/examples/redis-unstable/src/call_reply.c b/examples/redis-unstable/src/call_reply.c new file mode 100644 index 0000000..2a4f710 --- /dev/null +++ b/examples/redis-unstable/src/call_reply.c @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "server.h" +#include "call_reply.h" + +#define REPLY_FLAG_ROOT (1<<0) +#define REPLY_FLAG_PARSED (1<<1) +#define REPLY_FLAG_RESP3 (1<<2) + +/* -------------------------------------------------------- + * An opaque struct used to parse a RESP protocol reply and + * represent it. Used when parsing replies such as in RM_Call + * or Lua scripts. + * -------------------------------------------------------- */ +struct CallReply { + void *private_data; + sds original_proto; /* Available only for root reply. */ + const char *proto; + size_t proto_len; + int type; /* REPLY_... */ + int flags; /* REPLY_FLAG... */ + size_t len; /* Length of a string, or the number elements in an array. */ + union { + const char *str; /* String pointer for string and error replies. This + * does not need to be freed, always points inside + * a reply->proto buffer of the reply object or, in + * case of array elements, of parent reply objects. */ + struct { + const char *str; + const char *format; + } verbatim_str; /* Reply value for verbatim string */ + long long ll; /* Reply value for integer reply. */ + double d; /* Reply value for double reply. */ + struct CallReply *array; /* Array of sub-reply elements. used for set, array, map, and attribute */ + } val; + list *deferred_error_list; /* list of errors in sds form or NULL */ + struct CallReply *attribute; /* attribute reply, NULL if not exists */ +}; + +static void callReplySetSharedData(CallReply *rep, int type, const char *proto, size_t proto_len, int extra_flags) { + rep->type = type; + rep->proto = proto; + rep->proto_len = proto_len; + rep->flags |= extra_flags; +} + +static void callReplyNull(void *ctx, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_NULL, proto, proto_len, REPLY_FLAG_RESP3); +} + +static void callReplyNullBulkString(void *ctx, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_NULL, proto, proto_len, 0); +} + +static void callReplyNullArray(void *ctx, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_NULL, proto, proto_len, 0); +} + +static void callReplyBulkString(void *ctx, const char *str, size_t len, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_STRING, proto, proto_len, 0); + rep->len = len; + rep->val.str = str; +} + +static void callReplyError(void *ctx, const char *str, size_t len, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_ERROR, proto, proto_len, 0); + rep->len = len; + rep->val.str = str; +} + +static void callReplySimpleStr(void *ctx, const char *str, size_t len, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_STRING, proto, proto_len, 0); + rep->len = len; + rep->val.str = str; +} + +static void callReplyLong(void *ctx, long long val, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_INTEGER, proto, proto_len, 0); + rep->val.ll = val; +} + +static void callReplyDouble(void *ctx, double val, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_DOUBLE, proto, proto_len, REPLY_FLAG_RESP3); + rep->val.d = val; +} + +static void callReplyVerbatimString(void *ctx, const char *format, const char *str, size_t len, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_VERBATIM_STRING, proto, proto_len, REPLY_FLAG_RESP3); + rep->len = len; + rep->val.verbatim_str.str = str; + rep->val.verbatim_str.format = format; +} + +static void callReplyBigNumber(void *ctx, const char *str, size_t len, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_BIG_NUMBER, proto, proto_len, REPLY_FLAG_RESP3); + rep->len = len; + rep->val.str = str; +} + +static void callReplyBool(void *ctx, int val, const char *proto, size_t proto_len) { + CallReply *rep = ctx; + callReplySetSharedData(rep, REDISMODULE_REPLY_BOOL, proto, proto_len, REPLY_FLAG_RESP3); + rep->val.ll = val; +} + +static void callReplyParseCollection(ReplyParser *parser, CallReply *rep, size_t len, const char *proto, size_t elements_per_entry) { + rep->len = len; + rep->val.array = zcalloc(elements_per_entry * len * sizeof(CallReply)); + for (size_t i = 0; i < len * elements_per_entry; i += elements_per_entry) { + for (size_t j = 0 ; j < elements_per_entry ; ++j) { + rep->val.array[i + j].private_data = rep->private_data; + parseReply(parser, rep->val.array + i + j); + rep->val.array[i + j].flags |= REPLY_FLAG_PARSED; + if (rep->val.array[i + j].flags & REPLY_FLAG_RESP3) { + /* If one of the sub-replies is RESP3, then the current reply is also RESP3. */ + rep->flags |= REPLY_FLAG_RESP3; + } + } + } + rep->proto = proto; + rep->proto_len = parser->curr_location - proto; +} + +static void callReplyAttribute(ReplyParser *parser, void *ctx, size_t len, const char *proto) { + CallReply *rep = ctx; + rep->attribute = zcalloc(sizeof(CallReply)); + + /* Continue parsing the attribute reply */ + rep->attribute->len = len; + rep->attribute->type = REDISMODULE_REPLY_ATTRIBUTE; + callReplyParseCollection(parser, rep->attribute, len, proto, 2); + rep->attribute->flags |= REPLY_FLAG_PARSED | REPLY_FLAG_RESP3; + rep->attribute->private_data = rep->private_data; + + /* Continue parsing the reply */ + parseReply(parser, rep); + + /* In this case we need to fix the proto address and len, it should start from the attribute */ + rep->proto = proto; + rep->proto_len = parser->curr_location - proto; + rep->flags |= REPLY_FLAG_RESP3; +} + +static void callReplyArray(ReplyParser *parser, void *ctx, size_t len, const char *proto) { + CallReply *rep = ctx; + rep->type = REDISMODULE_REPLY_ARRAY; + callReplyParseCollection(parser, rep, len, proto, 1); +} + +static void callReplySet(ReplyParser *parser, void *ctx, size_t len, const char *proto) { + CallReply *rep = ctx; + rep->type = REDISMODULE_REPLY_SET; + callReplyParseCollection(parser, rep, len, proto, 1); + rep->flags |= REPLY_FLAG_RESP3; +} + +static void callReplyMap(ReplyParser *parser, void *ctx, size_t len, const char *proto) { + CallReply *rep = ctx; + rep->type = REDISMODULE_REPLY_MAP; + callReplyParseCollection(parser, rep, len, proto, 2); + rep->flags |= REPLY_FLAG_RESP3; +} + +static void callReplyParseError(void *ctx) { + CallReply *rep = ctx; + rep->type = REDISMODULE_REPLY_UNKNOWN; +} + +/* Recursively free the current call reply and its sub-replies. */ +static void freeCallReplyInternal(CallReply *rep) { + if (rep->type == REDISMODULE_REPLY_ARRAY || rep->type == REDISMODULE_REPLY_SET) { + for (size_t i = 0 ; i < rep->len ; ++i) { + freeCallReplyInternal(rep->val.array + i); + } + zfree(rep->val.array); + } + + if (rep->type == REDISMODULE_REPLY_MAP || rep->type == REDISMODULE_REPLY_ATTRIBUTE) { + for (size_t i = 0 ; i < rep->len ; ++i) { + freeCallReplyInternal(rep->val.array + i * 2); + freeCallReplyInternal(rep->val.array + i * 2 + 1); + } + zfree(rep->val.array); + } + + if (rep->attribute) { + freeCallReplyInternal(rep->attribute); + zfree(rep->attribute); + } +} + +/* Free the given call reply and its children (in case of nested reply) recursively. + * If private data was set when the CallReply was created it will not be freed, as it's + * the caller's responsibility to free it before calling freeCallReply(). */ +void freeCallReply(CallReply *rep) { + if (!(rep->flags & REPLY_FLAG_ROOT)) { + return; + } + if (rep->flags & REPLY_FLAG_PARSED) { + if (rep->type == REDISMODULE_REPLY_PROMISE) { + zfree(rep); + return; + } + freeCallReplyInternal(rep); + } + sdsfree(rep->original_proto); + if (rep->deferred_error_list) + listRelease(rep->deferred_error_list); + zfree(rep); +} + +CallReply *callReplyCreatePromise(void *private_data) { + CallReply *res = zmalloc(sizeof(*res)); + res->type = REDISMODULE_REPLY_PROMISE; + /* Mark the reply as parsed so there will be not attempt to parse + * it when calling reply API such as freeCallReply. + * Also mark the reply as root so freeCallReply will not ignore it. */ + res->flags |= REPLY_FLAG_PARSED | REPLY_FLAG_ROOT; + res->private_data = private_data; + return res; +} + +static const ReplyParserCallbacks DefaultParserCallbacks = { + .null_callback = callReplyNull, + .bulk_string_callback = callReplyBulkString, + .null_bulk_string_callback = callReplyNullBulkString, + .null_array_callback = callReplyNullArray, + .error_callback = callReplyError, + .simple_str_callback = callReplySimpleStr, + .long_callback = callReplyLong, + .array_callback = callReplyArray, + .set_callback = callReplySet, + .map_callback = callReplyMap, + .double_callback = callReplyDouble, + .bool_callback = callReplyBool, + .big_number_callback = callReplyBigNumber, + .verbatim_string_callback = callReplyVerbatimString, + .attribute_callback = callReplyAttribute, + .error = callReplyParseError, +}; + +/* Parse the buffer located in rep->original_proto and update the CallReply + * structure to represent its contents. */ +static void callReplyParse(CallReply *rep) { + if (rep->flags & REPLY_FLAG_PARSED) { + return; + } + + ReplyParser parser = {.curr_location = rep->proto, .callbacks = DefaultParserCallbacks}; + + parseReply(&parser, rep); + rep->flags |= REPLY_FLAG_PARSED; +} + +/* Return the call reply type (REDISMODULE_REPLY_...). */ +int callReplyType(CallReply *rep) { + if (!rep) return REDISMODULE_REPLY_UNKNOWN; + callReplyParse(rep); + return rep->type; +} + +/* Return reply string as buffer and len. Applicable to: + * - REDISMODULE_REPLY_STRING + * - REDISMODULE_REPLY_ERROR + * + * The return value is borrowed from CallReply, so it must not be freed + * explicitly or used after CallReply itself is freed. + * + * The returned value is not NULL terminated and its length is returned by + * reference through len, which must not be NULL. + */ +const char *callReplyGetString(CallReply *rep, size_t *len) { + callReplyParse(rep); + if (rep->type != REDISMODULE_REPLY_STRING && + rep->type != REDISMODULE_REPLY_ERROR) return NULL; + if (len) *len = rep->len; + return rep->val.str; +} + +/* Return a long long reply value. Applicable to: + * - REDISMODULE_REPLY_INTEGER + */ +long long callReplyGetLongLong(CallReply *rep) { + callReplyParse(rep); + if (rep->type != REDISMODULE_REPLY_INTEGER) return LLONG_MIN; + return rep->val.ll; +} + +/* Return a double reply value. Applicable to: + * - REDISMODULE_REPLY_DOUBLE + */ +double callReplyGetDouble(CallReply *rep) { + callReplyParse(rep); + if (rep->type != REDISMODULE_REPLY_DOUBLE) return LLONG_MIN; + return rep->val.d; +} + +/* Return a reply Boolean value. Applicable to: + * - REDISMODULE_REPLY_BOOL + */ +int callReplyGetBool(CallReply *rep) { + callReplyParse(rep); + if (rep->type != REDISMODULE_REPLY_BOOL) return INT_MIN; + return rep->val.ll; +} + +/* Return reply length. Applicable to: + * - REDISMODULE_REPLY_STRING + * - REDISMODULE_REPLY_ERROR + * - REDISMODULE_REPLY_ARRAY + * - REDISMODULE_REPLY_SET + * - REDISMODULE_REPLY_MAP + * - REDISMODULE_REPLY_ATTRIBUTE + */ +size_t callReplyGetLen(CallReply *rep) { + callReplyParse(rep); + switch(rep->type) { + case REDISMODULE_REPLY_STRING: + case REDISMODULE_REPLY_ERROR: + case REDISMODULE_REPLY_ARRAY: + case REDISMODULE_REPLY_SET: + case REDISMODULE_REPLY_MAP: + case REDISMODULE_REPLY_ATTRIBUTE: + return rep->len; + default: + return 0; + } +} + +static CallReply *callReplyGetCollectionElement(CallReply *rep, size_t idx, int elements_per_entry) { + if (idx >= rep->len * elements_per_entry) return NULL; // real len is rep->len * elements_per_entry + return rep->val.array+idx; +} + +/* Return a reply array element at a given index. Applicable to: + * - REDISMODULE_REPLY_ARRAY + * + * The return value is borrowed from CallReply, so it must not be freed + * explicitly or used after CallReply itself is freed. + */ +CallReply *callReplyGetArrayElement(CallReply *rep, size_t idx) { + callReplyParse(rep); + if (rep->type != REDISMODULE_REPLY_ARRAY) return NULL; + return callReplyGetCollectionElement(rep, idx, 1); +} + +/* Return a reply set element at a given index. Applicable to: + * - REDISMODULE_REPLY_SET + * + * The return value is borrowed from CallReply, so it must not be freed + * explicitly or used after CallReply itself is freed. + */ +CallReply *callReplyGetSetElement(CallReply *rep, size_t idx) { + callReplyParse(rep); + if (rep->type != REDISMODULE_REPLY_SET) return NULL; + return callReplyGetCollectionElement(rep, idx, 1); +} + +static int callReplyGetMapElementInternal(CallReply *rep, size_t idx, CallReply **key, CallReply **val, int type) { + callReplyParse(rep); + if (rep->type != type) return C_ERR; + if (idx >= rep->len) return C_ERR; + if (key) *key = callReplyGetCollectionElement(rep, idx * 2, 2); + if (val) *val = callReplyGetCollectionElement(rep, idx * 2 + 1, 2); + return C_OK; +} + +/* Retrieve a map reply key and value at a given index. Applicable to: + * - REDISMODULE_REPLY_MAP + * + * The key and value are returned by reference through key and val, + * which may also be NULL if not needed. + * + * Returns C_OK on success or C_ERR if reply type mismatches, or if idx is out + * of range. + * + * The returned values are borrowed from CallReply, so they must not be freed + * explicitly or used after CallReply itself is freed. + */ +int callReplyGetMapElement(CallReply *rep, size_t idx, CallReply **key, CallReply **val) { + return callReplyGetMapElementInternal(rep, idx, key, val, REDISMODULE_REPLY_MAP); +} + +/* Return reply attribute, or NULL if it does not exist. Applicable to all replies. + * + * The returned values are borrowed from CallReply, so they must not be freed + * explicitly or used after CallReply itself is freed. + */ +CallReply *callReplyGetAttribute(CallReply *rep) { + return rep->attribute; +} + +/* Retrieve attribute reply key and value at a given index. Applicable to: + * - REDISMODULE_REPLY_ATTRIBUTE + * + * The key and value are returned by reference through key and val, + * which may also be NULL if not needed. + * + * Returns C_OK on success or C_ERR if reply type mismatches, or if idx is out + * of range. + * + * The returned values are borrowed from CallReply, so they must not be freed + * explicitly or used after CallReply itself is freed. + */ +int callReplyGetAttributeElement(CallReply *rep, size_t idx, CallReply **key, CallReply **val) { + return callReplyGetMapElementInternal(rep, idx, key, val, REDISMODULE_REPLY_MAP); +} + +/* Return a big number reply value. Applicable to: + * - REDISMODULE_REPLY_BIG_NUMBER + * + * The returned values are borrowed from CallReply, so they must not be freed + * explicitly or used after CallReply itself is freed. + * + * The return value is guaranteed to be a big number, as described in the RESP3 + * protocol specifications. + * + * The returned value is not NULL terminated and its length is returned by + * reference through len, which must not be NULL. + */ +const char *callReplyGetBigNumber(CallReply *rep, size_t *len) { + callReplyParse(rep); + if (rep->type != REDISMODULE_REPLY_BIG_NUMBER) return NULL; + *len = rep->len; + return rep->val.str; +} + +/* Return a verbatim string reply value. Applicable to: + * - REDISMODULE_REPLY_VERBATIM_STRING + * + * If format is non-NULL, the verbatim reply format is also returned by value. + * + * The optional output argument can be given to get a verbatim reply + * format, or can be set NULL if not needed. + * + * The return value is borrowed from CallReply, so it must not be freed + * explicitly or used after CallReply itself is freed. + * + * The returned value is not NULL terminated and its length is returned by + * reference through len, which must not be NULL. + */ +const char *callReplyGetVerbatim(CallReply *rep, size_t *len, const char **format){ + callReplyParse(rep); + if (rep->type != REDISMODULE_REPLY_VERBATIM_STRING) return NULL; + *len = rep->len; + if (format) *format = rep->val.verbatim_str.format; + return rep->val.verbatim_str.str; +} + +/* Return the current reply blob. + * + * The return value is borrowed from CallReply, so it must not be freed + * explicitly or used after CallReply itself is freed. + */ +const char *callReplyGetProto(CallReply *rep, size_t *proto_len) { + *proto_len = rep->proto_len; + return rep->proto; +} + +/* Return CallReply private data, as set by the caller on callReplyCreate(). + */ +void *callReplyGetPrivateData(CallReply *rep) { + return rep->private_data; +} + +/* Return true if the reply or one of it sub-replies is RESP3 formatted. */ +int callReplyIsResp3(CallReply *rep) { + return rep->flags & REPLY_FLAG_RESP3; +} + +/* Returns a list of errors in sds form, or NULL. */ +list *callReplyDeferredErrorList(CallReply *rep) { + return rep->deferred_error_list; +} + +/* Create a new CallReply struct from the reply blob. + * + * The function will own the reply blob, so it must not be used or freed by + * the caller after passing it to this function. + * + * The reply blob will be freed when the returned CallReply struct is later + * freed using freeCallReply(). + * + * The deferred_error_list is an optional list of errors that are present + * in the reply blob, if given, this function will take ownership on it. + * + * The private_data is optional and can later be accessed using + * callReplyGetPrivateData(). + * + * NOTE: The parser used for parsing the reply and producing CallReply is + * designed to handle valid replies created by Redis itself. IT IS NOT + * DESIGNED TO HANDLE USER INPUT and using it to parse invalid replies is + * unsafe. + */ +CallReply *callReplyCreate(sds reply, list *deferred_error_list, void *private_data) { + CallReply *res = zmalloc(sizeof(*res)); + res->flags = REPLY_FLAG_ROOT; + res->original_proto = reply; + res->proto = reply; + res->proto_len = sdslen(reply); + res->private_data = private_data; + res->attribute = NULL; + res->deferred_error_list = deferred_error_list; + return res; +} + +/* Create a new CallReply struct from the reply blob representing an error message. + * Automatically creating deferred_error_list and set a copy of the reply in it. + * Refer to callReplyCreate for detailed explanation. + * Reply string can come in one of two forms: + * 1. A protocol reply starting with "-CODE" and ending with "\r\n" + * 2. A plain string, in which case this function adds the protocol header and footer. */ +CallReply *callReplyCreateError(sds reply, void *private_data) { + sds err_buff = reply; + if (err_buff[0] != '-') { + err_buff = sdscatfmt(sdsempty(), "-ERR %S\r\n", reply); + sdsfree(reply); + } + list *deferred_error_list = listCreate(); + listSetFreeMethod(deferred_error_list, sdsfreegeneric); + listAddNodeTail(deferred_error_list, sdsnew(err_buff)); + return callReplyCreate(err_buff, deferred_error_list, private_data); +} diff --git a/examples/redis-unstable/src/call_reply.h b/examples/redis-unstable/src/call_reply.h new file mode 100644 index 0000000..4ae7f3c --- /dev/null +++ b/examples/redis-unstable/src/call_reply.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#ifndef SRC_CALL_REPLY_H_ +#define SRC_CALL_REPLY_H_ + +#include "resp_parser.h" + +typedef struct CallReply CallReply; +typedef void (*RedisModuleOnUnblocked)(void *ctx, CallReply *reply, void *private_data); + +CallReply *callReplyCreate(sds reply, list *deferred_error_list, void *private_data); +CallReply *callReplyCreateError(sds reply, void *private_data); +int callReplyType(CallReply *rep); +const char *callReplyGetString(CallReply *rep, size_t *len); +long long callReplyGetLongLong(CallReply *rep); +double callReplyGetDouble(CallReply *rep); +int callReplyGetBool(CallReply *rep); +size_t callReplyGetLen(CallReply *rep); +CallReply *callReplyGetArrayElement(CallReply *rep, size_t idx); +CallReply *callReplyGetSetElement(CallReply *rep, size_t idx); +int callReplyGetMapElement(CallReply *rep, size_t idx, CallReply **key, CallReply **val); +CallReply *callReplyGetAttribute(CallReply *rep); +int callReplyGetAttributeElement(CallReply *rep, size_t idx, CallReply **key, CallReply **val); +const char *callReplyGetBigNumber(CallReply *rep, size_t *len); +const char *callReplyGetVerbatim(CallReply *rep, size_t *len, const char **format); +const char *callReplyGetProto(CallReply *rep, size_t *len); +void *callReplyGetPrivateData(CallReply *rep); +int callReplyIsResp3(CallReply *rep); +list *callReplyDeferredErrorList(CallReply *rep); +void freeCallReply(CallReply *rep); +CallReply *callReplyCreatePromise(void *private_data); + +#endif /* SRC_CALL_REPLY_H_ */ diff --git a/examples/redis-unstable/src/childinfo.c b/examples/redis-unstable/src/childinfo.c new file mode 100644 index 0000000..95cbbbb --- /dev/null +++ b/examples/redis-unstable/src/childinfo.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "server.h" +#include +#include + +typedef struct { + size_t keys; + size_t cow; + monotime cow_updated; + double progress; + childInfoType information_type; /* Type of information */ +} child_info_data; + +/* Open a child-parent channel used in order to move information about the + * RDB / AOF saving process from the child to the parent (for instance + * the amount of copy on write memory used) */ +void openChildInfoPipe(void) { + if (anetPipe(server.child_info_pipe, O_NONBLOCK, 0) == -1) { + /* On error our two file descriptors should be still set to -1, + * but we call anyway closeChildInfoPipe() since can't hurt. */ + closeChildInfoPipe(); + } else { + server.child_info_nread = 0; + } +} + +/* Close the pipes opened with openChildInfoPipe(). */ +void closeChildInfoPipe(void) { + if (server.child_info_pipe[0] != -1 || + server.child_info_pipe[1] != -1) + { + close(server.child_info_pipe[0]); + close(server.child_info_pipe[1]); + server.child_info_pipe[0] = -1; + server.child_info_pipe[1] = -1; + server.child_info_nread = 0; + } +} + +/* Send save data to parent. */ +void sendChildInfoGeneric(childInfoType info_type, size_t keys, double progress, char *pname) { + if (server.child_info_pipe[1] == -1) return; + + static monotime cow_updated = 0; + static uint64_t cow_update_cost = 0; + static size_t cow = 0; + static size_t peak_cow = 0; + static size_t update_count = 0; + static unsigned long long sum_cow = 0; + + child_info_data data = {0}; /* zero everything, including padding to satisfy valgrind */ + + /* When called to report current info, we need to throttle down CoW updates as they + * can be very expensive. To do that, we measure the time it takes to get a reading + * and schedule the next reading to happen not before time*CHILD_COW_COST_FACTOR + * passes. */ + + monotime now = getMonotonicUs(); + if (info_type != CHILD_INFO_TYPE_CURRENT_INFO || + !cow_updated || + now - cow_updated > cow_update_cost * CHILD_COW_DUTY_CYCLE) + { + cow = zmalloc_get_private_dirty(-1); + cow_updated = getMonotonicUs(); + cow_update_cost = cow_updated - now; + if (cow > peak_cow) peak_cow = cow; + sum_cow += cow; + update_count++; + + int cow_info = (info_type != CHILD_INFO_TYPE_CURRENT_INFO); + if (cow || cow_info) { + serverLog(cow_info ? LL_NOTICE : LL_VERBOSE, + "Fork CoW for %s: current %zu MB, peak %zu MB, average %llu MB", + pname, cow>>20, peak_cow>>20, (sum_cow/update_count)>>20); + } + } + + data.information_type = info_type; + data.keys = keys; + data.cow = cow; + data.cow_updated = cow_updated; + data.progress = progress; + + ssize_t wlen = sizeof(data); + + if (write(server.child_info_pipe[1], &data, wlen) != wlen) { + /* Failed writing to parent, it could have been killed, exit. */ + serverLog(LL_WARNING,"Child failed reporting info to parent, exiting. %s", strerror(errno)); + exitFromChild(1, 0); + } +} + +/* Update Child info. */ +void updateChildInfo(childInfoType information_type, size_t cow, monotime cow_updated, size_t keys, double progress) { + if (cow > server.stat_current_cow_peak) server.stat_current_cow_peak = cow; + + if (information_type == CHILD_INFO_TYPE_CURRENT_INFO) { + server.stat_current_cow_bytes = cow; + server.stat_current_cow_updated = cow_updated; + server.stat_current_save_keys_processed = keys; + if (progress != -1) server.stat_module_progress = progress; + } else if (information_type == CHILD_INFO_TYPE_AOF_COW_SIZE) { + server.stat_aof_cow_bytes = server.stat_current_cow_peak; + } else if (information_type == CHILD_INFO_TYPE_RDB_COW_SIZE) { + server.stat_rdb_cow_bytes = server.stat_current_cow_peak; + } else if (information_type == CHILD_INFO_TYPE_MODULE_COW_SIZE) { + server.stat_module_cow_bytes = server.stat_current_cow_peak; + } +} + +/* Read child info data from the pipe. + * if complete data read into the buffer, + * data is stored into *buffer, and returns 1. + * otherwise, the partial data is left in the buffer, waiting for the next read, and returns 0. */ +int readChildInfo(childInfoType *information_type, size_t *cow, monotime *cow_updated, size_t *keys, double* progress) { + /* We are using here a static buffer in combination with the server.child_info_nread to handle short reads */ + static child_info_data buffer; + ssize_t wlen = sizeof(buffer); + + /* Do not overlap */ + if (server.child_info_nread == wlen) server.child_info_nread = 0; + + int nread = read(server.child_info_pipe[0], (char *)&buffer + server.child_info_nread, wlen - server.child_info_nread); + if (nread > 0) { + server.child_info_nread += nread; + } + + /* We have complete child info */ + if (server.child_info_nread == wlen) { + *information_type = buffer.information_type; + *cow = buffer.cow; + *cow_updated = buffer.cow_updated; + *keys = buffer.keys; + *progress = buffer.progress; + return 1; + } else { + return 0; + } +} + +/* Receive info data from child. */ +void receiveChildInfo(void) { + if (server.child_info_pipe[0] == -1) return; + + size_t cow; + monotime cow_updated; + size_t keys; + double progress; + childInfoType information_type; + + /* Drain the pipe and update child info so that we get the final message. */ + while (readChildInfo(&information_type, &cow, &cow_updated, &keys, &progress)) { + updateChildInfo(information_type, cow, cow_updated, keys, progress); + } +} diff --git a/examples/redis-unstable/src/chk.c b/examples/redis-unstable/src/chk.c new file mode 100644 index 0000000..f15cfb1 --- /dev/null +++ b/examples/redis-unstable/src/chk.c @@ -0,0 +1,822 @@ +/* Implementation of a topK structure using CuckooHeavyKeeper algorithm + * + * Implementation is based on the paper "Cuckoo Heavy Keeper and the balancing + * act of maintaining heavy hitters in stream processing" by Vinh Quang Ngo and + * Marina Papatriantafilou. Also, the accompanying C++ implementation was used + * as a reference point: https://github.com/vinhqngo5/Cuckoo_Heavy_Keeper + * Main changes are addition of a min-heap so we can keep names of the top K + * elements - idea comes from RedisBloom's TopK structure. + * + * Copyright (c) 2026-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "chk.h" +#include "redisassert.h" +#include "zmalloc.h" +#include "xxhash.h" + +#include +#include +#include + +/* Lobby to heavy item promotion threshold */ +#define LOBBY_PROMOTION_THRESHOLD 16 + +#ifndef static_assert +#define static_assert(expr, lit) extern char __static_assert_failure[(expr) ? 1:-1] +#endif + +static_assert(LOBBY_PROMOTION_THRESHOLD < CHK_LUT_SIZE, + "Lobby promotion threshold should be less then the LUT size to " + "ensure constant operations during decayCounter!"); + +/* After a heavy item is demoted is starts recursively kicking out other heavy + * items in the case it should stay heavy (defined by isHeavyHitter). In + * principle this process could go over all the items in the chkTopK's tables + * so it's artificially limited by this constant. */ +#define MAX_KICKS 16 + +/* An item is defined as heavy hitter if its count is more or equal to x * N + * where x is a threshold constant (HEAVY_RATIO) and N is the total count the + * chkTopK structure has accumulated. See the paper for more info. */ +#define HEAVY_RATIO 0.008 + +/* A unique seed for the items when storing them in the heap so it's not related + * to the cuckoo's hashes. Also, we don't need the less-bit hash here as the + * heap does not take much memory so we avoid needless possible collisions. */ +#define HEAP_SEED 1919 + +typedef struct { + size_t idx[CHK_NUM_TABLES]; + fingerprint_t fp; +} fpAndIdx; + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +/* Heap operations */ +static chkHeapBucket *chkCheckExistInHeap(chkTopK *topk, const char *item, int itemlen, uint64_t fp) { + for (int32_t i = topk->k - 1; i >= 0; --i) { + chkHeapBucket *bucket = topk->heap + i; + if (bucket->fp == fp && bucket->item && + sdslen(bucket->item) == (size_t)itemlen && + memcmp(bucket->item, item, itemlen) == 0) + { + return bucket; + } + } + return NULL; +} + +void chkHeapifyDown(chkHeapBucket *array, size_t len, size_t start) { + size_t child = start; + + if (len < 2 || (len - 2) / 2 < child) { + return; + } + child = 2 * child + 1; + if ((child + 1) < len && (array[child].count > array[child + 1].count)) { + ++child; + } + if (array[child].count > array[start].count) { + return; + } + + chkHeapBucket top = {0}; + top = array[start]; + do { + memcpy(&array[start], &array[child], sizeof(chkHeapBucket)); + start = child; + + if ((len - 2) / 2 < child) { + break; + } + child = 2 * child + 1; + + if ((child + 1) < len && (array[child].count > array[child + 1].count)) { + ++child; + } + } while (array[child].count < top.count); + memcpy(&array[start], &top, sizeof(chkHeapBucket)); +} + +/*----------------------------------------------------------------------------- + * chkTopK operations + *----------------------------------------------------------------------------*/ + +/* Create the chkTopK structure. Note, CHK paper recommends decay=1.08. + * numbuckets must be a power of 2. Recommended size for numbuckets is at least + * 7 or 8 times k. */ +chkTopK *chkTopKCreate(int k, int numbuckets, double decay) { + /* Number of buckets need to be a power of 2 for better performance - we + * have better cache locality of the tables and faster table indices + * calculations. */ + assert(k > 0 && (numbuckets & (numbuckets - 1)) == 0); + + size_t usable = 0; + chkTopK *topk = zcalloc_usable(sizeof(chkTopK), &usable); + topk->alloc_size += usable; + + for (int i = 0; i < CHK_NUM_TABLES; ++i) { + topk->tables[i] = zcalloc_usable(sizeof(chkBucket) * numbuckets, &usable); + topk->alloc_size += usable; + } + + topk->heap = zcalloc_usable(sizeof(chkHeapBucket) * k, &usable); + topk->alloc_size += usable; + + topk->decay = decay; + topk->inv_decay = 1. / decay; + topk->k = k; + topk->numbuckets = numbuckets; + + topk->lut_decay_exp[0] = 0; + topk->lut_min_decay[0] = 0; + topk->lut_decay_prob[0] = 0; + for (int i = 1; i < CHK_LUT_SIZE + 1; ++i) { + topk->lut_decay_exp[i] = topk->lut_decay_exp[i - 1] + pow(topk->decay, i - 1); + topk->lut_min_decay[i] = topk->lut_decay_exp[i] - topk->lut_decay_exp[i - 1]; + topk->lut_decay_prob[i] = pow(topk->inv_decay, i); + } + + return topk; +} + +/* Release chkTopK resources */ +void chkTopKRelease(chkTopK *topk) { + size_t usable; + for (int i = 0; i < CHK_NUM_TABLES; ++i) { + zfree_usable(topk->tables[i], &usable); + topk->alloc_size -= usable; + } + for (int i = 0; i < topk->k; ++i) { + if (topk->heap[i].item) { + topk->alloc_size -= sdsAllocSize(topk->heap[i].item); + sdsfree(topk->heap[i].item); + } + } + zfree_usable(topk->heap, &usable); + topk->alloc_size -= usable; + debugAssert(topk->alloc_size == zmalloc_usable_size(topk)); + + zfree(topk); +} + +static inline int generateAltIdx(fingerprint_t fp, int idx, int numbuckets) { + return (idx ^ (0x5bd1e995 * (size_t)fp)) & (numbuckets - 1); +} + +fpAndIdx generateItemFpAndIdxs(chkTopK *topk, char *item, int itemlen) { + uint64_t hash = XXH3_64bits_withSeed(item, itemlen, 0); + + fpAndIdx res; + res.fp = (hash & 0xFFFF); /* Only use 16 bits for fingerprint */ + + /* Note numbuckets are a power of 2 so we don't use modulo for index calc */ + res.idx[0] = (hash >> 32) & (topk->numbuckets - 1); + for (int i = 1; i < CHK_NUM_TABLES; ++i) { + res.idx[i] = generateAltIdx(res.fp, res.idx[i-1], topk->numbuckets); + } + + return res; +} + +typedef struct { + int table_idx; + int pos; +} checkEntryRes; + +/* Check if `item` is a heavy entry. If so we bump its count. If not - we make + * it a heavy entry immediately if there is an empty spot, thus skipping the + * lobby as an optimization. */ +checkEntryRes checkHeavyEntries(chkTopK *topk, fpAndIdx item, counter_t weight) { + int empty_table_idx = -1; + int empty_pos = -1; + + for (int i = 0; i < CHK_NUM_TABLES; ++i) { + int idx = item.idx[i]; + + chkBucket *bucket = &topk->tables[i][idx]; + for (int j = 0; j < CHK_HEAVY_ENTRIES_PER_BUCKET; ++j) { + chkHeavyEntry *e = &bucket->heavy_entries[j]; + if (e->count > 0) { + if (e->fp == item.fp) { + e->count += weight; + + checkEntryRes res = { i, j }; + return res; + } + } else if (empty_table_idx == -1) { + empty_table_idx = i; + empty_pos = j; + } + } + } + + if (empty_table_idx == -1) { + checkEntryRes res = { -1, -1 }; + return res; + } + + /* If there is an empty slot in the heavy entries just put the item there + * instead of going through the lobby first (optimization as per the paper) */ + int idx = item.idx[empty_table_idx]; + chkHeavyEntry *e = &topk->tables[empty_table_idx][idx].heavy_entries[empty_pos]; + e->fp = item.fp; + e->count = weight; + + checkEntryRes res = {empty_table_idx, empty_pos}; + return res; +} + +/* A heavy hitter is defined by the paper as an item with counter more or equal + * to phi * N, where phi is a constant and N is the total count the structure + * has recorded up to that point */ +int isHeavyHitter(chkTopK *topk, counter_t cnt) { + return cnt >= (topk->total * HEAVY_RATIO); +} + +/* After a lobby item is promoted it may be placed on a heavy item's spot. The + * latter is kicked out, but it may recursively kick out another heavy item. + * The process is limited by MAX_KICKS and also by the fact that during updates + * one of the kicked out items may have its counter decayed so much - it's not + * passing the heavy item threshold (see isHeavyHitter). */ +void kickout(chkTopK *topk, chkHeavyEntry entry, int idx, int table_idx) { + for (int i = 0; i < MAX_KICKS; ++i) { + /* Do not try to swap with any entries if we don't reach the heavy + * hitter threshold */ + if (!isHeavyHitter(topk, entry.count)) return; + + /* Find the heavy entry in the alt bucket in the other table with + * minimum count. If there is empty entry there just occupy it, else + * recursively kick the minimal one out. + * To find the alt bucket we need to compute the alt index from the + * fingerprint of the kicked-out entry. */ + table_idx = 1 - table_idx; + idx = generateAltIdx(entry.fp, idx, topk->numbuckets); + + chkBucket *bucket = &topk->tables[table_idx][idx]; + counter_t min = (counter_t)-1; + int min_pos = -1; + for (int j = 0; j < CHK_HEAVY_ENTRIES_PER_BUCKET; ++j) { + chkHeavyEntry *e = &bucket->heavy_entries[j]; + if (e->count == 0) { + *e = entry; + return; + } + if (e->count < min) { + min = e->count; + min_pos = j; + } + } + + chkHeavyEntry old_entry = bucket->heavy_entries[min_pos]; + bucket->heavy_entries[min_pos] = entry; + entry = old_entry; + } +} + +/* When a lobby entry's counter passes the promotion threshold we try to promote + * it with some probability. See the paper for more details. If promotion is + * successful the lobby entry may kick out a heavy one - see kickout() */ +int tryPromoteAndKickout(chkTopK *topk, fpAndIdx item, counter_t new_count, + int table_idx) +{ + int idx = item.idx[table_idx]; + chkBucket *bucket = &topk->tables[table_idx][idx]; + counter_t min = (counter_t)-1; /* counter_t is unsigned */ + int min_idx = -1; + + /* We search for heavy item bucket of the promoted lobby entry. We may have + * an empty space which we immediately occupy. Otherwise we choose the + * bucket with lowest counter */ + for (int i = 0; i < CHK_HEAVY_ENTRIES_PER_BUCKET; ++i) { + if (bucket->heavy_entries[i].count == 0) { + bucket->heavy_entries[i].fp = item.fp; + bucket->heavy_entries[i].count = new_count; + return i; + } + if (bucket->heavy_entries[i].count < min) { + min = bucket->heavy_entries[i].count; + min_idx = i; + } + } + + /* If the heavy entry that is going to be kicked out has a counter lower + * than the lobby's one we always kick it out */ + if (min > new_count) { + double prob = (new_count - LOBBY_PROMOTION_THRESHOLD) / + (double)(min - LOBBY_PROMOTION_THRESHOLD); + + if ((rand() / (double)RAND_MAX) >= prob) return -1; + } + + chkHeavyEntry to_kickout = bucket->heavy_entries[min_idx]; + /* Note, that here the promoted item keeps the old count as per the paper */ + bucket->heavy_entries[min_idx].fp = bucket->lobby_entry.fp; + + bucket->lobby_entry.count = 0; + bucket->lobby_entry.fp = 0; + + kickout(topk, to_kickout, idx, table_idx); + + return min_idx; +} + +/* Check if an item is a lobby entry */ +checkEntryRes checkLobbyEntries(chkTopK *topk, fpAndIdx item, counter_t weight) { + for (int i = 0; i < CHK_NUM_TABLES; ++i) { + int idx = item.idx[i]; + + chkBucket *bucket = &topk->tables[i][idx]; + chkLobbyEntry *e = &bucket->lobby_entry; + + /* No match or empty lobby entry */ + if (e->fp != item.fp || e->count == 0) continue; + + /* If we don't cross the threshold just update the counter */ + uint64_t new_count = (uint64_t)e->count + weight; + if (new_count < LOBBY_PROMOTION_THRESHOLD) { + e->count = (uint16_t)new_count; + + checkEntryRes res = { i, -1 }; + return res; + } + + /* Try to promote the entry to heavy entry if we crossed the threshold. + * Else just set the counter to the value of the threshold */ + int kickout_pos = tryPromoteAndKickout(topk, item, new_count, i); + if (kickout_pos != -1) { + checkEntryRes res = {i, kickout_pos}; + return res; + } + + e->count = LOBBY_PROMOTION_THRESHOLD; + checkEntryRes res = { i, -1 }; + return res; + } + + checkEntryRes res = { -1, -1 }; + return res; +} + +/* Probability to decay cnt with 1. + * Equal to pow(decay, -cnt) */ +static inline double getDecayProb(chkTopK *topk, counter_t cnt) { + if (cnt < CHK_LUT_SIZE) { + return topk->lut_decay_prob[cnt]; + } + + return pow(topk->lut_decay_prob[CHK_LUT_SIZE], + ((double)cnt / (CHK_LUT_SIZE))) * + topk->lut_decay_prob[cnt % (CHK_LUT_SIZE)]; +} + +/* Expected decay steps to decay cnt to 0. + * Equal to sum(pow(decay, i)) for i in [0; cnt] */ +static inline double getExpDecayCount(chkTopK *topk, lobby_counter_t cnt) { + return topk->lut_decay_exp[cnt]; +} + +/* Expected minimum decay steps to decay cnt with 1. Since probability is + * pow(decay, -cnt) it's equal to pow(decay, cnt) */ +static inline double getMinDecayCount(chkTopK *topk, counter_t cnt) { + if (cnt < CHK_LUT_SIZE) { + return topk->lut_min_decay[cnt]; + } + + return pow(topk->lut_min_decay[CHK_LUT_SIZE], + ((double)cnt / (CHK_LUT_SIZE))) * + topk->lut_min_decay[cnt % (CHK_LUT_SIZE)]; +} + +/* When there is a hash-collission between lobby entries we decay the existing + * lobby entry with the weight of the new one. Return the counter after decaying. */ +lobby_counter_t chkDecayCounter(chkTopK *topk, lobby_counter_t cnt, counter_t weight) { + if (weight == 0) return cnt; + + /* Unweighted update - just decay with probability pow(decay, -cnt) */ + if (weight == 1) { + double prob = getDecayProb(topk, (counter_t)cnt); + if ((rand() / (double)RAND_MAX) < prob) { + return cnt - 1; + } + return cnt; + } + + /* For weighted updates we simulate multiple unweighted ones */ + + /* Weight is smaller than the minimum amount of decay steps required to + * decay the counter with probability of 100% so again we roll the dice */ + double min_decay = getMinDecayCount(topk, cnt); + if (weight < (counter_t)min_decay) { + double prob = weight / min_decay; + if ((rand() / (double)RAND_MAX) < prob) { + return cnt - 1; + } + return cnt; + } + + /* Weight is more than the expected amount of decay steps to decay the + * counter to 0. */ + double exp_decays = getExpDecayCount(topk, cnt); + if (weight >= (counter_t)exp_decays) + return 0; + + /* Weight is large enough to decay the counter to cnt - X where 0 < X < cnt. + * We binary search for the largest value `C` such that: + * + * (expected decay ops for `C`) >= (expected decay ops for `cnt`) - `weight` + * i.e lut_decay_exp[C] + weight >= lut_decay_exp[cnt] + * + * Note that since cnt is a lobby counter it will necessarily be less or + * equal than LOBBY_PROMOTION_THRESHOLD, so although we binary search this + * is a O(1) operation */ + int left = 0; + int right = cnt; + while (left < right) { + int mid = left + (right - left) / 2; + + if (topk->lut_decay_exp[mid] + weight >= topk->lut_decay_exp[cnt]) { + right = mid; + } else { + left = mid + 1; + } + } + + return left; +} + +/* Update weighted item. If another one was expelled from the topK list - + * return it. Caller is responsible for releasing it */ +sds chkTopKUpdate(chkTopK *topk, char *item, int itemlen, counter_t weight) +{ + if (weight == 0) return NULL; + + topk->total += weight; + + /* Generate a fingerprint and indices for both cuckoo tables. */ + fpAndIdx itemFpIdx = generateItemFpAndIdxs(topk, item, itemlen); + + /* Check if the item is amongst the heavy entries. If so we just update its + * counter. */ + checkEntryRes res = checkHeavyEntries(topk, itemFpIdx, weight); + if (res.table_idx != -1) { + goto update_heap; + } + + /* If the item is not already heavy it may be in the lobby. If so we'll + * increase its counter and promote it to a heavy entry if it passes the + * threshold */ + res = checkLobbyEntries(topk, itemFpIdx, weight); + if (res.table_idx != -1) { + goto update_heap; + } + + /* Item is not tracked at all. Check for empty lobby entries - if there is + * any - place the item there. The weight may be higher than the promotional + * threshold in which case we'll try to promote it. */ + for (int i = 0; i < CHK_NUM_TABLES; ++i) { + int idx = itemFpIdx.idx[i]; + chkBucket *bucket = &topk->tables[i][idx]; + if (bucket->lobby_entry.count == 0) { + bucket->lobby_entry.fp = itemFpIdx.fp; + + res.table_idx = i; + res.pos = -1; + + if (weight < LOBBY_PROMOTION_THRESHOLD) { + bucket->lobby_entry.count = weight; + } else { + int kickout_pos = tryPromoteAndKickout(topk, itemFpIdx, weight, i); + if (kickout_pos != -1) { + res.pos = kickout_pos; + } else { + bucket->lobby_entry.count = LOBBY_PROMOTION_THRESHOLD; + } + } + + goto update_heap; + } + } + + /* If there are no empty lobby entries choose a table deterministically, + * decay its lobby counter and update */ + int table_idx = itemFpIdx.fp & 1; + int idx = itemFpIdx.idx[table_idx]; + + chkLobbyEntry *e = &topk->tables[table_idx][idx].lobby_entry; + + /* new_count is the count of `e` after decaying it with weight */ + lobby_counter_t new_count = chkDecayCounter(topk, e->count, weight); + + /* if the chosen lobby entry has decayed its counter to 0, it's replaced by + * the new entry. Note, in that case the new entry has it's weight + * decreased by the approximate amount of decay operations needed to decay + * the old entry. */ + if (new_count == 0) { + e->fp = itemFpIdx.fp; + counter_t exp_decay_cnt = getExpDecayCount(topk, e->count); + e->count = exp_decay_cnt >= weight ? + 1 : (lobby_counter_t)min(255, weight - exp_decay_cnt); + } else { + e->count = new_count; + } + + if (e->count >= LOBBY_PROMOTION_THRESHOLD) { + int kickout_pos = tryPromoteAndKickout(topk, itemFpIdx, e->count, table_idx); + if (kickout_pos != -1) { + res.table_idx = table_idx; + res.pos = kickout_pos; + } + } + + /* After a change in the structure has occurred we check if we also need to + * update the heap - i.e bump a new item in it, or reorder an old item if + * it's counter went up. */ +update_heap: + if (res.table_idx == -1 || res.pos == -1) + return NULL; + + table_idx = res.table_idx; + idx = itemFpIdx.idx[table_idx]; + + counter_t heap_min = topk->heap[0].count; + chkHeavyEntry *entry = &topk->tables[table_idx][idx].heavy_entries[res.pos]; + + if (entry->count < heap_min) + return NULL; + + /* Heap uses different hash than the cuckoo tables */ + uint64_t fp = XXH3_64bits_withSeed(item, itemlen, HEAP_SEED); + chkHeapBucket *itemHeapPtr = chkCheckExistInHeap(topk, item, itemlen, fp); + if (itemHeapPtr != NULL) { + itemHeapPtr->count = entry->count; + chkHeapifyDown(topk->heap, topk->k, itemHeapPtr - topk->heap); + } else { + /* We know the new entry has bigger count than the min-element so it's + * safe to expel it. */ + sds expelled = topk->heap[0].item; + if (expelled) topk->alloc_size -= sdsAllocSize(expelled); + + topk->heap[0].count = entry->count; + topk->heap[0].fp = fp; + topk->heap[0].item = sdsnewlen(item, itemlen); + topk->alloc_size += sdsAllocSize(topk->heap[0].item); + + chkHeapifyDown(topk->heap, topk->k, 0); + return expelled; + } + + return NULL; +} + +int cmpchkHeapBucket(const void *tmp1, const void *tmp2) { + const chkHeapBucket *res1 = tmp1; + const chkHeapBucket *res2 = tmp2; + return res1->count < res2->count ? 1 : res1->count > res2->count ? -1 : 0; +} + +/* Get an ordered by count list of topk->k elements inside the topk object. + * + * NOTE, the returned array is a copy of the internal heap stored by `topk`. The + * caller is responsible for releasing it after use. The elements of the array + * share their `item` pointers with the internal topk->heap buckets so one must + * not use it after `topk` is released. */ +chkHeapBucket *chkTopKList(chkTopK *topk) { + chkHeapBucket *list = zmalloc(sizeof(chkHeapBucket) * topk->k); + memcpy(list, topk->heap, sizeof(chkHeapBucket) * topk->k); + qsort(list, topk->k, sizeof(*list), cmpchkHeapBucket); + return list; +} + +size_t chkTopKGetMemoryUsage(chkTopK *topk) { + if (!topk) return 0; + + return topk->alloc_size; +} + +#ifdef REDIS_TEST + +#include +#include "testhelp.h" + +#define UNUSED(x) (void)(x) + +static int findItemInList(chkHeapBucket *list, int k, const char *item, int itemlen) { + for (int i = 0; i < k; i++) { + if (list[i].item != NULL && + sdslen(list[i].item) == (size_t)itemlen && + memcmp(list[i].item, item, itemlen) == 0) { + return i; + } + } + return -1; +} + +static int verifyListSorted(chkHeapBucket *list, int k) { + for (int i = 0; i < k - 1; i++) { + if (list[i].item == NULL) continue; + if (list[i + 1].item == NULL) continue; + if (list[i].count < list[i + 1].count) { + return 0; + } + } + return 1; +} + +static void chkTopKUpdateAndFreeExpelled(chkTopK *topk, const char *item, int itemlen, counter_t weight) { + sds expelled = chkTopKUpdate(topk, (char *)item, itemlen, weight); + if (expelled) sdsfree(expelled); +} + +static void testBasicTopK(void) { + int k = 5; + int numbuckets = 64; + double decay = 0.9; + + chkTopK *topk = chkTopKCreate(k, numbuckets, decay); + test_cond("Create topk structure", topk != NULL); + + if (topk == NULL) return; + + chkTopKUpdateAndFreeExpelled(topk, "item1", 5, 100); + chkTopKUpdateAndFreeExpelled(topk, "item2", 5, 200); + chkTopKUpdateAndFreeExpelled(topk, "item3", 5, 150); + chkTopKUpdateAndFreeExpelled(topk, "item4", 5, 50); + chkTopKUpdateAndFreeExpelled(topk, "item5", 5, 300); + chkTopKUpdateAndFreeExpelled(topk, "item6", 5, 75); + + chkHeapBucket *list = chkTopKList(topk); + test_cond("chkTopKList returns non-NULL", list != NULL); + + if (list == NULL) { + chkTopKRelease(topk); + return; + } + + test_cond("TopK list is sorted in descending order", verifyListSorted(list, k)); + + int idx1 = findItemInList(list, k, "item5", 5); + int idx2 = findItemInList(list, k, "item2", 5); + int idx3 = findItemInList(list, k, "item3", 5); + + test_cond("Heaviest items are in the list", idx1 != -1 && idx2 != -1 && idx3 != -1); + + test_cond("item5 has the highest count", idx1 == 0); + + zfree(list); + chkTopKRelease(topk); +} + +static void testHeavierElementsReplaceLighter(void) { + int k = 5; + int numbuckets = 64; + double decay = 0.9; + + chkTopK *topk = chkTopKCreate(k, numbuckets, decay); + test_cond("Create topk structure for replacement test", topk != NULL); + + if (topk == NULL) return; + + chkTopKUpdateAndFreeExpelled(topk, "light1", 6, 50); + chkTopKUpdateAndFreeExpelled(topk, "light2", 6, 60); + chkTopKUpdateAndFreeExpelled(topk, "light3", 6, 70); + chkTopKUpdateAndFreeExpelled(topk, "light4", 6, 80); + chkTopKUpdateAndFreeExpelled(topk, "light5", 6, 90); + + chkHeapBucket *list1 = chkTopKList(topk); + test_cond("Initial topk list is not NULL", list1 != NULL); + + if (list1 == NULL) { + chkTopKRelease(topk); + return; + } + + int light1_idx = findItemInList(list1, k, "light1", 6); + int light2_idx = findItemInList(list1, k, "light2", 6); + int light3_idx = findItemInList(list1, k, "light3", 6); + int light4_idx = findItemInList(list1, k, "light4", 6); + int light5_idx = findItemInList(list1, k, "light5", 6); + + test_cond("light1 is in initial topk list", light1_idx != -1); + test_cond("light2 is in initial topk list", light2_idx != -1); + test_cond("light3 is in initial topk list", light3_idx != -1); + test_cond("light4 is in initial topk list", light4_idx != -1); + test_cond("light5 is in initial topk list", light5_idx != -1); + + zfree(list1); + + chkTopKUpdateAndFreeExpelled(topk, "heavy1", 6, 500); + chkTopKUpdateAndFreeExpelled(topk, "heavy2", 6, 600); + + chkHeapBucket *list2 = chkTopKList(topk); + test_cond("Updated topk list is not NULL", list2 != NULL); + + if (list2 == NULL) { + chkTopKRelease(topk); + return; + } + + int heavy1_idx = findItemInList(list2, k, "heavy1", 6); + int heavy2_idx = findItemInList(list2, k, "heavy2", 6); + + test_cond("heavy1 is in updated topk list", heavy1_idx != -1); + test_cond("heavy2 is in updated topk list", heavy2_idx != -1); + + light1_idx = findItemInList(list2, k, "light1", 6); + light2_idx = findItemInList(list2, k, "light2", 6); + light3_idx = findItemInList(list2, k, "light3", 6); + light4_idx = findItemInList(list2, k, "light4", 6); + light5_idx = findItemInList(list2, k, "light5", 6); + + int light_items_remaining = (light1_idx != -1 ? 1 : 0) + + (light2_idx != -1 ? 1 : 0) + + (light3_idx != -1 ? 1 : 0) + + (light4_idx != -1 ? 1 : 0) + + (light5_idx != -1 ? 1 : 0); + + test_cond("Some lighter items remain in the list after adding heavier ones", + light_items_remaining > 0); + + zfree(list2); + chkTopKRelease(topk); +} + +static void testManySmallWeightUpdates(void) { + int k = 2; + int numbuckets = 64; + double decay = 0.9; + + chkTopK *topk = chkTopKCreate(k, numbuckets, decay); + test_cond("Create topk structure for small weight updates test", topk != NULL); + + if (topk == NULL) return; + + chkTopKUpdateAndFreeExpelled(topk, "item0", 5, 50); + chkTopKUpdateAndFreeExpelled(topk, "item1", 5, 100); + + chkHeapBucket *list1 = chkTopKList(topk); + test_cond("Topk list after adding item0 and item1 is not NULL", list1 != NULL); + + if (list1 == NULL) { + chkTopKRelease(topk); + return; + } + + int item0_idx1 = findItemInList(list1, k, "item0", 5); + int item1_idx1 = findItemInList(list1, k, "item1", 5); + + test_cond("item0 and item1 are in topk after initial updates", + item0_idx1 != -1 && item1_idx1 != -1); + + zfree(list1); + + for (int i = 0; i < 100; i++) { + chkTopKUpdateAndFreeExpelled(topk, "item2", 5, 1); + } + + chkHeapBucket *list2 = chkTopKList(topk); + test_cond("Topk list after many small updates is not NULL", list2 != NULL); + + if (list2 == NULL) { + chkTopKRelease(topk); + return; + } + + int item0_idx2 = findItemInList(list2, k, "item0", 5); + int item1_idx2 = findItemInList(list2, k, "item1", 5); + int item2_idx2 = findItemInList(list2, k, "item2", 5); + + test_cond("item1 and item2 are in topk, item0 is not", + item1_idx2 != -1 && item2_idx2 != -1 && item0_idx2 == -1); + + counter_t item1_count = 0; + counter_t item2_count = 0; + if (item1_idx2 != -1) item1_count = list2[item1_idx2].count; + if (item2_idx2 != -1) item2_count = list2[item2_idx2].count; + + test_cond("item1 and item2 have similar weights", item1_count > 0 && item2_count > 0 && + (item1_count > item2_count ? item1_count - item2_count : item2_count - item1_count) < 5); + + zfree(list2); + chkTopKRelease(topk); +} + +int chkTopKTest(int argc, char *argv[], int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + testBasicTopK(); + testHeavierElementsReplaceLighter(); + testManySmallWeightUpdates(); + + return 0; +} + +#endif /* REDIS_TEST */ diff --git a/examples/redis-unstable/src/chk.h b/examples/redis-unstable/src/chk.h new file mode 100644 index 0000000..a974fd6 --- /dev/null +++ b/examples/redis-unstable/src/chk.h @@ -0,0 +1,89 @@ +/* Implementation of a topK structure using CuckooHeavyKeeper algorithm + * + * Implementation is based on the paper "Cuckoo Heavy Keeper and the balancing + * act of maintaining heavy hitters in stream processing" by Vinh Quang Ngo and + * Marina Papatriantafilou. Also, the accompanying C++ implementation was used + * as a reference point: https://github.com/vinhqngo5/Cuckoo_Heavy_Keeper + * Main changes are addition of a min-heap so we can keep names of the top K + * elements - idea comes from RedisBloom's TopK structure. + * + * Copyright (c) 2026-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#pragma once + +#include "sds.h" + +#include +#include + +#define CHK_LUT_SIZE 256 +#define CHK_HEAVY_ENTRIES_PER_BUCKET 2 +#define CHK_NUM_TABLES 2 + +typedef uint64_t counter_t; +typedef uint16_t fingerprint_t; +typedef uint8_t lobby_counter_t; + +typedef struct { + counter_t count; + fingerprint_t fp; +} chkHeavyEntry; + +typedef struct { + fingerprint_t fp; + lobby_counter_t count; +} chkLobbyEntry; + +typedef struct { + chkHeavyEntry heavy_entries[CHK_HEAVY_ENTRIES_PER_BUCKET]; + chkLobbyEntry lobby_entry; +} chkBucket; + +typedef struct { + counter_t count; + sds item; + uint64_t fp; /* Fingerprint used to identify the item. Internal use only */ +} chkHeapBucket; + +typedef struct chkTopK { + chkBucket *tables[CHK_NUM_TABLES]; /* Cuckoo tables */ + chkHeapBucket *heap; /* Min-heap for storing top-K item's names */ + + size_t alloc_size; /* Used for memory tracking only */ + + /* Expected number of operations to decay count i to 0 */ + double lut_decay_exp[CHK_LUT_SIZE + 1]; + + /* Minimum number of decay operations to decay count i with 1 */ + double lut_min_decay[CHK_LUT_SIZE + 1]; + + /* Probability of decaying i with 1. As per paper probability is decay^-i + * but we actually store (1/decay)^i for faster computation. */ + double lut_decay_prob[CHK_LUT_SIZE + 1]; + + double decay; /* Decay constant */ + double inv_decay; /* Cache 1/decay for faster computations */ + + counter_t total; /* Total recorded count for all updates */ + + int k; + int numbuckets; +} chkTopK; + +chkTopK *chkTopKCreate(int k, int numbuckets, double decay); +void chkTopKRelease(chkTopK *topk); +sds chkTopKUpdate(chkTopK *topk, char *item, int itemlen, counter_t weight); +chkHeapBucket *chkTopKList(chkTopK *topk); +size_t chkTopKGetMemoryUsage(chkTopK *topk); + +#ifdef REDIS_TEST + +int chkTopKTest(int argc, char *argv[], int flags); + +#endif /* REDIS_TEST */ diff --git a/examples/redis-unstable/src/cli_commands.c b/examples/redis-unstable/src/cli_commands.c new file mode 100644 index 0000000..e56d48c --- /dev/null +++ b/examples/redis-unstable/src/cli_commands.c @@ -0,0 +1,13 @@ +#include +#include "cli_commands.h" + +/* Definitions to configure commands.c to generate the above structs. */ +#define MAKE_CMD(name,summary,complexity,since,doc_flags,replaced,deprecated,group,group_enum,history,num_history,tips,num_tips,function,arity,flags,acl,key_specs,key_specs_num,get_keys,numargs) name,summary,group,since,numargs +#define MAKE_ARG(name,type,key_spec_index,token,summary,since,flags,numsubargs,deprecated_since) name,type,token,since,flags,numsubargs +#define COMMAND_ARG cliCommandArg +#define COMMAND_STRUCT commandDocs +#define SKIP_CMD_HISTORY_TABLE +#define SKIP_CMD_TIPS_TABLE +#define SKIP_CMD_KEY_SPECS_TABLE + +#include "commands.def" diff --git a/examples/redis-unstable/src/cli_commands.h b/examples/redis-unstable/src/cli_commands.h new file mode 100644 index 0000000..eb5a476 --- /dev/null +++ b/examples/redis-unstable/src/cli_commands.h @@ -0,0 +1,46 @@ +/* This file is used by redis-cli in place of server.h when including commands.c + * It contains alternative structs which omit the parts of the commands table + * that are not suitable for redis-cli, e.g. the command proc. */ + +#ifndef __REDIS_CLI_COMMANDS_H +#define __REDIS_CLI_COMMANDS_H + +#include +#include "commands.h" + +/* Syntax specifications for a command argument. */ +typedef struct cliCommandArg { + char *name; + redisCommandArgType type; + char *token; + char *since; + int flags; + int numsubargs; + struct cliCommandArg *subargs; + const char *display_text; + + /* + * For use at runtime. + * Fields used to keep track of input word matches for command-line hinting. + */ + int matched; /* How many input words have been matched by this argument? */ + int matched_token; /* Has the token been matched? */ + int matched_name; /* Has the name been matched? */ + int matched_all; /* Has the whole argument been consumed (no hint needed)? */ +} cliCommandArg; + +/* Command documentation info used for help output */ +struct commandDocs { + char *name; + char *summary; + char *group; + char *since; + int numargs; + cliCommandArg *args; /* An array of the command arguments. */ + struct commandDocs *subcommands; + char *params; /* A string describing the syntax of the command arguments. */ +}; + +extern struct commandDocs redisCommandTable[]; + +#endif diff --git a/examples/redis-unstable/src/cli_common.c b/examples/redis-unstable/src/cli_common.c new file mode 100644 index 0000000..0c269de --- /dev/null +++ b/examples/redis-unstable/src/cli_common.c @@ -0,0 +1,424 @@ +/* CLI (command line interface) common methods + * + * Copyright (c) 2020-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "fmacros.h" +#include "cli_common.h" +#include "version.h" + +#include +#include +#include +#include +#include +#include /* Use hiredis' sds compat header that maps sds calls to their hi_ variants */ +#include /* use sds.h from hiredis, so that only one set of sds functions will be present in the binary */ +#include +#include +#include +#ifdef USE_OPENSSL +#include +#include +#include +#endif + +#define UNUSED(V) ((void) V) + +char *redisGitSHA1(void); +char *redisGitDirty(void); + +/* Wrapper around redisSecureConnection to avoid hiredis_ssl dependencies if + * not building with TLS support. + */ +int cliSecureConnection(redisContext *c, cliSSLconfig config, const char **err) { +#ifdef USE_OPENSSL + static SSL_CTX *ssl_ctx = NULL; + + if (!ssl_ctx) { + ssl_ctx = SSL_CTX_new(SSLv23_client_method()); + if (!ssl_ctx) { + *err = "Failed to create SSL_CTX"; + goto error; + } + SSL_CTX_set_options(ssl_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); + SSL_CTX_set_verify(ssl_ctx, config.skip_cert_verify ? SSL_VERIFY_NONE : SSL_VERIFY_PEER, NULL); + + if (config.cacert || config.cacertdir) { + if (!SSL_CTX_load_verify_locations(ssl_ctx, config.cacert, config.cacertdir)) { + *err = "Invalid CA Certificate File/Directory"; + goto error; + } + } else { + if (!SSL_CTX_set_default_verify_paths(ssl_ctx)) { + *err = "Failed to use default CA paths"; + goto error; + } + } + + if (config.cert && !SSL_CTX_use_certificate_chain_file(ssl_ctx, config.cert)) { + *err = "Invalid client certificate"; + goto error; + } + + if (config.key && !SSL_CTX_use_PrivateKey_file(ssl_ctx, config.key, SSL_FILETYPE_PEM)) { + *err = "Invalid private key"; + goto error; + } + if (config.ciphers && !SSL_CTX_set_cipher_list(ssl_ctx, config.ciphers)) { + *err = "Error while configuring ciphers"; + goto error; + } +#ifdef TLS1_3_VERSION + if (config.ciphersuites && !SSL_CTX_set_ciphersuites(ssl_ctx, config.ciphersuites)) { + *err = "Error while setting cypher suites"; + goto error; + } +#endif + } + + SSL *ssl = SSL_new(ssl_ctx); + if (!ssl) { + *err = "Failed to create SSL object"; + return REDIS_ERR; + } + + if (config.sni && !SSL_set_tlsext_host_name(ssl, config.sni)) { + *err = "Failed to configure SNI"; + SSL_free(ssl); + return REDIS_ERR; + } + + return redisInitiateSSL(c, ssl); + +error: + SSL_CTX_free(ssl_ctx); + ssl_ctx = NULL; + return REDIS_ERR; +#else + (void) config; + (void) c; + (void) err; + return REDIS_OK; +#endif +} + +/* Wrapper around hiredis to allow arbitrary reads and writes. + * + * We piggybacks on top of hiredis to achieve transparent TLS support, + * and use its internal buffers so it can co-exist with commands + * previously/later issued on the connection. + * + * Interface is close to enough to read()/write() so things should mostly + * work transparently. + */ + +/* Write a raw buffer through a redisContext. If we already have something + * in the buffer (leftovers from hiredis operations) it will be written + * as well. + */ +ssize_t cliWriteConn(redisContext *c, const char *buf, size_t buf_len) +{ + int done = 0; + + /* Append data to buffer which is *usually* expected to be empty + * but we don't assume that, and write. + */ + c->obuf = sdscatlen(c->obuf, buf, buf_len); + if (redisBufferWrite(c, &done) == REDIS_ERR) { + if (!(c->flags & REDIS_BLOCK)) + errno = EAGAIN; + + /* On error, we assume nothing was written and we roll back the + * buffer to its original state. + */ + if (sdslen(c->obuf) > buf_len) + sdsrange(c->obuf, 0, -(buf_len+1)); + else + sdsclear(c->obuf); + + return -1; + } + + /* If we're done, free up everything. We may have written more than + * buf_len (if c->obuf was not initially empty) but we don't have to + * tell. + */ + if (done) { + sdsclear(c->obuf); + return buf_len; + } + + /* Write was successful but we have some leftovers which we should + * remove from the buffer. + * + * Do we still have data that was there prior to our buf? If so, + * restore buffer to it's original state and report no new data was + * written. + */ + if (sdslen(c->obuf) > buf_len) { + sdsrange(c->obuf, 0, -(buf_len+1)); + return 0; + } + + /* At this point we're sure no prior data is left. We flush the buffer + * and report how much we've written. + */ + size_t left = sdslen(c->obuf); + sdsclear(c->obuf); + return buf_len - left; +} + +/* Wrapper around OpenSSL (libssl and libcrypto) initialisation + */ +int cliSecureInit(void) +{ +#ifdef USE_OPENSSL + ERR_load_crypto_strings(); + SSL_load_error_strings(); + SSL_library_init(); +#endif + return REDIS_OK; +} + +/* Create an sds from stdin */ +sds readArgFromStdin(void) { + char buf[1024]; + sds arg = sdsempty(); + + while(1) { + int nread = read(fileno(stdin),buf,1024); + + if (nread == 0) break; + else if (nread == -1) { + perror("Reading from standard input"); + exit(1); + } + arg = sdscatlen(arg,buf,nread); + } + return arg; +} + +/* Create an sds array from argv, either as-is or by dequoting every + * element. When quoted is non-zero, may return a NULL to indicate an + * invalid quoted string. + * + * The caller should free the resulting array of sds strings with + * sdsfreesplitres(). + */ +sds *getSdsArrayFromArgv(int argc,char **argv, int quoted) { + sds *res = sds_malloc(sizeof(sds) * argc); + + for (int j = 0; j < argc; j++) { + if (quoted) { + sds unquoted = unquoteCString(argv[j]); + if (!unquoted) { + while (--j >= 0) sdsfree(res[j]); + sds_free(res); + return NULL; + } + res[j] = unquoted; + } else { + res[j] = sdsnew(argv[j]); + } + } + + return res; +} + +/* Unquote a null-terminated string and return it as a binary-safe sds. */ +sds unquoteCString(char *str) { + int count; + sds *unquoted = sdssplitargs(str, &count); + sds res = NULL; + + if (unquoted && count == 1) { + res = unquoted[0]; + unquoted[0] = NULL; + } + + if (unquoted) + sdsfreesplitres(unquoted, count); + + return res; +} + + +/* URL-style percent decoding. */ +#define isHexChar(c) (isdigit(c) || ((c) >= 'a' && (c) <= 'f')) +#define decodeHexChar(c) (isdigit(c) ? (c) - '0' : (c) - 'a' + 10) +#define decodeHex(h, l) ((decodeHexChar(h) << 4) + decodeHexChar(l)) + +static sds percentDecode(const char *pe, size_t len) { + const char *end = pe + len; + sds ret = sdsempty(); + const char *curr = pe; + + while (curr < end) { + if (*curr == '%') { + if ((end - curr) < 2) { + fprintf(stderr, "Incomplete URI encoding\n"); + exit(1); + } + + char h = tolower(*(++curr)); + char l = tolower(*(++curr)); + if (!isHexChar(h) || !isHexChar(l)) { + fprintf(stderr, "Illegal character in URI encoding\n"); + exit(1); + } + char c = decodeHex(h, l); + ret = sdscatlen(ret, &c, 1); + curr++; + } else { + ret = sdscatlen(ret, curr++, 1); + } + } + + return ret; +} + +/* Parse a URI and extract the server connection information. + * URI scheme is based on the provisional specification[1] excluding support + * for query parameters. Valid URIs are: + * scheme: "redis://" + * authority: [[ ":"] "@"] [ [":" ]] + * path: ["/" []] + * + * [1]: https://www.iana.org/assignments/uri-schemes/prov/redis */ +void parseRedisUri(const char *uri, const char* tool_name, cliConnInfo *connInfo, int *tls_flag) { +#ifdef USE_OPENSSL + UNUSED(tool_name); +#else + UNUSED(tls_flag); +#endif + + const char *scheme = "redis://"; + const char *tlsscheme = "rediss://"; + const char *curr = uri; + const char *end = uri + strlen(uri); + const char *userinfo, *username, *port, *host, *path; + + /* URI must start with a valid scheme. */ + if (!strncasecmp(tlsscheme, curr, strlen(tlsscheme))) { +#ifdef USE_OPENSSL + *tls_flag = 1; + curr += strlen(tlsscheme); +#else + fprintf(stderr,"rediss:// is only supported when %s is compiled with OpenSSL\n", tool_name); + exit(1); +#endif + } else if (!strncasecmp(scheme, curr, strlen(scheme))) { + curr += strlen(scheme); + } else { + fprintf(stderr,"Invalid URI scheme\n"); + exit(1); + } + if (curr == end) return; + + /* Extract user info. */ + if ((userinfo = strchr(curr,'@'))) { + if ((username = strchr(curr, ':')) && username < userinfo) { + connInfo->user = percentDecode(curr, username - curr); + curr = username + 1; + } + + connInfo->auth = percentDecode(curr, userinfo - curr); + curr = userinfo + 1; + } + if (curr == end) return; + + /* Extract host and port. */ + path = strchr(curr, '/'); + if (*curr != '/') { + host = path ? path - 1 : end; + if (*curr == '[') { + curr += 1; + if ((port = strchr(curr, ']'))) { + if (*(port+1) == ':') { + connInfo->hostport = atoi(port + 2); + } + host = port - 1; + } + } else { + if ((port = strchr(curr, ':'))) { + connInfo->hostport = atoi(port + 1); + host = port - 1; + } + } + sdsfree(connInfo->hostip); + connInfo->hostip = sdsnewlen(curr, host - curr + 1); + } + curr = path ? path + 1 : end; + if (curr == end) return; + + /* Extract database number. */ + connInfo->input_dbnum = atoi(curr); +} + +void freeCliConnInfo(cliConnInfo connInfo){ + if (connInfo.hostip) sdsfree(connInfo.hostip); + if (connInfo.auth) sdsfree(connInfo.auth); + if (connInfo.user) sdsfree(connInfo.user); +} + +/* + * Escape a Unicode string for JSON output (--json), following RFC 7159: + * https://datatracker.ietf.org/doc/html/rfc7159#section-7 +*/ +sds escapeJsonString(sds s, const char *p, size_t len) { + s = sdscatlen(s,"\"",1); + while(len--) { + switch(*p) { + case '\\': + case '"': + s = sdscatprintf(s,"\\%c",*p); + break; + case '\n': s = sdscatlen(s,"\\n",2); break; + case '\f': s = sdscatlen(s,"\\f",2); break; + case '\r': s = sdscatlen(s,"\\r",2); break; + case '\t': s = sdscatlen(s,"\\t",2); break; + case '\b': s = sdscatlen(s,"\\b",2); break; + default: + s = sdscatprintf(s,*(unsigned char *)p <= 0x1f ? "\\u%04x" : "%c",*p); + } + p++; + } + return sdscatlen(s,"\"",1); +} + +sds cliVersion(void) { + sds version = sdscatprintf(sdsempty(), "%s", REDIS_VERSION); + + /* Add git commit and working tree status when available. */ + if (strtoll(redisGitSHA1(),NULL,16)) { + version = sdscatprintf(version, " (git:%s", redisGitSHA1()); + if (strtoll(redisGitDirty(),NULL,10)) + version = sdscatprintf(version, "-dirty"); + version = sdscat(version, ")"); + } + return version; +} + +/* This is a wrapper to call redisConnect or redisConnectWithTimeout. */ +redisContext *redisConnectWrapper(const char *ip, int port, const struct timeval tv) { + if (tv.tv_sec == 0 && tv.tv_usec == 0) { + return redisConnect(ip, port); + } else { + return redisConnectWithTimeout(ip, port, tv); + } +} + +/* This is a wrapper to call redisConnectUnix or redisConnectUnixWithTimeout. */ +redisContext *redisConnectUnixWrapper(const char *path, const struct timeval tv) { + if (tv.tv_sec == 0 && tv.tv_usec == 0) { + return redisConnectUnix(path); + } else { + return redisConnectUnixWithTimeout(path, tv); + } +} diff --git a/examples/redis-unstable/src/cli_common.h b/examples/redis-unstable/src/cli_common.h new file mode 100644 index 0000000..a5b8e44 --- /dev/null +++ b/examples/redis-unstable/src/cli_common.h @@ -0,0 +1,59 @@ +#ifndef __CLICOMMON_H +#define __CLICOMMON_H + +#include +#include /* Use hiredis' sds compat header that maps sds calls to their hi_ variants */ + +typedef struct cliSSLconfig { + /* Requested SNI, or NULL */ + char *sni; + /* CA Certificate file, or NULL */ + char *cacert; + /* Directory where trusted CA certificates are stored, or NULL */ + char *cacertdir; + /* Skip server certificate verification. */ + int skip_cert_verify; + /* Client certificate to authenticate with, or NULL */ + char *cert; + /* Private key file to authenticate with, or NULL */ + char *key; + /* Preferred cipher list, or NULL (applies only to <= TLSv1.2) */ + char* ciphers; + /* Preferred ciphersuites list, or NULL (applies only to TLSv1.3) */ + char* ciphersuites; +} cliSSLconfig; + + +/* server connection information object, used to describe an ip:port pair, db num user input, and user:pass. */ +typedef struct cliConnInfo { + char *hostip; + int hostport; + int input_dbnum; + char *auth; + char *user; +} cliConnInfo; + +int cliSecureConnection(redisContext *c, cliSSLconfig config, const char **err); + +ssize_t cliWriteConn(redisContext *c, const char *buf, size_t buf_len); + +int cliSecureInit(void); + +sds readArgFromStdin(void); + +sds *getSdsArrayFromArgv(int argc,char **argv, int quoted); + +sds unquoteCString(char *str); + +void parseRedisUri(const char *uri, const char* tool_name, cliConnInfo *connInfo, int *tls_flag); + +void freeCliConnInfo(cliConnInfo connInfo); + +sds escapeJsonString(sds s, const char *p, size_t len); + +sds cliVersion(void); + +redisContext *redisConnectWrapper(const char *ip, int port, const struct timeval tv); +redisContext *redisConnectUnixWrapper(const char *path, const struct timeval tv); + +#endif /* __CLICOMMON_H */ diff --git a/examples/redis-unstable/src/cluster.c b/examples/redis-unstable/src/cluster.c new file mode 100644 index 0000000..d07c31c --- /dev/null +++ b/examples/redis-unstable/src/cluster.c @@ -0,0 +1,2263 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + * + * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. + */ + +/* + * cluster.c contains the common parts of a clustering + * implementation, the parts that are shared between + * any implementation of Redis clustering. + */ + +#include "server.h" +#include "cluster.h" +#include "cluster_asm.h" +#include "cluster_slot_stats.h" + +#include + +/* ----------------------------------------------------------------------------- + * Key space handling + * -------------------------------------------------------------------------- */ + +/* If it can be inferred that the given glob-style pattern, as implemented in + * stringmatchlen() in util.c, only can match keys belonging to a single slot, + * that slot is returned. Otherwise -1 is returned. */ +int patternHashSlot(char *pattern, int length) { + int s = -1; /* index of the first '{' */ + + for (int i = 0; i < length; i++) { + if (pattern[i] == '*' || pattern[i] == '?' || pattern[i] == '[') { + /* Wildcard or character class found. Keys can be in any slot. */ + return -1; + } else if (pattern[i] == '\\') { + /* Escaped character. Computing slot in this case is not + * implemented. We would need a temp buffer. */ + return -1; + } else if (s == -1 && pattern[i] == '{') { + /* Opening brace '{' found. */ + s = i; + } else if (s >= 0 && pattern[i] == '}' && i == s + 1) { + /* Empty tag '{}' found. The whole key is hashed. Ignore braces. */ + s = -2; + } else if (s >= 0 && pattern[i] == '}') { + /* Non-empty tag '{...}' found. Hash what's between braces. */ + return crc16(pattern + s + 1, i - s - 1) & 0x3FFF; + } + } + + /* The pattern matches a single key. Hash the whole pattern. */ + return crc16(pattern, length) & 0x3FFF; +} + +int getSlotOrReply(client *c, robj *o) { + long long slot; + + if (getLongLongFromObject(o,&slot) != C_OK || + slot < 0 || slot >= CLUSTER_SLOTS) + { + addReplyError(c,"Invalid or out of range slot"); + return -1; + } + return (int) slot; +} + +ConnectionType *connTypeOfCluster(void) { + if (server.tls_cluster) { + return connectionTypeTls(); + } + + return connectionTypeTcp(); +} + +/* ----------------------------------------------------------------------------- + * DUMP, RESTORE and MIGRATE commands + * -------------------------------------------------------------------------- */ + +/* Generates a DUMP-format representation of the object 'o', adding it to the + * io stream pointed by 'rio'. This function can't fail. */ +void createDumpPayload(rio *payload, robj *o, robj *key, int dbid, int skip_checksum) { + unsigned char buf[2]; + uint64_t crc = 0; + + /* Serialize the object in an RDB-like format. It consist of an object type + * byte followed by the serialized object. This is understood by RESTORE. */ + rioInitWithBuffer(payload,sdsempty()); + + /* Save key metadata if present without (handles TTL separately via command args) */ + if (getModuleMetaBits(o->metabits)) + serverAssert(rdbSaveKeyMetadata(payload, key, o, dbid) != -1); + serverAssert(rdbSaveObjectType(payload,o)); + serverAssert(rdbSaveObject(payload,o,key,dbid)); + + /* Write the footer, this is how it looks like: + * ----------------+---------------------+---------------+ + * ... RDB payload | 2 bytes RDB version | 8 bytes CRC64 | + * ----------------+---------------------+---------------+ + * RDB version and CRC are both in little endian. + */ + + /* RDB version */ + buf[0] = RDB_VERSION & 0xff; + buf[1] = (RDB_VERSION >> 8) & 0xff; + payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,buf,2); + + /* If crc checksum is disabled, crc is set to 0 and no checksum validation + * will be performed on RESTORE. */ + if (!skip_checksum) { + /* CRC64 */ + crc = crc64(0,(unsigned char*)payload->io.buffer.ptr, + sdslen(payload->io.buffer.ptr)); + memrev64ifbe(&crc); + } + payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,&crc,8); +} + +/* Verify that the RDB version of the dump payload matches the one of this Redis + * instance and that the checksum is ok. + * If the DUMP payload looks valid C_OK is returned, otherwise C_ERR + * is returned. If rdbver_ptr is not NULL, its populated with the value read + * from the input buffer. */ +int verifyDumpPayload(unsigned char *p, size_t len, uint16_t *rdbver_ptr) { + unsigned char *footer; + uint16_t rdbver; + uint64_t crc; + + /* At least 2 bytes of RDB version and 8 of CRC64 should be present. */ + if (len < 10) return C_ERR; + footer = p+(len-10); + + /* Set and verify RDB version. */ + rdbver = (footer[1] << 8) | footer[0]; + if (rdbver_ptr) { + *rdbver_ptr = rdbver; + } + if (rdbver > RDB_VERSION) return C_ERR; + + if (server.skip_checksum_validation) + return C_OK; + + uint64_t crc_payload; + memcpy(&crc_payload, footer+2, 8); + if (crc_payload == 0) /* No checksum. */ + return C_OK; + + /* Verify CRC64 */ + crc = crc64(0,p,len-8); + memrev64ifbe(&crc); + return crc == crc_payload ? C_OK : C_ERR; +} + +/* DUMP keyname + * DUMP is actually not used by Redis Cluster but it is the obvious + * complement of RESTORE and can be useful for different applications. */ +void dumpCommand(client *c) { + kvobj *o; + rio payload; + + /* Check if the key is here. */ + if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) { + addReplyNull(c); + return; + } + + /* Create the DUMP encoded representation. */ + createDumpPayload(&payload,o,c->argv[1],c->db->id,0); + + /* Transfer to the client */ + addReplyBulkSds(c,payload.io.buffer.ptr); + return; +} + +/* RESTORE key ttl serialized-value [REPLACE] [ABSTTL] [IDLETIME seconds] [FREQ frequency] */ +void restoreCommand(client *c) { + long long ttl, lfu_freq = -1, lru_idle = -1, lru_clock = -1; + rio payload; + int j, type, replace = 0, absttl = 0; + robj *obj; + + /* Parse additional options */ + for (j = 4; j < c->argc; j++) { + int additional = c->argc-j-1; + if (!strcasecmp(c->argv[j]->ptr,"replace")) { + replace = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"absttl")) { + absttl = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"idletime") && additional >= 1 && + lfu_freq == -1) + { + if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lru_idle,NULL) + != C_OK) return; + if (lru_idle < 0) { + addReplyError(c,"Invalid IDLETIME value, must be >= 0"); + return; + } + lru_clock = LRU_CLOCK(); + j++; /* Consume additional arg. */ + } else if (!strcasecmp(c->argv[j]->ptr,"freq") && additional >= 1 && + lru_idle == -1) + { + if (getLongLongFromObjectOrReply(c,c->argv[j+1],&lfu_freq,NULL) + != C_OK) return; + if (lfu_freq < 0 || lfu_freq > 255) { + addReplyError(c,"Invalid FREQ value, must be >= 0 and <= 255"); + return; + } + j++; /* Consume additional arg. */ + } else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + + /* Make sure this key does not already exist here... */ + robj *key = c->argv[1]; + kvobj *oldval = lookupKeyWrite(c->db,key); + int oldtype = oldval ? oldval->type : -1; + if (!replace && oldval) { + addReplyErrorObject(c,shared.busykeyerr); + return; + } + + /* Check if the TTL value makes sense */ + if (getLongLongFromObjectOrReply(c,c->argv[2],&ttl,NULL) != C_OK) { + return; + } else if (ttl < 0) { + addReplyError(c,"Invalid TTL value, must be >= 0"); + return; + } + + /* Verify RDB version and data checksum. */ + if (verifyDumpPayload(c->argv[3]->ptr,sdslen(c->argv[3]->ptr),NULL) == C_ERR) + { + addReplyError(c,"DUMP payload version or checksum are wrong"); + return; + } + + rioInitWithBuffer(&payload,c->argv[3]->ptr); + + /* Initialize metadata spec to collect metadata+expiry from payload. */ + KeyMetaSpec keymeta; + keyMetaSpecInit(&keymeta); + + /* Compute TTL early so we can add it to metadata spec in correct order */ + if (ttl) { + if (!absttl) ttl+=commandTimeSnapshot(); + keyMetaSpecAdd(&keymeta, KEY_META_ID_EXPIRE, ttl); + } + + /* With metadata, type = RDB_OPCODE_KEY_META. Layout: [,],, */ + type = rdbLoadType(&payload); + if (rdbResolveKeyType(&payload, &type, c->db->id, &keymeta) == -1) { + addReplyError(c,"Bad data format"); + return; + } + + /* Load the object */ + if ((obj = rdbLoadObject(type,&payload,key->ptr,c->db->id,NULL)) == NULL) + { + keyMetaSpecCleanup(&keymeta); + addReplyError(c,"Bad data format"); + return; + } + + /* Remove the old key if needed. */ + int deleted = 0; + if (replace) + deleted = dbDelete(c->db,key); + + if (ttl && checkAlreadyExpired(ttl)) { + if (deleted) { + robj *aux = server.lazyfree_lazy_server_del ? shared.unlink : shared.del; + rewriteClientCommandVector(c, 2, aux, key); + keyModified(c,c->db,key,NULL,1); + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); + server.dirty++; + } + keyMetaSpecCleanup(&keymeta); + decrRefCount(obj); + addReply(c, shared.ok); + return; + } + + /* Create the key and set the TTL if any */ + kvobj *kv = dbAddInternal(c->db, key, &obj, NULL, &keymeta); + + /* If minExpiredField was set, then the object is hash with expiration + * on fields and need to register it in global HFE DS */ + if (kv->type == OBJ_HASH) { + uint64_t minExpiredField = hashTypeGetMinExpire(kv, 1); + if (minExpiredField != EB_EXPIRE_TIME_INVALID) + estoreAdd(c->db->subexpires, getKeySlot(key->ptr), kv, minExpiredField); + } + + if (ttl) { + if (!absttl) { + /* Propagate TTL as absolute timestamp */ + robj *ttl_obj = createStringObjectFromLongLong(ttl); + rewriteClientCommandArgument(c,2,ttl_obj); + decrRefCount(ttl_obj); + rewriteClientCommandArgument(c,c->argc,shared.absttl); + } + } + objectSetLRUOrLFU(kv, lfu_freq, lru_idle, lru_clock, 1000); + keyModified(c,c->db,key,NULL,1); + notifyKeyspaceEvent(NOTIFY_GENERIC,"restore",key,c->db->id); + + /* If we deleted a key that means REPLACE parameter was passed and the + * destination key existed. */ + if (deleted) { + notifyKeyspaceEvent(NOTIFY_OVERWRITTEN, "overwritten", key, c->db->id); + if (oldtype != kv->type) { + notifyKeyspaceEvent(NOTIFY_TYPE_CHANGED, "type_changed", key, c->db->id); + } + } + addReply(c,shared.ok); + server.dirty++; +} +/* MIGRATE socket cache implementation. + * + * We take a map between host:ip and a TCP socket that we used to connect + * to this instance in recent time. + * This sockets are closed when the max number we cache is reached, and also + * in serverCron() when they are around for more than a few seconds. */ +#define MIGRATE_SOCKET_CACHE_ITEMS 64 /* max num of items in the cache. */ +#define MIGRATE_SOCKET_CACHE_TTL 10 /* close cached sockets after 10 sec. */ + +typedef struct migrateCachedSocket { + connection *conn; + long last_dbid; + time_t last_use_time; +} migrateCachedSocket; + +/* Return a migrateCachedSocket containing a TCP socket connected with the + * target instance, possibly returning a cached one. + * + * This function is responsible of sending errors to the client if a + * connection can't be established. In this case -1 is returned. + * Otherwise on success the socket is returned, and the caller should not + * attempt to free it after usage. + * + * If the caller detects an error while using the socket, migrateCloseSocket() + * should be called so that the connection will be created from scratch + * the next time. */ +migrateCachedSocket* migrateGetSocket(client *c, robj *host, robj *port, long timeout) { + connection *conn; + sds name = sdsempty(); + migrateCachedSocket *cs; + + /* Check if we have an already cached socket for this ip:port pair. */ + name = sdscatlen(name,host->ptr,sdslen(host->ptr)); + name = sdscatlen(name,":",1); + name = sdscatlen(name,port->ptr,sdslen(port->ptr)); + cs = dictFetchValue(server.migrate_cached_sockets,name); + if (cs) { + sdsfree(name); + cs->last_use_time = server.unixtime; + return cs; + } + + /* No cached socket, create one. */ + if (dictSize(server.migrate_cached_sockets) == MIGRATE_SOCKET_CACHE_ITEMS) { + /* Too many items, drop one at random. */ + dictEntry *de = dictGetRandomKey(server.migrate_cached_sockets); + cs = dictGetVal(de); + connClose(cs->conn); + zfree(cs); + dictDelete(server.migrate_cached_sockets,dictGetKey(de)); + } + + /* Create the connection */ + conn = connCreate(server.el, connTypeOfCluster()); + if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout) + != C_OK) { + addReplyError(c,"-IOERR error or timeout connecting to the client"); + connClose(conn); + sdsfree(name); + return NULL; + } + connEnableTcpNoDelay(conn); + + /* Add to the cache and return it to the caller. */ + cs = zmalloc(sizeof(*cs)); + cs->conn = conn; + + cs->last_dbid = -1; + cs->last_use_time = server.unixtime; + dictAdd(server.migrate_cached_sockets,name,cs); + return cs; +} + +/* Free a migrate cached connection. */ +void migrateCloseSocket(robj *host, robj *port) { + sds name = sdsempty(); + migrateCachedSocket *cs; + + name = sdscatlen(name,host->ptr,sdslen(host->ptr)); + name = sdscatlen(name,":",1); + name = sdscatlen(name,port->ptr,sdslen(port->ptr)); + cs = dictFetchValue(server.migrate_cached_sockets,name); + if (!cs) { + sdsfree(name); + return; + } + + connClose(cs->conn); + zfree(cs); + dictDelete(server.migrate_cached_sockets,name); + sdsfree(name); +} + +void migrateCloseTimedoutSockets(void) { + dictIterator di; + dictEntry *de; + + dictInitSafeIterator(&di, server.migrate_cached_sockets); + while((de = dictNext(&di)) != NULL) { + migrateCachedSocket *cs = dictGetVal(de); + + if ((server.unixtime - cs->last_use_time) > MIGRATE_SOCKET_CACHE_TTL) { + connClose(cs->conn); + zfree(cs); + dictDelete(server.migrate_cached_sockets,dictGetKey(de)); + } + } + dictResetIterator(&di); +} + +/* MIGRATE host port key dbid timeout [COPY | REPLACE | AUTH password | + * AUTH2 username password] + * + * On in the multiple keys form: + * + * MIGRATE host port "" dbid timeout [COPY | REPLACE | AUTH password | + * AUTH2 username password] KEYS key1 key2 ... keyN */ +void migrateCommand(client *c) { + migrateCachedSocket *cs; + int copy = 0, replace = 0, j; + char *username = NULL; + char *password = NULL; + long timeout; + long dbid; + robj **kvArray = NULL; /* Objects to migrate. */ + robj **keyArray = NULL; /* Key names. */ + robj **newargv = NULL; /* Used to rewrite the command as DEL ... keys ... */ + rio cmd, payload; + int may_retry = 1; + int write_error = 0; + int argv_rewritten = 0; + + /* To support the KEYS option we need the following additional state. */ + int first_key = 3; /* Argument index of the first key. */ + int num_keys = 1; /* By default only migrate the 'key' argument. */ + + /* Parse additional options */ + for (j = 6; j < c->argc; j++) { + int moreargs = (c->argc-1) - j; + if (!strcasecmp(c->argv[j]->ptr,"copy")) { + copy = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"replace")) { + replace = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"auth")) { + if (!moreargs) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + j++; + password = c->argv[j]->ptr; + redactClientCommandArgument(c,j); + } else if (!strcasecmp(c->argv[j]->ptr,"auth2")) { + if (moreargs < 2) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + username = c->argv[++j]->ptr; + redactClientCommandArgument(c,j); + password = c->argv[++j]->ptr; + redactClientCommandArgument(c,j); + } else if (!strcasecmp(c->argv[j]->ptr,"keys")) { + if (sdslen(c->argv[3]->ptr) != 0) { + addReplyError(c, + "When using MIGRATE KEYS option, the key argument" + " must be set to the empty string"); + return; + } + first_key = j+1; + num_keys = c->argc - j - 1; + break; /* All the remaining args are keys. */ + } else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + + /* Sanity check */ + if (getLongFromObjectOrReply(c,c->argv[5],&timeout,NULL) != C_OK || + getLongFromObjectOrReply(c,c->argv[4],&dbid,NULL) != C_OK) + { + return; + } + if (timeout <= 0) timeout = 1000; + + /* Check if the keys are here. If at least one key is to migrate, do it + * otherwise if all the keys are missing reply with "NOKEY" to signal + * the caller there was nothing to migrate. We don't return an error in + * this case, since often this is due to a normal condition like the key + * expiring in the meantime. */ + kvArray = zrealloc(kvArray,sizeof(kvobj*)*num_keys); + keyArray = zrealloc(keyArray,sizeof(robj*)*num_keys); + int num_exists = 0; + + for (j = 0; j < num_keys; j++) { + if ((kvArray[num_exists] = lookupKeyRead(c->db,c->argv[first_key+j])) != NULL) { + keyArray[num_exists] = c->argv[first_key+j]; + num_exists++; + } + } + num_keys = num_exists; + if (num_keys == 0) { + zfree(kvArray); zfree(keyArray); + addReplySds(c,sdsnew("+NOKEY\r\n")); + return; + } + + try_again: + write_error = 0; + + /* Connect */ + cs = migrateGetSocket(c,c->argv[1],c->argv[2],timeout); + if (cs == NULL) { + zfree(kvArray); zfree(keyArray); + return; /* error sent to the client by migrateGetSocket() */ + } + + rioInitWithBuffer(&cmd,sdsempty()); + + /* Authentication */ + if (password) { + int arity = username ? 3 : 2; + serverAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',arity)); + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"AUTH",4)); + if (username) { + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,username, + sdslen(username))); + } + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,password, + sdslen(password))); + } + + /* Send the SELECT command if the current DB is not already selected. */ + int select = cs->last_dbid != dbid; /* Should we emit SELECT? */ + if (select) { + serverAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,'*',2)); + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"SELECT",6)); + serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,dbid)); + } + + int non_expired = 0; /* Number of keys that we'll find non expired. + Note that serializing large keys may take some time + so certain keys that were found non expired by the + lookupKey() function, may be expired later. */ + + /* Create RESTORE payload and generate the protocol to call the command. */ + for (j = 0; j < num_keys; j++) { + long long ttl = 0; + long long expireat = kvobjGetExpire(kvArray[j]); + + if (expireat != -1) { + ttl = expireat-commandTimeSnapshot(); + if (ttl < 0) { + continue; + } + if (ttl < 1) ttl = 1; + } + + /* Relocate valid (non expired) keys and values into the array in successive + * positions to remove holes created by the keys that were present + * in the first lookup but are now expired after the second lookup. */ + kvArray[non_expired] = kvArray[j]; + keyArray[non_expired++] = keyArray[j]; + + serverAssertWithInfo(c,NULL, + rioWriteBulkCount(&cmd,'*',replace ? 5 : 4)); + + if (server.cluster_enabled) + serverAssertWithInfo(c,NULL, + rioWriteBulkString(&cmd,"RESTORE-ASKING",14)); + else + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",7)); + serverAssertWithInfo(c,NULL,sdsEncodedObject(keyArray[j])); + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,keyArray[j]->ptr, + sdslen(keyArray[j]->ptr))); + serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl)); + + /* Emit the payload argument, that is the serialized object using + * the DUMP format. */ + createDumpPayload(&payload,kvArray[j],keyArray[j],dbid,0); + serverAssertWithInfo(c,NULL, + rioWriteBulkString(&cmd,payload.io.buffer.ptr, + sdslen(payload.io.buffer.ptr))); + sdsfree(payload.io.buffer.ptr); + + /* Add the REPLACE option to the RESTORE command if it was specified + * as a MIGRATE option. */ + if (replace) + serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"REPLACE",7)); + } + + /* Fix the actual number of keys we are migrating. */ + num_keys = non_expired; + + /* Transfer the query to the other node in 64K chunks. */ + errno = 0; + { + sds buf = cmd.io.buffer.ptr; + size_t pos = 0, towrite; + int nwritten = 0; + + while ((towrite = sdslen(buf)-pos) > 0) { + towrite = (towrite > (64*1024) ? (64*1024) : towrite); + nwritten = connSyncWrite(cs->conn,buf+pos,towrite,timeout); + if (nwritten != (signed)towrite) { + write_error = 1; + goto socket_err; + } + pos += nwritten; + } + } + + char buf0[1024]; /* Auth reply. */ + char buf1[1024]; /* Select reply. */ + char buf2[1024]; /* Restore reply. */ + + /* Read the AUTH reply if needed. */ + if (password && connSyncReadLine(cs->conn, buf0, sizeof(buf0), timeout) <= 0) + goto socket_err; + + /* Read the SELECT reply if needed. */ + if (select && connSyncReadLine(cs->conn, buf1, sizeof(buf1), timeout) <= 0) + goto socket_err; + + /* Read the RESTORE replies. */ + int error_from_target = 0; + int socket_error = 0; + int del_idx = 1; /* Index of the key argument for the replicated DEL op. */ + + /* Allocate the new argument vector that will replace the current command, + * to propagate the MIGRATE as a DEL command (if no COPY option was given). + * We allocate num_keys+1 because the additional argument is for "DEL" + * command name itself. */ + if (!copy) newargv = zmalloc(sizeof(robj*)*(num_keys+1)); + + for (j = 0; j < num_keys; j++) { + if (connSyncReadLine(cs->conn, buf2, sizeof(buf2), timeout) <= 0) { + socket_error = 1; + break; + } + if ((password && buf0[0] == '-') || + (select && buf1[0] == '-') || + buf2[0] == '-') + { + /* On error assume that last_dbid is no longer valid. */ + if (!error_from_target) { + cs->last_dbid = -1; + char *errbuf; + if (password && buf0[0] == '-') errbuf = buf0; + else if (select && buf1[0] == '-') errbuf = buf1; + else errbuf = buf2; + + error_from_target = 1; + addReplyErrorFormat(c,"Target instance replied with error: %s", + errbuf+1); + } + } else { + if (!copy) { + /* No COPY option: remove the local key, signal the change. */ + dbDelete(c->db,keyArray[j]); + keyModified(c,c->db,keyArray[j],NULL,1); + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",keyArray[j],c->db->id); + server.dirty++; + + /* Populate the argument vector to replace the old one. */ + newargv[del_idx++] = keyArray[j]; + incrRefCount(keyArray[j]); + } + } + } + + /* On socket error, if we want to retry, do it now before rewriting the + * command vector. We only retry if we are sure nothing was processed + * and we failed to read the first reply (j == 0 test). */ + if (!error_from_target && socket_error && j == 0 && may_retry && + errno != ETIMEDOUT) + { + goto socket_err; /* A retry is guaranteed because of tested conditions.*/ + } + + /* On socket errors, close the migration socket now that we still have + * the original host/port in the ARGV. Later the original command may be + * rewritten to DEL and will be too later. */ + if (socket_error) migrateCloseSocket(c->argv[1],c->argv[2]); + + if (!copy) { + /* Translate MIGRATE as DEL for replication/AOF. Note that we do + * this only for the keys for which we received an acknowledgement + * from the receiving Redis server, by using the del_idx index. */ + if (del_idx > 1) { + newargv[0] = createStringObject("DEL",3); + /* Note that the following call takes ownership of newargv. */ + replaceClientCommandVector(c,del_idx,newargv); + argv_rewritten = 1; + } else { + /* No key transfer acknowledged, no need to rewrite as DEL. */ + zfree(newargv); + } + newargv = NULL; /* Make it safe to call zfree() on it in the future. */ + } + + /* If we are here and a socket error happened, we don't want to retry. + * Just signal the problem to the client, but only do it if we did not + * already queue a different error reported by the destination server. */ + if (!error_from_target && socket_error) { + may_retry = 0; + goto socket_err; + } + + if (!error_from_target) { + /* Success! Update the last_dbid in migrateCachedSocket, so that we can + * avoid SELECT the next time if the target DB is the same. Reply +OK. + * + * Note: If we reached this point, even if socket_error is true + * still the SELECT command succeeded (otherwise the code jumps to + * socket_err label. */ + cs->last_dbid = dbid; + addReply(c,shared.ok); + } else { + /* On error we already sent it in the for loop above, and set + * the currently selected socket to -1 to force SELECT the next time. */ + } + + sdsfree(cmd.io.buffer.ptr); + zfree(kvArray); zfree(keyArray); zfree(newargv); + return; + +/* On socket errors we try to close the cached socket and try again. + * It is very common for the cached socket to get closed, if just reopening + * it works it's a shame to notify the error to the caller. */ + socket_err: + /* Cleanup we want to perform in both the retry and no retry case. + * Note: Closing the migrate socket will also force SELECT next time. */ + sdsfree(cmd.io.buffer.ptr); + + /* If the command was rewritten as DEL and there was a socket error, + * we already closed the socket earlier. While migrateCloseSocket() + * is idempotent, the host/port arguments are now gone, so don't do it + * again. */ + if (!argv_rewritten) migrateCloseSocket(c->argv[1],c->argv[2]); + zfree(newargv); + newargv = NULL; /* This will get reallocated on retry. */ + + /* Retry only if it's not a timeout and we never attempted a retry + * (or the code jumping here did not set may_retry to zero). */ + if (errno != ETIMEDOUT && may_retry) { + may_retry = 0; + goto try_again; + } + + /* Cleanup we want to do if no retry is attempted. */ + zfree(kvArray); zfree(keyArray); + addReplyErrorSds(c, sdscatprintf(sdsempty(), + "-IOERR error or timeout %s to target instance", + write_error ? "writing" : "reading")); + return; +} + +/* Cluster node sanity check. Returns C_OK if the node id + * is valid an C_ERR otherwise. */ +int verifyClusterNodeId(const char *name, int length) { + if (length != CLUSTER_NAMELEN) return C_ERR; + for (int i = 0; i < length; i++) { + if (name[i] >= 'a' && name[i] <= 'z') continue; + if (name[i] >= '0' && name[i] <= '9') continue; + return C_ERR; + } + return C_OK; +} + +int isValidAuxChar(int c) { + return isalnum(c) || (strchr("!#$%&()*+:;<>?@[]^{|}~", c) == NULL); +} + +int isValidAuxString(char *s, unsigned int length) { + for (unsigned i = 0; i < length; i++) { + if (!isValidAuxChar(s[i])) return 0; + } + return 1; +} + +void clusterCommandMyId(client *c) { + char *name = clusterNodeGetName(getMyClusterNode()); + if (name) { + addReplyBulkCBuffer(c,name, CLUSTER_NAMELEN); + } else { + addReplyError(c, "No ID yet"); + } +} + +char* getMyClusterId(void) { + return clusterNodeGetName(getMyClusterNode()); +} + +void clusterCommandMyShardId(client *c) { + char *sid = clusterNodeGetShardId(getMyClusterNode()); + if (sid) { + addReplyBulkCBuffer(c,sid, CLUSTER_NAMELEN); + } else { + addReplyError(c, "No shard ID yet"); + } +} + +/* When a cluster command is called, we need to decide whether to return TLS info or + * non-TLS info by the client's connection type. However if the command is called by + * a Lua script or RM_call, there is no connection in the fake client, so we use + * server.current_client here to get the real client if available. And if it is not + * available (modules may call commands without a real client), we return the default + * info, which is determined by server.tls_cluster. */ +static int shouldReturnTlsInfo(void) { + if (server.current_client && server.current_client->conn) { + return connIsTLS(server.current_client->conn); + } else { + return server.tls_cluster; + } +} + +unsigned int countKeysInSlot(unsigned int slot) { + return kvstoreDictSize(server.db->keys, slot); +} + +/* Add detailed information of a node to the output buffer of the given client. */ +void addNodeDetailsToShardReply(client *c, clusterNode *node) { + + int reply_count = 0; + char *hostname; + void *node_replylen = addReplyDeferredLen(c); + + addReplyBulkCString(c, "id"); + addReplyBulkCBuffer(c, clusterNodeGetName(node), CLUSTER_NAMELEN); + reply_count++; + + if (clusterNodeTcpPort(node)) { + addReplyBulkCString(c, "port"); + addReplyLongLong(c, clusterNodeTcpPort(node)); + reply_count++; + } + + if (clusterNodeTlsPort(node)) { + addReplyBulkCString(c, "tls-port"); + addReplyLongLong(c, clusterNodeTlsPort(node)); + reply_count++; + } + + addReplyBulkCString(c, "ip"); + addReplyBulkCString(c, clusterNodeIp(node)); + reply_count++; + + addReplyBulkCString(c, "endpoint"); + addReplyBulkCString(c, clusterNodePreferredEndpoint(node)); + reply_count++; + + hostname = clusterNodeHostname(node); + if (hostname != NULL && *hostname != '\0') { + addReplyBulkCString(c, "hostname"); + addReplyBulkCString(c, hostname); + reply_count++; + } + + long long node_offset; + if (clusterNodeIsMyself(node)) { + node_offset = clusterNodeIsSlave(node) ? replicationGetSlaveOffset() : server.master_repl_offset; + } else { + node_offset = clusterNodeReplOffset(node); + } + + addReplyBulkCString(c, "role"); + addReplyBulkCString(c, clusterNodeIsSlave(node) ? "replica" : "master"); + reply_count++; + + addReplyBulkCString(c, "replication-offset"); + addReplyLongLong(c, node_offset); + reply_count++; + + addReplyBulkCString(c, "health"); + const char *health_msg = NULL; + if (clusterNodeIsFailing(node)) { + health_msg = "fail"; + } else if (clusterNodeIsSlave(node) && node_offset == 0) { + health_msg = "loading"; + } else { + health_msg = "online"; + } + addReplyBulkCString(c, health_msg); + reply_count++; + + setDeferredMapLen(c, node_replylen, reply_count); +} + +static clusterNode *clusterGetMasterFromShard(void *shard_handle) { + clusterNode *n = NULL; + void *node_it = clusterShardHandleGetNodeIterator(shard_handle); + while((n = clusterShardNodeIteratorNext(node_it)) != NULL) { + if (!clusterNodeIsFailing(n)) { + break; + } + } + clusterShardNodeIteratorFree(node_it); + if (!n) return NULL; + return clusterNodeGetMaster(n); +} + +/* Add the shard reply of a single shard based off the given primary node. */ +void addShardReplyForClusterShards(client *c, void *shard_handle) { + serverAssert(clusterGetShardNodeCount(shard_handle) > 0); + addReplyMapLen(c, 2); + addReplyBulkCString(c, "slots"); + + /* Use slot_info_pairs from the primary only */ + clusterNode *master_node = clusterGetMasterFromShard(shard_handle); + + if (master_node && clusterNodeHasSlotInfo(master_node)) { + serverAssert((clusterNodeSlotInfoCount(master_node) % 2) == 0); + addReplyArrayLen(c, clusterNodeSlotInfoCount(master_node)); + for (int i = 0; i < clusterNodeSlotInfoCount(master_node); i++) + addReplyLongLong(c, (unsigned long)clusterNodeSlotInfoEntry(master_node, i)); + } else { + /* If no slot info pair is provided, the node owns no slots */ + addReplyArrayLen(c, 0); + } + + addReplyBulkCString(c, "nodes"); + addReplyArrayLen(c, clusterGetShardNodeCount(shard_handle)); + void *node_it = clusterShardHandleGetNodeIterator(shard_handle); + for (clusterNode *n = clusterShardNodeIteratorNext(node_it); n != NULL; n = clusterShardNodeIteratorNext(node_it)) { + addNodeDetailsToShardReply(c, n); + clusterFreeNodesSlotsInfo(n); + } + clusterShardNodeIteratorFree(node_it); +} + +/* Add to the output buffer of the given client, an array of slot (start, end) + * pair owned by the shard, also the primary and set of replica(s) along with + * information about each node. */ +void clusterCommandShards(client *c) { + addReplyArrayLen(c, clusterGetShardCount()); + /* This call will add slot_info_pairs to all nodes */ + clusterGenNodesSlotsInfo(0); + dictIterator *shard_it = clusterGetShardIterator(); + for(void *shard_handle = clusterNextShardHandle(shard_it); shard_handle != NULL; shard_handle = clusterNextShardHandle(shard_it)) { + addShardReplyForClusterShards(c, shard_handle); + } + clusterFreeShardIterator(shard_it); +} + +void clusterCommandHelp(client *c) { + const char *help[] = { + "COUNTKEYSINSLOT ", + " Return the number of keys in .", + "GETKEYSINSLOT ", + " Return key names stored by current node in a slot.", + "INFO", + " Return information about the cluster.", + "KEYSLOT ", + " Return the hash slot for .", + "MYID", + " Return the node id.", + "MYSHARDID", + " Return the node's shard id.", + "NODES", + " Return cluster configuration seen by node. Output format:", + " ...", + "REPLICAS ", + " Return replicas.", + "SLOTS", + " Return information about slots range mappings. Each range is made of:", + " start, end, master and replicas IP addresses, ports and ids", + "SLOT-STATS", + " Return an array of slot usage statistics for slots assigned to the current node.", + "SHARDS", + " Return information about slot range mappings and the nodes associated with them.", + NULL + }; + + addExtendedReplyHelp(c, help, clusterCommandExtendedHelp()); +} + +void clusterCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + + if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) { + clusterCommandHelp(c); + } else if (!strcasecmp(c->argv[1]->ptr,"nodes") && c->argc == 2) { + /* CLUSTER NODES */ + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + sds nodes = clusterGenNodesDescription(c, 0, shouldReturnTlsInfo()); + addReplyVerbatim(c,nodes,sdslen(nodes),"txt"); + sdsfree(nodes); + } else if (!strcasecmp(c->argv[1]->ptr,"myid") && c->argc == 2) { + /* CLUSTER MYID */ + clusterCommandMyId(c); + } else if (!strcasecmp(c->argv[1]->ptr,"myshardid") && c->argc == 2) { + /* CLUSTER MYSHARDID */ + clusterCommandMyShardId(c); + } else if (!strcasecmp(c->argv[1]->ptr,"slots") && c->argc == 2) { + /* CLUSTER SLOTS */ + clusterCommandSlots(c); + } else if (!strcasecmp(c->argv[1]->ptr,"shards") && c->argc == 2) { + /* CLUSTER SHARDS */ + clusterCommandShards(c); + } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { + /* CLUSTER INFO */ + + sds info = genClusterInfoString(); + + /* Produce the reply protocol. */ + addReplyVerbatim(c,info,sdslen(info),"txt"); + sdsfree(info); + } else if (!strcasecmp(c->argv[1]->ptr,"keyslot") && c->argc == 3) { + /* CLUSTER KEYSLOT */ + sds key = c->argv[2]->ptr; + + addReplyLongLong(c,keyHashSlot(key,sdslen(key))); + } else if (!strcasecmp(c->argv[1]->ptr,"countkeysinslot") && c->argc == 3) { + /* CLUSTER COUNTKEYSINSLOT */ + long long slot; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) + return; + if (slot < 0 || slot >= CLUSTER_SLOTS) { + addReplyError(c,"Invalid slot"); + return; + } + + if (!clusterCanAccessKeysInSlot(slot)) { + addReplyLongLong(c, 0); + return; + } + addReplyLongLong(c,countKeysInSlot(slot)); + } else if (!strcasecmp(c->argv[1]->ptr,"getkeysinslot") && c->argc == 4) { + /* CLUSTER GETKEYSINSLOT */ + long long maxkeys, slot; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != C_OK) + return; + if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) + != C_OK) + return; + if (slot < 0 || slot >= CLUSTER_SLOTS || maxkeys < 0) { + addReplyError(c,"Invalid slot or number of keys"); + return; + } + + if (!clusterCanAccessKeysInSlot(slot)) { + addReplyArrayLen(c, 0); + return; + } + + unsigned int keys_in_slot = countKeysInSlot(slot); + unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; + addReplyArrayLen(c,numkeys); + kvstoreDictIterator kvs_di; + dictEntry *de = NULL; + kvstoreInitDictIterator(&kvs_di, server.db->keys, slot); + for (unsigned int i = 0; i < numkeys; i++) { + de = kvstoreDictIteratorNext(&kvs_di); + serverAssert(de != NULL); + sds sdskey = kvobjGetKey(dictGetKV(de)); + addReplyBulkCBuffer(c, sdskey, sdslen(sdskey)); + } + kvstoreResetDictIterator(&kvs_di); + } else if ((!strcasecmp(c->argv[1]->ptr,"slaves") || + !strcasecmp(c->argv[1]->ptr,"replicas")) && c->argc == 3) { + /* CLUSTER SLAVES */ + /* CLUSTER REPLICAS */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + int j; + + /* Lookup the specified node in our table. */ + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return; + } + + if (clusterNodeIsSlave(n)) { + addReplyError(c,"The specified node is not a master"); + return; + } + + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + addReplyArrayLen(c, clusterNodeNumSlaves(n)); + for (j = 0; j < clusterNodeNumSlaves(n); j++) { + sds ni = clusterGenNodeDescription(c, clusterNodeGetSlave(n, j), shouldReturnTlsInfo()); + addReplyBulkCString(c,ni); + sdsfree(ni); + } + } else if (!strcasecmp(c->argv[1]->ptr, "migration")) { + clusterMigrationCommand(c); + } else if (!strcasecmp(c->argv[1]->ptr,"syncslots") && c->argc >= 3) { + clusterSyncSlotsCommand(c); + } else if(!clusterCommandSpecial(c)) { + addReplySubcommandSyntaxError(c); + return; + } +} + +/* Extract slot number from keys in a keys_result structure and return to caller. + * Returns: + * - The slot number if all keys belong to the same slot + * - INVALID_CLUSTER_SLOT if there are no keys or cluster is disabled + * - CLUSTER_CROSSSLOT if keys belong to different slots (cross-slot error) */ +int extractSlotFromKeysResult(robj **argv, getKeysResult *keys_result) { + if (keys_result->numkeys == 0 || !server.cluster_enabled) + return INVALID_CLUSTER_SLOT; + + int first_slot = INVALID_CLUSTER_SLOT; + for (int j = 0; j < keys_result->numkeys; j++) { + robj *this_key = argv[keys_result->keys[j].pos]; + int this_slot = (int)keyHashSlot((char*)this_key->ptr, sdslen(this_key->ptr)); + + if (first_slot == INVALID_CLUSTER_SLOT) + first_slot = this_slot; + else if (first_slot != this_slot) { + return CLUSTER_CROSSSLOT; + } + } + return first_slot; +} + +/* Return the pointer to the cluster node that is able to serve the command. + * For the function to succeed the command should only target either: + * + * 1) A single key (even multiple times like RPOPLPUSH mylist mylist). + * 2) Multiple keys in the same hash slot, while the slot is stable (no + * resharding in progress). + * + * On success the function returns the node that is able to serve the request. + * If the node is not 'myself' a redirection must be performed. The kind of + * redirection is specified setting the integer passed by reference + * 'error_code', which will be set to CLUSTER_REDIR_ASK or + * CLUSTER_REDIR_MOVED. + * + * When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE. + * + * If the command fails NULL is returned, and the reason of the failure is + * provided via 'error_code', which will be set to: + * + * CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that + * don't belong to the same hash slot. + * + * CLUSTER_REDIR_UNSTABLE if the request contains multiple keys + * belonging to the same slot, but the slot is not stable (in migration or + * importing state, likely because a resharding is in progress). + * + * CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is + * not bound to any node. In this case the cluster global state should be + * already "down" but it is fragile to rely on the update of the global state, + * so we also handle it here. + * + * CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is + * down but the user attempts to execute a command that addresses one or more keys. */ +clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, + getKeysResult *keys_result, uint8_t read_error, uint64_t cmd_flags, int *error_code) +{ + clusterNode *myself = getMyClusterNode(); + clusterNode *n = NULL; + robj *firstkey = NULL; + int multiple_keys = 0; + multiState *ms, _ms; + pendingCommand mc; + pendingCommand *mcp = &mc; + int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0, + existing_keys = 0; + int pubsubshard_included = 0; /* Flag to indicate if a pubsub shard cmd is included. */ + + /* Allow any key to be set if a module disabled cluster redirections. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return myself; + + /* Set error code optimistically for the base case. */ + if (error_code) *error_code = CLUSTER_REDIR_NONE; + + /* Modules can turn off Redis Cluster redirection: this is useful + * when writing a module that implements a completely different + * distributed system. */ + + /* We handle all the cases as if they were EXEC commands, so we have + * a common code path for everything */ + if (cmd->proc == execCommand) { + /* If CLIENT_MULTI flag is not set EXEC is just going to return an + * error. */ + if (!(c->flags & CLIENT_MULTI)) return myself; + ms = &c->mstate; + } else { + /* In order to have a single codepath create a fake Multi State + * structure if the client is not in MULTI/EXEC state, this way + * we have a single codepath below. */ + ms = &_ms; + _ms.commands = &mcp; + _ms.count = 1; + + /* Properly initialize the fake pendingCommand */ + initPendingCommand(&mc); + mc.argv = argv; + mc.argc = argc; + mc.cmd = cmd; + mc.slot = hashslot ? *hashslot : INVALID_CLUSTER_SLOT; + mc.read_error = read_error; + if (keys_result) { + mc.keys_result = *keys_result; + mc.flags |= PENDING_CMD_KEYS_RESULT_VALID; + } + } + + /* Check that all the keys are in the same hash slot, and obtain this + * slot and the node associated. */ + for (i = 0; i < ms->count; i++) { + struct redisCommand *mcmd; + robj **margv; + int margc, j; + keyReference *keyindex; + + pendingCommand *pcmd = ms->commands[i]; + + mcmd = pcmd->cmd; + margc = pcmd->argc; + margv = pcmd->argv; + + /* Only valid for sharded pubsub as regular pubsub can operate on any node and bypasses this layer. */ + if (!pubsubshard_included && + doesCommandHaveChannelsWithFlags(mcmd, CMD_CHANNEL_PUBLISH | CMD_CHANNEL_SUBSCRIBE)) + { + pubsubshard_included = 1; + } + + /* If we have a cached keys result from preprocessCommand(), use it. + * Otherwise, extract keys result. */ + int use_cache_keys_result = pcmd->flags & PENDING_CMD_KEYS_RESULT_VALID; + getKeysResult result = GETKEYS_RESULT_INIT; + if (use_cache_keys_result) + result = pcmd->keys_result; + else + getKeysFromCommand(mcmd,margv,margc,&result); + keyindex = result.keys; + + for (j = 0; j < result.numkeys; j++) { + /* The command has keys and was checked for cross-slot between its keys in preprocessCommand() */ + if (pcmd->read_error == CLIENT_READ_CROSS_SLOT) { + /* Error: multiple keys from different slots. */ + if (error_code) + *error_code = CLUSTER_REDIR_CROSS_SLOT; + return NULL; + } + + robj *thiskey = margv[keyindex[j].pos]; + int thisslot = pcmd->slot; + if (thisslot == INVALID_CLUSTER_SLOT) + thisslot = keyHashSlot((char*)thiskey->ptr, sdslen(thiskey->ptr)); + + if (firstkey == NULL) { + /* This is the first key we see. Check what is the slot + * and node. */ + firstkey = thiskey; + slot = thisslot; + n = getNodeBySlot(slot); + + /* Error: If a slot is not served, we are in "cluster down" + * state. However the state is yet to be updated, so this was + * not trapped earlier in processCommand(). Report the same + * error to the client. */ + if (n == NULL) { + if (!use_cache_keys_result) getKeysFreeResult(&result); + if (error_code) + *error_code = CLUSTER_REDIR_DOWN_UNBOUND; + return NULL; + } + + /* If we are migrating or importing this slot, we need to check + * if we have all the keys in the request (the only way we + * can safely serve the request, otherwise we return a TRYAGAIN + * error). To do so we set the importing/migrating state and + * increment a counter for every missing key. */ + if (n == myself && + getMigratingSlotDest(slot) != NULL) + { + migrating_slot = 1; + } else if (getImportingSlotSource(slot) != NULL) { + importing_slot = 1; + } + } else { + /* If it is not the first key/channel, make sure it is exactly + * the same key/channel as the first we saw. */ + if (slot != thisslot) { + /* Error: multiple keys from different slots. */ + if (!use_cache_keys_result) getKeysFreeResult(&result); + if (error_code) + *error_code = CLUSTER_REDIR_CROSS_SLOT; + return NULL; + } + if (importing_slot && !multiple_keys && !equalStringObjects(firstkey,thiskey)) { + /* Flag this request as one with multiple different + * keys/channels when the slot is in importing state. */ + multiple_keys = 1; + } + } + + /* Migrating / Importing slot? Count keys we don't have. + * If it is pubsubshard command, it isn't required to check + * the channel being present or not in the node during the + * slot migration, the channel will be served from the source + * node until the migration completes with CLUSTER SETSLOT + * NODE . */ + int flags = LOOKUP_NOTOUCH | LOOKUP_NOSTATS | LOOKUP_NONOTIFY | LOOKUP_NOEXPIRE; + if ((migrating_slot || importing_slot) && !pubsubshard_included) + { + if (lookupKeyReadWithFlags(&server.db[0], thiskey, flags) == NULL) missing_keys++; + else existing_keys++; + } + } + if (!use_cache_keys_result) getKeysFreeResult(&result); + } + + /* No key at all in command? then we can serve the request + * without redirections or errors in all the cases. */ + if (n == NULL) return myself; + + /* Cluster is globally down but we got keys? We only serve the request + * if it is a read command and when allow_reads_when_down is enabled. */ + if (!isClusterHealthy()) { + if (pubsubshard_included) { + if (!server.cluster_allow_pubsubshard_when_down) { + if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; + return NULL; + } + } else if (!server.cluster_allow_reads_when_down) { + /* The cluster is configured to block commands when the + * cluster is down. */ + if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE; + return NULL; + } else if (cmd_flags & CMD_WRITE) { + /* The cluster is configured to allow read only commands */ + if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE; + return NULL; + } else { + /* Fall through and allow the command to be executed: + * this happens when server.cluster_allow_reads_when_down is + * true and the command is not a write command */ + } + } + + /* Return the hashslot by reference. */ + if (hashslot) *hashslot = slot; + + /* MIGRATE always works in the context of the local node if the slot + * is open (migrating or importing state). We need to be able to freely + * move keys among instances in this case. */ + if ((migrating_slot || importing_slot) && cmd->proc == migrateCommand) + return myself; + + /* If we don't have all the keys and we are migrating the slot, send + * an ASK redirection or TRYAGAIN. */ + if (migrating_slot && missing_keys) { + /* If we have keys but we don't have all keys, we return TRYAGAIN */ + if (existing_keys) { + if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + if (error_code) *error_code = CLUSTER_REDIR_ASK; + return getMigratingSlotDest(slot); + } + } + + /* If we are receiving the slot, and the client correctly flagged the + * request as "ASKING", we can serve the request. However if the request + * involves multiple keys and we don't have them all, the only option is + * to send a TRYAGAIN error. */ + if (importing_slot && + (c->flags & CLIENT_ASKING || cmd_flags & CMD_ASKING)) + { + if (multiple_keys && missing_keys) { + if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + return myself; + } + } + + /* Handle the read-only client case reading from a slave: if this + * node is a slave and the request is about a hash slot our master + * is serving, we can reply without redirection. */ + int is_write_command = (cmd_flags & CMD_WRITE) || + (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE)); + if (((c->flags & CLIENT_READONLY) || pubsubshard_included) && + !is_write_command && + clusterNodeIsSlave(myself) && + clusterNodeGetSlaveof(myself) == n) + { + return myself; + } + + /* Base case: just return the right node. However, if this node is not + * myself, set error_code to MOVED since we need to issue a redirection. */ + if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED; + return n; +} + +/* Send the client the right redirection code, according to error_code + * that should be set to one of CLUSTER_REDIR_* macros. + * + * If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes + * are used, then the node 'n' should not be NULL, but should be the + * node we want to mention in the redirection. Moreover hashslot should + * be set to the hash slot that caused the redirection. */ +void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code) { + if (error_code == CLUSTER_REDIR_CROSS_SLOT) { + addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot"); + } else if (error_code == CLUSTER_REDIR_UNSTABLE) { + /* The request spawns multiple keys in the same slot, + * but the slot is not "stable" currently as there is + * a migration or import in progress. */ + addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot"); + } else if (error_code == CLUSTER_REDIR_DOWN_STATE) { + addReplyError(c,"-CLUSTERDOWN The cluster is down"); + } else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) { + addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands"); + } else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) { + addReplyError(c,"-CLUSTERDOWN Hash slot not served"); + } else if (error_code == CLUSTER_REDIR_MOVED || + error_code == CLUSTER_REDIR_ASK) + { + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + int port = clusterNodeClientPort(n, shouldReturnTlsInfo()); + addReplyErrorSds(c,sdscatprintf(sdsempty(), + "-%s %d %s:%d", + (error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED", + hashslot, clusterNodePreferredEndpoint(n), port)); + } else { + serverPanic("getNodeByQuery() unknown error."); + } +} + +/* This function is called by the function processing clients incrementally + * to detect timeouts, in order to handle the following case: + * + * 1) A client blocks with BLPOP or similar blocking operation. + * 2) The master migrates the hash slot elsewhere or turns into a slave. + * 3) The client may remain blocked forever (or up to the max timeout time) + * waiting for a key change that will never happen. + * + * If the client is found to be blocked into a hash slot this node no + * longer handles, the client is sent a redirection error, and the function + * returns 1. Otherwise 0 is returned and no operation is performed. */ +int clusterRedirectBlockedClientIfNeeded(client *c) { + clusterNode *myself = getMyClusterNode(); + if (c->flags & CLIENT_BLOCKED && + (c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET || + c->bstate.btype == BLOCKED_STREAM || + c->bstate.btype == BLOCKED_MODULE)) + { + dictEntry *de; + dictIterator di; + + /* If the cluster is down, unblock the client with the right error. + * If the cluster is configured to allow reads on cluster down, we + * still want to emit this error since a write will be required + * to unblock them which may never come. */ + if (!isClusterHealthy()) { + clusterRedirectClient(c,NULL,0,CLUSTER_REDIR_DOWN_STATE); + return 1; + } + + /* If the client is blocked on module, but not on a specific key, + * don't unblock it (except for the CLUSTER_FAIL case above). */ + if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) + return 0; + + /* All keys must belong to the same slot, so check first key only. */ + dictInitIterator(&di, c->bstate.keys); + if ((de = dictNext(&di)) != NULL) { + robj *key = dictGetKey(de); + int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr)); + clusterNode *node = getNodeBySlot(slot); + + /* if the client is read-only and attempting to access key that our + * replica can handle, allow it. */ + if ((c->flags & CLIENT_READONLY) && + !(c->lastcmd->flags & CMD_WRITE) && + clusterNodeIsSlave(myself) && clusterNodeGetSlaveof(myself) == node) + { + node = myself; + } + + /* We send an error and unblock the client if: + * 1) The slot is unassigned, emitting a cluster down error. + * 2) The slot is not handled by this node, nor being imported. */ + if (node != myself && getImportingSlotSource(slot) == NULL) + { + if (node == NULL) { + clusterRedirectClient(c,NULL,0, + CLUSTER_REDIR_DOWN_UNBOUND); + } else { + clusterRedirectClient(c,node,slot, + CLUSTER_REDIR_MOVED); + } + dictResetIterator(&di); + return 1; + } + } + dictResetIterator(&di); + } + return 0; +} + +/* Returns an indication if the replica node is fully available + * and should be listed in CLUSTER SLOTS response. + * Returns 1 for available nodes, 0 for nodes that have + * not finished their initial sync, in failed state, or are + * otherwise considered not available to serve read commands. */ +static int isReplicaAvailable(clusterNode *node) { + if (clusterNodeIsFailing(node)) { + return 0; + } + long long repl_offset = clusterNodeReplOffset(node); + if (clusterNodeIsMyself(node)) { + /* Nodes do not update their own information + * in the cluster node list. */ + repl_offset = replicationGetSlaveOffset(); + } + return (repl_offset != 0); +} + +void addNodeToNodeReply(client *c, clusterNode *node) { + char* hostname = clusterNodeHostname(node); + addReplyArrayLen(c, 4); + if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) { + addReplyBulkCString(c, clusterNodeIp(node)); + } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) { + if (hostname != NULL && hostname[0] != '\0') { + addReplyBulkCString(c, hostname); + } else { + addReplyBulkCString(c, "?"); + } + } else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) { + addReplyNull(c); + } else { + serverPanic("Unrecognized preferred endpoint type"); + } + + /* Report TLS ports to TLS client, and report non-TLS port to non-TLS client. */ + addReplyLongLong(c, clusterNodeClientPort(node, shouldReturnTlsInfo())); + addReplyBulkCBuffer(c, clusterNodeGetName(node), CLUSTER_NAMELEN); + + /* Add the additional endpoint information, this is all the known networking information + * that is not the preferred endpoint. Note the logic is evaluated twice so we can + * correctly report the number of additional network arguments without using a deferred + * map, an assertion is made at the end to check we set the right length. */ + int length = 0; + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { + length++; + } + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME + && hostname != NULL && hostname[0] != '\0') + { + length++; + } + addReplyMapLen(c, length); + + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) { + addReplyBulkCString(c, "ip"); + addReplyBulkCString(c, clusterNodeIp(node)); + length--; + } + if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME + && hostname != NULL && hostname[0] != '\0') + { + addReplyBulkCString(c, "hostname"); + addReplyBulkCString(c, hostname); + length--; + } + serverAssert(length == 0); +} + +void addNodeReplyForClusterSlot(client *c, clusterNode *node, int start_slot, int end_slot) { + int i, nested_elements = 3; /* slots (2) + master addr (1) */ + for (i = 0; i < clusterNodeNumSlaves(node); i++) { + if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue; + nested_elements++; + } + addReplyArrayLen(c, nested_elements); + addReplyLongLong(c, start_slot); + addReplyLongLong(c, end_slot); + addNodeToNodeReply(c, node); + + /* Remaining nodes in reply are replicas for slot range */ + for (i = 0; i < clusterNodeNumSlaves(node); i++) { + /* This loop is copy/pasted from clusterGenNodeDescription() + * with modifications for per-slot node aggregation. */ + if (!isReplicaAvailable(clusterNodeGetSlave(node, i))) continue; + addNodeToNodeReply(c, clusterNodeGetSlave(node, i)); + nested_elements--; + } + serverAssert(nested_elements == 3); /* Original 3 elements */ +} + +void clusterCommandSlots(client * c) { + /* Format: 1) 1) start slot + * 2) end slot + * 3) 1) master IP + * 2) master port + * 3) node ID + * 4) 1) replica IP + * 2) replica port + * 3) node ID + * ... continued until done + */ + clusterNode *n = NULL; + int num_masters = 0, start = -1; + void *slot_replylen = addReplyDeferredLen(c); + + for (int i = 0; i <= CLUSTER_SLOTS; i++) { + /* Find start node and slot id. */ + if (n == NULL) { + if (i == CLUSTER_SLOTS) break; + n = getNodeBySlot(i); + start = i; + continue; + } + + /* Add cluster slots info when occur different node with start + * or end of slot. */ + if (i == CLUSTER_SLOTS || n != getNodeBySlot(i)) { + addNodeReplyForClusterSlot(c, n, start, i-1); + num_masters++; + if (i == CLUSTER_SLOTS) break; + n = getNodeBySlot(i); + start = i; + } + } + setDeferredArrayLen(c, slot_replylen, num_masters); +} + +/* ----------------------------------------------------------------------------- + * Cluster functions related to serving / redirecting clients + * -------------------------------------------------------------------------- */ + +/* The ASKING command is required after a -ASK redirection. + * The client should issue ASKING before to actually send the command to + * the target instance. See the Redis Cluster specification for more + * information. */ +void askingCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags |= CLIENT_ASKING; + addReply(c,shared.ok); +} + +/* The READONLY command is used by clients to enter the read-only mode. + * In this mode slaves will not redirect clients as long as clients access + * with read-only commands to keys that are served by the slave's master. */ +void readonlyCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags |= CLIENT_READONLY; + addReply(c,shared.ok); +} + +/* Remove all the keys in the specified hash slot. + * The number of removed items is returned. */ +unsigned int clusterDelKeysInSlot(unsigned int hashslot, int by_command) { + unsigned int j = 0; + + if (!kvstoreDictSize(server.db->keys, (int) hashslot)) + return 0; + + kvstoreDictIterator kvs_di; + dictEntry *de = NULL; + kvstoreInitDictSafeIterator(&kvs_di, server.db->keys, (int) hashslot); + while((de = kvstoreDictIteratorNext(&kvs_di)) != NULL) { + enterExecutionUnit(1, 0); + sds sdskey = kvobjGetKey(dictGetKV(de)); + robj *key = createStringObject(sdskey, sdslen(sdskey)); + dbDelete(&server.db[0], key); + + keyModified(NULL, &server.db[0], key, NULL, 1); + if (by_command) { + /* Keys are deleted by a command (trimslots), we need to notify the + * keyspace event. Though, we don't need to propagate the DEL + * command, as the command (trimslots) will be propagated. */ + notifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); + } else { + /* Propagate the DEL command */ + propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del); + /* The keys are not actually logically deleted from the database, + * just moved to another node. The modules needs to know that these + * keys are no longer available locally, so just send the keyspace + * notification to the modules, but not to clients. */ + moduleNotifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, server.db[0].id); + } + exitExecutionUnit(); + postExecutionUnitOperations(); + decrRefCount(key); + j++; + server.dirty++; + } + kvstoreResetDictIterator(&kvs_di); + return j; +} + +/* Delete the keys in the slot ranges. Returns the number of deleted items */ +unsigned int clusterDelKeysInSlotRangeArray(slotRangeArray *slots, int by_command) { + unsigned int j = 0; + for (int i = 0; i < slots->num_ranges; i++) { + for (int slot = slots->ranges[i].start; slot <= slots->ranges[i].end; slot++) { + j += clusterDelKeysInSlot(slot, by_command); + } + } + return j; +} + +int clusterIsMySlot(int slot) { + return getMyClusterNode() == getNodeBySlot(slot); +} + +void replySlotsFlushAndFree(client *c, slotRangeArray *slots) { + addReplyArrayLen(c, slots->num_ranges); + for (int i = 0 ; i < slots->num_ranges ; i++) { + addReplyArrayLen(c, 2); + addReplyLongLong(c, slots->ranges[i].start); + addReplyLongLong(c, slots->ranges[i].end); + } + slotRangeArrayFree(slots); +} + +/* Normalizes (sorts and merges adjacent ranges), checks that slot ranges are + * well-formed and non-overlapping. */ +int slotRangeArrayNormalizeAndValidate(slotRangeArray *slots, sds *err) { + unsigned char used_slots[CLUSTER_SLOTS] = {0}; + + if (slots->num_ranges <= 0 || slots->num_ranges >= CLUSTER_SLOTS) { + *err = sdscatprintf(sdsempty(), "invalid number of slot ranges: %d", slots->num_ranges); + return C_ERR; + } + + /* Sort and merge adjacent slot ranges. */ + slotRangeArraySortAndMerge(slots); + + for (int i = 0; i < slots->num_ranges; i++) { + if (slots->ranges[i].start >= CLUSTER_SLOTS || + slots->ranges[i].end >= CLUSTER_SLOTS) + { + *err = sdscatprintf(sdsempty(), "slot range is out of range: %d-%d", + slots->ranges[i].start, slots->ranges[i].end); + return C_ERR; + } + + if (slots->ranges[i].start > slots->ranges[i].end) { + *err = sdscatprintf(sdsempty(), "start slot number %d is greater than end slot number %d", + slots->ranges[i].start, slots->ranges[i].end); + return C_ERR; + } + + for (int j = slots->ranges[i].start; j <= slots->ranges[i].end; j++) { + if (used_slots[j]) { + *err = sdscatprintf(sdsempty(), "Slot %d specified multiple times", j); + return C_ERR; + } + used_slots[j]++; + } + } + return C_OK; +} + +/* Create a slot range array with the specified number of ranges. */ +slotRangeArray *slotRangeArrayCreate(int num_ranges) { + slotRangeArray *slots = zcalloc(sizeof(slotRangeArray) + num_ranges * sizeof(slotRange)); + slots->num_ranges = num_ranges; + return slots; +} + +/* Duplicate the slot range array. */ +slotRangeArray *slotRangeArrayDup(slotRangeArray *slots) { + slotRangeArray *dup = slotRangeArrayCreate(slots->num_ranges); + memcpy(dup->ranges, slots->ranges, sizeof(slotRange) * slots->num_ranges); + return dup; +} + +/* Set the slot range at the specified index. */ +void slotRangeArraySet(slotRangeArray *slots, int idx, int start, int end) { + slots->ranges[idx].start = start; + slots->ranges[idx].end = end; +} + +/* Create a slot range string in the format of: "1000-2000 3000-4000 ..." */ +sds slotRangeArrayToString(slotRangeArray *slots) { + sds s = sdsempty(); + if (slots == NULL || slots->num_ranges == 0) return s; + + for (int i = 0; i < slots->num_ranges; i++) { + slotRange *sr = &slots->ranges[i]; + s = sdscatprintf(s, "%d-%d ", sr->start, sr->end); + } + sdssetlen(s, sdslen(s) - 1); + s[sdslen(s)] = '\0'; + + return s; +} + +/* Parse a slot range string in the format "1000-2000 3000-4000 ..." into a slotRangeArray. + * Returns a new slotRangeArray on success, NULL on failure. */ +slotRangeArray *slotRangeArrayFromString(sds data) { + int num_ranges; + long long start, end; + slotRangeArray *slots = NULL; + if (!data || sdslen(data) == 0) return NULL; + + sds *parts = sdssplitlen(data, sdslen(data), " ", 1, &num_ranges); + if (num_ranges <= 0) goto err; + + slots = slotRangeArrayCreate(num_ranges); + + /* Parse each slot range */ + for (int i = 0; i < num_ranges; i++) { + char *dash = strchr(parts[i], '-'); + if (!dash) goto err; + + if (string2ll(parts[i], dash - parts[i], &start) == 0 || + string2ll(dash + 1, sdslen(parts[i]) - (dash - parts[i]) - 1, &end) == 0) + goto err; + slotRangeArraySet(slots, i, start, end); + } + + /* Validate all ranges */ + sds err_msg = NULL; + if (slotRangeArrayNormalizeAndValidate(slots, &err_msg) != C_OK) { + if (err_msg) sdsfree(err_msg); + goto err; + } + sdsfreesplitres(parts, num_ranges); + return slots; + +err: + if (slots) slotRangeArrayFree(slots); + sdsfreesplitres(parts, num_ranges); + return NULL; +} + +static int compareSlotRange(const void *a, const void *b) { + const slotRange *sa = a; + const slotRange *sb = b; + if (sa->start < sb->start) return -1; + if (sa->start > sb->start) return 1; + return 0; +} + +/* Sort slot ranges by start slot and merge adjacent ranges. + * Adjacent means: prev.end + 1 == next.start. + * e.g. 1000-2000 2001-3000 0-100 => 0-100 1000-3000 + * + * Note: Overlapping ranges are not merged.*/ +void slotRangeArraySortAndMerge(slotRangeArray *slots) { + if (!slots || slots->num_ranges <= 1) return; + + qsort(slots->ranges, slots->num_ranges, sizeof(slotRange), compareSlotRange); + + int idx = 0; + for (int i = 1; i < slots->num_ranges; i++) { + if (slots->ranges[idx].end + 1 == slots->ranges[i].start) + slots->ranges[idx].end = slots->ranges[i].end; + else + slots->ranges[++idx] = slots->ranges[i]; + } + slots->num_ranges = idx + 1; +} + +/* Compare two slot range arrays, return 1 if equal, 0 otherwise */ +int slotRangeArrayIsEqual(slotRangeArray *slots1, slotRangeArray *slots2) { + slotRangeArraySortAndMerge(slots1); + slotRangeArraySortAndMerge(slots2); + + if (slots1->num_ranges != slots2->num_ranges) return 0; + + for (int i = 0; i < slots1->num_ranges; i++) { + if (slots1->ranges[i].start != slots2->ranges[i].start || + slots1->ranges[i].end != slots2->ranges[i].end) { + return 0; + } + } + return 1; +} + +/* Add a slot to the slot range array. + * Usage: + * slotRangeArray *slots = NULL + * slots = slotRangeArrayAppend(slots, 1000); + * slots = slotRangeArrayAppend(slots, 1001); + * slots = slotRangeArrayAppend(slots, 1003); + * slots = slotRangeArrayAppend(slots, 1004); + * slots = slotRangeArrayAppend(slots, 1005); + * + * Result: 1000-1001, 1003-1005 + * Note: `slot` must be greater than the previous slot. + * */ +slotRangeArray *slotRangeArrayAppend(slotRangeArray *slots, int slot) { + if (slots == NULL) { + slots = slotRangeArrayCreate(4); + slots->ranges[0].start = slot; + slots->ranges[0].end = slot; + slots->num_ranges = 1; + return slots; + } + + serverAssert(slots->num_ranges >= 0 && slots->num_ranges <= CLUSTER_SLOTS); + serverAssert(slot > slots->ranges[slots->num_ranges - 1].end); + + /* Check if we can extend the last range */ + slotRange *last = &slots->ranges[slots->num_ranges - 1]; + if (slot == last->end + 1) { + last->end = slot; + return slots; + } + + /* Calculate current capacity and reallocate if needed */ + int cap = (int) ((zmalloc_size(slots) - sizeof(slotRangeArray)) / sizeof(slotRange)); + if (slots->num_ranges >= cap) + slots = zrealloc(slots, sizeof(slotRangeArray) + sizeof(slotRange) * cap * 2); + + /* Add new single-slot range */ + slots->ranges[slots->num_ranges].start = slot; + slots->ranges[slots->num_ranges].end = slot; + slots->num_ranges++; + + return slots; +} + +/* Returns 1 if the slot range array contains the given slot, 0 otherwise. */ +int slotRangeArrayContains(slotRangeArray *slots, unsigned int slot) { + for (int i = 0; i < slots->num_ranges; i++) + if (slots->ranges[i].start <= slot && slots->ranges[i].end >= slot) + return 1; + return 0; +} + +/* Free the slot range array. */ +void slotRangeArrayFree(slotRangeArray *slots) { + zfree(slots); +} + +/* Generic version of slotRangeArrayFree(). */ +void slotRangeArrayFreeGeneric(void *slots) { + slotRangeArrayFree(slots); +} + +/* Slot range array iterator */ +slotRangeArrayIter *slotRangeArrayGetIterator(slotRangeArray *slots) { + slotRangeArrayIter *it = zmalloc(sizeof(*it)); + it->slots = slots; + it->range_index = 0; + it->cur_slot = slots->num_ranges > 0 ? slots->ranges[0].start : -1; + return it; +} + +/* Returns the next slot in the array, or -1 if there are no more slots. */ +int slotRangeArrayNext(slotRangeArrayIter *it) { + if (it->range_index >= it->slots->num_ranges) return -1; + + if (it->cur_slot < it->slots->ranges[it->range_index].end) { + it->cur_slot++; + } else { + it->range_index++; + if (it->range_index < it->slots->num_ranges) + it->cur_slot = it->slots->ranges[it->range_index].start; + else + it->cur_slot = -1; /* finished */ + } + return it->cur_slot; +} + +int slotRangeArrayGetCurrentSlot(slotRangeArrayIter *it) { + return it->cur_slot; +} + +void slotRangeArrayIteratorFree(slotRangeArrayIter *it) { + zfree(it); +} + +/* Parse slot range pairs from argv starting at `pos`. + * `argc` is the argument count, `pos` is the first slot argument index. + * Returns a slotRangeArray or NULL on error. */ +slotRangeArray *parseSlotRangesOrReply(client *c, int argc, int pos) { + int start, end, count; + slotRangeArray *slots; + + /* Ensure there is at least one (start,end) slot range pairs. */ + if (argc < 0 || pos < 0 || pos >= argc || (argc - pos) < 2 || ((argc - pos) % 2) != 0) { + addReplyErrorArity(c); + return NULL; + } + + count = (argc - pos) / 2; + slots = slotRangeArrayCreate(count); + slots->num_ranges = 0; + + for (int j = pos; j < argc; j += 2) { + if ((start = getSlotOrReply(c, c->argv[j])) == -1 || + (end = getSlotOrReply(c, c->argv[j + 1])) == -1) + { + slotRangeArrayFree(slots); + return NULL; + } + slotRangeArraySet(slots, slots->num_ranges, start, end); + slots->num_ranges++; + } + + sds err = NULL; + if (slotRangeArrayNormalizeAndValidate(slots, &err) != C_OK) { + addReplyErrorSds(c, err); + slotRangeArrayFree(slots); + return NULL; + } + return slots; +} + +/* Return 1 if the keys in the slot can be accessed, 0 otherwise. */ +int clusterCanAccessKeysInSlot(int slot) { + /* If not in cluster mode, all keys are accessible */ + if (server.cluster_enabled == 0) return 1; + + /* If the slot is being imported under old slot migration approach, we should + * allow to list keys from the slot as previously. */ + if (getImportingSlotSource(slot)) return 1; + + /* If using atomic slot migration, check if the slot belongs to the current + * node or its master, return 1 if so. */ + clusterNode *myself = getMyClusterNode(); + if (clusterNodeIsSlave(myself)) { + clusterNode *master = clusterNodeGetMaster(myself); + if (master && clusterNodeCoversSlot(master, slot)) + return 1; + } else { + if (clusterNodeCoversSlot(myself, slot)) + return 1; + } + return 0; +} + +/* Return the slot ranges that belong to the current node or its master. */ +slotRangeArray *clusterGetLocalSlotRanges(void) { + slotRangeArray *slots = NULL; + + if (!server.cluster_enabled) { + slots = slotRangeArrayCreate(1); + slotRangeArraySet(slots, 0, 0, CLUSTER_SLOTS - 1); + return slots; + } + + clusterNode *master = clusterNodeGetMaster(getMyClusterNode()); + if (master) { + for (int i = 0; i < CLUSTER_SLOTS; i++) { + if (clusterNodeCoversSlot(master, i)) + slots = slotRangeArrayAppend(slots, i); + } + } + return slots ? slots : slotRangeArrayCreate(0); +} + +/* Partially flush destination DB in a cluster node, based on the slot range. + * + * Usage: SFLUSH [ ]* [SYNC|ASYNC] + * + * This is an initial implementation of SFLUSH (slots flush) which is limited to + * flushing a single shard as a whole, but in the future the same command may be + * used to partially flush a shard based on hash slots. Currently only if provided + * slots cover entirely the slots of a node, the node will be flushed and the + * return value will be pairs of slot ranges. Otherwise, a single empty set will + * be returned. If possible, SFLUSH SYNC will be run as blocking ASYNC as an + * optimization. + */ +void sflushCommand(client *c) { + int flags = EMPTYDB_NO_FLAGS, argc = c->argc; + + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + + /* check if last argument is SYNC or ASYNC */ + if (!strcasecmp(c->argv[c->argc-1]->ptr,"sync")) { + flags = EMPTYDB_NO_FLAGS; + argc--; + } else if (!strcasecmp(c->argv[c->argc-1]->ptr,"async")) { + flags = EMPTYDB_ASYNC; + argc--; + } else if (server.lazyfree_lazy_user_flush) { + flags = EMPTYDB_ASYNC; + } + + /* parse the slot range */ + if (argc % 2 == 0) { + addReplyErrorArity(c); + return; + } + + /* Parse slot ranges from the command arguments. */ + slotRangeArray *slots = parseSlotRangesOrReply(c, argc, 1); + if (!slots) return; + + /* Iterate and find the slot ranges that belong to this node. Save them in + * a new slotRangeArray. It is allocated on heap since there is a chance + * that FLUSH SYNC will be running as blocking ASYNC and only later reply + * with slot ranges */ + unsigned char slots_to_flush[CLUSTER_SLOTS] = {0}; /* Requested slots to flush */ + slotRangeArray *myslots = NULL; + for (int i = 0; i < slots->num_ranges; i++) { + for (int j = slots->ranges[i].start; j <= slots->ranges[i].end; j++) { + if (clusterIsMySlot(j)) { + myslots = slotRangeArrayAppend(myslots, j); + slots_to_flush[j] = 1; + } + } + } + + /* Verify that all slots of mynode got covered. See sflushCommand() comment. */ + int all_slots_covered = 1; + for (int i = 0; i < CLUSTER_SLOTS; i++) { + if (clusterIsMySlot(i) && !slots_to_flush[i]) { + all_slots_covered = 0; + break; + } + } + if (myslots == NULL || !all_slots_covered) { + addReplyArrayLen(c, 0); + slotRangeArrayFree(slots); + slotRangeArrayFree(myslots); + return; + } + slotRangeArrayFree(slots); + + /* Flush selected slots. If not flush as blocking async, then reply immediately */ + if (flushCommandCommon(c, FLUSH_TYPE_SLOTS, flags, myslots) == 0) + replySlotsFlushAndFree(c, myslots); +} + +/* The READWRITE command just clears the READONLY command state. */ +void readwriteCommand(client *c) { + if (server.cluster_enabled == 0) { + addReplyError(c,"This instance has cluster support disabled"); + return; + } + c->flags &= ~CLIENT_READONLY; + addReply(c,shared.ok); +} + +/* Resets transient cluster stats that we expose via INFO or other means that we want + * to reset via CONFIG RESETSTAT. The function is also used in order to + * initialize these fields in clusterInit() at server startup. */ +void resetClusterStats(void) { + if (!server.cluster_enabled) return; + + clusterSlotStatResetAll(); +} + +/* This function is called at server startup in order to initialize cluster data + * structures that are shared between the different cluster implementations. */ +void clusterCommonInit(void) { + resetClusterStats(); + asmInit(); +} + +/* This function is called after the node startup in order to check if there + * are any slots that we have keys for, but are not assigned to us. If so, + * we delete the keys. */ +void clusterDeleteKeysInUnownedSlots(void) { + if (clusterNodeIsSlave(getMyClusterNode())) return; + + /* Check that all the slots we have keys for are assigned to us. Otherwise, + * delete the keys. */ + for (int i = 0; i < CLUSTER_SLOTS; i++) { + /* Skip if: no keys in the slot, it's our slot, or we are importing it. */ + if (!countKeysInSlot(i) || + clusterIsMySlot(i) || + getImportingSlotSource(i)) + { + continue; + } + + serverLog(LL_NOTICE, "I have keys for slot %d, but the slot is " + "assigned to another node. " + "Deleting keys in the slot.", i); + /* With atomic slot migration, it is safe to drop keys from slots + * that are not owned. This will not result in data loss under the + * legacy slot migration approach either, since the importing state + * has already been persisted in node.conf. */ + clusterDelKeysInSlot(i, 0); + } +} + + +/* This function is called after the node startup in order to verify that data + * loaded from disk is in agreement with the cluster configuration: + * + * 1) If we find keys about hash slots we have no responsibility for, the + * following happens: + * A) If no other node is in charge according to the current cluster + * configuration, we add these slots to our node. + * B) If according to our config other nodes are already in charge for + * this slots, we set the slots as IMPORTING from our point of view + * in order to justify we have those slots, and in order to make + * redis-cli aware of the issue, so that it can try to fix it. + * 2) If we find data in a DB different than DB0 we return C_ERR to + * signal the caller it should quit the server with an error message + * or take other actions. + * + * The function always returns C_OK even if it will try to correct + * the error described in "1". However if data is found in DB different + * from DB0, C_ERR is returned. + * + * The function also uses the logging facility in order to warn the user + * about desynchronizations between the data we have in memory and the + * cluster configuration. */ +int verifyClusterConfigWithData(void) { + /* Return ASAP if a module disabled cluster redirections. In that case + * every master can store keys about every possible hash slot. */ + if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION) + return C_OK; + + /* If this node is a slave, don't perform the check at all as we + * completely depend on the replication stream. */ + if (clusterNodeIsSlave(getMyClusterNode())) return C_OK; + + /* Make sure we only have keys in DB0. */ + for (int i = 1; i < server.dbnum; i++) { + if (kvstoreSize(server.db[i].keys)) return C_ERR; + } + + /* Take over slots that we have keys for, but are assigned to no one. */ + clusterClaimUnassignedSlots(); + /* Delete keys in unowned slots */ + clusterDeleteKeysInUnownedSlots(); + return C_OK; +} diff --git a/examples/redis-unstable/src/cluster.h b/examples/redis-unstable/src/cluster.h new file mode 100644 index 0000000..7daf093 --- /dev/null +++ b/examples/redis-unstable/src/cluster.h @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Copyright (c) 2024-present, Valkey contributors. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + * + * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information. + */ + +#ifndef __CLUSTER_H +#define __CLUSTER_H + +/*----------------------------------------------------------------------------- + * Redis cluster exported API. + *----------------------------------------------------------------------------*/ + +#define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ +#define CLUSTER_SLOTS (1< redis: initiates/advances/cancels ASM operations + * - clusterAsmOnEvent(...) redis -> impl: notifies state changes + * + * Generic steps for an alternative implementation: + * - On destination side, implementation calls clusterAsmProcess(ASM_EVENT_IMPORT_START) + * to start an import operation. + * - Redis calls clusterAsmOnEvent() when an ASM event occurs. + * - On the source side, Redis will call clusterAsmOnEvent(ASM_EVENT_HANDOFF_PREP) + * when slots are ready to be handed off and the write pause is needed. + * - Implementation stops the traffic to the slots and calls clusterAsmProcess(ASM_EVENT_HANDOFF) + * - On the destination side, Redis calls clusterAsmOnEvent(ASM_EVENT_TAKEOVER) + * when destination node is ready to take over the slot, waiting for ownership change. + * - Cluster implementation updates the config and calls clusterAsmProcess(ASM_EVENT_DONE) + * to notify Redis that the slots ownership has changed. + * + * Sequence diagram for import: + * - Note: shows only the events that cluster implementation needs to react. + * + * ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ + * │ Destination │ │ Destination │ │ Source │ │ Source │ + * │ Cluster impl │ │ Master │ │ Master │ │ Cluster impl │ + * └───────┬───────┘ └───────┬───────┘ └───────┬───────┘ └───────┬───────┘ + * │ │ │ │ + * │ ASM_EVENT_IMPORT_START │ │ │ + * ├─────────────────────────────►│ │ │ + * │ │ CLUSTER SYNCSLOTS │ │ + * │ ├────────────────────────►│ │ + * │ │ │ │ + * │ │ SNAPSHOT(restore cmds) │ │ + * │ │◄────────────────────────┤ │ + * │ │ Repl stream │ │ + * │ │◄────────────────────────┤ │ + * │ │ │ ASM_EVENT_HANDOFF_PREP │ + * │ │ ├────────────────────────────►│ + * │ │ │ ASM_EVENT_HANDOFF │ + * │ │ │◄────────────────────────────┤ + * │ │ Drain repl stream │ │ + * │ │◄────────────────────────┤ │ + * │ ASM_EVENT_TAKEOVER │ │ │ + * │◄─────────────────────────────┤ │ │ + * │ │ │ │ + * │ ASM_EVENT_DONE │ │ │ + * ├─────────────────────────────►│ │ ASM_EVENT_DONE │ + * │ │ │◄────────────────────────────┤ + * │ │ │ │ + */ + +#define ASM_EVENT_IMPORT_START 1 /* Start a new import operation (destination side) */ +#define ASM_EVENT_CANCEL 2 /* Cancel an ongoing import/migrate operation (source and destination side) */ +#define ASM_EVENT_HANDOFF_PREP 3 /* Slot is ready to be handed off to the destination shard (source side) */ +#define ASM_EVENT_HANDOFF 4 /* Notify that the slot can be handed off (source side) */ +#define ASM_EVENT_TAKEOVER 5 /* Ready to take over the slot, waiting for config change (destination side) */ +#define ASM_EVENT_DONE 6 /* Notify that import/migrate is completed, config is updated (source and destination side) */ + +#define ASM_EVENT_IMPORT_PREP 7 /* Import is about to start, the implementation may reject by returning C_ERR */ +#define ASM_EVENT_IMPORT_STARTED 8 /* Import started */ +#define ASM_EVENT_IMPORT_FAILED 9 /* Import failed */ +#define ASM_EVENT_IMPORT_COMPLETED 10 /* Import completed (config updated) */ +#define ASM_EVENT_MIGRATE_PREP 11 /* Migrate is about to start, the implementation may reject by returning C_ERR */ +#define ASM_EVENT_MIGRATE_STARTED 12 /* Migrate started */ +#define ASM_EVENT_MIGRATE_FAILED 13 /* Migrate failed */ +#define ASM_EVENT_MIGRATE_COMPLETED 14 /* Migrate completed (config updated) */ + + +/* Called by cluster implementation to request an ASM operation. (cluster impl --> redis) + * Valid values for 'event': + * ASM_EVENT_IMPORT_START + * ASM_EVENT_CANCEL + * ASM_EVENT_HANDOFF + * ASM_EVENT_DONE + * + * For ASM_EVENT_IMPORT_START, 'task_id' should be a unique string. + * For other events (ASM_EVENT_CANCEL, ASM_EVENT_HANDOFF, ASM_EVENT_DONE), + * 'task_id' should match the ID from the corresponding import operation. + * Usage: + * char *task_id = malloc(CLUSTER_NAMELEN + 1); + * getRandomHexChars(task_id, CLUSTER_NAMELEN); + * task_id[CLUSTER_NAMELEN] = '\0'; + * + * slotRangeArray *slots = slotRangeArrayCreate(1); + * slotRangeArraySet(slots, 0, 0, 1000); + * + * const char *err = NULL; + * int ret = clusterAsmProcess(task_id, ASM_EVENT_IMPORT_START, slots, &err); + * zfree(task_id); + * slotRangeArrayFree(slots); + * + * if (ret != C_OK) { + * perror(err); + * return; + * } + * + * For ASM_EVENT_CANCEL, if `task_id` is NULL, all tasks will be cancelled. + * If `arg` parameter is provided, it should be a pointer to an int. It will be + * set to the number of tasks cancelled. + * + * Return value: + * - Returns C_OK on success, C_ERR on failure and 'err' will be set to the + * error message. + * + * Memory management: + * - There is no ownership transfer of 'task_id', 'err' or `slotRangeArray`. + * - `task_id` and `slotRangeArray` should be allocated and be freed by the + * caller. Redis internally will make a copy of these. + * - `err` is allocated by Redis and should NOT be freed by the caller. + **/ +int clusterAsmProcess(const char *task_id, int event, void *arg, char **err); + +/* Called when an ASM event occurs to notify the cluster implementation. (redis --> cluster impl) + * + * `arg` will point to a `slotRangeArray` for the following events: + * ASM_EVENT_IMPORT_PREP + * ASM_EVENT_IMPORT_STARTED + * ASM_EVENT_MIGRATE_PREP + * ASM_EVENT_MIGRATE_STARTED + * ASM_EVENT_HANDOFF_PREP + * + * Memory management: + * - Redis owns the `task_id` and `slotRangeArray`. + * + * Returns C_OK on success. + * + * If the cluster implementation returns C_ERR for ASM_EVENT_IMPORT_PREP or + * ASM_EVENT_MIGRATE_PREP, operation will not start. + **/ +int clusterAsmOnEvent(const char *task_id, int event, void *arg); + +#endif /* __CLUSTER_H */ diff --git a/examples/redis-unstable/src/cluster_asm.c b/examples/redis-unstable/src/cluster_asm.c new file mode 100644 index 0000000..a090453 --- /dev/null +++ b/examples/redis-unstable/src/cluster_asm.c @@ -0,0 +1,3602 @@ +/* cluster_asm.c -- Atomic slot migration implementation for cluster + * + * Copyright (c) 2025-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#include "server.h" +#include "cluster.h" +#include "functions.h" +#include "cluster_asm.h" +#include "cluster_slot_stats.h" + +#define ASM_IMPORT (1 << 1) +#define ASM_MIGRATE (1 << 2) + +#define ASM_DEBUG_TRIM_DEFAULT 0 +#define ASM_DEBUG_TRIM_NONE 1 +#define ASM_DEBUG_TRIM_BG 2 +#define ASM_DEBUG_TRIM_ACTIVE 3 + +#define ASM_AOF_MIN_ITEMS_PER_KEY 512 /* Minimum number of items per key to use AOF format encoding */ + +typedef struct asmTask { + sds id; /* Task ID */ + int operation; /* Either ASM_IMPORT or ASM_MIGRATE */ + slotRangeArray *slots; /* List of slot ranges for this migration task */ + int state; /* Current state of the task */ + int dest_state; /* Destination node's main state (approximate) */ + char source[CLUSTER_NAMELEN]; /* Source node name */ + char dest[CLUSTER_NAMELEN]; /* Destination node name */ + clusterNode *source_node; /* Source node */ + connection *main_channel_conn; /* Main channel connection */ + connection *rdb_channel_conn; /* RDB channel connection */ + int rdb_channel_state; /* State of the RDB channel */ + unsigned long long dest_offset; /* Destination offset */ + unsigned long long source_offset; /* Source offset */ + int cross_slot_during_propagating; /* If cross-slot commands are encountered during propagating */ + int stream_eof_during_streaming; /* If STREAM-EOF is received during streaming buffer */ + replDataBuf sync_buffer; /* Buffer for the stream */ + client *main_channel_client; /* Client for the main channel on the source side */ + client *rdb_channel_client; /* Client for the RDB channel on the source side */ + long long retry_count; /* Number of retries for this task */ + mstime_t create_time; /* Task creation time */ + mstime_t start_time; /* Task start time */ + mstime_t end_time; /* Task end time */ + mstime_t paused_time; /* The time when the slot writes were paused */ + mstime_t dest_slots_snapshot_time; /* The time when the destination starts applying the slot snapshot */ + mstime_t dest_accum_applied_time; /* The time when the destination finishes applying the accumulated buffer */ + sds error; /* Error message for this task */ + redisOpArray *pre_snapshot_module_cmds; /* Module commands to be propagated at the beginning of slot migration */ +} asmTask; + +struct asmManager { + list *tasks; /* List of asmTask to be processed */ + list *archived_tasks; /* List of archived asmTask */ + list *pending_trim_jobs; /* List of pending trim jobs (due to write pause) */ + list *active_trim_jobs; /* List of active trim jobs */ + slotRangeArrayIter *active_trim_it; /* Iterator of the current active trim job */ + size_t sync_buffer_peak; /* Peak size of sync buffer */ + asmTask *master_task; /* The task that is currently active on the master */ + + /* Fail point injection for debugging */ + int debug_fail_channel; /* Channel where the task will fail */ + int debug_fail_state; /* State where the task will fail */ + int debug_trim_method; /* Method to trim the buffer */ + int debug_active_trim_delay; /* Sleep before trimming each key */ + + /* Active trim stats */ + unsigned long long active_trim_started; /* Number of times active trim was started */ + unsigned long long active_trim_completed; /* Number of times active trim was completed */ + unsigned long long active_trim_cancelled; /* Number of times active trim was cancelled */ + unsigned long long active_trim_current_job_keys; /* Total number of keys to trim in the current job */ + unsigned long long active_trim_current_job_trimmed; /* Number of keys trimmed in the current job */ +}; + +enum asmState { + /* Common state */ + ASM_NONE = 0, + ASM_CONNECTING, + ASM_AUTH_REPLY, + ASM_CANCELED, + ASM_FAILED, + ASM_COMPLETED, + + /* Import state */ + ASM_SEND_HANDSHAKE, + ASM_HANDSHAKE_REPLY, + ASM_SEND_SYNCSLOTS, + ASM_SYNCSLOTS_REPLY, + ASM_INIT_RDBCHANNEL, + ASM_ACCUMULATE_BUF, + ASM_READY_TO_STREAM, + ASM_STREAMING_BUF, + ASM_WAIT_STREAM_EOF, + ASM_TAKEOVER, + + /* Migrate state */ + ASM_WAIT_RDBCHANNEL, + ASM_WAIT_BGSAVE_START, + ASM_SEND_BULK_AND_STREAM, + ASM_SEND_STREAM, + ASM_HANDOFF_PREP, + ASM_HANDOFF, + ASM_STREAM_EOF, + + /* RDB channel state */ + ASM_RDBCHANNEL_REQUEST, + ASM_RDBCHANNEL_REPLY, + ASM_RDBCHANNEL_TRANSFER, +}; + +enum asmChannel { + ASM_IMPORT_MAIN_CHANNEL = 1, /* Main channel for the import task */ + ASM_IMPORT_RDB_CHANNEL, /* RDB channel for the import task */ + ASM_MIGRATE_MAIN_CHANNEL, /* Main channel for the migrate task */ + ASM_MIGRATE_RDB_CHANNEL /* RDB channel for the migrate task */ +}; + +/* Global ASM manager */ +struct asmManager *asmManager = NULL; + +/* replication.c */ +char *sendCommand(connection *conn, ...); +char *sendCommandArgv(connection *conn, int argc, char **argv, size_t *argv_lens); +char *receiveSynchronousResponse(connection *conn); +ConnectionType *connTypeOfReplication(void); +int startBgsaveForReplication(int mincapa, int req); +void createReplicationBacklogIfNeeded(void); +/* cluster.c */ +void createDumpPayload(rio *payload, robj *o, robj *key, int dbid, int skip_checksum); +/* cluster_asm.c */ +static void asmStartImportTask(asmTask *task); +static void asmTaskCancel(asmTask *task, const char *reason); +static void asmSyncBufferReadFromConn(connection *conn); +static void propagateTrimSlots(slotRangeArray *slots); +void asmTrimJobSchedule(slotRangeArray *slots); +void asmTrimJobProcessPending(void); +void asmCancelPendingTrimJobs(void); +void asmTriggerActiveTrim(slotRangeArray *slots); +void asmActiveTrimEnd(void); +int asmIsAnyTrimJobOverlaps(slotRangeArray *slots); +void asmTrimSlotsIfNotOwned(slotRangeArray *slots); +void asmNotifyStateChange(asmTask *task, int event); + +void asmInit(void) { + asmManager = zcalloc(sizeof(*asmManager)); + asmManager->tasks = listCreate(); + asmManager->archived_tasks = listCreate(); + asmManager->pending_trim_jobs = listCreate(); + asmManager->sync_buffer_peak = 0; + asmManager->master_task = NULL; + asmManager->debug_fail_channel = -1; + asmManager->debug_fail_state = -1; + asmManager->debug_trim_method = ASM_DEBUG_TRIM_DEFAULT; + asmManager->debug_active_trim_delay = 0; + asmManager->active_trim_jobs = listCreate(); + asmManager->active_trim_started = 0; + asmManager->active_trim_completed = 0; + asmManager->active_trim_cancelled = 0; + listSetFreeMethod(asmManager->active_trim_jobs, slotRangeArrayFreeGeneric); +} + +char *asmTaskStateToString(int state) { + switch (state) { + case ASM_NONE: return "none"; + case ASM_CONNECTING: return "connecting"; + case ASM_AUTH_REPLY: return "auth-reply"; + case ASM_CANCELED: return "canceled"; + case ASM_FAILED: return "failed"; + case ASM_COMPLETED: return "completed"; + + /* Import state */ + case ASM_SEND_HANDSHAKE: return "send-handshake"; + case ASM_HANDSHAKE_REPLY: return "handshake-reply"; + case ASM_SEND_SYNCSLOTS: return "send-syncslots"; + case ASM_SYNCSLOTS_REPLY: return "syncslots-reply"; + case ASM_INIT_RDBCHANNEL: return "init-rdbchannel"; + case ASM_ACCUMULATE_BUF: return "accumulate-buffer"; + case ASM_READY_TO_STREAM: return "ready-to-stream"; + case ASM_STREAMING_BUF: return "streaming-buffer"; + case ASM_WAIT_STREAM_EOF: return "wait-stream-eof"; + case ASM_TAKEOVER: return "takeover"; + + /* Migrate state */ + case ASM_WAIT_RDBCHANNEL: return "wait-rdbchannel"; + case ASM_WAIT_BGSAVE_START: return "wait-bgsave-start"; + case ASM_SEND_BULK_AND_STREAM: return "send-bulk-and-stream"; + case ASM_SEND_STREAM: return "send-stream"; + case ASM_HANDOFF_PREP: return "handoff-prep"; + case ASM_HANDOFF: return "handoff"; + case ASM_STREAM_EOF: return "stream-eof"; + + /* RDB channel state */ + case ASM_RDBCHANNEL_REQUEST: return "rdbchannel-request"; + case ASM_RDBCHANNEL_REPLY: return "rdbchannel-reply"; + case ASM_RDBCHANNEL_TRANSFER: return "rdbchannel-transfer"; + + default: return "unknown"; + } + serverAssert(0); /* Unreachable */ +} + +const char *asmChannelToString(int channel) { + switch (channel) { + case ASM_IMPORT_MAIN_CHANNEL: return "import-main-channel"; + case ASM_IMPORT_RDB_CHANNEL: return "import-rdb-channel"; + case ASM_MIGRATE_MAIN_CHANNEL: return "migrate-main-channel"; + case ASM_MIGRATE_RDB_CHANNEL: return "migrate-rdb-channel"; + default: return "unknown"; + } +} + +int asmDebugSetFailPoint(char *channel, char *state) { + if (!asmManager) { + serverLog(LL_WARNING, "ASM manager is not initialized"); + return C_ERR; + } + asmManager->debug_fail_channel = -1; + asmManager->debug_fail_state = -1; + if (!channel && !state) return C_ERR; + if (sdslen(channel) == 0 && sdslen(state) == 0) { + serverLog(LL_WARNING, "ASM fail point is cleared"); + return C_OK; + } + + for (int i = ASM_IMPORT_MAIN_CHANNEL; i <= ASM_MIGRATE_RDB_CHANNEL; i++) { + if (!strcasecmp(channel, asmChannelToString(i))) { + asmManager->debug_fail_channel = i; + break; + } + } + if (asmManager->debug_fail_channel == -1) return C_ERR; + + for (int i = ASM_NONE; i <= ASM_RDBCHANNEL_TRANSFER; i++) { + if (!strcasecmp(state, asmTaskStateToString(i))) { + asmManager->debug_fail_state = i; + break; + } + } + if (asmManager->debug_fail_state == -1) return C_ERR; + + serverLog(LL_NOTICE, "ASM fail point set: channel=%s, state=%s", channel, state); + return C_OK; +} + +int asmDebugSetTrimMethod(const char *method, int active_trim_delay) { + if (!asmManager) { + serverLog(LL_WARNING, "ASM manager is not initialized"); + return C_ERR; + } + int prev = asmManager->debug_trim_method; + if (!strcasecmp(method, "default")) asmManager->debug_trim_method = ASM_DEBUG_TRIM_DEFAULT; + else if (!strcasecmp(method, "none")) asmManager->debug_trim_method = ASM_DEBUG_TRIM_NONE; + else if (!strcasecmp(method, "bg")) asmManager->debug_trim_method = ASM_DEBUG_TRIM_BG; + else if (!strcasecmp(method, "active")) asmManager->debug_trim_method = ASM_DEBUG_TRIM_ACTIVE; + else return C_ERR; + + /* If we are switching from none to default, delete all the keys in the + * slots we don't own */ + if (prev == ASM_DEBUG_TRIM_NONE && asmManager->debug_trim_method != ASM_DEBUG_TRIM_NONE) { + for (int i = 0; i < CLUSTER_SLOTS; i++) + if (!clusterIsMySlot(i)) + clusterDelKeysInSlot(i, 0); + } + asmManager->debug_active_trim_delay = active_trim_delay; + serverLog(LL_NOTICE, "ASM trim method was set=%s, active_trim_delay=%d", method, active_trim_delay); + return C_OK; +} + +int asmDebugIsFailPointActive(int channel, int state) { + if (!asmManager) return 0; /* ASM manager not initialized */ + if (asmManager->debug_fail_channel == channel && asmManager->debug_fail_state == state) { + serverLog(LL_NOTICE, "ASM fail point active: channel=%s, state=%s", + asmChannelToString(channel), asmTaskStateToString(state)); + return 1; + } + return 0; +} + +sds asmCatInfoString(sds info) { + int active_tasks = 0; + + listIter li; + listNode *ln; + listRewind(asmManager->tasks, &li); + while ((ln = listNext(&li)) != NULL) { + asmTask *task = listNodeValue(ln); + if (task->operation == ASM_IMPORT || + (task->operation == ASM_MIGRATE && task->state != ASM_FAILED)) + { + active_tasks++; + } + } + + return sdscatprintf(info ? info : sdsempty(), + "cluster_slot_migration_active_tasks:%d\r\n" + "cluster_slot_migration_active_trim_running:%lu\r\n" + "cluster_slot_migration_active_trim_current_job_keys:%llu\r\n" + "cluster_slot_migration_active_trim_current_job_trimmed:%llu\r\n" + "cluster_slot_migration_stats_active_trim_started:%llu\r\n" + "cluster_slot_migration_stats_active_trim_completed:%llu\r\n" + "cluster_slot_migration_stats_active_trim_cancelled:%llu\r\n", + active_tasks, + listLength(asmManager->active_trim_jobs), + asmManager->active_trim_current_job_keys, + asmManager->active_trim_current_job_trimmed, + asmManager->active_trim_started, + asmManager->active_trim_completed, + asmManager->active_trim_cancelled); +} + +void asmTaskReset(asmTask *task) { + task->state = ASM_NONE; + task->dest_state = ASM_NONE; + task->rdb_channel_state = ASM_NONE; + task->main_channel_conn = NULL; + task->rdb_channel_conn = NULL; + task->dest_offset = 0; + task->source_offset = 0; + task->stream_eof_during_streaming = 0; + task->cross_slot_during_propagating = 0; + replDataBufInit(&task->sync_buffer); + task->main_channel_client = NULL; + task->rdb_channel_client = NULL; + task->paused_time = 0; + task->dest_slots_snapshot_time = 0; + task->dest_accum_applied_time = 0; + task->pre_snapshot_module_cmds = NULL; +} + +asmTask *asmTaskCreate(const char *task_id) { + asmTask *task = zcalloc(sizeof(*task)); + task->error = sdsempty(); + asmTaskReset(task); + task->slots = NULL; + task->source_node = NULL; + task->retry_count = 0; + task->create_time = server.mstime; + task->start_time = -1; + task->end_time = -1; + if (task_id) { + task->id = sdsnew(task_id); + } else { + task->id = sdsnewlen(NULL, CLUSTER_NAMELEN); + getRandomHexChars(task->id, CLUSTER_NAMELEN); + } + + return task; +} + +void asmTaskFree(asmTask *task) { + replDataBufClear(&task->sync_buffer); + sdsfree(task->id); + slotRangeArrayFree(task->slots); + sdsfree(task->error); + zfree(task); +} + +/* Convert the task state to the corresponding event. */ +int asmTaskStateToEvent(asmTask *task) { + if (task->operation == ASM_IMPORT) { + if (task->state == ASM_COMPLETED) return ASM_EVENT_IMPORT_COMPLETED; + else if (task->state == ASM_FAILED) return ASM_EVENT_IMPORT_FAILED; + else return ASM_EVENT_IMPORT_STARTED; + } else { + if (task->state == ASM_COMPLETED) return ASM_EVENT_MIGRATE_COMPLETED; + else if (task->state == ASM_FAILED) return ASM_EVENT_MIGRATE_FAILED; + else return ASM_EVENT_MIGRATE_STARTED; + } +} + +/* Serialize ASM task information into a string for transmission to replicas. + * Format: "task_id:source_node:dest_node:operation:state:slot_ranges" + * Where slot_ranges is in the format "1000-2000 3000-4000 ..." */ +sds asmTaskSerialize(asmTask *task) { + sds serialized = sdsempty(); + + /* Add task ID */ + serialized = sdscatprintf(serialized, "%s:", task->id); + + /* Add source node ID (40 chars) */ + serialized = sdscatlen(serialized, task->source, CLUSTER_NAMELEN); + serialized = sdscat(serialized, ":"); + + /* Add destination node ID (40 chars) */ + serialized = sdscatlen(serialized, task->dest, CLUSTER_NAMELEN); + serialized = sdscat(serialized, ":"); + + /* Add operation type */ + serialized = sdscatprintf(serialized, "%s:", task->operation == ASM_IMPORT ? + "import" : "migrate"); + + /* Add current state */ + serialized = sdscatprintf(serialized, "%s:", asmTaskStateToString(task->state)); + + /* Add slot ranges sds */ + sds slots_str = slotRangeArrayToString(task->slots); + serialized = sdscatprintf(serialized, "%s", slots_str); + sdsfree(slots_str); + + return serialized; +} + +/* Deserialize ASM task information from a string and create a complete asmTask. + * Format: "task_id:source_node:dest_node:operation:state:slot_ranges" + * Returns a new asmTask on success, NULL on failure. */ +asmTask *asmTaskDeserialize(sds data) { + int count, idx = 0; + asmTask *task = NULL; + if (!data || sdslen(data) == 0) return NULL; + + sds *parts = sdssplitlen(data, sdslen(data), ":", 1, &count); + if (count < 6) goto err; + + /* Parse task ID */ + if (sdslen(parts[idx]) == 0) goto err; + task = asmTaskCreate(parts[idx]); + if (!task) goto err; + idx++; + + /* Parse source node ID */ + if (sdslen(parts[idx]) != CLUSTER_NAMELEN) goto err; + memcpy(task->source, parts[idx], CLUSTER_NAMELEN); + idx++; + + /* Parse destination node ID */ + if (sdslen(parts[idx]) != CLUSTER_NAMELEN) goto err; + memcpy(task->dest, parts[idx], CLUSTER_NAMELEN); + idx++; + + /* Parse operation type */ + if (!strcasecmp(parts[idx], "import")) { + task->operation = ASM_IMPORT; + } else if (!strcasecmp(parts[idx], "migrate")) { + task->operation = ASM_MIGRATE; + } else { + goto err; + } + idx++; + + /* Parse state */ + task->state = ASM_NONE; /* Default state */ + for (int state = ASM_NONE; state <= ASM_RDBCHANNEL_TRANSFER; state++) { + if (!strcasecmp(parts[idx], asmTaskStateToString(state))) { + task->state = state; + break; + } + } + idx++; + + /* Parse slot ranges */ + task->slots = slotRangeArrayFromString(parts[idx]); + if (!task->slots) goto err; + idx++; + + /* Ignore any extra fields for future compatibility */ + + sdsfreesplitres(parts, count); + return task; + +err: + if (task) asmTaskFree(task); + sdsfreesplitres(parts, count); + return NULL; +} + +/* Notify replicas about ASM task information to maintain consistency during + * slot migration. This function sends a CLUSTER SYNCSLOTS CONF ASM-TASK command + * to all connected replicas with the serialized task information. */ +void asmNotifyReplicasStateChange(struct asmTask *task) { + if (!server.cluster_enabled || !clusterNodeIsMaster(getMyClusterNode())) return; + + /* Create command arguments for CLUSTER SYNCSLOTS CONF ASM-TASK */ + robj *argv[5]; + argv[0] = createStringObject("CLUSTER", 7); + argv[1] = createStringObject("SYNCSLOTS", 9); + argv[2] = createStringObject("CONF", 4); + argv[3] = createStringObject("ASM-TASK", 8); + argv[4] = createObject(OBJ_STRING, asmTaskSerialize(task)); + + /* Send the command to all replicas */ + replicationFeedSlaves(server.slaves, -1, argv, 5); + + /* Clean up command objects */ + for (int i = 0; i < 5; i++) { + decrRefCount(argv[i]); + } +} + +/* Dump the active import ASM task information. */ +sds asmDumpActiveImportTask(void) { + if (!server.cluster_enabled) return NULL; + + /* For replica, dump the master active task. */ + if (clusterNodeIsSlave(getMyClusterNode()) && + asmManager->master_task && + asmManager->master_task->state != ASM_FAILED && + asmManager->master_task->state != ASM_COMPLETED) + { + return asmTaskSerialize(asmManager->master_task); + } + + /* For master, dump the first active task. */ + if (!asmManager || listLength(asmManager->tasks) == 0) return NULL; + asmTask *task = listNodeValue(listFirst(asmManager->tasks)); + if (task->state == ASM_NONE || task->state == ASM_FAILED || + task->state == ASM_COMPLETED) return NULL; + + return asmTaskSerialize(task); +} + +size_t asmGetPeakSyncBufferSize(void) { + if (!asmManager) return 0; + /* Compute peak sync buffer usage. The current task's peak may not + * reflect in asmManager->sync_buffer_peak immediately. */ + size_t peak = asmManager->sync_buffer_peak; + asmTask *task = listFirst(asmManager->tasks) ? + listNodeValue(listFirst(asmManager->tasks)) : NULL; + if (task && task->operation == ASM_IMPORT) + peak = max(task->sync_buffer.peak, asmManager->sync_buffer_peak); + + return peak; +} + +size_t asmGetImportInputBufferSize(void) { + if (!asmManager || listLength(asmManager->tasks) == 0) return 0; + + asmTask *task = listNodeValue(listFirst(asmManager->tasks)); + if (task->operation == ASM_IMPORT) + return task->sync_buffer.mem_used; + + return 0; +} + +size_t asmGetMigrateOutputBufferSize(void) { + if (!asmManager || listLength(asmManager->tasks) == 0) return 0; + + asmTask *task = listNodeValue(listFirst(asmManager->tasks)); + if (task->operation == ASM_MIGRATE && task->main_channel_client) + return getClientOutputBufferMemoryUsage(task->main_channel_client); + + return 0; +} + +/* Returns the ASM task with the given ID, or NULL if no such task exists. */ +static asmTask *asmLookupTaskAt(list *tasks, const char *id) { + listIter li; + listNode *ln; + + listRewind(tasks, &li); + while ((ln = listNext(&li)) != NULL) { + asmTask *task = listNodeValue(ln); + if (!strcmp(task->id, id)) return task; + } + return NULL; +} + +/* Returns the ASM task with the given ID, or NULL if no such task exists. */ +asmTask *asmLookupTaskById(const char *id) { + return asmLookupTaskAt(asmManager->tasks, id); +} + +/* Returns the ASM task that is identical to the given slot range array, or NULL + * if no such task exists. */ +asmTask *asmLookupTaskBySlotRangeArray(slotRangeArray *slots) { + listIter li; + listNode *ln; + + listRewind(asmManager->tasks, &li); + while ((ln = listNext(&li)) != NULL) { + asmTask *task = listNodeValue(ln); + if (slotRangeArrayIsEqual(task->slots, slots)) + return task; + } + return NULL; +} + +/* Returns the slot range array for the given task ID */ +slotRangeArray *asmTaskGetSlotRanges(const char *task_id) { + asmTask *task = NULL; + if (!task_id || (task = asmLookupTaskById(task_id)) == NULL) return NULL; + + return task->slots; +} + +/* Returns 1 if the slot range array overlaps with the given slot range. */ +static int slotRangeArrayOverlaps(slotRangeArray *slots, slotRange *req) { + for (int i = 0; i < slots->num_ranges; i++) { + slotRange *sr = &slots->ranges[i]; + if (sr->start <= req->end && sr->end >= req->start) + return 1; + } + return 0; +} + +/* Returns 1 if the two slot range arrays overlap, 0 otherwise. */ +static int slotRangeArraysOverlap(slotRangeArray *slots1, slotRangeArray *slots2) { + for (int i = 0; i < slots1->num_ranges; i++) { + slotRange *sr1 = &slots1->ranges[i]; + if (slotRangeArrayOverlaps(slots2, sr1)) return 1; + } + return 0; +} + +/* Returns the ASM task that overlaps with the given slot range, or NULL if + * no such task exists. */ +static asmTask *lookupAsmTaskBySlotRange(slotRange *req) { + listIter li; + listNode *ln; + + listRewind(asmManager->tasks, &li); + while ((ln = listNext(&li)) != NULL) { + asmTask *task = listNodeValue(ln); + if (slotRangeArrayOverlaps(task->slots, req)) + return task; + } + return NULL; +} + +/* Validates the given slot ranges for a migration task: + * - Ensures the current node is a master. + * - Verifies all slots are in a STABLE state. + * - Confirms all slots belong to a single source node. + * - Confirms no ongoing import task that overlaps with the slot ranges. + * + * Returns the source node if validation succeeds. + * Otherwise, returns NULL and sets 'err' variable. */ +static clusterNode *validateImportSlotRanges(slotRangeArray *slots, sds *err, asmTask *current) { + clusterNode *source = NULL; + + *err = NULL; + + /* Ensure this is a master node */ + if (!clusterNodeIsMaster(getMyClusterNode())) { + *err = sdsnew("slot migration not allowed on replica."); + goto out; + } + + /* Ensure no manual migration is in progress. */ + for (int i = 0; i < CLUSTER_SLOTS; i++) { + if (getImportingSlotSource(i) != NULL || + getMigratingSlotDest(i) != NULL) + { + *err = sdsnew("all slot states must be STABLE to start a slot migration task."); + goto out; + } + } + + for (int i = 0; i < slots->num_ranges; i++) { + slotRange *sr = &slots->ranges[i]; + + /* Ensure no import task overlaps with this slot range. + * Skip check current task that is running for this slot range. */ + asmTask *task = lookupAsmTaskBySlotRange(sr); + if (task && task != current && task->operation == ASM_IMPORT) { + *err = sdscatprintf(sdsempty(), + "overlapping import exists for slot range: %d-%d", + sr->start, sr->end); + goto out; + } + + /* Validate if we can start migration task for this slot range. */ + for (int j = sr->start; j <= sr->end; j++) { + clusterNode *node = getNodeBySlot(j); + if (node == NULL) { + *err = sdscatprintf(sdsempty(), "slot has no owner: %d", j); + goto out; + } + + if (!source) { + source = node; + } else if (source != node) { + *err = sdsnew("slots belong to different source nodes"); + goto out; + } + } + } + +out: + return *err ? NULL : source; +} + +/* Returns 1 if a task with the specified operation is in progress, 0 otherwise. */ +static int asmTaskInProgress(int operation) { + listIter li; + listNode *ln; + + if (!asmManager || listLength(asmManager->tasks) == 0) return 0; + + listRewind(asmManager->tasks, &li); + while ((ln = listNext(&li)) != NULL) { + asmTask *task = listNodeValue(ln); + if (task->operation == operation) return 1; + } + return 0; +} + +/* Returns 1 if a migrate task is in progress, 0 otherwise. */ +int asmMigrateInProgress(void) { + return asmTaskInProgress(ASM_MIGRATE); +} + +/* Returns 1 if an import task is in progress, 0 otherwise. */ +int asmImportInProgress(void) { + return asmTaskInProgress(ASM_IMPORT); +} + +/* Returns 1 if the task is in a state where it can receive replication stream +* for the slot range, 0 otherwise. */ +inline static int asmCanFeedMigrationClient(asmTask *task) { + return task->operation == ASM_MIGRATE && + !task->cross_slot_during_propagating && + (task->state == ASM_SEND_BULK_AND_STREAM || + task->state == ASM_SEND_STREAM || + task->state == ASM_HANDOFF_PREP); +} + +/* Feed the migration client with the replication stream for the slot range. */ +void asmFeedMigrationClient(robj **argv, int argc) { + asmTask *task = NULL; + + if (server.cluster_enabled == 0 || listLength(asmManager->tasks) == 0) + return; + + /* Check if there is a migrate task that can receive replication stream. */ + task = listNodeValue(listFirst(asmManager->tasks)); + if (!asmCanFeedMigrationClient(task)) return; + + /* Ensure all arguments are converted to string encoding if necessary, + * since getSlotFromCommand expects them to be string-encoded. + * Generally the arguments are string-encoded, but we may rewrite + * the command arguments to integer encoding. */ + for (int i = 0; i < argc; i++) { + if (!sdsEncodedObject(argv[i])) { + serverAssert(argv[i]->encoding == OBJ_ENCODING_INT); + robj *old = argv[i]; + argv[i] = createStringObjectFromLongLongWithSds((long)old->ptr); + decrRefCount(old); + } + } + + /* Check if the command belongs to the slot range. */ + struct redisCommand *cmd = lookupCommand(argv, argc); + serverAssert(cmd); + + int slot = getSlotFromCommand(cmd, argv, argc); + + /* If the command does not have keys, skip it now. + * SELECT is not propagated, since we only support a single db in cluster mode. + * MULTI/EXEC is not needed, since transaction semantics are unnecessary + * before the slot handoff. + * FUNCTION subcommands should be executed on all nodes, so here we skip it, + * and even propagating them may cause an error when executing. + * + * NOTICE: if some keyless commands should be propagated to the destination, + * we should identify them here and send. */ + if (slot == INVALID_CLUSTER_SLOT) return; + + /* Generally we reject cross-slot commands before executing, but module may + * replicate this kind of command, so we check again. To guarantee data + * consistency, we cancel the task if we encounter a cross-slot command. */ + if (slot == CLUSTER_CROSSSLOT) { + /* We cannot cancel the task directly here, since it may lead to a recursive + * call: asmTaskCancel() --> moduleFireServerEvent() --> moduleFreeContext() + * --> postExecutionUnitOperations() --> propagateNow(). Even worse, this + * could result in propagating pending commands to the replication stream twice. + * To avoid this, we simply set a flag here, cancel the task in beforeSleep. */ + task->cross_slot_during_propagating = 1; + return; + } + + /* Check if the slot belongs to the task's slot range. */ + slotRange sr = {slot, slot}; + if (!slotRangeArrayOverlaps(task->slots, &sr)) return; + + if (unlikely(asmDebugIsFailPointActive(ASM_MIGRATE_MAIN_CHANNEL, task->state))) + freeClientAsync(task->main_channel_client); + + /* Feed main channel with the command. */ + client *c = task->main_channel_client; + size_t prev_bytes = getNormalClientPendingReplyBytes(c); + + addReplyArrayLen(c, argc); + for (int i = 0; i < argc; i++) + addReplyBulk(c, argv[i]); + + /* Update the task's source offset to reflect the bytes sent. */ + task->source_offset += (getNormalClientPendingReplyBytes(c) - prev_bytes); +} + +asmTask *asmCreateImportTask(const char *task_id, slotRangeArray *slots, sds *err) { + clusterNode *source; + + *err = NULL; + /* Validate that the slot ranges are valid and that migration can be + * initiated for them. */ + source = validateImportSlotRanges(slots, err, NULL); + if (!source) + goto err; + + if (source == getMyClusterNode()) { + *err = sdsnew("this node is already the owner of the slot range"); + goto err; + } + + /* Only support a single task at a time now. */ + if (listLength(asmManager->tasks) != 0) { + asmTask *current = listNodeValue(listFirst(asmManager->tasks)); + if (current->state == ASM_FAILED) { + /* We can create a new import task only if the current one is failed, + * cancel the failed task to create a new one. */ + asmTaskCancel(current, "new import requested"); + } else { + *err = sdsnew("another ASM task is already in progress"); + goto err; + } + } + /* There should be no task in progress. */ + serverAssert(listLength(asmManager->tasks) == 0); + + /* Create a slot migration task */ + asmTask *task = asmTaskCreate(task_id); + task->slots = slots; + task->state = ASM_NONE; + task->operation = ASM_IMPORT; + task->source_node = source; + memcpy(task->source, clusterNodeGetName(source), CLUSTER_NAMELEN); + memcpy(task->dest, getMyClusterId(), CLUSTER_NAMELEN); + + listAddNodeTail(asmManager->tasks, task); + sds slots_str = slotRangeArrayToString(slots); + serverLog(LL_NOTICE, "Import task %s created: src=%.40s, dest=%.40s, slots=%s", + task->id, task->source, task->dest, slots_str); + sdsfree(slots_str); + + return task; + +err: + slotRangeArrayFree(slots); + return NULL; +} + +/* CLUSTER MIGRATION IMPORT + * + * Sent by operator to the destination node to start the migration. */ +static void clusterMigrationCommandImport(client *c) { + /* Validate slot range arg count */ + int remaining = c->argc - 3; + if (remaining == 0 || remaining % 2 != 0) { + addReplyErrorArity(c); + return; + } + + slotRangeArray *slots = parseSlotRangesOrReply(c, c->argc, 3); + if (!slots) return; + + sds err = NULL; + asmTask *task = asmCreateImportTask(NULL, slots, &err); + if (!task) { + addReplyErrorSds(c, err); + return; + } + + addReplyBulkCString(c, task->id); +} + +/* CLUSTER MIGRATION CANCEL [ID | ALL] + * - Reply: Number of cancelled tasks + * + * Cancels import tasks that overlap with the specified slot ranges. + * Multiple tasks may be cancelled. */ +static void clusterMigrationCommandCancel(client *c) { + sds task_id = NULL; + int num_cancelled = 0; + + /* Validate slot range arg count */ + if (c->argc != 4 && c->argc != 5) { + addReplyErrorArity(c); + return; + } + + if (!strcasecmp(c->argv[3]->ptr, "id")) { + if (c->argc != 5) { + addReplyErrorArity(c); + return; + } + task_id = c->argv[4]->ptr; + } else if (!strcasecmp(c->argv[3]->ptr, "all")) { + if (c->argc != 4) { + addReplyErrorArity(c); + return; + } + } else { + addReplyError(c, "unknown argument"); + return; + } + + num_cancelled = clusterAsmCancel(task_id, "user request"); + addReplyLongLong(c, num_cancelled); +} + +/* Reply with the status of the task. */ +static void replyTaskStatus(client *c, asmTask *task) { + mstime_t p = 0; + + addReplyMapLen(c, 12); + addReplyBulkCString(c, "id"); + addReplyBulkCString(c, task->id); + addReplyBulkCString(c, "slots"); + addReplyBulkSds(c, slotRangeArrayToString(task->slots)); + addReplyBulkCString(c, "source"); + addReplyBulkCBuffer(c, task->source, CLUSTER_NAMELEN); + addReplyBulkCString(c, "dest"); + addReplyBulkCBuffer(c, task->dest, CLUSTER_NAMELEN); + addReplyBulkCString(c, "operation"); + addReplyBulkCString(c, task->operation == ASM_IMPORT ? "import" : "migrate"); + addReplyBulkCString(c, "state"); + addReplyBulkCString(c, asmTaskStateToString(task->state)); + addReplyBulkCString(c, "last_error"); + addReplyBulkCBuffer(c, task->error, sdslen(task->error)); + addReplyBulkCString(c, "retries"); + addReplyLongLong(c, task->retry_count); + addReplyBulkCString(c, "create_time"); + addReplyLongLong(c, task->create_time); + addReplyBulkCString(c, "start_time"); + addReplyLongLong(c, task->start_time); + addReplyBulkCString(c, "end_time"); + addReplyLongLong(c, task->end_time); + + if (task->operation == ASM_MIGRATE && task->state == ASM_COMPLETED) + p = task->end_time - task->paused_time; + addReplyBulkCString(c, "write_pause_ms"); + addReplyLongLong(c, p); +} + +/* CLUSTER MIGRATION STATUS [ID | ALL] + * - Reply: Array of atomic slot migration tasks */ +static void clusterMigrationCommandStatus(client *c) { + listIter li; + listNode *ln; + + if (c->argc != 4 && c->argc != 5) { + addReplyErrorArity(c); + return; + } + + if (!strcasecmp(c->argv[3]->ptr, "id")) { + if (c->argc != 5) { + addReplyErrorArity(c); + return; + } + sds id = c->argv[4]->ptr; + asmTask *task = asmLookupTaskAt(asmManager->tasks, id); + if (!task) task = asmLookupTaskAt(asmManager->archived_tasks, id); + if (!task) { + addReplyArrayLen(c, 0); + return; + } + + addReplyArrayLen(c, 1); + replyTaskStatus(c, task); + } else if (!strcasecmp(c->argv[3]->ptr, "all")) { + if (c->argc != 4) { + addReplyErrorArity(c); + return; + } + addReplyArrayLen(c, listLength(asmManager->tasks) + + listLength(asmManager->archived_tasks)); + listRewind(asmManager->tasks, &li); + while ((ln = listNext(&li)) != NULL) + replyTaskStatus(c, listNodeValue(ln)); + + listRewind(asmManager->archived_tasks, &li); + while ((ln = listNext(&li)) != NULL) + replyTaskStatus(c, listNodeValue(ln)); + } else { + addReplyError(c, "unknown argument"); + return; + } +} + +/* CLUSTER MIGRATION + * | + * STATUS [ID | ALL] | + * CANCEL [ID | ALL]> +*/ +void clusterMigrationCommand(client *c) { + if (c->argc < 4) { + addReplyErrorArity(c); + return; + } + + if (strcasecmp(c->argv[2]->ptr, "import") == 0) { + clusterMigrationCommandImport(c); + } else if (strcasecmp(c->argv[2]->ptr, "status") == 0) { + clusterMigrationCommandStatus(c); + } else if (strcasecmp(c->argv[2]->ptr, "cancel") == 0) { + clusterMigrationCommandCancel(c); + } else { + addReplyError(c, "unknown argument"); + } +} + +/* Return the number of keys in the specified slot ranges. */ +unsigned long long asmCountKeysInSlots(slotRangeArray *slots) { + if (!slots) return 0; + + unsigned long long key_count = 0; + for (int i = 0; i < slots->num_ranges; i++) { + for (int j = slots->ranges[i].start; j <= slots->ranges[i].end; j++) { + key_count += kvstoreDictSize(server.db[0].keys, j); + } + } + return key_count; +} + +/* Log a human-readable message for ASM task lifecycle events. */ +void asmLogTaskEvent(asmTask *task, int event) { + sds str = slotRangeArrayToString(task->slots); + + switch (event) { + case ASM_EVENT_IMPORT_STARTED: + serverLog(LL_NOTICE, "Import task %s started for slots: %s", task->id, str); + break; + case ASM_EVENT_IMPORT_FAILED: + serverLog(LL_NOTICE, "Import task %s failed for slots: %s", task->id, str); + break; + case ASM_EVENT_TAKEOVER: + serverLog(LL_NOTICE, "Import task %s is ready to takeover slots: %s", task->id, str); + break; + case ASM_EVENT_IMPORT_COMPLETED: + serverLog(LL_NOTICE, "Import task %s completed for slots: %s (imported %llu keys)", + task->id, str, asmCountKeysInSlots(task->slots)); + break; + case ASM_EVENT_MIGRATE_STARTED: + serverLog(LL_NOTICE, "Migrate task %s started for slots: %s (keys at start: %llu)", + task->id, str, asmCountKeysInSlots(task->slots)); + break; + case ASM_EVENT_MIGRATE_FAILED: + serverLog(LL_NOTICE, "Migrate task %s failed for slots: %s", task->id, str); + break; + case ASM_EVENT_HANDOFF_PREP: + serverLog(LL_NOTICE, "Migrate task %s preparing to handoff for slots: %s", task->id, str); + break; + case ASM_EVENT_MIGRATE_COMPLETED: + serverLog(LL_NOTICE, "Migrate task %s completed for slots: %s (migrated %llu keys)", + task->id, str, asmCountKeysInSlots(task->slots)); + break; + default: + break; + } + + sdsfree(str); +} + +/* Notify the state change to the module and the cluster implementation. */ +void asmNotifyStateChange(asmTask *task, int event) { + RedisModuleClusterSlotMigrationInfo info = { + .version = REDISMODULE_CLUSTER_SLOT_MIGRATION_INFO_VERSION, + .task_id = task->id, + .slots = (RedisModuleSlotRangeArray *) task->slots + }; + memcpy(info.source_node_id, task->source, CLUSTER_NAMELEN); + memcpy(info.destination_node_id, task->dest, CLUSTER_NAMELEN); + + int module_event = -1; + if (event == ASM_EVENT_IMPORT_STARTED) module_event = REDISMODULE_SUBEVENT_CLUSTER_SLOT_MIGRATION_IMPORT_STARTED; + else if (event == ASM_EVENT_IMPORT_COMPLETED) module_event = REDISMODULE_SUBEVENT_CLUSTER_SLOT_MIGRATION_IMPORT_COMPLETED; + else if (event == ASM_EVENT_IMPORT_FAILED) module_event = REDISMODULE_SUBEVENT_CLUSTER_SLOT_MIGRATION_IMPORT_FAILED; + else if (event == ASM_EVENT_MIGRATE_STARTED) module_event = REDISMODULE_SUBEVENT_CLUSTER_SLOT_MIGRATION_MIGRATE_STARTED; + else if (event == ASM_EVENT_MIGRATE_COMPLETED) module_event = REDISMODULE_SUBEVENT_CLUSTER_SLOT_MIGRATION_MIGRATE_COMPLETED; + else if (event == ASM_EVENT_MIGRATE_FAILED) module_event = REDISMODULE_SUBEVENT_CLUSTER_SLOT_MIGRATION_MIGRATE_FAILED; + serverAssert(module_event != -1); + + moduleFireServerEvent(REDISMODULE_EVENT_CLUSTER_SLOT_MIGRATION, module_event, &info); + serverLog(LL_DEBUG, "Fire cluster asm module event, task %s: state=%s", + task->id, asmTaskStateToString(task->state)); + + if (clusterNodeIsMaster(getMyClusterNode())) { + /* Notify the cluster impl only if it is a real active import task. */ + if (task != asmManager->master_task) { + asmLogTaskEvent(task, event); + clusterAsmOnEvent(task->id, event, task->slots); + } + asmNotifyReplicasStateChange(task); /* Propagate state change to replicas */ + } +} + +void asmImportSetFailed(asmTask *task) { + serverAssert(task->operation == ASM_IMPORT); + if (task->state == ASM_FAILED) return; + + /* If we are in the RDB channel transfer state, we need to + * close the client that was created for the RDB channel. */ + if (task->rdb_channel_conn && task->rdb_channel_state == ASM_RDBCHANNEL_TRANSFER) { + client *c = connGetPrivateData(task->rdb_channel_conn); + serverAssert(c->task == task); + task->rdb_channel_conn = NULL; + c->task = NULL; + c->flags &= ~CLIENT_MASTER; + freeClientAsync(c); + } + + /* If in the wait stream EOF or streaming buffer state, we need to close the + * client that was created for the main channel. */ + if (task->main_channel_conn && + (task->state == ASM_STREAMING_BUF || task->state == ASM_WAIT_STREAM_EOF)) + { + client *c = connGetPrivateData(task->main_channel_conn); + serverAssert(c->task == task); + task->main_channel_conn = NULL; + c->task = NULL; + c->flags &= ~CLIENT_MASTER; + freeClientAsync(c); + } + + /* Close the connections */ + if (task->rdb_channel_conn) connClose(task->rdb_channel_conn); + if (task->main_channel_conn) connClose(task->main_channel_conn); + task->rdb_channel_conn = NULL; + task->main_channel_conn = NULL; + + /* Clear the replication data buffer */ + asmManager->sync_buffer_peak = max(asmManager->sync_buffer_peak, task->sync_buffer.peak); + replDataBufClear(&task->sync_buffer); + + /* Mark the task as failed and notify the cluster */ + task->state = ASM_FAILED; + asmNotifyStateChange(task, ASM_EVENT_IMPORT_FAILED); + /* This node may become replica, only master can setup new slot trimming jobs. */ + if (clusterNodeIsMaster(getMyClusterNode())) + asmTrimJobSchedule(task->slots); +} + +void asmMigrateSetFailed(asmTask *task) { + serverAssert(task->operation == ASM_MIGRATE); + if (task->state == ASM_FAILED) return; + + /* Close the RDB and main channel clients*/ + if (task->rdb_channel_client) { + task->rdb_channel_client->task = NULL; + freeClientAsync(task->rdb_channel_client); + task->rdb_channel_client = NULL; + } + if (task->main_channel_client) { + task->main_channel_client->task = NULL; + freeClientAsync(task->main_channel_client); + task->main_channel_client = NULL; + } + + /* Actually it is not necessary to clear the sync buffer here, + * to make asmTaskReset work properly after migrate task failed */ + replDataBufClear(&task->sync_buffer); + + /* Mark the task as failed and notify the cluster */ + task->state = ASM_FAILED; + asmNotifyStateChange(task, ASM_EVENT_MIGRATE_FAILED); +} + +void asmTaskSetFailed(asmTask *task, const char *fmt, ...) { + va_list ap; + sds error = sdsempty(); + + /* Set the error message */ + va_start(ap, fmt); + error = sdscatvprintf(error, fmt, ap); + va_end(ap); + error = sdscatprintf(error, " (state: %s, rdb_channel_state: %s)", + asmTaskStateToString(task->state), + asmTaskStateToString(task->rdb_channel_state)); + sdsfree(task->error); + task->error = error; + + /* Log the error */ + sds slots_str = slotRangeArrayToString(task->slots); + serverLog(LL_WARNING, "%s task %s failed: slots=%s, err=%s", + task->operation == ASM_IMPORT ? "Import" : "Migrate", + task->id, slots_str, task->error); + sdsfree(slots_str); + + if (task->operation == ASM_IMPORT) + asmImportSetFailed(task); + else + asmMigrateSetFailed(task); +} + +/* The task is completed or canceled. Update stats and move it to + * the archived list. */ +void asmTaskFinalize(asmTask *task) { + listNode *ln = listFirst(asmManager->tasks); + serverAssert(ln->value == task); + + task->source_node = NULL; /* Should never access it */ + task->end_time = server.mstime; + + if (task->operation == ASM_IMPORT) { + asmManager->sync_buffer_peak = max(asmManager->sync_buffer_peak, + task->sync_buffer.peak); + replDataBufClear(&task->sync_buffer); /* Not used, so save memory */ + } + + /* Move the task to the archived list */ + listUnlinkNode(asmManager->tasks, ln); + listLinkNodeHead(asmManager->archived_tasks, ln); +} + +static void asmTaskCancel(asmTask *task, const char *reason) { + if (task->state == ASM_CANCELED) return; + + asmTaskSetFailed(task, "Cancelled due to %s", reason); + task->state = ASM_CANCELED; + asmTaskFinalize(task); +} + +void asmImportTakeover(asmTask *task) { + serverAssert(task->state == ASM_WAIT_STREAM_EOF || + task->state == ASM_STREAMING_BUF); + + /* Free the main channel connection since it is no longer needed. */ + serverAssert(task->main_channel_conn != NULL); + client *c = connGetPrivateData(task->main_channel_conn); + c->task = NULL; + c->flags &= ~CLIENT_MASTER; + freeClientAsync(c); + task->main_channel_conn = NULL; + + task->state = ASM_TAKEOVER; + asmLogTaskEvent(task, ASM_EVENT_TAKEOVER); + clusterAsmOnEvent(task->id, ASM_EVENT_TAKEOVER, task->slots); +} + +void asmCallbackOnFreeClient(client *c) { + asmTask *task = c->task; + if (!task) return; + + /* If the RDB channel connection is closed, mark the task as failed. */ + if (c->conn && task->rdb_channel_conn == c->conn) { + /* We create the client only when transferring data on the RDB channel */ + serverAssert(task->rdb_channel_state == ASM_RDBCHANNEL_TRANSFER); + task->rdb_channel_conn = NULL; /* Will be freed by freeClient */ + c->flags &= ~CLIENT_MASTER; + asmTaskSetFailed(task, "RDB channel - Connection is closed"); + return; + } + + if (c->conn && task->main_channel_conn == c->conn) { + /* After or in the process of streaming buffer to DB, a client will be + * created based on the main channel connection. */ + serverAssert(task->state == ASM_STREAMING_BUF || + task->state == ASM_WAIT_STREAM_EOF); + task->main_channel_conn = NULL; /* Will be freed by freeClient */ + c->flags &= ~CLIENT_MASTER; + asmTaskSetFailed(task, "Main channel - Connection is closed"); + return; + } + + if (c == task->rdb_channel_client) { + /* TODO: Detect whether the bgsave is completed successfully and + * update the state properly. */ + task->rdb_channel_state = ASM_COMPLETED; + /* We may not have detected whether the child process has exited yet, + * so we can't determine whether the client has completed the slots + * snapshot transfer. If the RDB channel is interrupted unexpectedly, + * the destination side will also close the main channel. + * So here we just reset the RDB channel client of task. */ + task->rdb_channel_client = NULL; + return; + } + + /* If the main channel client is closed, we need to mark the task as failed + * and clean up the RDB channel client if it exists. */ + if (c == task->main_channel_client) { + task->main_channel_client = NULL; + /* The rdb channel client will be cleaned up */ + asmTaskSetFailed(task, "Main and RDB channel clients are disconnected."); + return; + } +} + +/* Sends an AUTH command to the source node using the internal secret. + * Returns an error string if the command fails, or NULL on success. */ +char *asmSendInternalAuth(connection *conn) { + size_t len = 0; + const char *internal_secret = clusterGetSecret(&len); + serverAssert(internal_secret != NULL); + + sds secret = sdsnewlen(internal_secret, len); + char *err = sendCommand(conn, "AUTH", "internal connection", secret, NULL); + sdsfree(secret); + return err; +} + +/* Handles the RDB channel sync with the source node. + * This function is called when the RDB channel is established + * and ready to sync with the source node. */ +void asmRdbChannelSyncWithSource(connection *conn) { + asmTask *task = connGetPrivateData(conn); + char *err = NULL; + sds task_error_msg = NULL; + + /* Check for errors in the socket: after a non blocking connect() we + * may find that the socket is in error state. */ + if (connGetState(conn) != CONN_STATE_CONNECTED) + goto error; + + /* Check if the task is in a fail point state */ + if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_RDB_CHANNEL, task->rdb_channel_state))) { + char buf[1]; + /* Simulate a failure by shutting down the connection. On some operating + * systems (e.g. Linux), the socket's receive buffer is not flushed + * immediately, so we issue a dummy read to drain any pending data and + * surface the error condition. + * using shutdown() instead of connShutdown() because connTLSShutdown() + * will free the connection directly, which is not what we want. */ + shutdown(conn->fd, SHUT_RDWR); + connRead(conn, buf, 1); + } + + if (task->rdb_channel_state == ASM_CONNECTING) { + connSetReadHandler(conn, asmRdbChannelSyncWithSource); + connSetWriteHandler(conn, NULL); + + /* Send AUTH command to source node using internal auth */ + err = asmSendInternalAuth(conn); + if (err) goto write_error; + task->rdb_channel_state = ASM_AUTH_REPLY; + return; + } + + if (task->rdb_channel_state == ASM_AUTH_REPLY) { + err = receiveSynchronousResponse(conn); + /* The source node did not reply */ + if (err == NULL) goto no_response_error; + + /* Check `+OK` reply */ + if (!strcmp(err, "+OK")) { + sdsfree(err); + err = NULL; + task->rdb_channel_state = ASM_RDBCHANNEL_REQUEST; + serverLog(LL_NOTICE, "Source node replied to AUTH command, syncslots rdb channel operation can continue..."); + } else { + task_error_msg = sdscatprintf(sdsempty(), + "Error reply to AUTH from source: %s", err); + sdsfree(err); + goto error; + } + } + + if (task->rdb_channel_state == ASM_RDBCHANNEL_REQUEST) { + err = sendCommand(conn, "CLUSTER", "SYNCSLOTS", "RDBCHANNEL", task->id, NULL); + if (err) goto write_error; + task->rdb_channel_state = ASM_RDBCHANNEL_REPLY; + return; + } + + if (task->rdb_channel_state == ASM_RDBCHANNEL_REPLY) { + err = receiveSynchronousResponse(conn); + /* The source node did not reply */ + if (err == NULL) goto no_response_error; + + /* Ignore ‘\n' sent from the source node to keep the connection alive. */ + if (sdslen(err) == 0) { + serverLog(LL_DEBUG, "Received an empty line in RDBCHANNEL reply, slots snapshot delivery will start later"); + sdsfree(err); + return; + } + + /* Check `+SLOTSSNAPSHOT` reply */ + if (!strncmp(err, "+SLOTSSNAPSHOT", strlen("+SLOTSSNAPSHOT"))) { + sdsfree(err); + err = NULL; + task->state = ASM_ACCUMULATE_BUF; + /* The main channel buffers pending commands. */ + connSetReadHandler(task->main_channel_conn, asmSyncBufferReadFromConn); + + task->rdb_channel_state = ASM_RDBCHANNEL_TRANSFER; + client *c = createClient(conn); + c->flags |= (CLIENT_MASTER | CLIENT_INTERNAL | CLIENT_ASM_IMPORTING); + c->querybuf = sdsempty(); + c->authenticated = 1; + c->user = NULL; + c->task = task; + serverLog(LL_NOTICE, + "Source node replied to SLOTSSNAPSHOT, syncing slots snapshot can continue..."); + } else { + task_error_msg = sdscatprintf(sdsempty(), + "Error reply to CLUSTER SYNCSLOTS RDBCHANNEL from the source: %s", err); + sdsfree(err); + goto error; + } + return; + } + return; + +no_response_error: + task_error_msg = sdsnew("Source node did not respond to command during RDBCHANNELSYNCSLOTS handshake"); + /* Fall through to regular error handling */ + +error: + asmTaskSetFailed(task, "RDB channel - Failed to sync with the source node: %s", + task_error_msg ? task_error_msg : connGetLastError(conn)); + sdsfree(task_error_msg); + return; + +write_error: /* Handle sendCommand() errors. */ + task_error_msg = sdscatprintf(sdsempty(), "Failed to send command to the source node: %s", err); + sdsfree(err); + goto error; +} + +char *asmSendSlotRangesSync(connection *conn, asmTask *task) { + /* Prepare CLUSTER SYNCSLOTS SYNC command */ + serverAssert(task->slots->num_ranges <= CLUSTER_SLOTS); + int argc = task->slots->num_ranges * 2 + 4; + char **args = zcalloc(sizeof(char*) * argc); + size_t *lens = zcalloc(sizeof(size_t) * argc); + + args[0] = "CLUSTER"; + args[1] = "SYNCSLOTS"; + args[2] = "SYNC"; + args[3] = task->id; + lens[0] = strlen("CLUSTER"); + lens[1] = strlen("SYNCSLOTS"); + lens[2] = strlen("SYNC"); + lens[3] = sdslen(task->id); + + int i = 4; + for (int j = 0; j < task->slots->num_ranges; j++) { + slotRange *sr = &task->slots->ranges[j]; + args[i] = sdscatprintf(sdsempty(), "%d", sr->start); + lens[i] = sdslen(args[i]); + args[i+1] = sdscatprintf(sdsempty(), "%d", sr->end); + lens[i+1] = sdslen(args[i+1]); + i += 2; + } + serverAssert(i == argc); + + /* Send command to source node */ + char *err = sendCommandArgv(conn, argc, args, lens); + + /* Free allocated memory */ + for (int j = 4; j < argc; j++) { + sdsfree(args[j]); + } + zfree(args); + zfree(lens); + + return err; +} + +void asmSyncWithSource(connection *conn) { + asmTask *task = connGetPrivateData(conn); + char *err = NULL; + + /* Some task errors are not network issues, we record them explicitly. */ + sds task_error_msg = NULL; + + /* Check for errors in the socket: after a non blocking connect() we + * may find that the socket is in error state. */ + if (connGetState(conn) != CONN_STATE_CONNECTED) + goto error; + + /* Check if the fail point is active for this channel and state */ + if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_MAIN_CHANNEL, task->state))) { + char buf[1]; + shutdown(conn->fd, SHUT_RDWR); + connRead(conn, buf, 1); + } + + if (task->state == ASM_CONNECTING) { + connSetReadHandler(conn, asmSyncWithSource); + connSetWriteHandler(conn, NULL); + /* Send AUTH command to source node using internal auth */ + err = asmSendInternalAuth(conn); + if (err) goto write_error; + task->state = ASM_AUTH_REPLY; + return; + } + + if (task->state == ASM_AUTH_REPLY) { + err = receiveSynchronousResponse(conn); + /* The source node did not reply */ + if (err == NULL) goto no_response_error; + + /* Check `+OK` reply */ + if (!strcmp(err, "+OK")) { + sdsfree(err); + err = NULL; + task->state = ASM_SEND_HANDSHAKE; + serverLog(LL_NOTICE, "Source node replied to AUTH command, syncslots can continue..."); + } else { + task_error_msg = sdscatprintf(sdsempty(), + "Error reply to AUTH from the source: %s", err); + sdsfree(err); + goto error; + } + } + + if (task->state == ASM_SEND_HANDSHAKE) { + sds node_id = sdsnewlen(clusterNodeGetName(getMyClusterNode()), CLUSTER_NAMELEN); + err = sendCommand(conn, "CLUSTER", "SYNCSLOTS", "CONF", "NODE-ID", node_id, NULL); + sdsfree(node_id); + if (err) goto write_error; + task->state = ASM_HANDSHAKE_REPLY; + return; + } + + if (task->state == ASM_HANDSHAKE_REPLY) { + err = receiveSynchronousResponse(conn); + /* The source node did not reply */ + if (err == NULL) goto no_response_error; + + /* Check `+OK` reply */ + if (!strcmp(err, "+OK")) { + sdsfree(err); + err = NULL; + task->state = ASM_SEND_SYNCSLOTS; + serverLog(LL_NOTICE, "Source node replied to SYNCSLOTS CONF command, syncslots can continue..."); + } else { + task_error_msg = sdscatprintf(sdsempty(), + "Error reply to CLUSTER SYNCSLOTS CONF from the source: %s", err); + sdsfree(err); + goto error; + } + } + + if (task->state == ASM_SEND_SYNCSLOTS) { + err = asmSendSlotRangesSync(conn, task); + if (err) goto write_error; + + task->state = ASM_SYNCSLOTS_REPLY; + return; + } + + if (task->state == ASM_SYNCSLOTS_REPLY) { + err = receiveSynchronousResponse(conn); + /* The source node did not reply */ + if (err == NULL) goto no_response_error; + + /* Check `+RDBCHANNELSYNCSLOTS` reply */ + if (!strncmp(err, "+RDBCHANNELSYNCSLOTS", strlen("+RDBCHANNELSYNCSLOTS"))) { + sdsfree(err); + err = NULL; + task->state = ASM_INIT_RDBCHANNEL; + serverLog(LL_NOTICE, + "Source node replied to SYNCSLOTS SYNC, syncslots can continue..."); + } else if (!strncmp(err, "-NOTREADY", strlen("-NOTREADY"))) { + /* The source-side cluster is temporarily not ready to start a + * migration and replied -NOTREADY. We could fail this attempt and + * let the import task start another attempt later but that could + * trigger unnecessary cleanup in the cluster implementation. + * Instead, we'll retry sending SYNCSLOTS later in asmCron(). */ + sdsfree(err); + task->state = ASM_SEND_SYNCSLOTS; + serverLog(LL_NOTICE, + "Source node replied to SYNCSLOTS SYNC with -NOTREADY, will retry later..."); + return; + } else { + task_error_msg = sdscatprintf(sdsempty(), + "Error reply to CLUSTER SYNCSLOTS SYNC from the source: %s", err); + sdsfree(err); + goto error; + } + } + + if (task->state == ASM_INIT_RDBCHANNEL) { + /* Create RDB channel connection */ + char *ip = clusterNodeIp(task->source_node); + int port = server.tls_replication ? clusterNodeTlsPort(task->source_node) : + clusterNodeTcpPort(task->source_node); + task->rdb_channel_conn = connCreate(server.el, connTypeOfReplication()); + if (connConnect(task->rdb_channel_conn, ip, port, + server.bind_source_addr, asmRdbChannelSyncWithSource) == C_ERR) + { + serverLog(LL_WARNING, "Unable to connect to the source node: %s", + connGetLastError(task->rdb_channel_conn)); + goto error; + } + task->rdb_channel_state = ASM_CONNECTING; + connSetPrivateData(task->rdb_channel_conn, task); + serverLog(LL_NOTICE, + "RDB channel connection to source node %.40s established, waiting for AUTH reply...", + task->source); + + /* Main channel waits for the new event */ + connSetReadHandler(conn, NULL); + return; + } + return; + +no_response_error: + serverLog(LL_WARNING, "Source node did not respond to command during SYNCSLOTS handshake"); + /* Fall through to regular error handling */ + +error: + asmTaskSetFailed(task, "Main channel - Failed to sync with source node: %s", + task_error_msg ? task_error_msg : connGetLastError(conn)); + sdsfree(task_error_msg); + return; + +write_error: /* Handle sendCommand() errors. */ + serverLog(LL_WARNING, "Failed to send command to source node: %s", err); + sdsfree(err); + goto error; +} + +int asmImportSendACK(asmTask *task) { + serverAssert(task->operation == ASM_IMPORT && task->state == ASM_WAIT_STREAM_EOF); + serverLog(LL_DEBUG, "Destination node applied offset is %lld", task->dest_offset); + + char offset[64]; + ull2string(offset, sizeof(offset), task->dest_offset); + + char *err = sendCommand(task->main_channel_conn, "CLUSTER", "SYNCSLOTS", "ACK", + asmTaskStateToString(task->state), offset, NULL); + if (err) { + asmTaskSetFailed(task, "Main channel - Failed to send ACK: %s", err); + sdsfree(err); + return C_ERR; + } + return C_OK; +} + +/* Called when the RDB channel begins sending the snapshot. + * From this point on, the main channel also starts sending incremental streams. */ +void asmSlotSnapshotAndStreamStart(struct asmTask *task) { + if (task == NULL || task->state != ASM_WAIT_BGSAVE_START) return; + + if (unlikely(asmDebugIsFailPointActive(ASM_MIGRATE_RDB_CHANNEL, task->state))) { + shutdown(task->rdb_channel_client->conn->fd, SHUT_RDWR); + return; + } + task->main_channel_client->replstate = SLAVE_STATE_SEND_BULK_AND_STREAM; + + task->state = ASM_SEND_BULK_AND_STREAM; + task->rdb_channel_state = ASM_RDBCHANNEL_TRANSFER; + + /* From the source node's perspective, the destination node begins to accumulate + * the buffer while the RDB channel starts applying the slot snapshot data. */ + task->dest_state = ASM_ACCUMULATE_BUF; + task->dest_slots_snapshot_time = server.mstime; +} + +/* Called when the RDB channel has succeeded in sending the snapshot. */ +void asmSlotSnapshotSucceed(struct asmTask *task) { + if (task == NULL || task->state != ASM_SEND_BULK_AND_STREAM) return; + + /* The destination starts sending ACKs to keep the main channel alive after + * receiving the snapshot, so here we need to update the last interaction + * time to avoid false timeout. */ + task->main_channel_client->lastinteraction = server.unixtime; + + task->state = ASM_SEND_STREAM; + task->rdb_channel_state = ASM_COMPLETED; +} + +/* Called when the RDB channel fails to send the snapshot. */ +void asmSlotSnapshotFailed(struct asmTask *task) { + if (task == NULL || task->state != ASM_SEND_BULK_AND_STREAM) return; + + asmTaskSetFailed(task, "RDB channel - Failed to send slots snapshot"); +} + +/* CLUSTER SYNCSLOTS SNAPSHOT-EOF + * + * This command is sent by the source node to the destination node to indicate + * that the slots snapshot has ended. */ +void clusterSyncSlotsSnapshotEOF(client *c) { + /* This client is RDB channel connection. */ + asmTask *task = c->task; + if (!task || task->rdb_channel_state != ASM_RDBCHANNEL_TRANSFER || + c->conn != task->rdb_channel_conn) + { + /* Unexpected SNAPSHOT-EOF command */ + serverLog(LL_WARNING, "Unexpected CLUSTER SYNCSLOTS SNAPSHOT-EOF command: " + "rdb_channel_state=%s", + asmTaskStateToString(task ? task->rdb_channel_state : ASM_NONE)); + freeClientAsync(c); + return; + } + + /* RDB channel state: ASM_RDBCHANNEL_TRANSFER */ + if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_RDB_CHANNEL, task->rdb_channel_state))) { + freeClientAsync(c); /* Simulate a failure */ + return; + } + + /* Clear the RDB channel connection */ + task->rdb_channel_conn = NULL; + task->rdb_channel_state = ASM_COMPLETED; + serverLog(LL_NOTICE, "RDB channel snapshot transfer completed for the import task."); + + /* Free the RDB channel connection. */ + c->task = NULL; + c->flags &= ~CLIENT_MASTER; + freeClientAsync(c); + + /* Will start streaming the buffer to DB, don't start here since now + * we are in the context of executing command, otherwise, redis will + * generate a big MULTI-EXEC including all the commands in the buffer. + * just update the state here, and do it in beforeSleep(). */ + task->state = ASM_READY_TO_STREAM; + connSetReadHandler(task->main_channel_conn, NULL); +} + +/* CLUSTER SYNCSLOTS STREAM-EOF + * + * This command is sent by the source node to the destination node to indicate + * that the slot sync stream has ended and the slots can be handed off. */ +void clusterSyncSlotsStreamEOF(client *c) { + asmTask *task = c->task; + + if (!task || task->operation != ASM_IMPORT) { + serverLog(LL_WARNING, "Unexpected CLUSTER SYNCSLOTS STREAM-EOF command"); + freeClientAsync(c); + return; + } + + if (task->state == ASM_STREAMING_BUF) { + /* We are still streaming the buffer to DB, mark the EOF received, and we + * can take over after streaming is EOF. Since we may release the context + * in asmImportTakeover, this breaks the context for streaming buffer. */ + task->stream_eof_during_streaming = 1; + serverLog(LL_NOTICE, "CLUSTER SYNCSLOTS STREAM-EOF received during streaming buffer"); + return; + } + + if (task->state != ASM_WAIT_STREAM_EOF) { + serverLog(LL_WARNING, "Unexpected CLUSTER SYNCSLOTS STREAM-EOF state: %s", + asmTaskStateToString(task->state)); + freeClientAsync(c); + return; + } + serverLog(LL_NOTICE, "CLUSTER SYNCSLOTS STREAM-EOF received when waiting for STREAM-EOF"); + + /* STREAM-EOF received, the source is ready to handoff, takeover now. */ + asmImportTakeover(task); +} + +/* Start the import task. */ +static void asmStartImportTask(asmTask *task) { + if (task->operation != ASM_IMPORT || task->state != ASM_NONE) return; + sds slots_str = slotRangeArrayToString(task->slots); + + /* Sanity check: Clean up any keys that exist in slots not owned by this node. + * This handles cases where users previously migrated slots using legacy method + * but left behind orphaned keys, or maybe cluster missed cleaning up during + * previous operations, which could interfere with the ASM import process. */ + asmTrimSlotsIfNotOwned(task->slots); + + /* Check if there is any trim job in progress for the slot ranges. + * We can't start the import task since the trim job will modify the data.*/ + int trim_in_progress = asmIsAnyTrimJobOverlaps(task->slots); + + /* Notify the cluster implementation to prepare for the import task. */ + int impl_ret = clusterAsmOnEvent(task->id, ASM_EVENT_IMPORT_PREP, task->slots); + + /* We do not start the import task if trim is disabled by module. */ + int disabled_by_module = server.cluster_module_trim_disablers > 0; + + static int start_blocked_logged = 0; + /* Cannot start import task since pause action is performed. Otherwise, we + * will break the promise that no writes are performed during the pause. */ + if (isPausedActions(PAUSE_ACTION_CLIENT_ALL) || + isPausedActions(PAUSE_ACTION_CLIENT_WRITE) || + trim_in_progress || + impl_ret != C_OK || + disabled_by_module) + { + const char *reason = disabled_by_module ? "trim is disabled by module" : + impl_ret != C_OK ? "cluster is not ready" : + trim_in_progress ? "trim in progress for some of the slots" : + "server paused"; + if (start_blocked_logged == 0) { + serverLog(LL_WARNING, "Can not start import task %s for slots: %s due to %s", + task->id, slots_str, reason); + start_blocked_logged = 1; + } + sdsfree(slots_str); + return; + } + start_blocked_logged = 0; /* Reset the log flag */ + + /* Detect if the cluster topology is changed. We should cancel the task if + * we can not schedule it, and update the source node if needed. */ + sds err = NULL; + clusterNode *source = validateImportSlotRanges(task->slots, &err, task); + if (!source) { + asmTaskCancel(task, err); + sdsfree(slots_str); + sdsfree(err); + return; + } + /* Now I'm the owner of the slot range, cancel the import task. */ + if (source == getMyClusterNode()) { + asmTaskCancel(task, "slots owned by myself now"); + sdsfree(slots_str); + return; + } + /* Change the source node if needed. */ + if (source != task->source_node) { + task->source_node = source; + memcpy(task->source, clusterNodeGetName(source), CLUSTER_NAMELEN); + serverLog(LL_NOTICE, "Import task %s source node changed: slots=%s, " + "new_source=%.40s", task->id, slots_str, clusterNodeGetName(source)); + } + sdsfree(slots_str); + + task->state = ASM_CONNECTING; + task->start_time = server.mstime; + asmNotifyStateChange(task, ASM_EVENT_IMPORT_STARTED); + + task->main_channel_conn = connCreate(server.el, connTypeOfReplication()); + char *ip = clusterNodeIp(task->source_node); + int port = server.tls_replication ? clusterNodeTlsPort(task->source_node) : + clusterNodeTcpPort(task->source_node); + if (connConnect(task->main_channel_conn, ip, port, server.bind_source_addr, + asmSyncWithSource) == C_ERR) + { + asmTaskSetFailed(task, "Main channel - Failed to connect to source node: %s", + connGetLastError(task->main_channel_conn)); + return; + } + connSetPrivateData(task->main_channel_conn, task); +} + +void clusterSyncSlotsCommand(client *c) { + /* Only internal clients are allowed to execute this command to avoid + * potential attack, since some state changes are not well protected, + * external clients may damage the slot migration state. */ + if (!(c->flags & (CLIENT_INTERNAL | CLIENT_MASTER))) { + addReplyError(c, "CLUSTER SYNCSLOTS subcommands are only allowed for internal clients"); + c->flags |= CLIENT_CLOSE_AFTER_REPLY; + return; + } + + /* On replica, only allow master client to execute CONF subcommand. */ + if (!clusterNodeIsMaster(getMyClusterNode())) { + if (!(c->flags & CLIENT_MASTER)) { + /* Not master client, reject all subcommands and close the connection. */ + addReplyError(c, "CLUSTER SYNCSLOTS subcommands are only allowed for master"); + c->flags |= CLIENT_CLOSE_AFTER_REPLY; + return; + } else { + /* Only allow CONF subcommand on replica. */ + if (strcasecmp(c->argv[2]->ptr, "conf")) return; + } + } + + if (!strcasecmp(c->argv[2]->ptr, "sync") && c->argc >= 6) { + /* CLUSTER SYNCSLOTS SYNC [ ] */ + if (c->argc % 2 == 1) { + addReplyErrorArity(c); + return; + } + + slotRangeArray *slots = parseSlotRangesOrReply(c, c->argc, 4); + if (!slots) return; + + /* Validate that the slot ranges are valid and that migration can be + * initiated for them. */ + sds err = NULL; + clusterNode *source = validateImportSlotRanges(slots, &err, NULL); + if (!source) { + addReplyErrorSds(c, err); + slotRangeArrayFree(slots); + return; + } + + /* Check if the source node is the same as the current node. */ + if (source != getMyClusterNode()) { + addReplyError(c, "This node is not the owner of the slots"); + slotRangeArrayFree(slots); + return; + } + + /* Verify the destination node is known and is a master. */ + if (c->node_id) { + clusterNode *dest = clusterLookupNode(c->node_id, CLUSTER_NAMELEN); + if (dest == NULL || !clusterNodeIsMaster(dest)) { + addReplyErrorFormat(c, "Destination node %.40s is not a master", c->node_id); + slotRangeArrayFree(slots); + return; + } + } + + sds task_id = c->argv[3]->ptr; + /* Notify the cluster implementation to prepare for the migrate task. */ + if (clusterAsmOnEvent(task_id, ASM_EVENT_MIGRATE_PREP, slots) != C_OK || + asmDebugIsFailPointActive(ASM_MIGRATE_MAIN_CHANNEL, ASM_NONE)) + { + addReplyError(c, "-NOTREADY Cluster is not ready to migrate slots"); + slotRangeArrayFree(slots); + return; + } + + /* We do not start the migrate task if trim is disabled by module. */ + int disabled_by_module = server.cluster_module_trim_disablers > 0; + if (disabled_by_module) { + addReplyError(c, "Trim is disabled by module"); + slotRangeArrayFree(slots); + return; + } + + asmTask *task = listLength(asmManager->tasks) == 0 ? NULL : + listNodeValue(listFirst(asmManager->tasks)); + if (task && !strcmp(task->id, task_id) && + task->operation == ASM_MIGRATE && task->state == ASM_FAILED && + slotRangeArrayIsEqual(slots, task->slots) && + memcmp(task->dest, c->node_id, CLUSTER_NAMELEN) == 0) + { + /* Reuse the failed task */ + asmTaskReset(task); + slotRangeArrayFree(task->slots); /* Will be set again later */ + task->retry_count++; + } else if (task) { + if (task->state == ASM_FAILED) { + /* We can create a new migrate task only if the current one is + * failed, cancel the failed task to create a new one. */ + asmTaskCancel(task, "new migration requested"); + task = NULL; + } else { + addReplyError(c, "Another ASM task is already in progress"); + slotRangeArrayFree(slots); + return; + } + } + + /* Create the migrate slots task and add it to the list, + * otherwise reuse the existing one */ + if (task == NULL) { + task = asmTaskCreate(task_id); + task->start_time = server.mstime; /* Start immediately */ + serverAssert(listLength(asmManager->tasks) == 0); + listAddNodeTail(asmManager->tasks, task); + } + + task->slots = slots; + task->operation = ASM_MIGRATE; + memcpy(task->source, clusterNodeGetName(getMyClusterNode()), CLUSTER_NAMELEN); + if (c->node_id) memcpy(task->dest, c->node_id, CLUSTER_NAMELEN); + + task->main_channel_client = c; + c->task = task; + + /* We mark the main channel client as a replica, so this client is limited + * by the client output buffer settings for replicas. The replstate has + * no real significance, just to prevent it from going online. */ + c->flags |= (CLIENT_SLAVE | CLIENT_ASM_MIGRATING); + c->replstate = SLAVE_STATE_WAIT_RDB_CHANNEL; + if (server.repl_disable_tcp_nodelay) + connDisableTcpNoDelay(c->conn); /* Non-critical if it fails. */ + listAddNodeTail(server.slaves, c); + createReplicationBacklogIfNeeded(); + + /* Wait for RDB channel to be ready */ + task->state = ASM_WAIT_RDBCHANNEL; + + sds slots_str = slotRangeArrayToString(slots); + serverLog(LL_NOTICE, "Migrate task %s created: src=%.40s, dest=%.40s, slots=%s", + task->id, task->source, task->dest, slots_str); + sdsfree(slots_str); + + asmNotifyStateChange(task, ASM_EVENT_MIGRATE_STARTED); + + /* Keep the client in the main thread to avoid data races between the + * connWrite call below and the client's event handler in IO threads. */ + if (c->tid != IOTHREAD_MAIN_THREAD_ID) keepClientInMainThread(c); + + /* addReply*() is not suitable for clients in SLAVE_STATE_WAIT_RDB_CHANNEL state. */ + if (connWrite(c->conn, "+RDBCHANNELSYNCSLOTS\r\n", 22) != 22) + freeClientAsync(c); + } else if (!strcasecmp(c->argv[2]->ptr, "rdbchannel") && c->argc == 4) { + /* CLUSTER SYNCSLOTS RDBCHANNEL */ + sds task_id = c->argv[3]->ptr; + if (sdslen(task_id) != CLUSTER_NAMELEN) { + addReplyError(c, "Invalid task id"); + return; + } + + if (listLength(asmManager->tasks) == 0) { + addReplyError(c, "No slot migration task in progress"); + return; + } + + asmTask *task = listNodeValue(listFirst(asmManager->tasks)); + if (task->operation != ASM_MIGRATE || task->state != ASM_WAIT_RDBCHANNEL || + strcmp(task->id, task_id) != 0) + { + addReplyError(c, "Another migration task is already in progress"); + return; + } + + if (unlikely(asmDebugIsFailPointActive(ASM_MIGRATE_MAIN_CHANNEL, task->state))) { + /* Close the main channel client before rdb channel client connects */ + if (task->main_channel_client) + freeClient(task->main_channel_client); + } + + /* The main channel client must be present when setting RDB channel client */ + if (task->main_channel_client == NULL) { + /* Maybe the main channel connection is closed. */ + addReplyError(c, "Main channel connection is not established"); + return; + } + + /* Mark the client as a slave to generate slots snapshot */ + c->flags |= (CLIENT_SLAVE | CLIENT_REPL_RDB_CHANNEL | CLIENT_REPL_RDBONLY | CLIENT_ASM_MIGRATING); + c->slave_capa |= SLAVE_CAPA_EOF; + c->slave_req |= (SLAVE_REQ_SLOTS_SNAPSHOT | SLAVE_REQ_RDB_CHANNEL); + c->replstate = SLAVE_STATE_WAIT_BGSAVE_START; + c->repldbfd = -1; + if (server.repl_disable_tcp_nodelay) + connDisableTcpNoDelay(c->conn); /* Non-critical if it fails. */ + listAddNodeTail(server.slaves, c); + + /* Wait for bgsave to start for slots sync */ + task->state = ASM_WAIT_BGSAVE_START; + task->rdb_channel_state = ASM_WAIT_BGSAVE_START; + task->rdb_channel_client = c; + c->task = task; + + /* Keep the client in the main thread to avoid data races between the + * connWrite call in startBgsaveForReplication and the client's event + * handler in IO threads. */ + if (c->tid != IOTHREAD_MAIN_THREAD_ID) keepClientInMainThread(c); + + if (!hasActiveChildProcess()) { + startBgsaveForReplication(c->slave_capa, c->slave_req); + } else { + serverLog(LL_NOTICE, "BGSAVE for slots snapshot sync delayed"); + } + } else if (!strcasecmp(c->argv[2]->ptr, "snapshot-eof") && c->argc == 3) { + /* CLUSTER SYNCSLOTS SNAPSHOT-EOF */ + clusterSyncSlotsSnapshotEOF(c); + } else if (!strcasecmp(c->argv[2]->ptr, "stream-eof") && c->argc == 3) { + /* CLUSTER SYNCSLOTS STREAM-EOF */ + clusterSyncSlotsStreamEOF(c); + } else if (!strcasecmp(c->argv[2]->ptr, "ack") && c->argc == 5) { + /* CLUSTER SYNCSLOTS ACK */ + long long offset; + int dest_state; + + if (!strcasecmp(c->argv[3]->ptr, asmTaskStateToString(ASM_STREAMING_BUF))) { + dest_state = ASM_STREAMING_BUF; + } else if (!strcasecmp(c->argv[3]->ptr, asmTaskStateToString(ASM_WAIT_STREAM_EOF))) { + dest_state = ASM_WAIT_STREAM_EOF; + } else { + return; /* Not support now. */ + } + + if ((getLongLongFromObject(c->argv[4], &offset) != C_OK)) + return; + + if (c->task && c->task->operation == ASM_MIGRATE) { + /* Update the state and ACKed offset from destination. */ + asmTask *task = c->task; + task->dest_state = dest_state; + if (task->dest_offset > (unsigned long long) offset) { + serverLog(LL_WARNING, "CLUSTER SYNCSLOTS ACK received, dest state: %s, " + "but offset %lld is less than the current dest offset %lld", + asmTaskStateToString(dest_state), offset, task->dest_offset); + return; + } + task->dest_offset = offset; + serverLog(LL_DEBUG, "CLUSTER SYNCSLOTS ACK received, dest state: %s, " + "updated dest offset to %lld, source offset: %lld", + asmTaskStateToString(dest_state), task->dest_offset, task->source_offset); + + /* Record the time when the destination finishes applying the accumulated buffer */ + if (task->dest_state == ASM_WAIT_STREAM_EOF && task->dest_accum_applied_time == 0) + task->dest_accum_applied_time = server.mstime; + + /* Pause write if needed */ + if (task->state == ASM_SEND_BULK_AND_STREAM || task->state == ASM_SEND_STREAM) { + /* Pause writes on the main channel if the lag is less than the threshold. */ + if (task->dest_offset + server.asm_handoff_max_lag_bytes >= task->source_offset) { + if (unlikely(asmDebugIsFailPointActive(ASM_MIGRATE_MAIN_CHANNEL, ASM_HANDOFF_PREP))) + return; /* Do not enter handoff prep state for testing buffer drain timeout. */ + + serverLog(LL_NOTICE, "The applied offset lag %lld is less than the threshold %lld, " + "pausing writes for slot handoff", + task->source_offset - task->dest_offset, + server.asm_handoff_max_lag_bytes); + task->state = ASM_HANDOFF_PREP; + asmLogTaskEvent(task, ASM_EVENT_HANDOFF_PREP); + clusterAsmOnEvent(task->id, ASM_EVENT_HANDOFF_PREP, task->slots); + } + } + } + } else if (!strcasecmp(c->argv[2]->ptr, "fail") && c->argc == 4) { + /* CLUSTER SYNCSLOTS FAIL */ + return; /* This is a no-op, just to handle the command syntax. */ + } else if (!strcasecmp(c->argv[2]->ptr, "conf") && c->argc >= 5) { + /* CLUSTER SYNCSLOTS CONF