1 files changed, 261 insertions, 0 deletions
diff --git a/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl b/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl
new file mode 100644
index 0000000..5c7c992
--- /dev/null
+++ b/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl
@@ -0,0 +1,261 @@
+# tests of corrupt listpack payload with valid CRC
+# The fuzzer can cause corrupt the state in many places, which could
+# mess up the reply, so we decided to skip logreqres.
+tags {"dump" "corruption" "external:skip" "logreqres:skip"} {
+# catch sigterm so that in case one of the random command hangs the test,
+# usually due to redis not putting a response in the output buffers,
+# we'll know which command it was
+if { ! [ catch {
+    package require Tclx
+} err ] } {
+    signal error SIGTERM
+}
+proc generate_collections {suffix elements} {
+    set rd [redis_deferring_client]
+    set numcmd 7
+    set has_vsets [server_has_command vadd]
+    if {$has_vsets} {incr numcmd}
+    for {set j 0} {$j < $elements} {incr j} {
+        # add both string values and integers
+        if {$j % 2 == 0} {set val $j} else {set val "_$j"}
+        $rd hset hash$suffix $j $val
+        $rd hset hashmd$suffix $j $val
+        $rd hexpire hashmd$suffix [expr {int(rand() * 10000)}] FIELDS 1 $j
+        $rd lpush list$suffix $val
+        $rd zadd zset$suffix $j $val
+        $rd sadd set$suffix $val
+        $rd xadd stream$suffix * item 1 value $val
+        if {$has_vsets} {
+            $rd vadd vset$suffix VALUES 3 1 1 1 $j
+        }
+    }
+    for {set j 0} {$j < $elements * $numcmd} {incr j} {
+        $rd read ; # Discard replies
+    }
+    $rd close
+}
+# generate keys with various types and encodings
+proc generate_types {} {
+    r config set list-max-ziplist-size 5
+    r config set hash-max-ziplist-entries 5
+    r config set set-max-listpack-entries 5
+    r config set zset-max-ziplist-entries 5
+    r config set stream-node-max-entries 5
+    # create small (ziplist / listpack encoded) objects with 3 items
+    generate_collections "" 3
+    # add some metadata to the stream
+    r xgroup create stream mygroup 0
+    set records [r xreadgroup GROUP mygroup Alice COUNT 2 STREAMS stream >]
+    r xdel stream [lindex [lindex [lindex [lindex $records 0] 1] 1] 0]
+    r xack stream mygroup [lindex [lindex [lindex [lindex $records 0] 1] 0] 0]
+    # create other non-collection types
+    r incr int
+    r set string str
+    # create bigger objects with 10 items (more than a single ziplist / listpack)
+    generate_collections big 10
+    # make sure our big stream also has a listpack record that has different
+    # field names than the master recorded
+    r xadd streambig * item 1 value 1
+    r xadd streambig * item 1 unique value
+}
+proc corrupt_payload {payload} {
+    set len [string length $payload]
+    set count 1 ;# usually corrupt only one byte
+    if {rand() > 0.9} { set count 2 }
+    while { $count > 0 } {
+        set idx [expr {int(rand() * $len)}]
+        set ch [binary format c [expr {int(rand()*255)}]]
+        set payload [string replace $payload $idx $idx $ch]
+        incr count -1
+    }
+    return $payload
+}
+# fuzzy tester for corrupt RESTORE payloads
+# valgrind will make sure there were no leaks in the rdb loader error handling code
+foreach sanitize_dump {no yes} {
+    if {$::accurate} {
+        set min_duration [expr {60 * 10}] ;# run at least 10 minutes
+        set min_cycles 1000 ;# run at least 1k cycles (max 16 minutes)
+    } else {
+        set min_duration 10 ; # run at least 10 seconds
+        set min_cycles 10 ; # run at least 10 cycles
+    }
+    # Don't execute this on FreeBSD due to a yet-undiscovered memory issue
+    # which causes tclsh to bloat.
+    if {[exec uname] == "FreeBSD"} {
+        set min_cycles 1
+        set min_duration 1
+    }
+    test "Fuzzer corrupt restore payloads - sanitize_dump: $sanitize_dump" {
+        if {$min_duration * 2 > $::timeout} {
+            fail "insufficient timeout"
+        }
+        # start a server, fill with data and save an RDB file once (avoid re-save)
+        start_server [list overrides [list "save" "" use-exit-on-panic yes crash-memcheck-enabled no loglevel verbose] ] {
+            set stdout [srv 0 stdout]
+            r config set sanitize-dump-payload $sanitize_dump
+            r debug set-skip-checksum-validation 1
+            set start_time [clock seconds]
+            generate_types
+            set dbsize [r dbsize]
+            r save
+            set cycle 0
+            set stat_terminated_in_restore 0
+            set stat_terminated_in_traffic 0
+            set stat_terminated_by_signal 0
+            set stat_successful_restore 0
+            set stat_rejected_restore 0
+            set stat_traffic_commands_sent 0
+            # repeatedly DUMP a random key, corrupt it and try RESTORE into a new key
+            while true {
+                set k [r randomkey]
+                set dump [r dump $k]
+                set dump [corrupt_payload $dump]
+                set printable_dump [string2printable $dump]
+                set restore_failed false
+                set report_and_restart false
+                set sent {}
+                set expired_subkeys [s expired_subkeys]
+                # RESTORE can fail, but hopefully not terminate
+                if { [catch { r restore "_$k" 0 $dump REPLACE } err] } {
+                    set restore_failed true
+                    # skip if return failed with an error response.
+                    if {[string match "ERR*" $err]} {
+                        incr stat_rejected_restore
+                    } else {
+                        set report_and_restart true
+                        incr stat_terminated_in_restore
+                        write_log_line 0 "corrupt payload: $printable_dump"
+                        if {$sanitize_dump == yes} {
+                            puts "Server crashed in RESTORE with payload: $printable_dump"
+                        }
+                    }
+                } else {
+                    r ping ;# an attempt to check if the server didn't terminate (this will throw an error that will terminate the tests)
+                }
+                set print_commands false
+                if {!$restore_failed} {
+                    # if RESTORE didn't fail or terminate, run some random traffic on the new key
+                    incr stat_successful_restore
+                    if { [ catch {
+                        set type [r type "_$k"]
+                        if {$type eq {none}} {
+                            # The key has been removed due to expiration.
+                            # Ensure the server didn't terminate during expiration and verify
+                            # expire stats to confirm the key was removed due to expiration.
+                            r ping
+                            assert_morethan [s expired_subkeys] $expired_subkeys
+                        } else {
+                            set sent [generate_fuzzy_traffic_on_key "_$k" $type 1] ;# traffic for 1 second
+                        }
+                        incr stat_traffic_commands_sent [llength $sent]
+                        r del "_$k" ;# in case the server terminated, here's where we'll detect it.
+                        if {$dbsize != [r dbsize]} {
+                            puts "unexpected keys"
+                            puts "keys: [r keys *]"
+                            puts "commands leading to it:"
+                            foreach cmd $sent {
+                                foreach arg $cmd {
+                                    puts -nonewline "[string2printable $arg] "
+                                }
+                                puts ""
+                            }
+                            exit 1
+                        }
+                    } err ] } {
+                        set err [format "%s" $err] ;# convert to string for pattern matching
+                        if {[string match "*SIGTERM*" $err]} {
+                            puts "payload that caused test to hang: $printable_dump"
+                            if {$::dump_logs} {
+                                set srv [get_srv 0]
+                                dump_server_log $srv
+                            }
+                            exit 1
+                        }
+                        # if the server terminated update stats and restart it
+                        set report_and_restart true
+                        incr stat_terminated_in_traffic
+                        set by_signal [count_log_message 0 "crashed by signal"]
+                        incr stat_terminated_by_signal $by_signal
+                        if {$by_signal != 0 || $sanitize_dump == yes} {
+                            if {$::dump_logs} {
+                                set srv [get_srv 0]
+                                dump_server_log $srv
+                            }
+                            puts "Server crashed (by signal: $by_signal, err: $err), with payload: $printable_dump"
+                            set print_commands true
+                        }
+                    }
+                }
+                # check valgrind report for invalid reads after each RESTORE
+                # payload so that we have a report that is easier to reproduce
+                set valgrind_errors [find_valgrind_errors [srv 0 stderr] false]
+                set asan_errors [sanitizer_errors_from_file [srv 0 stderr]]
+                if {$valgrind_errors != "" || $asan_errors != ""} {
+                    puts "valgrind or asan found an issue for payload: $printable_dump"
+                    set report_and_restart true
+                    set print_commands true
+                }
+                if {$report_and_restart} {
+                    if {$print_commands} {
+                        puts "violating commands:"
+                        foreach cmd $sent {
+                            foreach arg $cmd {
+                                puts -nonewline "[string2printable $arg] "
+                            }
+                            puts ""
+                        }
+                    }
+                    # restart the server and re-apply debug configuration
+                    write_log_line 0 "corrupt payload: $printable_dump"
+                    restart_server 0 true true
+                    r config set sanitize-dump-payload $sanitize_dump
+                    r debug set-skip-checksum-validation 1
+                }
+                incr cycle
+                if { ([clock seconds]-$start_time) >= $min_duration && $cycle >= $min_cycles} {
+                    break
+                }
+            }
+            if {$::verbose} {
+                puts "Done $cycle cycles in [expr {[clock seconds]-$start_time}] seconds."
+                puts "RESTORE: successful: $stat_successful_restore, rejected: $stat_rejected_restore"
+                puts "Total commands sent in traffic: $stat_traffic_commands_sent, crashes during traffic: $stat_terminated_in_traffic ($stat_terminated_by_signal by signal)."
+            }
+        }
+        # if we run sanitization we never expect the server to crash at runtime
+        if {$sanitize_dump == yes} {
+            assert_equal $stat_terminated_in_restore 0
+            assert_equal $stat_terminated_in_traffic 0
+        }
+        # make sure all terminations where due to assertion and not a SIGSEGV
+        assert_equal $stat_terminated_by_signal 0
+    }
+}
+} ;# tags

diff --git a/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl b/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl new file mode 100644 index 0000000..5c7c992 --- /dev/null +++ b/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl
@@ -0,0 +1,261 @@
	1	# tests of corrupt listpack payload with valid CRC
	2
	3	# The fuzzer can cause corrupt the state in many places, which could
	4	# mess up the reply, so we decided to skip logreqres.
	5	tags {"dump" "corruption" "external:skip" "logreqres:skip"} {
	6
	7	# catch sigterm so that in case one of the random command hangs the test,
	8	# usually due to redis not putting a response in the output buffers,
	9	# we'll know which command it was
	10	if { ! [ catch {
	11	package require Tclx
	12	} err ] } {
	13	signal error SIGTERM
	14	}
	15
	16	proc generate_collections {suffix elements} {
	17	set rd [redis_deferring_client]
	18	set numcmd 7
	19	set has_vsets [server_has_command vadd]
	20	if {$has_vsets} {incr numcmd}
	21
	22	for {set j 0} {$j < $elements} {incr j} {
	23	# add both string values and integers
	24	if {$j % 2 == 0} {set val $j} else {set val "_$j"}
	25	$rd hset hash$suffix $j $val
	26	$rd hset hashmd$suffix $j $val
	27	$rd hexpire hashmd$suffix [expr {int(rand() * 10000)}] FIELDS 1 $j
	28	$rd lpush list$suffix $val
	29	$rd zadd zset$suffix $j $val
	30	$rd sadd set$suffix $val
	31	$rd xadd stream$suffix * item 1 value $val
	32	if {$has_vsets} {
	33	$rd vadd vset$suffix VALUES 3 1 1 1 $j
	34	}
	35	}
	36	for {set j 0} {$j < $elements * $numcmd} {incr j} {
	37	$rd read ; # Discard replies
	38	}
	39	$rd close
	40	}
	41
	42	# generate keys with various types and encodings
	43	proc generate_types {} {
	44	r config set list-max-ziplist-size 5
	45	r config set hash-max-ziplist-entries 5
	46	r config set set-max-listpack-entries 5
	47	r config set zset-max-ziplist-entries 5
	48	r config set stream-node-max-entries 5
	49
	50	# create small (ziplist / listpack encoded) objects with 3 items
	51	generate_collections "" 3
	52
	53	# add some metadata to the stream
	54	r xgroup create stream mygroup 0
	55	set records [r xreadgroup GROUP mygroup Alice COUNT 2 STREAMS stream >]
	56	r xdel stream [lindex [lindex [lindex [lindex $records 0] 1] 1] 0]
	57	r xack stream mygroup [lindex [lindex [lindex [lindex $records 0] 1] 0] 0]
	58
	59	# create other non-collection types
	60	r incr int
	61	r set string str
	62
	63	# create bigger objects with 10 items (more than a single ziplist / listpack)
	64	generate_collections big 10
	65
	66	# make sure our big stream also has a listpack record that has different
	67	# field names than the master recorded
	68	r xadd streambig * item 1 value 1
	69	r xadd streambig * item 1 unique value
	70	}
	71
	72	proc corrupt_payload {payload} {
	73	set len [string length $payload]
	74	set count 1 ;# usually corrupt only one byte
	75	if {rand() > 0.9} { set count 2 }
	76	while { $count > 0 } {
	77	set idx [expr {int(rand() * $len)}]
	78	set ch [binary format c [expr {int(rand()*255)}]]
	79	set payload [string replace $payload $idx $idx $ch]
	80	incr count -1
	81	}
	82	return $payload
	83	}
	84
	85	# fuzzy tester for corrupt RESTORE payloads
	86	# valgrind will make sure there were no leaks in the rdb loader error handling code
	87	foreach sanitize_dump {no yes} {
	88	if {$::accurate} {
	89	set min_duration [expr {60 * 10}] ;# run at least 10 minutes
	90	set min_cycles 1000 ;# run at least 1k cycles (max 16 minutes)
	91	} else {
	92	set min_duration 10 ; # run at least 10 seconds
	93	set min_cycles 10 ; # run at least 10 cycles
	94	}
	95
	96	# Don't execute this on FreeBSD due to a yet-undiscovered memory issue
	97	# which causes tclsh to bloat.
	98	if {[exec uname] == "FreeBSD"} {
	99	set min_cycles 1
	100	set min_duration 1
	101	}
	102
	103	test "Fuzzer corrupt restore payloads - sanitize_dump: $sanitize_dump" {
	104	if {$min_duration * 2 > $::timeout} {
	105	fail "insufficient timeout"
	106	}
	107	# start a server, fill with data and save an RDB file once (avoid re-save)
	108	start_server [list overrides [list "save" "" use-exit-on-panic yes crash-memcheck-enabled no loglevel verbose] ] {
	109	set stdout [srv 0 stdout]
	110	r config set sanitize-dump-payload $sanitize_dump
	111	r debug set-skip-checksum-validation 1
	112	set start_time [clock seconds]
	113	generate_types
	114	set dbsize [r dbsize]
	115	r save
	116	set cycle 0
	117	set stat_terminated_in_restore 0
	118	set stat_terminated_in_traffic 0
	119	set stat_terminated_by_signal 0
	120	set stat_successful_restore 0
	121	set stat_rejected_restore 0
	122	set stat_traffic_commands_sent 0
	123	# repeatedly DUMP a random key, corrupt it and try RESTORE into a new key
	124	while true {
	125	set k [r randomkey]
	126	set dump [r dump $k]
	127	set dump [corrupt_payload $dump]
	128	set printable_dump [string2printable $dump]
	129	set restore_failed false
	130	set report_and_restart false
	131	set sent {}
	132	set expired_subkeys [s expired_subkeys]
	133	# RESTORE can fail, but hopefully not terminate
	134	if { [catch { r restore "_$k" 0 $dump REPLACE } err] } {
	135	set restore_failed true
	136	# skip if return failed with an error response.
	137	if {[string match "ERR*" $err]} {
	138	incr stat_rejected_restore
	139	} else {
	140	set report_and_restart true
	141	incr stat_terminated_in_restore
	142	write_log_line 0 "corrupt payload: $printable_dump"
	143	if {$sanitize_dump == yes} {
	144	puts "Server crashed in RESTORE with payload: $printable_dump"
	145	}
	146	}
	147	} else {
	148	r ping ;# an attempt to check if the server didn't terminate (this will throw an error that will terminate the tests)
	149	}
	150
	151	set print_commands false
	152	if {!$restore_failed} {
	153	# if RESTORE didn't fail or terminate, run some random traffic on the new key
	154	incr stat_successful_restore
	155	if { [ catch {
	156	set type [r type "_$k"]
	157	if {$type eq {none}} {
	158	# The key has been removed due to expiration.
	159	# Ensure the server didn't terminate during expiration and verify
	160	# expire stats to confirm the key was removed due to expiration.
	161	r ping
	162	assert_morethan [s expired_subkeys] $expired_subkeys
	163	} else {
	164	set sent [generate_fuzzy_traffic_on_key "_$k" $type 1] ;# traffic for 1 second
	165	}
	166
	167	incr stat_traffic_commands_sent [llength $sent]
	168	r del "_$k" ;# in case the server terminated, here's where we'll detect it.
	169	if {$dbsize != [r dbsize]} {
	170	puts "unexpected keys"
	171	puts "keys: [r keys *]"
	172	puts "commands leading to it:"
	173	foreach cmd $sent {
	174	foreach arg $cmd {
	175	puts -nonewline "[string2printable $arg] "
	176	}
	177	puts ""
	178	}
	179	exit 1
	180	}
	181	} err ] } {
	182	set err [format "%s" $err] ;# convert to string for pattern matching
	183	if {[string match "SIGTERM" $err]} {
	184	puts "payload that caused test to hang: $printable_dump"
	185	if {$::dump_logs} {
	186	set srv [get_srv 0]
	187	dump_server_log $srv
	188	}
	189	exit 1
	190	}
	191	# if the server terminated update stats and restart it
	192	set report_and_restart true
	193	incr stat_terminated_in_traffic
	194	set by_signal [count_log_message 0 "crashed by signal"]
	195	incr stat_terminated_by_signal $by_signal
	196
	197	if {$by_signal != 0 \|\| $sanitize_dump == yes} {
	198	if {$::dump_logs} {
	199	set srv [get_srv 0]
	200	dump_server_log $srv
	201	}
	202
	203	puts "Server crashed (by signal: $by_signal, err: $err), with payload: $printable_dump"
	204	set print_commands true
	205	}
	206	}
	207	}
	208
	209	# check valgrind report for invalid reads after each RESTORE
	210	# payload so that we have a report that is easier to reproduce
	211	set valgrind_errors [find_valgrind_errors [srv 0 stderr] false]
	212	set asan_errors [sanitizer_errors_from_file [srv 0 stderr]]
	213	if {$valgrind_errors != "" \|\| $asan_errors != ""} {
	214	puts "valgrind or asan found an issue for payload: $printable_dump"
	215	set report_and_restart true
	216	set print_commands true
	217	}
	218
	219	if {$report_and_restart} {
	220	if {$print_commands} {
	221	puts "violating commands:"
	222	foreach cmd $sent {
	223	foreach arg $cmd {
	224	puts -nonewline "[string2printable $arg] "
	225	}
	226	puts ""
	227	}
	228	}
	229
	230	# restart the server and re-apply debug configuration
	231	write_log_line 0 "corrupt payload: $printable_dump"
	232	restart_server 0 true true
	233	r config set sanitize-dump-payload $sanitize_dump
	234	r debug set-skip-checksum-validation 1
	235	}
	236
	237	incr cycle
	238	if { ([clock seconds]-$start_time) >= $min_duration && $cycle >= $min_cycles} {
	239	break
	240	}
	241	}
	242	if {$::verbose} {
	243	puts "Done $cycle cycles in [expr {[clock seconds]-$start_time}] seconds."
	244	puts "RESTORE: successful: $stat_successful_restore, rejected: $stat_rejected_restore"
	245	puts "Total commands sent in traffic: $stat_traffic_commands_sent, crashes during traffic: $stat_terminated_in_traffic ($stat_terminated_by_signal by signal)."
	246	}
	247	}
	248	# if we run sanitization we never expect the server to crash at runtime
	249	if {$sanitize_dump == yes} {
	250	assert_equal $stat_terminated_in_restore 0
	251	assert_equal $stat_terminated_in_traffic 0
	252	}
	253	# make sure all terminations where due to assertion and not a SIGSEGV
	254	assert_equal $stat_terminated_by_signal 0
	255	}
	256	}
	257
	258
	259
	260	} ;# tags
	261