diff options
Diffstat (limited to 'examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl')
| -rw-r--r-- | examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl | 261 |
1 files changed, 261 insertions, 0 deletions
diff --git a/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl b/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl new file mode 100644 index 0000000..5c7c992 --- /dev/null +++ b/examples/redis-unstable/tests/integration/corrupt-dump-fuzzer.tcl | |||
| @@ -0,0 +1,261 @@ | |||
| 1 | # tests of corrupt listpack payload with valid CRC | ||
| 2 | |||
| 3 | # The fuzzer can cause corrupt the state in many places, which could | ||
| 4 | # mess up the reply, so we decided to skip logreqres. | ||
| 5 | tags {"dump" "corruption" "external:skip" "logreqres:skip"} { | ||
| 6 | |||
| 7 | # catch sigterm so that in case one of the random command hangs the test, | ||
| 8 | # usually due to redis not putting a response in the output buffers, | ||
| 9 | # we'll know which command it was | ||
| 10 | if { ! [ catch { | ||
| 11 | package require Tclx | ||
| 12 | } err ] } { | ||
| 13 | signal error SIGTERM | ||
| 14 | } | ||
| 15 | |||
| 16 | proc generate_collections {suffix elements} { | ||
| 17 | set rd [redis_deferring_client] | ||
| 18 | set numcmd 7 | ||
| 19 | set has_vsets [server_has_command vadd] | ||
| 20 | if {$has_vsets} {incr numcmd} | ||
| 21 | |||
| 22 | for {set j 0} {$j < $elements} {incr j} { | ||
| 23 | # add both string values and integers | ||
| 24 | if {$j % 2 == 0} {set val $j} else {set val "_$j"} | ||
| 25 | $rd hset hash$suffix $j $val | ||
| 26 | $rd hset hashmd$suffix $j $val | ||
| 27 | $rd hexpire hashmd$suffix [expr {int(rand() * 10000)}] FIELDS 1 $j | ||
| 28 | $rd lpush list$suffix $val | ||
| 29 | $rd zadd zset$suffix $j $val | ||
| 30 | $rd sadd set$suffix $val | ||
| 31 | $rd xadd stream$suffix * item 1 value $val | ||
| 32 | if {$has_vsets} { | ||
| 33 | $rd vadd vset$suffix VALUES 3 1 1 1 $j | ||
| 34 | } | ||
| 35 | } | ||
| 36 | for {set j 0} {$j < $elements * $numcmd} {incr j} { | ||
| 37 | $rd read ; # Discard replies | ||
| 38 | } | ||
| 39 | $rd close | ||
| 40 | } | ||
| 41 | |||
| 42 | # generate keys with various types and encodings | ||
| 43 | proc generate_types {} { | ||
| 44 | r config set list-max-ziplist-size 5 | ||
| 45 | r config set hash-max-ziplist-entries 5 | ||
| 46 | r config set set-max-listpack-entries 5 | ||
| 47 | r config set zset-max-ziplist-entries 5 | ||
| 48 | r config set stream-node-max-entries 5 | ||
| 49 | |||
| 50 | # create small (ziplist / listpack encoded) objects with 3 items | ||
| 51 | generate_collections "" 3 | ||
| 52 | |||
| 53 | # add some metadata to the stream | ||
| 54 | r xgroup create stream mygroup 0 | ||
| 55 | set records [r xreadgroup GROUP mygroup Alice COUNT 2 STREAMS stream >] | ||
| 56 | r xdel stream [lindex [lindex [lindex [lindex $records 0] 1] 1] 0] | ||
| 57 | r xack stream mygroup [lindex [lindex [lindex [lindex $records 0] 1] 0] 0] | ||
| 58 | |||
| 59 | # create other non-collection types | ||
| 60 | r incr int | ||
| 61 | r set string str | ||
| 62 | |||
| 63 | # create bigger objects with 10 items (more than a single ziplist / listpack) | ||
| 64 | generate_collections big 10 | ||
| 65 | |||
| 66 | # make sure our big stream also has a listpack record that has different | ||
| 67 | # field names than the master recorded | ||
| 68 | r xadd streambig * item 1 value 1 | ||
| 69 | r xadd streambig * item 1 unique value | ||
| 70 | } | ||
| 71 | |||
| 72 | proc corrupt_payload {payload} { | ||
| 73 | set len [string length $payload] | ||
| 74 | set count 1 ;# usually corrupt only one byte | ||
| 75 | if {rand() > 0.9} { set count 2 } | ||
| 76 | while { $count > 0 } { | ||
| 77 | set idx [expr {int(rand() * $len)}] | ||
| 78 | set ch [binary format c [expr {int(rand()*255)}]] | ||
| 79 | set payload [string replace $payload $idx $idx $ch] | ||
| 80 | incr count -1 | ||
| 81 | } | ||
| 82 | return $payload | ||
| 83 | } | ||
| 84 | |||
| 85 | # fuzzy tester for corrupt RESTORE payloads | ||
| 86 | # valgrind will make sure there were no leaks in the rdb loader error handling code | ||
| 87 | foreach sanitize_dump {no yes} { | ||
| 88 | if {$::accurate} { | ||
| 89 | set min_duration [expr {60 * 10}] ;# run at least 10 minutes | ||
| 90 | set min_cycles 1000 ;# run at least 1k cycles (max 16 minutes) | ||
| 91 | } else { | ||
| 92 | set min_duration 10 ; # run at least 10 seconds | ||
| 93 | set min_cycles 10 ; # run at least 10 cycles | ||
| 94 | } | ||
| 95 | |||
| 96 | # Don't execute this on FreeBSD due to a yet-undiscovered memory issue | ||
| 97 | # which causes tclsh to bloat. | ||
| 98 | if {[exec uname] == "FreeBSD"} { | ||
| 99 | set min_cycles 1 | ||
| 100 | set min_duration 1 | ||
| 101 | } | ||
| 102 | |||
| 103 | test "Fuzzer corrupt restore payloads - sanitize_dump: $sanitize_dump" { | ||
| 104 | if {$min_duration * 2 > $::timeout} { | ||
| 105 | fail "insufficient timeout" | ||
| 106 | } | ||
| 107 | # start a server, fill with data and save an RDB file once (avoid re-save) | ||
| 108 | start_server [list overrides [list "save" "" use-exit-on-panic yes crash-memcheck-enabled no loglevel verbose] ] { | ||
| 109 | set stdout [srv 0 stdout] | ||
| 110 | r config set sanitize-dump-payload $sanitize_dump | ||
| 111 | r debug set-skip-checksum-validation 1 | ||
| 112 | set start_time [clock seconds] | ||
| 113 | generate_types | ||
| 114 | set dbsize [r dbsize] | ||
| 115 | r save | ||
| 116 | set cycle 0 | ||
| 117 | set stat_terminated_in_restore 0 | ||
| 118 | set stat_terminated_in_traffic 0 | ||
| 119 | set stat_terminated_by_signal 0 | ||
| 120 | set stat_successful_restore 0 | ||
| 121 | set stat_rejected_restore 0 | ||
| 122 | set stat_traffic_commands_sent 0 | ||
| 123 | # repeatedly DUMP a random key, corrupt it and try RESTORE into a new key | ||
| 124 | while true { | ||
| 125 | set k [r randomkey] | ||
| 126 | set dump [r dump $k] | ||
| 127 | set dump [corrupt_payload $dump] | ||
| 128 | set printable_dump [string2printable $dump] | ||
| 129 | set restore_failed false | ||
| 130 | set report_and_restart false | ||
| 131 | set sent {} | ||
| 132 | set expired_subkeys [s expired_subkeys] | ||
| 133 | # RESTORE can fail, but hopefully not terminate | ||
| 134 | if { [catch { r restore "_$k" 0 $dump REPLACE } err] } { | ||
| 135 | set restore_failed true | ||
| 136 | # skip if return failed with an error response. | ||
| 137 | if {[string match "ERR*" $err]} { | ||
| 138 | incr stat_rejected_restore | ||
| 139 | } else { | ||
| 140 | set report_and_restart true | ||
| 141 | incr stat_terminated_in_restore | ||
| 142 | write_log_line 0 "corrupt payload: $printable_dump" | ||
| 143 | if {$sanitize_dump == yes} { | ||
| 144 | puts "Server crashed in RESTORE with payload: $printable_dump" | ||
| 145 | } | ||
| 146 | } | ||
| 147 | } else { | ||
| 148 | r ping ;# an attempt to check if the server didn't terminate (this will throw an error that will terminate the tests) | ||
| 149 | } | ||
| 150 | |||
| 151 | set print_commands false | ||
| 152 | if {!$restore_failed} { | ||
| 153 | # if RESTORE didn't fail or terminate, run some random traffic on the new key | ||
| 154 | incr stat_successful_restore | ||
| 155 | if { [ catch { | ||
| 156 | set type [r type "_$k"] | ||
| 157 | if {$type eq {none}} { | ||
| 158 | # The key has been removed due to expiration. | ||
| 159 | # Ensure the server didn't terminate during expiration and verify | ||
| 160 | # expire stats to confirm the key was removed due to expiration. | ||
| 161 | r ping | ||
| 162 | assert_morethan [s expired_subkeys] $expired_subkeys | ||
| 163 | } else { | ||
| 164 | set sent [generate_fuzzy_traffic_on_key "_$k" $type 1] ;# traffic for 1 second | ||
| 165 | } | ||
| 166 | |||
| 167 | incr stat_traffic_commands_sent [llength $sent] | ||
| 168 | r del "_$k" ;# in case the server terminated, here's where we'll detect it. | ||
| 169 | if {$dbsize != [r dbsize]} { | ||
| 170 | puts "unexpected keys" | ||
| 171 | puts "keys: [r keys *]" | ||
| 172 | puts "commands leading to it:" | ||
| 173 | foreach cmd $sent { | ||
| 174 | foreach arg $cmd { | ||
| 175 | puts -nonewline "[string2printable $arg] " | ||
| 176 | } | ||
| 177 | puts "" | ||
| 178 | } | ||
| 179 | exit 1 | ||
| 180 | } | ||
| 181 | } err ] } { | ||
| 182 | set err [format "%s" $err] ;# convert to string for pattern matching | ||
| 183 | if {[string match "*SIGTERM*" $err]} { | ||
| 184 | puts "payload that caused test to hang: $printable_dump" | ||
| 185 | if {$::dump_logs} { | ||
| 186 | set srv [get_srv 0] | ||
| 187 | dump_server_log $srv | ||
| 188 | } | ||
| 189 | exit 1 | ||
| 190 | } | ||
| 191 | # if the server terminated update stats and restart it | ||
| 192 | set report_and_restart true | ||
| 193 | incr stat_terminated_in_traffic | ||
| 194 | set by_signal [count_log_message 0 "crashed by signal"] | ||
| 195 | incr stat_terminated_by_signal $by_signal | ||
| 196 | |||
| 197 | if {$by_signal != 0 || $sanitize_dump == yes} { | ||
| 198 | if {$::dump_logs} { | ||
| 199 | set srv [get_srv 0] | ||
| 200 | dump_server_log $srv | ||
| 201 | } | ||
| 202 | |||
| 203 | puts "Server crashed (by signal: $by_signal, err: $err), with payload: $printable_dump" | ||
| 204 | set print_commands true | ||
| 205 | } | ||
| 206 | } | ||
| 207 | } | ||
| 208 | |||
| 209 | # check valgrind report for invalid reads after each RESTORE | ||
| 210 | # payload so that we have a report that is easier to reproduce | ||
| 211 | set valgrind_errors [find_valgrind_errors [srv 0 stderr] false] | ||
| 212 | set asan_errors [sanitizer_errors_from_file [srv 0 stderr]] | ||
| 213 | if {$valgrind_errors != "" || $asan_errors != ""} { | ||
| 214 | puts "valgrind or asan found an issue for payload: $printable_dump" | ||
| 215 | set report_and_restart true | ||
| 216 | set print_commands true | ||
| 217 | } | ||
| 218 | |||
| 219 | if {$report_and_restart} { | ||
| 220 | if {$print_commands} { | ||
| 221 | puts "violating commands:" | ||
| 222 | foreach cmd $sent { | ||
| 223 | foreach arg $cmd { | ||
| 224 | puts -nonewline "[string2printable $arg] " | ||
| 225 | } | ||
| 226 | puts "" | ||
| 227 | } | ||
| 228 | } | ||
| 229 | |||
| 230 | # restart the server and re-apply debug configuration | ||
| 231 | write_log_line 0 "corrupt payload: $printable_dump" | ||
| 232 | restart_server 0 true true | ||
| 233 | r config set sanitize-dump-payload $sanitize_dump | ||
| 234 | r debug set-skip-checksum-validation 1 | ||
| 235 | } | ||
| 236 | |||
| 237 | incr cycle | ||
| 238 | if { ([clock seconds]-$start_time) >= $min_duration && $cycle >= $min_cycles} { | ||
| 239 | break | ||
| 240 | } | ||
| 241 | } | ||
| 242 | if {$::verbose} { | ||
| 243 | puts "Done $cycle cycles in [expr {[clock seconds]-$start_time}] seconds." | ||
| 244 | puts "RESTORE: successful: $stat_successful_restore, rejected: $stat_rejected_restore" | ||
| 245 | puts "Total commands sent in traffic: $stat_traffic_commands_sent, crashes during traffic: $stat_terminated_in_traffic ($stat_terminated_by_signal by signal)." | ||
| 246 | } | ||
| 247 | } | ||
| 248 | # if we run sanitization we never expect the server to crash at runtime | ||
| 249 | if {$sanitize_dump == yes} { | ||
| 250 | assert_equal $stat_terminated_in_restore 0 | ||
| 251 | assert_equal $stat_terminated_in_traffic 0 | ||
| 252 | } | ||
| 253 | # make sure all terminations where due to assertion and not a SIGSEGV | ||
| 254 | assert_equal $stat_terminated_by_signal 0 | ||
| 255 | } | ||
| 256 | } | ||
| 257 | |||
| 258 | |||
| 259 | |||
| 260 | } ;# tags | ||
| 261 | |||
