diff options
Diffstat (limited to 'examples/redis-unstable/src/syscheck.c')
| -rw-r--r-- | examples/redis-unstable/src/syscheck.c | 354 |
1 files changed, 354 insertions, 0 deletions
diff --git a/examples/redis-unstable/src/syscheck.c b/examples/redis-unstable/src/syscheck.c new file mode 100644 index 0000000..1251c1d --- /dev/null +++ b/examples/redis-unstable/src/syscheck.c | |||
| @@ -0,0 +1,354 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2016-Present, Redis Ltd. | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * Licensed under your choice of (a) the Redis Source Available License 2.0 | ||
| 6 | * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the | ||
| 7 | * GNU Affero General Public License v3 (AGPLv3). | ||
| 8 | */ | ||
| 9 | #include "fmacros.h" | ||
| 10 | #include "config.h" | ||
| 11 | #include "syscheck.h" | ||
| 12 | #include "sds.h" | ||
| 13 | #include "anet.h" | ||
| 14 | |||
| 15 | #include <time.h> | ||
| 16 | #include <sys/resource.h> | ||
| 17 | #include <unistd.h> | ||
| 18 | #include <stdio.h> | ||
| 19 | #include <stdlib.h> | ||
| 20 | #include <string.h> | ||
| 21 | #include <errno.h> | ||
| 22 | #include <sys/wait.h> | ||
| 23 | |||
| 24 | #ifdef __linux__ | ||
| 25 | #include <sys/mman.h> | ||
| 26 | #endif | ||
| 27 | |||
| 28 | |||
| 29 | #ifdef __linux__ | ||
| 30 | static sds read_sysfs_line(char *path) { | ||
| 31 | char buf[256]; | ||
| 32 | FILE *f = fopen(path, "r"); | ||
| 33 | if (!f) return NULL; | ||
| 34 | if (!fgets(buf, sizeof(buf), f)) { | ||
| 35 | fclose(f); | ||
| 36 | return NULL; | ||
| 37 | } | ||
| 38 | fclose(f); | ||
| 39 | sds res = sdsnew(buf); | ||
| 40 | res = sdstrim(res, " \n"); | ||
| 41 | return res; | ||
| 42 | } | ||
| 43 | |||
| 44 | /* Verify our clocksource implementation doesn't go through a system call (uses vdso). | ||
| 45 | * Going through a system call to check the time degrades Redis performance. */ | ||
| 46 | static int checkClocksource(sds *error_msg) { | ||
| 47 | unsigned long test_time_us, system_hz; | ||
| 48 | struct timespec ts; | ||
| 49 | unsigned long long start_us; | ||
| 50 | struct rusage ru_start, ru_end; | ||
| 51 | |||
| 52 | system_hz = sysconf(_SC_CLK_TCK); | ||
| 53 | |||
| 54 | if (getrusage(RUSAGE_SELF, &ru_start) != 0) | ||
| 55 | return 0; | ||
| 56 | if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) { | ||
| 57 | return 0; | ||
| 58 | } | ||
| 59 | start_us = (ts.tv_sec * 1000000 + ts.tv_nsec / 1000); | ||
| 60 | |||
| 61 | /* clock_gettime() busy loop of 5 times system tick (for a system_hz of 100 this is 50ms) | ||
| 62 | * Using system_hz is required to ensure accurate measurements from getrusage(). | ||
| 63 | * If our clocksource is configured correctly (vdso) this will result in no system calls. | ||
| 64 | * If our clocksource is inefficient it'll waste most of the busy loop in the kernel. */ | ||
| 65 | test_time_us = 5 * 1000000 / system_hz; | ||
| 66 | while (1) { | ||
| 67 | unsigned long long d; | ||
| 68 | if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) | ||
| 69 | return 0; | ||
| 70 | d = (ts.tv_sec * 1000000 + ts.tv_nsec / 1000) - start_us; | ||
| 71 | if (d >= test_time_us) break; | ||
| 72 | } | ||
| 73 | if (getrusage(RUSAGE_SELF, &ru_end) != 0) | ||
| 74 | return 0; | ||
| 75 | |||
| 76 | long long stime_us = (ru_end.ru_stime.tv_sec * 1000000 + ru_end.ru_stime.tv_usec) - (ru_start.ru_stime.tv_sec * 1000000 + ru_start.ru_stime.tv_usec); | ||
| 77 | long long utime_us = (ru_end.ru_utime.tv_sec * 1000000 + ru_end.ru_utime.tv_usec) - (ru_start.ru_utime.tv_sec * 1000000 + ru_start.ru_utime.tv_usec); | ||
| 78 | |||
| 79 | /* If more than 10% of the process time was in system calls we probably have an inefficient clocksource, print a warning */ | ||
| 80 | if (stime_us * 10 > stime_us + utime_us) { | ||
| 81 | sds avail = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/available_clocksource"); | ||
| 82 | sds curr = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/current_clocksource"); | ||
| 83 | *error_msg = sdscatprintf(sdsempty(), | ||
| 84 | "Slow system clocksource detected. This can result in degraded performance. " | ||
| 85 | "Consider changing the system's clocksource. " | ||
| 86 | "Current clocksource: %s. Available clocksources: %s. " | ||
| 87 | "For example: run the command 'echo tsc > /sys/devices/system/clocksource/clocksource0/current_clocksource' as root. " | ||
| 88 | "To permanently change the system's clocksource you'll need to set the 'clocksource=' kernel command line parameter.", | ||
| 89 | curr ? curr : "", avail ? avail : ""); | ||
| 90 | sdsfree(avail); | ||
| 91 | sdsfree(curr); | ||
| 92 | return -1; | ||
| 93 | } else { | ||
| 94 | return 1; | ||
| 95 | } | ||
| 96 | } | ||
| 97 | |||
| 98 | /* Verify we're not using the `xen` clocksource. The xen hypervisor's default clocksource is slow and affects | ||
| 99 | * Redis's performance. This has been measured on ec2 xen based instances. ec2 recommends using the non-default | ||
| 100 | * tsc clock source for these instances. */ | ||
| 101 | int checkXenClocksource(sds *error_msg) { | ||
| 102 | sds curr = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/current_clocksource"); | ||
| 103 | int res = 1; | ||
| 104 | if (curr == NULL) { | ||
| 105 | res = 0; | ||
| 106 | } else if (strcmp(curr, "xen") == 0) { | ||
| 107 | *error_msg = sdsnew( | ||
| 108 | "Your system is configured to use the 'xen' clocksource which might lead to degraded performance. " | ||
| 109 | "Check the result of the [slow-clocksource] system check: run 'redis-server --check-system' to check if " | ||
| 110 | "the system's clocksource isn't degrading performance."); | ||
| 111 | res = -1; | ||
| 112 | } | ||
| 113 | sdsfree(curr); | ||
| 114 | return res; | ||
| 115 | } | ||
| 116 | |||
| 117 | /* Verify overcommit is enabled. | ||
| 118 | * When overcommit memory is disabled Linux will kill the forked child of a background save | ||
| 119 | * if we don't have enough free memory to satisfy double the current memory usage even though | ||
| 120 | * the forked child uses copy-on-write to reduce its actual memory usage. */ | ||
| 121 | int checkOvercommit(sds *error_msg) { | ||
| 122 | FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r"); | ||
| 123 | char buf[64]; | ||
| 124 | |||
| 125 | if (!fp) return 0; | ||
| 126 | if (fgets(buf,64,fp) == NULL) { | ||
| 127 | fclose(fp); | ||
| 128 | return 0; | ||
| 129 | } | ||
| 130 | fclose(fp); | ||
| 131 | |||
| 132 | if (strtol(buf, NULL, 10) != 1) { | ||
| 133 | *error_msg = sdsnew( | ||
| 134 | "Memory overcommit must be enabled! Without it, a background save or replication may fail under low memory condition. " | ||
| 135 | #if defined(USE_JEMALLOC) | ||
| 136 | "Being disabled, it can also cause failures without low memory condition, see https://github.com/jemalloc/jemalloc/issues/1328. " | ||
| 137 | #endif | ||
| 138 | "To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the " | ||
| 139 | "command 'sysctl vm.overcommit_memory=1' for this to take effect."); | ||
| 140 | return -1; | ||
| 141 | } else { | ||
| 142 | return 1; | ||
| 143 | } | ||
| 144 | } | ||
| 145 | |||
| 146 | /* Make sure transparent huge pages aren't always enabled. When they are this can cause copy-on-write logic | ||
| 147 | * to consume much more memory and reduce performance during forks. */ | ||
| 148 | int checkTHPEnabled(sds *error_msg) { | ||
| 149 | char buf[1024]; | ||
| 150 | |||
| 151 | FILE *fp = fopen("/sys/kernel/mm/transparent_hugepage/enabled","r"); | ||
| 152 | if (!fp) return 0; | ||
| 153 | if (fgets(buf,sizeof(buf),fp) == NULL) { | ||
| 154 | fclose(fp); | ||
| 155 | return 0; | ||
| 156 | } | ||
| 157 | fclose(fp); | ||
| 158 | |||
| 159 | if (strstr(buf,"[always]") != NULL) { | ||
| 160 | *error_msg = sdsnew( | ||
| 161 | "You have Transparent Huge Pages (THP) support enabled in your kernel. " | ||
| 162 | "This will create latency and memory usage issues with Redis. " | ||
| 163 | "To fix this issue run the command 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled' as root, " | ||
| 164 | "and add it to your /etc/rc.local in order to retain the setting after a reboot. " | ||
| 165 | "Redis must be restarted after THP is disabled (set to 'madvise' or 'never')."); | ||
| 166 | return -1; | ||
| 167 | } else { | ||
| 168 | return 1; | ||
| 169 | } | ||
| 170 | } | ||
| 171 | |||
| 172 | #ifdef __arm64__ | ||
| 173 | /* Get size in kilobytes of the Shared_Dirty pages of the calling process for the | ||
| 174 | * memory map corresponding to the provided address, or -1 on error. */ | ||
| 175 | static int smapsGetSharedDirty(unsigned long addr) { | ||
| 176 | int ret, in_mapping = 0, val = -1; | ||
| 177 | unsigned long from, to; | ||
| 178 | char buf[64]; | ||
| 179 | FILE *f; | ||
| 180 | |||
| 181 | f = fopen("/proc/self/smaps", "r"); | ||
| 182 | if (!f) return -1; | ||
| 183 | |||
| 184 | while (1) { | ||
| 185 | if (!fgets(buf, sizeof(buf), f)) | ||
| 186 | break; | ||
| 187 | |||
| 188 | ret = sscanf(buf, "%lx-%lx", &from, &to); | ||
| 189 | if (ret == 2) | ||
| 190 | in_mapping = from <= addr && addr < to; | ||
| 191 | |||
| 192 | if (in_mapping && !memcmp(buf, "Shared_Dirty:", 13)) { | ||
| 193 | sscanf(buf, "%*s %d", &val); | ||
| 194 | /* If parsing fails, we remain with val == -1 */ | ||
| 195 | break; | ||
| 196 | } | ||
| 197 | } | ||
| 198 | |||
| 199 | fclose(f); | ||
| 200 | return val; | ||
| 201 | } | ||
| 202 | |||
| 203 | /* Older arm64 Linux kernels have a bug that could lead to data corruption | ||
| 204 | * during background save in certain scenarios. This function checks if the | ||
| 205 | * kernel is affected. | ||
| 206 | * The bug was fixed in commit ff1712f953e27f0b0718762ec17d0adb15c9fd0b | ||
| 207 | * titled: "arm64: pgtable: Ensure dirty bit is preserved across pte_wrprotect()" | ||
| 208 | */ | ||
| 209 | int checkLinuxMadvFreeForkBug(sds *error_msg) { | ||
| 210 | int ret, pipefd[2] = { -1, -1 }; | ||
| 211 | pid_t pid; | ||
| 212 | char *p = NULL, *q; | ||
| 213 | int res = 1; | ||
| 214 | long page_size = sysconf(_SC_PAGESIZE); | ||
| 215 | long map_size = 3 * page_size; | ||
| 216 | |||
| 217 | /* Create a memory map that's in our full control (not one used by the allocator). */ | ||
| 218 | p = mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | ||
| 219 | if (p == MAP_FAILED) { | ||
| 220 | return 0; | ||
| 221 | } | ||
| 222 | |||
| 223 | q = p + page_size; | ||
| 224 | |||
| 225 | /* Split the memory map in 3 pages by setting their protection as RO|RW|RO to prevent | ||
| 226 | * Linux from merging this memory map with adjacent VMAs. */ | ||
| 227 | ret = mprotect(q, page_size, PROT_READ | PROT_WRITE); | ||
| 228 | if (ret < 0) { | ||
| 229 | res = 0; | ||
| 230 | goto exit; | ||
| 231 | } | ||
| 232 | |||
| 233 | /* Write to the page once to make it resident */ | ||
| 234 | *(volatile char*)q = 0; | ||
| 235 | |||
| 236 | /* Tell the kernel that this page is free to be reclaimed. */ | ||
| 237 | #ifndef MADV_FREE | ||
| 238 | #define MADV_FREE 8 | ||
| 239 | #endif | ||
| 240 | ret = madvise(q, page_size, MADV_FREE); | ||
| 241 | if (ret < 0) { | ||
| 242 | /* MADV_FREE is not available on older kernels that are presumably | ||
| 243 | * not affected. */ | ||
| 244 | if (errno == EINVAL) goto exit; | ||
| 245 | |||
| 246 | res = 0; | ||
| 247 | goto exit; | ||
| 248 | } | ||
| 249 | |||
| 250 | /* Write to the page after being marked for freeing, this is supposed to take | ||
| 251 | * ownership of that page again. */ | ||
| 252 | *(volatile char*)q = 0; | ||
| 253 | |||
| 254 | /* Create a pipe for the child to return the info to the parent. */ | ||
| 255 | ret = anetPipe(pipefd, 0, 0); | ||
| 256 | if (ret < 0) { | ||
| 257 | res = 0; | ||
| 258 | goto exit; | ||
| 259 | } | ||
| 260 | |||
| 261 | /* Fork the process. */ | ||
| 262 | pid = fork(); | ||
| 263 | if (pid < 0) { | ||
| 264 | res = 0; | ||
| 265 | goto exit; | ||
| 266 | } else if (!pid) { | ||
| 267 | /* Child: check if the page is marked as dirty, page_size in kb. | ||
| 268 | * A value of 0 means the kernel is affected by the bug. */ | ||
| 269 | ret = smapsGetSharedDirty((unsigned long) q); | ||
| 270 | if (!ret) | ||
| 271 | res = -1; | ||
| 272 | else if (ret == -1) /* Failed to read */ | ||
| 273 | res = 0; | ||
| 274 | |||
| 275 | ret = write(pipefd[1], &res, sizeof(res)); /* Assume success, ignore return value*/ | ||
| 276 | exit(0); | ||
| 277 | } else { | ||
| 278 | /* Read the result from the child. */ | ||
| 279 | ret = read(pipefd[0], &res, sizeof(res)); | ||
| 280 | if (ret < 0) { | ||
| 281 | res = 0; | ||
| 282 | } | ||
| 283 | |||
| 284 | /* Reap the child pid. */ | ||
| 285 | waitpid(pid, NULL, 0); | ||
| 286 | } | ||
| 287 | |||
| 288 | exit: | ||
| 289 | /* Cleanup */ | ||
| 290 | if (pipefd[0] != -1) close(pipefd[0]); | ||
| 291 | if (pipefd[1] != -1) close(pipefd[1]); | ||
| 292 | if (p != NULL) munmap(p, map_size); | ||
| 293 | |||
| 294 | if (res == -1) | ||
| 295 | *error_msg = sdsnew( | ||
| 296 | "Your kernel has a bug that could lead to data corruption during background save. " | ||
| 297 | "Please upgrade to the latest stable kernel."); | ||
| 298 | |||
| 299 | return res; | ||
| 300 | } | ||
| 301 | #endif /* __arm64__ */ | ||
| 302 | #endif /* __linux__ */ | ||
| 303 | |||
| 304 | /* | ||
| 305 | * Standard system check interface: | ||
| 306 | * Each check has a name `name` and a functions pointer `check_fn`. | ||
| 307 | * `check_fn` should return: | ||
| 308 | * -1 in case the check fails. | ||
| 309 | * 1 in case the check passes. | ||
| 310 | * 0 in case the check could not be completed (usually because of some unexpected failed system call). | ||
| 311 | * When (and only when) the check fails and -1 is returned and error description is places in a new sds pointer to by | ||
| 312 | * the single `sds*` argument to `check_fn`. This message should be freed by the caller via `sdsfree()`. | ||
| 313 | */ | ||
| 314 | typedef struct { | ||
| 315 | const char *name; | ||
| 316 | int (*check_fn)(sds*); | ||
| 317 | } check; | ||
| 318 | |||
| 319 | check checks[] = { | ||
| 320 | #ifdef __linux__ | ||
| 321 | {.name = "slow-clocksource", .check_fn = checkClocksource}, | ||
| 322 | {.name = "xen-clocksource", .check_fn = checkXenClocksource}, | ||
| 323 | {.name = "overcommit", .check_fn = checkOvercommit}, | ||
| 324 | {.name = "THP", .check_fn = checkTHPEnabled}, | ||
| 325 | #ifdef __arm64__ | ||
| 326 | {.name = "madvise-free-fork-bug", .check_fn = checkLinuxMadvFreeForkBug}, | ||
| 327 | #endif | ||
| 328 | #endif | ||
| 329 | {.name = NULL, .check_fn = NULL} | ||
| 330 | }; | ||
| 331 | |||
| 332 | /* Performs various system checks, returns 0 if any check fails, 1 otherwise. */ | ||
| 333 | int syscheck(void) { | ||
| 334 | check *cur_check = checks; | ||
| 335 | int ret = 1; | ||
| 336 | sds err_msg = NULL; | ||
| 337 | while (cur_check->check_fn) { | ||
| 338 | int res = cur_check->check_fn(&err_msg); | ||
| 339 | printf("[%s]...", cur_check->name); | ||
| 340 | if (res == 0) { | ||
| 341 | printf("skipped\n"); | ||
| 342 | } else if (res == 1) { | ||
| 343 | printf("OK\n"); | ||
| 344 | } else { | ||
| 345 | printf("WARNING:\n"); | ||
| 346 | printf("%s\n", err_msg); | ||
| 347 | sdsfree(err_msg); | ||
| 348 | ret = 0; | ||
| 349 | } | ||
| 350 | cur_check++; | ||
| 351 | } | ||
| 352 | |||
| 353 | return ret; | ||
| 354 | } | ||
