summaryrefslogtreecommitdiff
path: root/examples/redis-unstable/src/syscheck.c
diff options
context:
space:
mode:
Diffstat (limited to 'examples/redis-unstable/src/syscheck.c')
-rw-r--r--examples/redis-unstable/src/syscheck.c354
1 files changed, 354 insertions, 0 deletions
diff --git a/examples/redis-unstable/src/syscheck.c b/examples/redis-unstable/src/syscheck.c
new file mode 100644
index 0000000..1251c1d
--- /dev/null
+++ b/examples/redis-unstable/src/syscheck.c
@@ -0,0 +1,354 @@
1/*
2 * Copyright (c) 2016-Present, Redis Ltd.
3 * All rights reserved.
4 *
5 * Licensed under your choice of (a) the Redis Source Available License 2.0
6 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
7 * GNU Affero General Public License v3 (AGPLv3).
8 */
9#include "fmacros.h"
10#include "config.h"
11#include "syscheck.h"
12#include "sds.h"
13#include "anet.h"
14
15#include <time.h>
16#include <sys/resource.h>
17#include <unistd.h>
18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21#include <errno.h>
22#include <sys/wait.h>
23
24#ifdef __linux__
25#include <sys/mman.h>
26#endif
27
28
29#ifdef __linux__
30static sds read_sysfs_line(char *path) {
31 char buf[256];
32 FILE *f = fopen(path, "r");
33 if (!f) return NULL;
34 if (!fgets(buf, sizeof(buf), f)) {
35 fclose(f);
36 return NULL;
37 }
38 fclose(f);
39 sds res = sdsnew(buf);
40 res = sdstrim(res, " \n");
41 return res;
42}
43
44/* Verify our clocksource implementation doesn't go through a system call (uses vdso).
45 * Going through a system call to check the time degrades Redis performance. */
46static int checkClocksource(sds *error_msg) {
47 unsigned long test_time_us, system_hz;
48 struct timespec ts;
49 unsigned long long start_us;
50 struct rusage ru_start, ru_end;
51
52 system_hz = sysconf(_SC_CLK_TCK);
53
54 if (getrusage(RUSAGE_SELF, &ru_start) != 0)
55 return 0;
56 if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) {
57 return 0;
58 }
59 start_us = (ts.tv_sec * 1000000 + ts.tv_nsec / 1000);
60
61 /* clock_gettime() busy loop of 5 times system tick (for a system_hz of 100 this is 50ms)
62 * Using system_hz is required to ensure accurate measurements from getrusage().
63 * If our clocksource is configured correctly (vdso) this will result in no system calls.
64 * If our clocksource is inefficient it'll waste most of the busy loop in the kernel. */
65 test_time_us = 5 * 1000000 / system_hz;
66 while (1) {
67 unsigned long long d;
68 if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
69 return 0;
70 d = (ts.tv_sec * 1000000 + ts.tv_nsec / 1000) - start_us;
71 if (d >= test_time_us) break;
72 }
73 if (getrusage(RUSAGE_SELF, &ru_end) != 0)
74 return 0;
75
76 long long stime_us = (ru_end.ru_stime.tv_sec * 1000000 + ru_end.ru_stime.tv_usec) - (ru_start.ru_stime.tv_sec * 1000000 + ru_start.ru_stime.tv_usec);
77 long long utime_us = (ru_end.ru_utime.tv_sec * 1000000 + ru_end.ru_utime.tv_usec) - (ru_start.ru_utime.tv_sec * 1000000 + ru_start.ru_utime.tv_usec);
78
79 /* If more than 10% of the process time was in system calls we probably have an inefficient clocksource, print a warning */
80 if (stime_us * 10 > stime_us + utime_us) {
81 sds avail = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/available_clocksource");
82 sds curr = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/current_clocksource");
83 *error_msg = sdscatprintf(sdsempty(),
84 "Slow system clocksource detected. This can result in degraded performance. "
85 "Consider changing the system's clocksource. "
86 "Current clocksource: %s. Available clocksources: %s. "
87 "For example: run the command 'echo tsc > /sys/devices/system/clocksource/clocksource0/current_clocksource' as root. "
88 "To permanently change the system's clocksource you'll need to set the 'clocksource=' kernel command line parameter.",
89 curr ? curr : "", avail ? avail : "");
90 sdsfree(avail);
91 sdsfree(curr);
92 return -1;
93 } else {
94 return 1;
95 }
96}
97
98/* Verify we're not using the `xen` clocksource. The xen hypervisor's default clocksource is slow and affects
99 * Redis's performance. This has been measured on ec2 xen based instances. ec2 recommends using the non-default
100 * tsc clock source for these instances. */
101int checkXenClocksource(sds *error_msg) {
102 sds curr = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/current_clocksource");
103 int res = 1;
104 if (curr == NULL) {
105 res = 0;
106 } else if (strcmp(curr, "xen") == 0) {
107 *error_msg = sdsnew(
108 "Your system is configured to use the 'xen' clocksource which might lead to degraded performance. "
109 "Check the result of the [slow-clocksource] system check: run 'redis-server --check-system' to check if "
110 "the system's clocksource isn't degrading performance.");
111 res = -1;
112 }
113 sdsfree(curr);
114 return res;
115}
116
117/* Verify overcommit is enabled.
118 * When overcommit memory is disabled Linux will kill the forked child of a background save
119 * if we don't have enough free memory to satisfy double the current memory usage even though
120 * the forked child uses copy-on-write to reduce its actual memory usage. */
121int checkOvercommit(sds *error_msg) {
122 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
123 char buf[64];
124
125 if (!fp) return 0;
126 if (fgets(buf,64,fp) == NULL) {
127 fclose(fp);
128 return 0;
129 }
130 fclose(fp);
131
132 if (strtol(buf, NULL, 10) != 1) {
133 *error_msg = sdsnew(
134 "Memory overcommit must be enabled! Without it, a background save or replication may fail under low memory condition. "
135#if defined(USE_JEMALLOC)
136 "Being disabled, it can also cause failures without low memory condition, see https://github.com/jemalloc/jemalloc/issues/1328. "
137#endif
138 "To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the "
139 "command 'sysctl vm.overcommit_memory=1' for this to take effect.");
140 return -1;
141 } else {
142 return 1;
143 }
144}
145
146/* Make sure transparent huge pages aren't always enabled. When they are this can cause copy-on-write logic
147 * to consume much more memory and reduce performance during forks. */
148int checkTHPEnabled(sds *error_msg) {
149 char buf[1024];
150
151 FILE *fp = fopen("/sys/kernel/mm/transparent_hugepage/enabled","r");
152 if (!fp) return 0;
153 if (fgets(buf,sizeof(buf),fp) == NULL) {
154 fclose(fp);
155 return 0;
156 }
157 fclose(fp);
158
159 if (strstr(buf,"[always]") != NULL) {
160 *error_msg = sdsnew(
161 "You have Transparent Huge Pages (THP) support enabled in your kernel. "
162 "This will create latency and memory usage issues with Redis. "
163 "To fix this issue run the command 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled' as root, "
164 "and add it to your /etc/rc.local in order to retain the setting after a reboot. "
165 "Redis must be restarted after THP is disabled (set to 'madvise' or 'never').");
166 return -1;
167 } else {
168 return 1;
169 }
170}
171
172#ifdef __arm64__
173/* Get size in kilobytes of the Shared_Dirty pages of the calling process for the
174 * memory map corresponding to the provided address, or -1 on error. */
175static int smapsGetSharedDirty(unsigned long addr) {
176 int ret, in_mapping = 0, val = -1;
177 unsigned long from, to;
178 char buf[64];
179 FILE *f;
180
181 f = fopen("/proc/self/smaps", "r");
182 if (!f) return -1;
183
184 while (1) {
185 if (!fgets(buf, sizeof(buf), f))
186 break;
187
188 ret = sscanf(buf, "%lx-%lx", &from, &to);
189 if (ret == 2)
190 in_mapping = from <= addr && addr < to;
191
192 if (in_mapping && !memcmp(buf, "Shared_Dirty:", 13)) {
193 sscanf(buf, "%*s %d", &val);
194 /* If parsing fails, we remain with val == -1 */
195 break;
196 }
197 }
198
199 fclose(f);
200 return val;
201}
202
203/* Older arm64 Linux kernels have a bug that could lead to data corruption
204 * during background save in certain scenarios. This function checks if the
205 * kernel is affected.
206 * The bug was fixed in commit ff1712f953e27f0b0718762ec17d0adb15c9fd0b
207 * titled: "arm64: pgtable: Ensure dirty bit is preserved across pte_wrprotect()"
208 */
209int checkLinuxMadvFreeForkBug(sds *error_msg) {
210 int ret, pipefd[2] = { -1, -1 };
211 pid_t pid;
212 char *p = NULL, *q;
213 int res = 1;
214 long page_size = sysconf(_SC_PAGESIZE);
215 long map_size = 3 * page_size;
216
217 /* Create a memory map that's in our full control (not one used by the allocator). */
218 p = mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
219 if (p == MAP_FAILED) {
220 return 0;
221 }
222
223 q = p + page_size;
224
225 /* Split the memory map in 3 pages by setting their protection as RO|RW|RO to prevent
226 * Linux from merging this memory map with adjacent VMAs. */
227 ret = mprotect(q, page_size, PROT_READ | PROT_WRITE);
228 if (ret < 0) {
229 res = 0;
230 goto exit;
231 }
232
233 /* Write to the page once to make it resident */
234 *(volatile char*)q = 0;
235
236 /* Tell the kernel that this page is free to be reclaimed. */
237#ifndef MADV_FREE
238#define MADV_FREE 8
239#endif
240 ret = madvise(q, page_size, MADV_FREE);
241 if (ret < 0) {
242 /* MADV_FREE is not available on older kernels that are presumably
243 * not affected. */
244 if (errno == EINVAL) goto exit;
245
246 res = 0;
247 goto exit;
248 }
249
250 /* Write to the page after being marked for freeing, this is supposed to take
251 * ownership of that page again. */
252 *(volatile char*)q = 0;
253
254 /* Create a pipe for the child to return the info to the parent. */
255 ret = anetPipe(pipefd, 0, 0);
256 if (ret < 0) {
257 res = 0;
258 goto exit;
259 }
260
261 /* Fork the process. */
262 pid = fork();
263 if (pid < 0) {
264 res = 0;
265 goto exit;
266 } else if (!pid) {
267 /* Child: check if the page is marked as dirty, page_size in kb.
268 * A value of 0 means the kernel is affected by the bug. */
269 ret = smapsGetSharedDirty((unsigned long) q);
270 if (!ret)
271 res = -1;
272 else if (ret == -1) /* Failed to read */
273 res = 0;
274
275 ret = write(pipefd[1], &res, sizeof(res)); /* Assume success, ignore return value*/
276 exit(0);
277 } else {
278 /* Read the result from the child. */
279 ret = read(pipefd[0], &res, sizeof(res));
280 if (ret < 0) {
281 res = 0;
282 }
283
284 /* Reap the child pid. */
285 waitpid(pid, NULL, 0);
286 }
287
288exit:
289 /* Cleanup */
290 if (pipefd[0] != -1) close(pipefd[0]);
291 if (pipefd[1] != -1) close(pipefd[1]);
292 if (p != NULL) munmap(p, map_size);
293
294 if (res == -1)
295 *error_msg = sdsnew(
296 "Your kernel has a bug that could lead to data corruption during background save. "
297 "Please upgrade to the latest stable kernel.");
298
299 return res;
300}
301#endif /* __arm64__ */
302#endif /* __linux__ */
303
304/*
305 * Standard system check interface:
306 * Each check has a name `name` and a functions pointer `check_fn`.
307 * `check_fn` should return:
308 * -1 in case the check fails.
309 * 1 in case the check passes.
310 * 0 in case the check could not be completed (usually because of some unexpected failed system call).
311 * When (and only when) the check fails and -1 is returned and error description is places in a new sds pointer to by
312 * the single `sds*` argument to `check_fn`. This message should be freed by the caller via `sdsfree()`.
313 */
314typedef struct {
315 const char *name;
316 int (*check_fn)(sds*);
317} check;
318
319check checks[] = {
320#ifdef __linux__
321 {.name = "slow-clocksource", .check_fn = checkClocksource},
322 {.name = "xen-clocksource", .check_fn = checkXenClocksource},
323 {.name = "overcommit", .check_fn = checkOvercommit},
324 {.name = "THP", .check_fn = checkTHPEnabled},
325#ifdef __arm64__
326 {.name = "madvise-free-fork-bug", .check_fn = checkLinuxMadvFreeForkBug},
327#endif
328#endif
329 {.name = NULL, .check_fn = NULL}
330};
331
332/* Performs various system checks, returns 0 if any check fails, 1 otherwise. */
333int syscheck(void) {
334 check *cur_check = checks;
335 int ret = 1;
336 sds err_msg = NULL;
337 while (cur_check->check_fn) {
338 int res = cur_check->check_fn(&err_msg);
339 printf("[%s]...", cur_check->name);
340 if (res == 0) {
341 printf("skipped\n");
342 } else if (res == 1) {
343 printf("OK\n");
344 } else {
345 printf("WARNING:\n");
346 printf("%s\n", err_msg);
347 sdsfree(err_msg);
348 ret = 0;
349 }
350 cur_check++;
351 }
352
353 return ret;
354}