aboutsummaryrefslogtreecommitdiff
path: root/examples/redis-unstable/src/latency.c
diff options
context:
space:
mode:
Diffstat (limited to 'examples/redis-unstable/src/latency.c')
-rw-r--r--examples/redis-unstable/src/latency.c721
1 files changed, 721 insertions, 0 deletions
diff --git a/examples/redis-unstable/src/latency.c b/examples/redis-unstable/src/latency.c
new file mode 100644
index 0000000..fcbe12d
--- /dev/null
+++ b/examples/redis-unstable/src/latency.c
@@ -0,0 +1,721 @@
1/* The latency monitor allows to easily observe the sources of latency
2 * in a Redis instance using the LATENCY command. Different latency
3 * sources are monitored, like disk I/O, execution of commands, fork
4 * system call, and so forth.
5 *
6 * ----------------------------------------------------------------------------
7 *
8 * Copyright (c) 2014-Present, Redis Ltd.
9 * All rights reserved.
10 *
11 * Licensed under your choice of (a) the Redis Source Available License 2.0
12 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
13 * GNU Affero General Public License v3 (AGPLv3).
14 */
15
16#include "server.h"
17#include "hdr_histogram.h"
18
19/* Dictionary type for latency events. */
20int dictStringKeyCompare(dictCmpCache *cache, const void *key1, const void *key2) {
21 UNUSED(cache);
22 return strcmp(key1,key2) == 0;
23}
24
25uint64_t dictStringHash(const void *key) {
26 return dictGenHashFunction(key, strlen(key));
27}
28
29void dictVanillaFree(dict *d, void *val);
30
31dictType latencyTimeSeriesDictType = {
32 dictStringHash, /* hash function */
33 NULL, /* key dup */
34 NULL, /* val dup */
35 dictStringKeyCompare, /* key compare */
36 dictVanillaFree, /* key destructor */
37 dictVanillaFree, /* val destructor */
38 NULL /* allow to expand */
39};
40
41/* ------------------------- Utility functions ------------------------------ */
42
43/* Report the amount of AnonHugePages in smap, in bytes. If the return
44 * value of the function is non-zero, the process is being targeted by
45 * THP support, and is likely to have memory usage / latency issues. */
46int THPGetAnonHugePagesSize(void) {
47 return zmalloc_get_smap_bytes_by_field("AnonHugePages:",-1);
48}
49
50/* ---------------------------- Latency API --------------------------------- */
51
52/* Latency monitor initialization. We just need to create the dictionary
53 * of time series, each time series is created on demand in order to avoid
54 * having a fixed list to maintain. */
55void latencyMonitorInit(void) {
56 server.latency_events = dictCreate(&latencyTimeSeriesDictType);
57}
58
59/* Add the specified sample to the specified time series "event".
60 * This function is usually called via latencyAddSampleIfNeeded(), that
61 * is a macro that only adds the sample if the latency is higher than
62 * server.latency_monitor_threshold. */
63void latencyAddSample(const char *event, mstime_t latency) {
64 struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
65 time_t now = time(NULL);
66 int prev;
67
68 /* Create the time series if it does not exist. */
69 if (ts == NULL) {
70 ts = zmalloc(sizeof(*ts));
71 ts->idx = 0;
72 ts->max = 0;
73 memset(ts->samples,0,sizeof(ts->samples));
74 dictAdd(server.latency_events,zstrdup(event),ts);
75 }
76
77 if (latency > ts->max) ts->max = latency;
78
79 /* If the previous sample is in the same second, we update our old sample
80 * if this latency is > of the old one, or just return. */
81 prev = (ts->idx + LATENCY_TS_LEN - 1) % LATENCY_TS_LEN;
82 if (ts->samples[prev].time == now) {
83 if (latency > ts->samples[prev].latency)
84 ts->samples[prev].latency = latency;
85 return;
86 }
87
88 ts->samples[ts->idx].time = now;
89 ts->samples[ts->idx].latency = latency;
90
91 ts->idx++;
92 if (ts->idx == LATENCY_TS_LEN) ts->idx = 0;
93}
94
95/* Reset data for the specified event, or all the events data if 'event' is
96 * NULL.
97 *
98 * Note: this is O(N) even when event_to_reset is not NULL because makes
99 * the code simpler and we have a small fixed max number of events. */
100int latencyResetEvent(char *event_to_reset) {
101 dictIterator di;
102 dictEntry *de;
103 int resets = 0;
104
105 dictInitSafeIterator(&di, server.latency_events);
106 while((de = dictNext(&di)) != NULL) {
107 char *event = dictGetKey(de);
108
109 if (event_to_reset == NULL || strcasecmp(event,event_to_reset) == 0) {
110 dictDelete(server.latency_events, event);
111 resets++;
112 }
113 }
114 dictResetIterator(&di);
115 return resets;
116}
117
118/* ------------------------ Latency reporting (doctor) ---------------------- */
119
120/* Analyze the samples available for a given event and return a structure
121 * populate with different metrics, average, MAD, min, max, and so forth.
122 * Check latency.h definition of struct latencyStats for more info.
123 * If the specified event has no elements the structure is populate with
124 * zero values. */
125void analyzeLatencyForEvent(char *event, struct latencyStats *ls) {
126 struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
127 int j;
128 uint64_t sum;
129
130 ls->all_time_high = ts ? ts->max : 0;
131 ls->avg = 0;
132 ls->min = 0;
133 ls->max = 0;
134 ls->mad = 0;
135 ls->samples = 0;
136 ls->period = 0;
137 if (!ts) return;
138
139 /* First pass, populate everything but the MAD. */
140 sum = 0;
141 for (j = 0; j < LATENCY_TS_LEN; j++) {
142 if (ts->samples[j].time == 0) continue;
143 ls->samples++;
144 if (ls->samples == 1) {
145 ls->min = ls->max = ts->samples[j].latency;
146 } else {
147 if (ls->min > ts->samples[j].latency)
148 ls->min = ts->samples[j].latency;
149 if (ls->max < ts->samples[j].latency)
150 ls->max = ts->samples[j].latency;
151 }
152 sum += ts->samples[j].latency;
153
154 /* Track the oldest event time in ls->period. */
155 if (ls->period == 0 || ts->samples[j].time < ls->period)
156 ls->period = ts->samples[j].time;
157 }
158
159 /* So far avg is actually the sum of the latencies, and period is
160 * the oldest event time. We need to make the first an average and
161 * the second a range of seconds. */
162 if (ls->samples) {
163 ls->avg = sum / ls->samples;
164 ls->period = time(NULL) - ls->period;
165 if (ls->period == 0) ls->period = 1;
166 }
167
168 /* Second pass, compute MAD. */
169 sum = 0;
170 for (j = 0; j < LATENCY_TS_LEN; j++) {
171 int64_t delta;
172
173 if (ts->samples[j].time == 0) continue;
174 delta = (int64_t)ls->avg - ts->samples[j].latency;
175 if (delta < 0) delta = -delta;
176 sum += delta;
177 }
178 if (ls->samples) ls->mad = sum / ls->samples;
179}
180
181/* Create a human readable report of latency events for this Redis instance. */
182sds createLatencyReport(void) {
183 sds report = sdsempty();
184 int advise_better_vm = 0; /* Better virtual machines. */
185 int advise_slowlog_enabled = 0; /* Enable slowlog. */
186 int advise_slowlog_tuning = 0; /* Reconfigure slowlog. */
187 int advise_slowlog_inspect = 0; /* Check your slowlog. */
188 int advise_disk_contention = 0; /* Try to lower disk contention. */
189 int advise_scheduler = 0; /* Intrinsic latency. */
190 int advise_data_writeback = 0; /* data=writeback. */
191 int advise_no_appendfsync = 0; /* don't fsync during rewrites. */
192 int advise_local_disk = 0; /* Avoid remote disks. */
193 int advise_ssd = 0; /* Use an SSD drive. */
194 int advise_write_load_info = 0; /* Print info about AOF and write load. */
195 int advise_hz = 0; /* Use higher HZ. */
196 int advise_large_objects = 0; /* Deletion of large objects. */
197 int advise_mass_eviction = 0; /* Avoid mass eviction of keys. */
198 int advise_relax_fsync_policy = 0; /* appendfsync always is slow. */
199 int advise_disable_thp = 0; /* AnonHugePages detected. */
200 int advices = 0;
201
202 /* Return ASAP if the latency engine is disabled and it looks like it
203 * was never enabled so far. */
204 if (dictSize(server.latency_events) == 0 &&
205 server.latency_monitor_threshold == 0)
206 {
207 report = sdscat(report,"I'm sorry, Dave, I can't do that. Latency monitoring is disabled in this Redis instance. You may use \"CONFIG SET latency-monitor-threshold <milliseconds>.\" in order to enable it. If we weren't in a deep space mission I'd suggest to take a look at https://redis.io/docs/latest/operate/oss_and_stack/management/optimization/latency-monitor.\n");
208 return report;
209 }
210
211 /* Show all the events stats and add for each event some event-related
212 * comment depending on the values. */
213 dictIterator di;
214 dictEntry *de;
215 int eventnum = 0;
216
217 dictInitSafeIterator(&di, server.latency_events);
218 while((de = dictNext(&di)) != NULL) {
219 char *event = dictGetKey(de);
220 struct latencyTimeSeries *ts = dictGetVal(de);
221 struct latencyStats ls;
222
223 if (ts == NULL) continue;
224 eventnum++;
225 if (eventnum == 1) {
226 report = sdscat(report,"Dave, I have observed latency spikes in this Redis instance. You don't mind talking about it, do you Dave?\n\n");
227 }
228 analyzeLatencyForEvent(event,&ls);
229
230 report = sdscatprintf(report,
231 "%d. %s: %d latency spikes (average %lums, mean deviation %lums, period %.2f sec). Worst all time event %lums.",
232 eventnum, event,
233 ls.samples,
234 (unsigned long) ls.avg,
235 (unsigned long) ls.mad,
236 (double) ls.period/ls.samples,
237 (unsigned long) ts->max);
238
239 /* Fork */
240 if (!strcasecmp(event,"fork")) {
241 char *fork_quality;
242 if (server.stat_fork_rate < 10) {
243 fork_quality = "terrible";
244 advise_better_vm = 1;
245 advices++;
246 } else if (server.stat_fork_rate < 25) {
247 fork_quality = "poor";
248 advise_better_vm = 1;
249 advices++;
250 } else if (server.stat_fork_rate < 100) {
251 fork_quality = "good";
252 } else {
253 fork_quality = "excellent";
254 }
255 report = sdscatprintf(report,
256 " Fork rate is %.2f GB/sec (%s).", server.stat_fork_rate,
257 fork_quality);
258 }
259
260 /* Potentially commands. */
261 if (!strcasecmp(event,"command")) {
262 if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) {
263 advise_slowlog_enabled = 1;
264 advices++;
265 } else if (server.slowlog_log_slower_than/1000 >
266 server.latency_monitor_threshold)
267 {
268 advise_slowlog_tuning = 1;
269 advices++;
270 }
271 advise_slowlog_inspect = 1;
272 advise_large_objects = 1;
273 advices += 2;
274 }
275
276 /* fast-command. */
277 if (!strcasecmp(event,"fast-command")) {
278 advise_scheduler = 1;
279 advices++;
280 }
281
282 /* AOF and I/O. */
283 if (!strcasecmp(event,"aof-write-pending-fsync")) {
284 advise_local_disk = 1;
285 advise_disk_contention = 1;
286 advise_ssd = 1;
287 advise_data_writeback = 1;
288 advices += 4;
289 }
290
291 if (!strcasecmp(event,"aof-write-active-child")) {
292 advise_no_appendfsync = 1;
293 advise_data_writeback = 1;
294 advise_ssd = 1;
295 advices += 3;
296 }
297
298 if (!strcasecmp(event,"aof-write-alone")) {
299 advise_local_disk = 1;
300 advise_data_writeback = 1;
301 advise_ssd = 1;
302 advices += 3;
303 }
304
305 if (!strcasecmp(event,"aof-fsync-always")) {
306 advise_relax_fsync_policy = 1;
307 advices++;
308 }
309
310 if (!strcasecmp(event,"aof-fstat") ||
311 !strcasecmp(event,"rdb-unlink-temp-file")) {
312 advise_disk_contention = 1;
313 advise_local_disk = 1;
314 advices += 2;
315 }
316
317 if (!strcasecmp(event,"aof-rewrite-diff-write") ||
318 !strcasecmp(event,"aof-rename")) {
319 advise_write_load_info = 1;
320 advise_data_writeback = 1;
321 advise_ssd = 1;
322 advise_local_disk = 1;
323 advices += 4;
324 }
325
326 /* Expire cycle. */
327 if (!strcasecmp(event,"expire-cycle")) {
328 advise_hz = 1;
329 advise_large_objects = 1;
330 advices += 2;
331 }
332
333 /* Eviction cycle. */
334 if (!strcasecmp(event,"eviction-del")) {
335 advise_large_objects = 1;
336 advices++;
337 }
338
339 if (!strcasecmp(event,"eviction-cycle")) {
340 advise_mass_eviction = 1;
341 advices++;
342 }
343
344 report = sdscatlen(report,"\n",1);
345 }
346 dictResetIterator(&di);
347
348 /* Add non event based advices. */
349 if (THPGetAnonHugePagesSize() > 0) {
350 advise_disable_thp = 1;
351 advices++;
352 }
353
354 if (eventnum == 0 && advices == 0) {
355 report = sdscat(report,"Dave, no latency spike was observed during the lifetime of this Redis instance, not in the slightest bit. I honestly think you ought to sit down calmly, take a stress pill, and think things over.\n");
356 } else if (eventnum > 0 && advices == 0) {
357 report = sdscat(report,"\nWhile there are latency events logged, I'm not able to suggest any easy fix. Please use the Redis community to get some help, providing this report in your help request.\n");
358 } else {
359 /* Add all the suggestions accumulated so far. */
360
361 /* Better VM. */
362 report = sdscat(report,"\nI have a few advices for you:\n\n");
363 if (advise_better_vm) {
364 report = sdscat(report,"- If you are using a virtual machine, consider upgrading it with a faster one using a hypervisior that provides less latency during fork() calls. Xen is known to have poor fork() performance. Even in the context of the same VM provider, certain kinds of instances can execute fork faster than others.\n");
365 }
366
367 /* Slow log. */
368 if (advise_slowlog_enabled) {
369 report = sdscatprintf(report,"- There are latency issues with potentially slow commands you are using. Try to enable the Slow Log Redis feature using the command 'CONFIG SET slowlog-log-slower-than %llu'. If the Slow log is disabled Redis is not able to log slow commands execution for you.\n", (unsigned long long)server.latency_monitor_threshold*1000);
370 }
371
372 if (advise_slowlog_tuning) {
373 report = sdscatprintf(report,"- Your current Slow Log configuration only logs events that are slower than your configured latency monitor threshold. Please use 'CONFIG SET slowlog-log-slower-than %llu'.\n", (unsigned long long)server.latency_monitor_threshold*1000);
374 }
375
376 if (advise_slowlog_inspect) {
377 report = sdscat(report,"- Check your Slow Log to understand what are the commands you are running which are too slow to execute. Please check https://redis.io/commands/slowlog for more information.\n");
378 }
379
380 /* Intrinsic latency. */
381 if (advise_scheduler) {
382 report = sdscat(report,"- The system is slow to execute Redis code paths not containing system calls. This usually means the system does not provide Redis CPU time to run for long periods. You should try to:\n"
383 " 1) Lower the system load.\n"
384 " 2) Use a computer / VM just for Redis if you are running other software in the same system.\n"
385 " 3) Check if you have a \"noisy neighbour\" problem.\n"
386 " 4) Check with 'redis-cli --intrinsic-latency 100' what is the intrinsic latency in your system.\n"
387 " 5) Check if the problem is allocator-related by recompiling Redis with MALLOC=libc, if you are using Jemalloc. However this may create fragmentation problems.\n");
388 }
389
390 /* AOF / Disk latency. */
391 if (advise_local_disk) {
392 report = sdscat(report,"- It is strongly advised to use local disks for persistence, especially if you are using AOF. Remote disks provided by platform-as-a-service providers are known to be slow.\n");
393 }
394
395 if (advise_ssd) {
396 report = sdscat(report,"- SSD disks are able to reduce fsync latency, and total time needed for snapshotting and AOF log rewriting (resulting in smaller memory usage). With extremely high write load SSD disks can be a good option. However Redis should perform reasonably with high load using normal disks. Use this advice as a last resort.\n");
397 }
398
399 if (advise_data_writeback) {
400 report = sdscat(report,"- Mounting ext3/4 filesystems with data=writeback can provide a performance boost compared to data=ordered, however this mode of operation provides less guarantees, and sometimes it can happen that after a hard crash the AOF file will have a half-written command at the end and will require to be repaired before Redis restarts.\n");
401 }
402
403 if (advise_disk_contention) {
404 report = sdscat(report,"- Try to lower the disk contention. This is often caused by other disk intensive processes running in the same computer (including other Redis instances).\n");
405 }
406
407 if (advise_no_appendfsync) {
408 report = sdscat(report,"- Assuming from the point of view of data safety this is viable in your environment, you could try to enable the 'no-appendfsync-on-rewrite' option, so that fsync will not be performed while there is a child rewriting the AOF file or producing an RDB file (the moment where there is high disk contention).\n");
409 }
410
411 if (advise_relax_fsync_policy && server.aof_fsync == AOF_FSYNC_ALWAYS) {
412 report = sdscat(report,"- Your fsync policy is set to 'always'. It is very hard to get good performances with such a setup, if possible try to relax the fsync policy to 'onesec'.\n");
413 }
414
415 if (advise_write_load_info) {
416 report = sdscat(report,"- Latency during the AOF atomic rename operation or when the final difference is flushed to the AOF file at the end of the rewrite, sometimes is caused by very high write load, causing the AOF buffer to get very large. If possible try to send less commands to accomplish the same work, or use Lua scripts to group multiple operations into a single EVALSHA call.\n");
417 }
418
419 if (advise_hz && server.hz < 100) {
420 report = sdscat(report,"- In order to make the Redis keys expiring process more incremental, try to set the 'hz' configuration parameter to 100 using 'CONFIG SET hz 100'.\n");
421 }
422
423 if (advise_large_objects) {
424 report = sdscat(report,"- Deleting, expiring or evicting (because of maxmemory policy) large objects is a blocking operation. If you have very large objects that are often deleted, expired, or evicted, try to fragment those objects into multiple smaller objects.\n");
425 }
426
427 if (advise_mass_eviction) {
428 report = sdscat(report,"- Sudden changes to the 'maxmemory' setting via 'CONFIG SET', or allocation of large objects via sets or sorted sets intersections, STORE option of SORT, Redis Cluster large keys migrations (RESTORE command), may create sudden memory pressure forcing the server to block trying to evict keys. \n");
429 }
430
431 if (advise_disable_thp) {
432 report = sdscat(report,"- I detected a non zero amount of anonymous huge pages used by your process. This creates very serious latency events in different conditions, especially when Redis is persisting on disk. To disable THP support use the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled', make sure to also add it into /etc/rc.local so that the command will be executed again after a reboot. Note that even if you have already disabled THP, you still need to restart the Redis process to get rid of the huge pages already created.\n");
433 }
434 }
435
436 return report;
437}
438
439/* ---------------------- Latency command implementation -------------------- */
440
441/* latencyCommand() helper to produce a map of time buckets,
442 * each representing a latency range,
443 * between 1 nanosecond and roughly 1 second.
444 * Each bucket covers twice the previous bucket's range.
445 * Empty buckets are not printed.
446 * Everything above 1 sec is considered +Inf.
447 * At max there will be log2(1000000000)=30 buckets */
448void fillCommandCDF(client *c, struct hdr_histogram* histogram) {
449 addReplyMapLen(c,2);
450 addReplyBulkCString(c,"calls");
451 addReplyLongLong(c,(long long) histogram->total_count);
452 addReplyBulkCString(c,"histogram_usec");
453 void *replylen = addReplyDeferredLen(c);
454 int samples = 0;
455 struct hdr_iter iter;
456 hdr_iter_log_init(&iter,histogram,1024,2);
457 int64_t previous_count = 0;
458 while (hdr_iter_next(&iter)) {
459 const int64_t micros = iter.highest_equivalent_value / 1000;
460 const int64_t cumulative_count = iter.cumulative_count;
461 if(cumulative_count > previous_count){
462 addReplyLongLong(c,(long long) micros);
463 addReplyLongLong(c,(long long) cumulative_count);
464 samples++;
465 }
466 previous_count = cumulative_count;
467 }
468 setDeferredMapLen(c,replylen,samples);
469}
470
471/* latencyCommand() helper to produce for all commands,
472 * a per command cumulative distribution of latencies. */
473void latencyAllCommandsFillCDF(client *c, dict *commands, int *command_with_data) {
474 dictIterator di;
475 dictEntry *de;
476 struct redisCommand *cmd;
477
478 dictInitSafeIterator(&di, commands);
479 while((de = dictNext(&di)) != NULL) {
480 cmd = (struct redisCommand *) dictGetVal(de);
481 if (cmd->latency_histogram) {
482 addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
483 fillCommandCDF(c, cmd->latency_histogram);
484 (*command_with_data)++;
485 }
486
487 if (cmd->subcommands) {
488 latencyAllCommandsFillCDF(c, cmd->subcommands_dict, command_with_data);
489 }
490 }
491 dictResetIterator(&di);
492}
493
494/* latencyCommand() helper to produce for a specific command set,
495 * a per command cumulative distribution of latencies. */
496void latencySpecificCommandsFillCDF(client *c) {
497 void *replylen = addReplyDeferredLen(c);
498 int command_with_data = 0;
499 for (int j = 2; j < c->argc; j++){
500 struct redisCommand *cmd = lookupCommandBySds(c->argv[j]->ptr);
501 /* If the command does not exist we skip the reply */
502 if (cmd == NULL) {
503 continue;
504 }
505
506 if (cmd->latency_histogram) {
507 addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
508 fillCommandCDF(c, cmd->latency_histogram);
509 command_with_data++;
510 }
511
512 if (cmd->subcommands_dict) {
513 dictEntry *de;
514 dictIterator di;
515
516 dictInitSafeIterator(&di, cmd->subcommands_dict);
517 while ((de = dictNext(&di)) != NULL) {
518 struct redisCommand *sub = dictGetVal(de);
519 if (sub->latency_histogram) {
520 addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname));
521 fillCommandCDF(c, sub->latency_histogram);
522 command_with_data++;
523 }
524 }
525 dictResetIterator(&di);
526 }
527 }
528 setDeferredMapLen(c,replylen,command_with_data);
529}
530
531/* latencyCommand() helper to produce a time-delay reply for all the samples
532 * in memory for the specified time series. */
533void latencyCommandReplyWithSamples(client *c, struct latencyTimeSeries *ts) {
534 void *replylen = addReplyDeferredLen(c);
535 int samples = 0, j;
536
537 for (j = 0; j < LATENCY_TS_LEN; j++) {
538 int i = (ts->idx + j) % LATENCY_TS_LEN;
539
540 if (ts->samples[i].time == 0) continue;
541 addReplyArrayLen(c,2);
542 addReplyLongLong(c,ts->samples[i].time);
543 addReplyLongLong(c,ts->samples[i].latency);
544 samples++;
545 }
546 setDeferredArrayLen(c,replylen,samples);
547}
548
549/* latencyCommand() helper to produce the reply for the LATEST subcommand,
550 * listing the last latency sample for every event type registered so far. */
551void latencyCommandReplyWithLatestEvents(client *c) {
552 dictIterator di;
553 dictEntry *de;
554
555 addReplyArrayLen(c,dictSize(server.latency_events));
556 dictInitIterator(&di, server.latency_events);
557 while((de = dictNext(&di)) != NULL) {
558 char *event = dictGetKey(de);
559 struct latencyTimeSeries *ts = dictGetVal(de);
560 int last = (ts->idx + LATENCY_TS_LEN - 1) % LATENCY_TS_LEN;
561
562 addReplyArrayLen(c,4);
563 addReplyBulkCString(c,event);
564 addReplyLongLong(c,ts->samples[last].time);
565 addReplyLongLong(c,ts->samples[last].latency);
566 addReplyLongLong(c,ts->max);
567 }
568 dictResetIterator(&di);
569}
570
571#define LATENCY_GRAPH_COLS 80
572sds latencyCommandGenSparkeline(char *event, struct latencyTimeSeries *ts) {
573 int j;
574 struct sequence *seq = createSparklineSequence();
575 sds graph = sdsempty();
576 uint32_t min = 0, max = 0;
577
578 for (j = 0; j < LATENCY_TS_LEN; j++) {
579 int i = (ts->idx + j) % LATENCY_TS_LEN;
580 int elapsed;
581 char buf[64];
582
583 if (ts->samples[i].time == 0) continue;
584 /* Update min and max. */
585 if (seq->length == 0) {
586 min = max = ts->samples[i].latency;
587 } else {
588 if (ts->samples[i].latency > max) max = ts->samples[i].latency;
589 if (ts->samples[i].latency < min) min = ts->samples[i].latency;
590 }
591 /* Use as label the number of seconds / minutes / hours / days
592 * ago the event happened. */
593 elapsed = time(NULL) - ts->samples[i].time;
594 if (elapsed < 60)
595 snprintf(buf,sizeof(buf),"%ds",elapsed);
596 else if (elapsed < 3600)
597 snprintf(buf,sizeof(buf),"%dm",elapsed/60);
598 else if (elapsed < 3600*24)
599 snprintf(buf,sizeof(buf),"%dh",elapsed/3600);
600 else
601 snprintf(buf,sizeof(buf),"%dd",elapsed/(3600*24));
602 sparklineSequenceAddSample(seq,ts->samples[i].latency,buf);
603 }
604
605 graph = sdscatprintf(graph,
606 "%s - high %lu ms, low %lu ms (all time high %lu ms)\n", event,
607 (unsigned long) max, (unsigned long) min, (unsigned long) ts->max);
608 for (j = 0; j < LATENCY_GRAPH_COLS; j++)
609 graph = sdscatlen(graph,"-",1);
610 graph = sdscatlen(graph,"\n",1);
611 graph = sparklineRender(graph,seq,LATENCY_GRAPH_COLS,4,SPARKLINE_FILL);
612 freeSparklineSequence(seq);
613 return graph;
614}
615
616/* LATENCY command implementations.
617 *
618 * LATENCY HISTORY: return time-latency samples for the specified event.
619 * LATENCY LATEST: return the latest latency for all the events classes.
620 * LATENCY DOCTOR: returns a human readable analysis of instance latency.
621 * LATENCY GRAPH: provide an ASCII graph of the latency of the specified event.
622 * LATENCY RESET: reset data of a specified event or all the data if no event provided.
623 * LATENCY HISTOGRAM: return a cumulative distribution of latencies in the format of an histogram for the specified command names.
624 */
625void latencyCommand(client *c) {
626 struct latencyTimeSeries *ts;
627
628 if (!strcasecmp(c->argv[1]->ptr,"history") && c->argc == 3) {
629 /* LATENCY HISTORY <event> */
630 ts = dictFetchValue(server.latency_events,c->argv[2]->ptr);
631 if (ts == NULL) {
632 addReplyArrayLen(c,0);
633 } else {
634 latencyCommandReplyWithSamples(c,ts);
635 }
636 } else if (!strcasecmp(c->argv[1]->ptr,"graph") && c->argc == 3) {
637 /* LATENCY GRAPH <event> */
638 sds graph;
639 dictEntry *de;
640 char *event;
641
642 de = dictFind(server.latency_events,c->argv[2]->ptr);
643 if (de == NULL) goto nodataerr;
644 ts = dictGetVal(de);
645 event = dictGetKey(de);
646
647 graph = latencyCommandGenSparkeline(event,ts);
648 addReplyVerbatim(c,graph,sdslen(graph),"txt");
649 sdsfree(graph);
650 } else if (!strcasecmp(c->argv[1]->ptr,"latest") && c->argc == 2) {
651 /* LATENCY LATEST */
652 latencyCommandReplyWithLatestEvents(c);
653 } else if (!strcasecmp(c->argv[1]->ptr,"doctor") && c->argc == 2) {
654 /* LATENCY DOCTOR */
655 sds report = createLatencyReport();
656
657 addReplyVerbatim(c,report,sdslen(report),"txt");
658 sdsfree(report);
659 } else if (!strcasecmp(c->argv[1]->ptr,"reset") && c->argc >= 2) {
660 /* LATENCY RESET */
661 if (c->argc == 2) {
662 addReplyLongLong(c,latencyResetEvent(NULL));
663 } else {
664 int j, resets = 0;
665
666 for (j = 2; j < c->argc; j++)
667 resets += latencyResetEvent(c->argv[j]->ptr);
668 addReplyLongLong(c,resets);
669 }
670 } else if (!strcasecmp(c->argv[1]->ptr,"histogram") && c->argc >= 2) {
671 /* LATENCY HISTOGRAM*/
672 if (c->argc == 2) {
673 int command_with_data = 0;
674 void *replylen = addReplyDeferredLen(c);
675 latencyAllCommandsFillCDF(c, server.commands, &command_with_data);
676 setDeferredMapLen(c, replylen, command_with_data);
677 } else {
678 latencySpecificCommandsFillCDF(c);
679 }
680 } else if (!strcasecmp(c->argv[1]->ptr,"help") && c->argc == 2) {
681 const char *help[] = {
682"DOCTOR",
683" Return a human readable latency analysis report.",
684"GRAPH <event>",
685" Return an ASCII latency graph for the <event> class.",
686"HISTORY <event>",
687" Return time-latency samples for the <event> class.",
688"LATEST",
689" Return the latest latency samples for all events.",
690"RESET [<event> ...]",
691" Reset latency data of one or more <event> classes.",
692" (default: reset all data for all event classes)",
693"HISTOGRAM [COMMAND ...]",
694" Return a cumulative distribution of latencies in the format of a histogram for the specified command names.",
695" If no commands are specified then all histograms are replied.",
696NULL
697 };
698 addReplyHelp(c, help);
699 } else {
700 addReplySubcommandSyntaxError(c);
701 }
702 return;
703
704nodataerr:
705 /* Common error when the user asks for an event we have no latency
706 * information about. */
707 addReplyErrorFormat(c,
708 "No samples available for event '%s'", (char*) c->argv[2]->ptr);
709}
710
711void durationAddSample(int type, monotime duration) {
712 if (type >= EL_DURATION_TYPE_NUM) {
713 return;
714 }
715 durationStats* ds = &server.duration_stats[type];
716 ds->cnt++;
717 ds->sum += duration;
718 if (duration > ds->max) {
719 ds->max = duration;
720 }
721}