From 5bfadab86cb71030deac7d64061bf2fc7267ce3c Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Wed, 10 Feb 2016 01:36:51 -0800 Subject: [PATCH] dcstat --- README.md | 1 + man/man8/dcstat.8 | 61 ++++++++++++++++++++ tools/dcstat.py | 143 +++++++++++++++++++++++++++++++++++++++++++++++ tools/dcstat_example.txt | 108 +++++++++++++++++++++++++++++++++++ 4 files changed, 313 insertions(+) create mode 100644 man/man8/dcstat.8 create mode 100755 tools/dcstat.py create mode 100644 tools/dcstat_example.txt diff --git a/README.md b/README.md index 43355b6..a917643 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ Tools: - tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt). - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt). - tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt). +- tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt). - tools/[filelife](tools/filelife.py): Trace the lifespan of short-lived files. [Examples](tools/filelife_example.txt). - tools/[fileslower](tools/fileslower.py): Trace slow synchronous file reads and writes. [Examples](tools/fileslower_example.txt). - tools/[filetop](tools/filetop.py): File reads and writes by filename and process. Top for files. [Examples](tools/filetop_example.txt). diff --git a/man/man8/dcstat.8 b/man/man8/dcstat.8 new file mode 100644 index 0000000..e2bc4dc --- /dev/null +++ b/man/man8/dcstat.8 @@ -0,0 +1,61 @@ +.TH dcstat 8 "2016-02-09" "USER COMMANDS" +.SH NAME +dcstat \- Directory entry cache (dcache) stats. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B dcstat +[interval [count]] +.SH DESCRIPTION +The Linux directory entry cache (dcache) improves the performance of file and +directory name lookups. This tool provides per-second summary statistics of +dcache performance. + +This uses kernel dynamic tracing of kernel functions, lookup_fast() and +d_lookup(), which will need to be modified to match kernel changes. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH EXAMPLES +.TP +Print summaries each second: +# +.B dcstat +.TP +Print output every five seconds, three times: +# +.B dcstat 5 3 +.SH FIELDS +.TP +REFS/s +Number dcache lookups (references) per second. +.TP +SLOW/s +Number of dcache lookups that failed the lookup_fast() path and executed the +lookup_slow() path instead. +.TP +MISS/s +Number of dcache misses (failed both fast and slow lookups). +.TP +HIT% +Percentage of dcache hits over total references. +.SH OVERHEAD +The overhead depends on the frequency of file and directory name lookups. +While the per-event overhead is low, some applications may make over 100k +lookups per second, and the low per-event overhead will begin to add up, and +could begin to be measurable (over 10% CPU usage). Measure in a test +environment. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +dcsnoop(8) diff --git a/tools/dcstat.py b/tools/dcstat.py new file mode 100755 index 0000000..b526a43 --- /dev/null +++ b/tools/dcstat.py @@ -0,0 +1,143 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# dcstat Directory entry cache (dcache) stats. +# For Linux, uses BCC, eBPF. +# +# USAGE: dcstat [interval [count]] +# +# This uses kernel dynamic tracing of kernel functions, lookup_fast() and +# d_lookup(), which will need to be modified to match kernel changes. See +# code comments. +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 09-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from ctypes import c_int +from time import sleep, strftime +from sys import argv + +def usage(): + print("USAGE: %s [interval [count]]" % argv[0]) + exit() + +# arguments +interval = 1 +count = -1 +if len(argv) > 1: + try: + interval = int(argv[1]) + if interval == 0: + raise + if len(argv) > 2: + count = int(argv[2]) + except: # also catches -h, --help + usage() + +# define BPF program +bpf_text = """ +#include + +enum stats { + S_REFS = 1, + S_SLOW, + S_MISS, + S_MAXSTAT +}; + +BPF_TABLE("array", int, u64, stats, S_MAXSTAT + 1); + +/* + * How this is instrumented, and how to interpret the statistics, is very much + * tied to the current kernel implementation (this was written on Linux 4.4). + * This will need maintenance to keep working as the implementation changes. To + * aid future adventurers, this is is what the current code does, and why. + * + * First problem: the current implementation takes a path and then does a + * lookup of each component. So how do we count a reference? Once for the path + * lookup, or once for every component lookup? I've chosen the latter + * since it seems to map more closely to actual dcache lookups (via + * __d_lookup_rcu()). It's counted via calls to lookup_fast(). + * + * The implementation tries different, progressively slower, approaches to + * lookup a file. At what point do we call it a dcache miss? I've choosen when + * a d_lookup() (which is called during lookup_slow()) returns zero. + * + * I've also included a "SLOW" statistic to show how often the fast lookup + * failed. Whether this exists or is interesting is an implementation detail, + * and the "SLOW" statistic may be removed in future versions. + */ +void count_fast(struct pt_regs *ctx) { + int key = S_REFS; + u64 *leaf = stats.lookup(&key); + if (leaf) (*leaf)++; +} + +void count_lookup(struct pt_regs *ctx) { + int key = S_SLOW; + u64 *leaf = stats.lookup(&key); + if (leaf) (*leaf)++; + if (ctx->ax == 0) { + key = S_MISS; + leaf = stats.lookup(&key); + if (leaf) (*leaf)++; + } +} +""" + +# load BPF program +b = BPF(text=bpf_text) +b.attach_kprobe(event="lookup_fast", fn_name="count_fast") +b.attach_kretprobe(event="d_lookup", fn_name="count_lookup") + +# stat column labels and indexes +stats = { + "REFS": 1, + "SLOW": 2, + "MISS": 3 +} + +# header +print("%-8s " % "TIME", end="") +for stype, idx in sorted(stats.iteritems(), key=lambda (k, v): (v, k)): + print(" %8s" % (stype + "/s"), end="") +print(" %8s" % "HIT%") + +# output +i = 0 +while (1): + if count > 0: + i += 1 + if i > count: + exit() + try: + sleep(interval) + except KeyboardInterrupt: + pass + exit() + + print("%-8s: " % strftime("%H:%M:%S"), end="") + + # print each statistic as a column + for stype, idx in sorted(stats.iteritems(), key=lambda (k, v): (v, k)): + try: + val = b["stats"][c_int(idx)].value / interval + print(" %8d" % val, end="") + except: + print(" %8d" % 0, end="") + + # print hit ratio percentage + try: + ref = b["stats"][c_int(stats["REFS"])].value + miss = b["stats"][c_int(stats["MISS"])].value + hit = ref - miss + pct = float(100) * hit / ref + print(" %8.2f" % pct) + except: + print(" %7s%%" % "-") + + b["stats"].clear() diff --git a/tools/dcstat_example.txt b/tools/dcstat_example.txt new file mode 100644 index 0000000..574473f --- /dev/null +++ b/tools/dcstat_example.txt @@ -0,0 +1,108 @@ +Demonstrations of dcstat, the Linux eBPF/bcc version. + + +dcstat shows directory entry cache (dcache) statistics. For example: + +# ./dcstat +TIME REFS/s SLOW/s MISS/s HIT% +08:11:47: 2059 141 97 95.29 +08:11:48: 79974 151 106 99.87 +08:11:49: 192874 146 102 99.95 +08:11:50: 2051 144 100 95.12 +08:11:51: 73373 17239 17194 76.57 +08:11:52: 54685 25431 25387 53.58 +08:11:53: 18127 8182 8137 55.12 +08:11:54: 22517 10345 10301 54.25 +08:11:55: 7524 2881 2836 62.31 +08:11:56: 2067 141 97 95.31 +08:11:57: 2115 145 101 95.22 + +The output shows the total references per second ("REFS/s"), the number that +took a slower code path to be processed ("SLOW/s"), the number of dcache misses +("MISS/s"), and the hit ratio as a percentage. By default, an interval of 1 +second is used. + +At 08:11:49, there were 192 thousand references, which almost entirely hit +from the dcache, with a hit ration of 99.95%. A little later, starting at +08:11:51, a workload began that walked many uncached files, reducing the hit +ratio to 53%, and more importantly, a miss rate of over 10 thousand per second. + + +Here's an interesting workload: + +# ./dcstat +TIME REFS/s SLOW/s MISS/s HIT% +08:15:53: 250683 141 97 99.96 +08:15:54: 266115 145 101 99.96 +08:15:55: 268428 141 97 99.96 +08:15:56: 260389 143 99 99.96 + +It's a 99.96% hit ratio, and these are all negative hits: accessing a file that +does not exist. Here's the C program that generated the workload: + +# cat -n badopen.c + 1 #include + 2 #include + 3 #include + 4 + 5 int + 6 main(int argc, char *argv[]) + 7 { + 8 int fd; + 9 while (1) { + 10 fd = open("bad", O_RDONLY); + 11 } + 12 return 0; + 13 } + +This is a simple workload generator than tries to open a missing file ("bad") +as quickly as possible. + + +Lets see what happens if the workload attempts to open a different filename +each time (which is also a missing file), using the following C code: + +# cat -n badopen2.c + 1 #include + 2 #include + 3 #include + 4 #include + 5 + 6 int + 7 main(int argc, char *argv[]) + 8 { + 9 int fd, i = 0; + 10 char buf[128] = {}; + 11 + 12 while (1) { + 13 sprintf(buf, "bad%d", i++); + 14 fd = open(buf, O_RDONLY); + 15 } + 16 return 0; + 17 } + +Here's dcstat: + +# ./dcstat +TIME REFS/s SLOW/s MISS/s HIT% +08:18:52: 241131 237544 237505 1.51 +08:18:53: 238210 236323 236278 0.82 +08:18:54: 235259 233307 233261 0.85 +08:18:55: 233144 231256 231214 0.83 +08:18:56: 231981 230097 230053 0.83 + + +dcstat also supports an optional interval and optional count. For example, +printing 5 second summaries 3 times: + +# ./dcstat 5 3 +TIME REFS/s SLOW/s MISS/s HIT% +08:20:03: 2085 143 99 95.23 +08:20:08: 2077 143 98 95.24 +08:20:14: 2071 144 100 95.15 + + +USAGE message: + +# ./dcstat -h +USAGE: ./dcstat [interval [count]] -- 2.7.4