From e350115b6ccbda7a13bce6c0cfbd3bb11bbd322b Mon Sep 17 00:00:00 2001 From: Sasha Goldshtein Date: Sat, 13 Feb 2016 03:56:29 -0800 Subject: [PATCH] Finalized $entry, $latency, and $retval implementation including examples and man --- man/man8/argdist.8 | 26 +++++++++++--- tools/argdist.py | 65 ++++++++++++++++++++++------------ tools/argdist_examples.txt | 88 ++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 138 insertions(+), 41 deletions(-) diff --git a/man/man8/argdist.8 b/man/man8/argdist.8 index 2f74597..63669b8 100644 --- a/man/man8/argdist.8 +++ b/man/man8/argdist.8 @@ -2,7 +2,7 @@ .SH NAME argdist \- Trace a function and display a histogram or frequency count of its parameter values. Uses Linux eBPF/bcc. .SH SYNOPSIS -.B argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-H specifier [specifier ...]] [-C specifier [specifier ...]] +.B argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-v] [-T TOP] [-H specifier [specifier ...]] [-C specifier [specifier ...]] .SH DESCRIPTION argdist attaches to function entry and exit points, collects specified parameter values, and stores them in a histogram or a frequency collection that counts @@ -30,6 +30,12 @@ Print the collected data every INTERVAL seconds. The default is 1 second. -n NUMBER Print the collected data COUNT times and then exit. .TP +\-v +Display the generated BPF program, for debugging purposes. +.TP +\-T TOP +When collecting frequency counts, display only the top TOP entries. +.TP -H SPECIFIER, -C SPECIFIER One or more probe specifications that instruct argdist which functions to probe, which parameters to collect, how to aggregate them, and whether to perform @@ -47,8 +53,7 @@ count information, or aggregate the collected values into a histogram. Counting probes will collect the number of times every parameter value was observed, whereas histogram probes will collect the parameter values into a histogram. Only integral types can be used with histogram probes; there is no such limitation -for counting probes. Return probes can only use the pseudo-variable $retval, which -is an alias for the function's return value. +for counting probes. .TP .B [library] Library containing the probe. @@ -73,11 +78,20 @@ The expression to capture. These are the values that are assigned to the histogram or raw event collection. You may use the parameters directly, or valid C expressions that involve the parameters, such as "size % 10". +Return probes can use the argument values received by the +function when it was entered, through the $entry(paramname) special variable. +Return probes can also access the function's return value in $retval, and the +function's execution time in nanoseconds in $latency. Note that adding the +$latency or $entry(paramname) variables to the expression will introduce an +additional probe at the function's entry to collect this data, and therefore +introduce additional overhead. .TP .B [filter] The filter applied to the captured data. Only parameter values that pass the filter will be collected. This is any valid C expression that refers to the parameter values, such as "fd == 1 && length > 16". +The $entry, $retval, and $latency variables can be used here as well, in return +probes. .TP .B [label] The label that will be displayed when printing the probed values. By default, @@ -96,6 +110,10 @@ Snoop on all strings returned by gets(): # .B argdist.py -C 'r:c:gets():char*:$retval' .TP +Print a histogram of read sizes that were longer than 1ms: +# +.B argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' +.TP Print frequency counts of how many times writes were issued to a particular file descriptor number, in process 1005: # .B argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' @@ -114,7 +132,7 @@ Count fork() calls in libc across all processes, grouped by pid: .TP Print histograms of sleep() and nanosleep() parameter values: # -.B argdist.py -H 'p:c:sleep(u32 seconds):u32:seconds' -H 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec' +.B argdist.py -H 'p:c:sleep(u32 seconds):u32:seconds' 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec' .TP Spy on writes to STDOUT performed by process 2780, up to a string size of 120 characters: # diff --git a/tools/argdist.py b/tools/argdist.py index b258827..dd111f7 100755 --- a/tools/argdist.py +++ b/tools/argdist.py @@ -74,22 +74,30 @@ int PROBENAME(struct pt_regs *ctx SIGNATURE) text = text.replace("PID_FILTER", pid_filter) collect = "" for pname in self.args_to_probe: - collect += "%s.update(&pid, &%s);\n" % \ - (self.hashname_prefix + pname, pname) + param_hash = self.hashname_prefix + pname + if pname == "__latency": + collect += """ +u64 __time = bpf_ktime_get_ns(); +%s.update(&pid, &__time); +""" % param_hash + else: + collect += "%s.update(&pid, &%s);\n" % \ + (param_hash, pname) text = text.replace("COLLECT", collect) return text def _generate_entry_probe(self): - # TODO $latency as a special keyword that should be traced # Any $entry(name) expressions result in saving that argument # when entering the function. self.args_to_probe = set() regex = r"\$entry\((\w+)\)" - for arg in re.finditer(regex, self.expr or ""): + for arg in re.finditer(regex, self.expr): self.args_to_probe.add(arg.group(1)) - for arg in re.finditer(regex, self.filter or ""): + for arg in re.finditer(regex, self.filter): self.args_to_probe.add(arg.group(1)) - + if "$latency" in self.expr or "$latency" in self.filter: + self.args_to_probe.add("__latency") + self.param_types["__latency"] = "u64" # nanoseconds for pname in self.args_to_probe: if pname not in self.param_types: raise ValueError("$entry(%s): no such param" \ @@ -124,12 +132,15 @@ int PROBENAME(struct pt_regs *ctx SIGNATURE) def _replace_entry_exprs(self): for pname, vname in self.param_val_names.items(): - entry_expr = "$entry(%s)" % pname - val_expr = "*" + vname # dereference the pointer + if pname == "__latency": + entry_expr = "$latency" + val_expr = "(bpf_ktime_get_ns() - *%s)" % vname + else: + entry_expr = "$entry(%s)" % pname + val_expr = "(*%s)" % vname self.expr = self.expr.replace(entry_expr, val_expr) - if self.filter is not None: - self.filter = self.filter.replace(entry_expr, - val_expr) + self.filter = self.filter.replace(entry_expr, + val_expr) def _attach_entry_probe(self): if self.is_user: @@ -177,14 +188,14 @@ int PROBENAME(struct pt_regs *ctx SIGNATURE) self.expr_type = \ "u64" if not self.is_ret_probe else "int" self.expr = "1" if not self.is_ret_probe else "$retval" - self.filter = None if len(parts) != 6 else parts[5] + self.filter = "" if len(parts) != 6 else parts[5] self._substitute_exprs() # Do we need to attach an entry probe so that we can collect an # argument that is required for an exit (return) probe? self.entry_probe_required = self.is_ret_probe and \ - ("$entry" in self.expr or \ - "$entry" in (self.filter or "")) + ("$entry" in self.expr or "$entry" in self.filter or + "$latency" in self.expr or "$latency" in self.filter) self.pid = pid # Generating unique names for probes means we can attach @@ -198,9 +209,8 @@ int PROBENAME(struct pt_regs *ctx SIGNATURE) def _substitute_exprs(self): self.expr = self.expr.replace("$retval", "(%s)ctx->ax" % self.expr_type) - if self.filter is not None: - self.filter = self.filter.replace("$retval", - "(%s)ctx->ax" % self.expr_type) + self.filter = self.filter.replace("$retval", + "(%s)ctx->ax" % self.expr_type) self.expr = self._substitute_aliases(self.expr) self.filter = self._substitute_aliases(self.filter) @@ -264,7 +274,8 @@ bpf_probe_read(&__key.key, sizeof(__key.key), %s); (self.expr_type, self.expr) program = program.replace("DATA_DECL", decl) program = program.replace("KEY_EXPR", key_expr) - program = program.replace("FILTER", self.filter or "1") + program = program.replace("FILTER", + "1" if len(self.filter) == 0 else self.filter) program = program.replace("COLLECT", collect) program = program.replace("PREFIX", prefix) return program @@ -293,9 +304,9 @@ bpf_probe_read(&__key.key, sizeof(__key.key), %s); self._attach_entry_probe() def display(self, top): - print(self.label or self.raw_spec) data = self.bpf.get_table(self.probe_hash_name) if self.type == "freq": + print(self.label or self.raw_spec) print("\t%-10s %s" % ("COUNT", "EVENT")) data = sorted(data.items(), key=lambda kv: kv[1].value) if top is not None: @@ -317,8 +328,9 @@ bpf_probe_read(&__key.key, sizeof(__key.key), %s); print("\t%-10s %s" % \ (str(value.value), key_str)) elif self.type == "hist": - label = self.expr if not self.is_default_expr \ - else "retval" + label = self.label or \ + (self.expr if not self.is_default_expr \ + else "retval") data.print_log2_hist(val_type=label) examples = """ @@ -326,7 +338,7 @@ Probe specifier syntax: {p,r}:[library]:function(signature)[:type:expr[:filter]][;label] Where: p,r -- probe at function entry or at function exit - in exit probes, only $retval is accessible + in exit probes: can use $retval, $entry(param), $latency library -- the library that contains the function (leave empty for kernel functions) function -- the function name to trace @@ -348,6 +360,9 @@ argdist.py -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16' argdist.py -C 'r:c:gets():char*:$retval;snooped strings' Snoop on all strings returned by gets() +argdist.py -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size);ns per byte' + Print a histogram of nanoseconds per byte from kmalloc allocations + argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' -T 5 Print frequency counts of how many times writes were issued to a particular file descriptor number, in process 1005, but only show @@ -356,6 +371,12 @@ argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' -T 5 argdist.py -p 1005 -H 'r:c:read()' Print a histogram of error codes returned by read() in process 1005 +argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000' + Print frequency of reads by process where the latency was >0.1ms + +argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' + Print a histogram of read sizes that were longer than 1ms + argdist.py -H \\ 'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1' Print a histogram of buffer sizes passed to write() across all diff --git a/tools/argdist_examples.txt b/tools/argdist_examples.txt index 46cb8d9..999b927 100644 --- a/tools/argdist_examples.txt +++ b/tools/argdist_examples.txt @@ -183,6 +183,54 @@ r:c:read() 1024 -> 2047 : 0 | | 2048 -> 4095 : 2 |** | +In return probes, you can also trace the latency of the function (unless it is +recursive) and the parameters it had on entry. For example, we can identify +which processes are performing slow synchronous filesystem reads -- say, +longer than 0.1ms (100,000ns): + +# ./argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000' +[01:08:48] +r::__vfs_read():u32:$PID:$latency > 100000 + COUNT EVENT + 1 bpf_get_current_pid_tgid() = 10457 + 21 bpf_get_current_pid_tgid() = 2780 +[01:08:49] +r::__vfs_read():u32:$PID:$latency > 100000 + COUNT EVENT + 1 bpf_get_current_pid_tgid() = 10457 + 21 bpf_get_current_pid_tgid() = 2780 +^C + +As you see, the $PID alias is expanded to the BPF function bpf_get_current_pid_tgid(), +which returns the current process' pid. It looks like process 2780 performed +21 slow reads. + +Occasionally, entry parameter values are also interesting. For example, you +might be curious how long it takes malloc() to allocate memory -- nanoseconds +per byte allocated. Let's go: + +# ./argdist.py -H 'r:c:malloc(size_t size):u64:$latency/$entry(size);ns per byte' -n 1 -i 10 +[01:11:13] + ns per byte : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 4 |***************** | + 4 -> 7 : 3 |************* | + 8 -> 15 : 2 |******** | + 16 -> 31 : 1 |**** | + 32 -> 63 : 0 | | + 64 -> 127 : 7 |******************************* | + 128 -> 255 : 1 |**** | + 256 -> 511 : 0 | | + 512 -> 1023 : 1 |**** | + 1024 -> 2047 : 1 |**** | + 2048 -> 4095 : 9 |****************************************| + 4096 -> 8191 : 1 |**** | + +It looks like a tri-modal distribution. Some allocations are extremely cheap, +and take 2-15 nanoseconds per byte. Other allocations are slower, and take +64-127 nanoseconds per byte. And some allocations are slower still, and take +multiple microseconds per byte. + Here's a final example that finds how many write() system calls are performed by each process on the system: @@ -200,15 +248,13 @@ write by process 23 bpf_get_current_pid_tgid() = 7615 23 bpf_get_current_pid_tgid() = 2480 -As you see, the $PID alias is expanded to the BPF function bpf_get_current_pid_tgid(), -which returns the current process' pid. - USAGE message: -usage: argdist.py [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] - [-H [HISTSPECIFIER [HISTSPECIFIER ...]]] - [-C [COUNTSPECIFIER [COUNTSPECIFIER ...]]] [-v] +# argdist.py -h +usage: argdist.py [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-v] + [-T TOP] [-H [HISTSPECIFIER [HISTSPECIFIER ...]]] + [-C [COUNTSPECIFIER [COUNTSPECIFIER ...]]] Trace a function and display a summary of its parameter values. @@ -221,19 +267,21 @@ optional arguments: output interval, in seconds -n COUNT, --number COUNT number of outputs + -v, --verbose print resulting BPF program code before executing + -T TOP, --top TOP number of top results to show (not applicable to + histograms) -H [HISTSPECIFIER [HISTSPECIFIER ...]], --histogram [HISTSPECIFIER [HISTSPECIFIER ...]] probe specifier to capture histogram of (see examples below) -C [COUNTSPECIFIER [COUNTSPECIFIER ...]], --count [COUNTSPECIFIER [COUNTSPECIFIER ...]] probe specifier to capture count of (see examples below) - -v, --verbose print resulting BPF program code before executing Probe specifier syntax: {p,r}:[library]:function(signature)[:type:expr[:filter]][;label] Where: p,r -- probe at function entry or at function exit - in exit probes, only $retval is accessible + in exit probes: can use $retval, $entry(param), $latency library -- the library that contains the function (leave empty for kernel functions) function -- the function name to trace @@ -255,13 +303,23 @@ argdist.py -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16' argdist.py -C 'r:c:gets():char*:$retval;snooped strings' Snoop on all strings returned by gets() -argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' +argdist.py -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size);ns per byte' + Print a histogram of nanoseconds per byte from kmalloc allocations + +argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' -T 5 Print frequency counts of how many times writes were issued to a - particular file descriptor number, in process 1005 + particular file descriptor number, in process 1005, but only show + the top 5 busiest fds argdist.py -p 1005 -H 'r:c:read()' Print a histogram of error codes returned by read() in process 1005 +argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000' + Print frequency of reads by process where the latency was >0.1ms + +argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' + Print a histogram of read sizes that were longer than 1ms + argdist.py -H \ 'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1' Print a histogram of buffer sizes passed to write() across all @@ -269,15 +327,15 @@ argdist.py -H \ argdist.py -C 'p:c:fork();fork calls' Count fork() calls in libc across all processes - Can also use funccount.py, which is easier and more flexible + Can also use funccount.py, which is easier and more flexible -argdist.py \ - -H 'p:c:sleep(u32 seconds):u32:seconds' \ - -H 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec' +argdist.py -H \ + 'p:c:sleep(u32 seconds):u32:seconds' \ + 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec' Print histograms of sleep() and nanosleep() parameter values argdist.py -p 2780 -z 120 \ -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1' Spy on writes to STDOUT performed by process 2780, up to a string size - of 120 characters + of 120 characters -- 2.7.4