From: Brendan Gregg Date: Fri, 12 Feb 2016 10:29:08 +0000 (-0800) Subject: ext4dist X-Git-Tag: v0.1.8~25^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1dcedc407b5a63ff2aeb8b7fd0e5f9905b024807;p=platform%2Fupstream%2Fbcc.git ext4dist --- diff --git a/README.md b/README.md index f3d13dc..232aed9 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ Tools: - tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt). - tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt). - tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt). +- tools/[ext4dist](tools/ext4dist.py): Summarize ext4 operation latency. [Examples](tools/ext4dist_example.txt). - tools/[ext4slower](tools/ext4slower.py): Trace slow ext4 operations. [Examples](tools/ext4slower_example.txt). - tools/[filelife](tools/filelife.py): Trace the lifespan of short-lived files. [Examples](tools/filelife_example.txt). - tools/[fileslower](tools/fileslower.py): Trace slow synchronous file reads and writes. [Examples](tools/fileslower_example.txt). diff --git a/man/man8/ext4dist.8 b/man/man8/ext4dist.8 new file mode 100644 index 0000000..0cba12f --- /dev/null +++ b/man/man8/ext4dist.8 @@ -0,0 +1,80 @@ +.TH ext4dist 8 "2016-02-12" "USER COMMANDS" +.SH NAME +ext4dist \- Summarize ext4 operation latency. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B ext4dist [\-h] [\-T] [\-N] [\-d] [interval] [count] +.SH DESCRIPTION +This tool summarizes time (latency) spent in common ext4 file operations: reads, +writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an +in-kernel eBPF map to store the histogram for efficiency. + +Since this works by tracing the ext4_file_operations interface functions, it +will need updating to match any changes to these functions. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Don't include timestamps on interval output. +.TP +\-m +Output in milliseconds. +.TP +\-p PID +Trace this PID only. +.SH EXAMPLES +.TP +Trace ext4 operation time, and print a summary on Ctrl-C: +# +.B ext4dist +.TP +Trace PID 181 only: +# +.B ext4dist -p 181 +.TP +Print 1 second summaries, 10 times: +# +.B ext4dist 1 10 +.TP +1 second summaries, printed in milliseconds +# +.B ext4dist \-m 1 +.SH FIELDS +.TP +msecs +Range of milliseconds for this bucket. +.TP +usecs +Range of microseconds for this bucket. +.TP +count +Number of operations in this time range. +.TP +distribution +ASCII representation of the distribution (the count column). +.SH OVERHEAD +This adds low-overhead instrumentation to these ext4 operations, +including reads and writes from the file system cache. Such reads and writes +can be very frequent (depending on the workload; eg, 1M/sec), at which +point the overhead of this tool may become noticeable. +Measure and quantify before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +ext4snoop(8) diff --git a/tools/ext4dist.py b/tools/ext4dist.py new file mode 100755 index 0000000..b59a227 --- /dev/null +++ b/tools/ext4dist.py @@ -0,0 +1,203 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# ext4dist Summarize ext4 operation latency. +# For Linux, uses BCC, eBPF. +# +# USAGE: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count] +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 12-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse + +# symbols +kallsyms = "/proc/kallsyms" + +# arguments +examples = """examples: + ./ext4dist # show operation latency as a histogram + ./ext4dist -p 181 # trace PID 181 only + ./ext4dist 1 10 # print 1 second summaries, 10 times + ./ext4dist -m 5 # 5s summaries, milliseconds +""" +parser = argparse.ArgumentParser( + description="Summarize ext4 operation latency", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--notimestamp", action="store_true", + help="don't include timestamp on interval output") +parser.add_argument("-m", "--milliseconds", action="store_true", + help="output in milliseconds") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("interval", nargs="?", + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +pid = args.pid +countdown = int(args.count) +if args.milliseconds: + factor = 1000000 + label = "msecs" +else: + factor = 1000 + label = "usecs" +if args.interval and int(args.interval) == 0: + print("ERROR: interval 0. Exiting.") + exit() +debug = 0 + +# define BPF program +bpf_text = """ +#include +#include +#include + +#define OP_NAME_LEN 8 +typedef struct dist_key { + char op[OP_NAME_LEN]; + u64 slot; +} dist_key_t; +BPF_HASH(start, u32); +BPF_HISTOGRAM(dist, dist_key_t); + +// time operation +int trace_entry(struct pt_regs *ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's +// own function, for reads. So we need to trace that and then filter on ext4, +// which I do by checking file->f_op. +int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // ext4 filter on file->f_op == ext4_file_operations + struct file *fp = iocb->ki_filp; + if ((u64)fp->f_op != EXT4_FILE_OPERATIONS) + return 0; + + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +static int trace_return(struct pt_regs *ctx, const char *op) +{ + u64 *tsp; + u32 pid = bpf_get_current_pid_tgid(); + + // fetch timestamp and calculate delta + tsp = start.lookup(&pid); + if (tsp == 0) { + return 0; // missed start or filtered + } + u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR; + + // store as histogram + dist_key_t key = {.slot = bpf_log2l(delta)}; + __builtin_memcpy(&key.op, op, sizeof(key.op)); + dist.increment(key); + + start.delete(&pid); + return 0; +} + +int trace_read_return(struct pt_regs *ctx) +{ + char *op = "read"; + return trace_return(ctx, op); +} + +int trace_write_return(struct pt_regs *ctx) +{ + char *op = "write"; + return trace_return(ctx, op); +} + +int trace_open_return(struct pt_regs *ctx) +{ + char *op = "open"; + return trace_return(ctx, op); +} + +int trace_fsync_return(struct pt_regs *ctx) +{ + char *op = "fsync"; + return trace_return(ctx, op); +} +""" + +# code replacements +with open(kallsyms) as syms: + ops = '' + for line in syms: + (addr, size, name) = line.rstrip().split(" ", 2) + if name == "ext4_file_operations": + ops = "0x" + addr + break + if ops == '': + print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.") + exit() + bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops) +bpf_text = bpf_text.replace('FACTOR', str(factor)) +if args.pid: + bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) +else: + bpf_text = bpf_text.replace('FILTER_PID', '0') +if debug: + print(bpf_text) + +# load BPF program +b = BPF(text=bpf_text) + +# Common file functions. See earlier comment about generic_file_read_iter(). +b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") +b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_entry") +b.attach_kprobe(event="ext4_file_open", fn_name="trace_entry") +b.attach_kprobe(event="ext4_sync_file", fn_name="trace_entry") +b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") +b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return") +b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return") +b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return") + +print("Tracing ext4 operation latency... Hit Ctrl-C to end.") + +# output +exiting = 0 +dist = b.get_table("dist") +while (1): + try: + if args.interval: + sleep(int(args.interval)) + else: + sleep(99999999) + except KeyboardInterrupt: + exiting = 1 + + print() + if args.interval and (not args.notimestamp): + print(strftime("%H:%M:%S:")) + + dist.print_log2_hist(label, "operation") + dist.clear() + + countdown -= 1 + if exiting or countdown == 0: + exit() diff --git a/tools/ext4dist_example.txt b/tools/ext4dist_example.txt new file mode 100644 index 0000000..def8e8b --- /dev/null +++ b/tools/ext4dist_example.txt @@ -0,0 +1,193 @@ +Demonstrations of ext4dist, the Linux eBPF/bcc version. + + +ext4dist traces ext4 reads, writes, opens, and fsyncs, and summarizes their +latency as a power-of-2 histogram. For example: + +# ./ext4dist +Tracing ext4 operation latency... Hit Ctrl-C to end. +^C + +operation = 'read' + usecs : count distribution + 0 -> 1 : 1210 |****************************************| + 2 -> 3 : 126 |**** | + 4 -> 7 : 376 |************ | + 8 -> 15 : 86 |** | + 16 -> 31 : 9 | | + 32 -> 63 : 47 |* | + 64 -> 127 : 6 | | + 128 -> 255 : 24 | | + 256 -> 511 : 137 |**** | + 512 -> 1023 : 66 |** | + 1024 -> 2047 : 13 | | + 2048 -> 4095 : 7 | | + 4096 -> 8191 : 13 | | + 8192 -> 16383 : 3 | | + +operation = 'write' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 75 |****************************************| + 16 -> 31 : 5 |** | + +operation = 'open' + usecs : count distribution + 0 -> 1 : 1278 |****************************************| + 2 -> 3 : 40 |* | + 4 -> 7 : 4 | | + 8 -> 15 : 1 | | + 16 -> 31 : 1 | | + +This output shows a bi-modal distribution for read latency, with a faster +mode of less than 7 microseconds, and a slower mode of between 256 and 1023 +microseconds. The count column shows how many events fell into that latency +range. It's likely that the faster mode was a hit from the in-memory file +system cache, and the slower mode is a read from a storage device (disk). + +This "latency" is measured from when the operation was issued from the VFS +interface to the file system, to when it completed. This spans everything: +block device I/O (disk I/O), file system CPU cycles, file system locks, run +queue latency, etc. This is a better measure of the latency suffered by +applications reading from the file system than measuring this down at the +block device interface. + +Note that this only traces the common file system operations previously +listed: other file system operations (eg, inode operations including +getattr()) are not traced. + + +An optional interval and a count can be provided, as well as -m to show the +distributions in milliseconds. For example: + +# ./ext4dist -m 1 5 +Tracing ext4 operation latency... Hit Ctrl-C to end. + +10:19:00: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 576 |****************************************| + 2 -> 3 : 5 | | + 4 -> 7 : 6 | | + 8 -> 15 : 13 | | + 16 -> 31 : 17 |* | + 32 -> 63 : 5 | | + 64 -> 127 : 1 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 20 |****************************************| + +operation = 'open' + msecs : count distribution + 0 -> 1 : 346 |****************************************| + +10:19:01: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 584 |****************************************| + 2 -> 3 : 10 | | + 4 -> 7 : 11 | | + 8 -> 15 : 16 |* | + 16 -> 31 : 6 | | + 32 -> 63 : 4 | | + 64 -> 127 : 2 | | + 128 -> 255 : 1 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 20 |****************************************| + +operation = 'open' + msecs : count distribution + 0 -> 1 : 336 |****************************************| + +10:19:02: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 678 |****************************************| + 2 -> 3 : 7 | | + 4 -> 7 : 9 | | + 8 -> 15 : 25 |* | + 16 -> 31 : 10 | | + 32 -> 63 : 3 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 19 |****************************************| + 2 -> 3 : 1 |** | + +operation = 'open' + msecs : count distribution + 0 -> 1 : 390 |****************************************| + +10:19:03: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 567 |****************************************| + 2 -> 3 : 7 | | + 4 -> 7 : 9 | | + 8 -> 15 : 20 |* | + 16 -> 31 : 15 |* | + 32 -> 63 : 5 | | + 64 -> 127 : 2 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 20 |****************************************| + +operation = 'open' + msecs : count distribution + 0 -> 1 : 417 |****************************************| + +10:19:04: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 762 |****************************************| + 2 -> 3 : 9 | | + 4 -> 7 : 9 | | + 8 -> 15 : 11 | | + 16 -> 31 : 20 |* | + 32 -> 63 : 4 | | + 64 -> 127 : 1 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 20 |****************************************| + +operation = 'open' + msecs : count distribution + 0 -> 1 : 427 |****************************************| + +This shows a mixed read/write workload. + + +USAGE message: + +# ./ext4dist -h +usage: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count] + +Summarize ext4 operation latency + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -T, --notimestamp don't include timestamp on interval output + -m, --milliseconds output in milliseconds + -p PID, --pid PID trace this PID only + +examples: + ./ext4dist # show operation latency as a histogram + ./ext4dist -p 181 # trace PID 181 only + ./ext4dist 1 10 # print 1 second summaries, 10 times + ./ext4dist -m 5 # 5s summaries, milliseconds