From: Brendan Gregg Date: Sun, 7 Feb 2016 08:46:34 +0000 (-0800) Subject: biotop X-Git-Tag: v0.1.8~36^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6f075b9338d930cc363bbb7da112fae3f49d0468;p=platform%2Fupstream%2Fbcc.git biotop --- diff --git a/README.md b/README.md index c3bbccb..98f002f 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ Tools: - tools/[bashreadline](tools/bashreadline.py): Print entered bash commands system wide. [Examples](tools/bashreadline_example.txt). - tools/[biolatency](tools/biolatency.py): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt). +- tools/[biotop](tools/biotop.py): Top for disks: Summarize block device I/O by process. [Examples](tools/biotop_example.txt). - tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt). - tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt). - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt). diff --git a/man/man8/biotop.8 b/man/man8/biotop.8 new file mode 100644 index 0000000..3ee021a --- /dev/null +++ b/man/man8/biotop.8 @@ -0,0 +1,90 @@ +.TH biotop 8 "2016-02-06" "USER COMMANDS" +.SH NAME +biotop \- Block device (disk) I/O by process top. +.SH SYNOPSIS +.B biotop [\-h] [\-C] [\-r MAXROWS] [interval] [count] +.SH DESCRIPTION +This is top for disks. + +This traces block device I/O (disk I/O), and prints a per-process summary every +interval (by default, 1 second). The summary is sorted on the top disk +consumers by throughput (Kbytes). The PID and process name shown are measured +from when the I/O was first created, which usually identifies the responsible +process. + +For efficiency, this uses in-kernel eBPF maps to cache process details (PID and +comm) by I/O request, as well as a starting timestamp for calculating I/O +latency, and the final summary. + +This works by tracing various kernel blk_*() functions using dynamic tracing, +and will need updating to match any changes to these functions. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH EXAMPLES +.TP +Summarize block device I/O by process, 1 second screen refresh: +# +.B biotop +.TP +Don't clear the screen: +# +.B biotop -C +.TP +5 second summaries, 10 times only: +# +.B biotop 5 10 +.SH FIELDS +.TP +loadavg: +The contents of /proc/loadavg +.TP +PID +Cached process ID, if present. This usually (but isn't guaranteed) to identify +the responsible process for the I/O. +.TP +COMM +Cached process name, if present. This usually (but isn't guaranteed) to identify +the responsible process for the I/O. +.TP +D +Direction: R == read, W == write. +.TP +MAJ +Major device number. +.TP +MIN +Minor device number. +.TP +DISK +Disk device name. +.TP +I/O +Number of I/O during the interval. +.TP +Kbytes +Total Kbytes for these I/O, during the interval. +.TP +AVGms +Average time for the I/O (latency) from the issue to the device, to its +completion, in milliseconds. +.SH OVERHEAD +Since block device I/O usually has a relatively low frequency (< 10,000/s), +the overhead for this tool is expected to be low or negligible. For high IOPS +storage systems, test and quantify before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +biosnoop(8), biolatency(8), iostat(1) diff --git a/tools/biotop.py b/tools/biotop.py new file mode 100755 index 0000000..c26572a --- /dev/null +++ b/tools/biotop.py @@ -0,0 +1,211 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# biotop block device (disk) I/O by process. +# For Linux, uses BCC, eBPF. +# +# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count] +# +# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O +# request, as well as a starting timestamp for calculating I/O latency. +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 06-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse +import signal +from subprocess import call + +# arguments +examples = """examples: + ./biotop # block device I/O top, 1 second refresh + ./biotop -C # don't clear the screen + ./biotop 5 # 5 second summaries + ./biotop 5 10 # 5 second summaries, 10 times only +""" +parser = argparse.ArgumentParser( + description="Block device (disk) I/O by process", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-C", "--noclear", action="store_true", + help="don't clear the screen") +parser.add_argument("-r", "--maxrows", default=20, + help="maximum rows to print, default 20") +parser.add_argument("interval", nargs="?", default=1, + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +interval = int(args.interval) +countdown = int(args.count) +maxrows = int(args.maxrows) +clear = not int(args.noclear) + +# linux stats +loadavg = "/proc/loadavg" +diskstats = "/proc/diskstats" + +# signal handler +def signal_ignore(signal, frame): + print() + +# load BPF program +b = BPF(text=""" +#include +#include + +// a value of one map, and a key for another: +struct who_t { + u32 pid; + char name[TASK_COMM_LEN]; +}; + +struct info_t { + u32 pid; + int type; + int major; + int minor; + char name[TASK_COMM_LEN]; +}; + +struct val_t { + u64 bytes; + u64 us; + u32 io; +}; + +BPF_HASH(start, struct request *); +BPF_HASH(whobyreq, struct request *, struct who_t); +BPF_HASH(counts, struct info_t, struct val_t); + +// cache PID and comm by-req +int trace_pid_start(struct pt_regs *ctx, struct request *req) +{ + struct who_t who = {}; + + if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) { + who.pid = bpf_get_current_pid_tgid(); + whobyreq.update(&req, &who); + } + + return 0; +} + +// time block I/O +int trace_req_start(struct pt_regs *ctx, struct request *req) +{ + u64 ts; + + ts = bpf_ktime_get_ns(); + start.update(&req, &ts); + + return 0; +} + +// output +int trace_req_completion(struct pt_regs *ctx, struct request *req) +{ + u64 *tsp; + + // fetch timestamp and calculate delta + tsp = start.lookup(&req); + if (tsp == 0) { + return 0; // missed tracing issue + } + + struct who_t *whop; + struct val_t *valp, zero = {}; + u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; + + // setup info_t key + struct info_t info = {}; + info.major = req->rq_disk->major; + info.minor = req->rq_disk->first_minor; + info.type = req->cmd_flags & REQ_WRITE; + whop = whobyreq.lookup(&req); + if (whop == 0) { + // missed pid who, save stats as pid 0 + valp = counts.lookup_or_init(&info, &zero); + } else { + info.pid = whop->pid; + __builtin_memcpy(&info.name, whop->name, sizeof(info.name)); + valp = counts.lookup_or_init(&info, &zero); + } + + // save stats + valp->us += delta_us; + valp->bytes += req->__data_len; + valp->io++; + + start.delete(&req); + whobyreq.delete(&req); + + return 0; +} +""", debug=0) +b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") +b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") +b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") +b.attach_kprobe(event="blk_account_io_completion", + fn_name="trace_req_completion") + +print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) + +# cache disk major,minor -> diskname +disklookup = {} +with open(diskstats) as stats: + for line in stats: + a = line.split() + disklookup[a[0] + "," + a[1]] = a[2] + +# output +exiting = 0 +while 1: + try: + sleep(interval) + except KeyboardInterrupt: + exiting = 1 + + # header + if clear: + call("clear") + else: + print() + with open(loadavg) as stats: + print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) + print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM", + "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms")) + + # by-PID output + counts = b.get_table("counts") + line = 0 + for k, v in reversed(sorted(counts.items(), + key=lambda counts: counts[1].bytes)): + + # lookup disk + disk = str(k.major) + "," + str(k.minor) + if disk in disklookup: + diskname = disklookup[disk] + else: + diskname = "?" + + # print line + avg_ms = (float(v.us) / 1000) / v.io + print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, k.name, + "W" if k.type else "R", k.major, k.minor, diskname, v.io, + v.bytes / 1024, avg_ms)) + + line += 1 + if line >= maxrows: + break + counts.clear() + + countdown -= 1 + if exiting or countdown == 0: + print("Detaching...") + exit() diff --git a/tools/biotop_example.txt b/tools/biotop_example.txt new file mode 100644 index 0000000..62e8f1c --- /dev/null +++ b/tools/biotop_example.txt @@ -0,0 +1,187 @@ +Demonstrations of biotop, the Linux eBPF/bcc version. + + +Short for block device I/O top, biotop summarizes which processes are +performing disk I/O. It's top for disks. Sample output: + +# ./biotop +Tracing... Output every 1 secs. Hit Ctrl-C to end + +08:04:11 loadavg: 1.48 0.87 0.45 1/287 14547 + +PID COMM D MAJ MIN DISK I/O Kbytes AVGms +14501 cksum R 202 1 xvda1 361 28832 3.39 +6961 dd R 202 1 xvda1 1628 13024 0.59 +13855 dd R 202 1 xvda1 1627 13016 0.59 +326 jbd2/xvda1-8 W 202 1 xvda1 3 168 3.00 +1880 supervise W 202 1 xvda1 2 8 6.71 +1873 supervise W 202 1 xvda1 2 8 2.51 +1871 supervise W 202 1 xvda1 2 8 1.57 +1876 supervise W 202 1 xvda1 2 8 1.22 +1892 supervise W 202 1 xvda1 2 8 0.62 +1878 supervise W 202 1 xvda1 2 8 0.78 +1886 supervise W 202 1 xvda1 2 8 1.30 +1894 supervise W 202 1 xvda1 2 8 3.46 +1869 supervise W 202 1 xvda1 2 8 0.73 +1888 supervise W 202 1 xvda1 2 8 1.48 + +By default the screen refreshes every 1 second, and shows the top 20 disk +consumers, sorted on total Kbytes. The first line printed is the header, +which has the time and then the contents of /proc/loadavg. + +For the interval summarized by the output above, the "cksum" command performed +361 disk reads to the "xvda1" device, for a total of 28832 Kbytes, with an +average I/O time of 3.39 ms. Two "dd" processes were also reading from the +same disk, which a higher I/O rate and lower latency. While the average I/O +size is not printed, it can be determined by dividing the Kbytes column by +the I/O column. + +The columns through to Kbytes show the workload applied. The final column, +AVGms, shows resulting performance. Other bcc tools can be used to get more +details when needed: biolatency and biosnoop. + +Many years ago I created the original "iotop", and later regretted not calling +it diskiotop or blockiotop, as "io" alone is ambiguous. This time it is biotop. + + +The -C option can be used to prevent the screen from clearing (my preference). +Here's using it with a 5 second interval: + +# ./biotop -C 5 +Tracing... Output every 5 secs. Hit Ctrl-C to end + +08:09:44 loadavg: 0.42 0.44 0.39 2/282 22115 + +PID COMM D MAJ MIN DISK I/O Kbytes AVGms +22069 dd R 202 1 xvda1 5993 47976 0.33 +326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.67 +1866 svscan R 202 1 xvda1 33 132 1.24 +1880 supervise W 202 1 xvda1 10 40 0.56 +1873 supervise W 202 1 xvda1 10 40 0.79 +1871 supervise W 202 1 xvda1 10 40 0.78 +1876 supervise W 202 1 xvda1 10 40 0.68 +1892 supervise W 202 1 xvda1 10 40 0.71 +1878 supervise W 202 1 xvda1 10 40 0.65 +1886 supervise W 202 1 xvda1 10 40 0.78 +1894 supervise W 202 1 xvda1 10 40 0.80 +1869 supervise W 202 1 xvda1 10 40 0.91 +1888 supervise W 202 1 xvda1 10 40 0.63 +22069 bash R 202 1 xvda1 1 16 19.94 +9251 kworker/u16:2 W 202 16 xvdb 2 8 0.13 + +08:09:49 loadavg: 0.47 0.44 0.39 1/282 22231 + +PID COMM D MAJ MIN DISK I/O Kbytes AVGms +22069 dd R 202 1 xvda1 13450 107600 0.35 +22199 cksum R 202 1 xvda1 941 45548 4.63 +326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.93 +24467 kworker/0:2 W 202 16 xvdb 1 64 0.28 +1880 supervise W 202 1 xvda1 10 40 0.81 +1873 supervise W 202 1 xvda1 10 40 0.81 +1871 supervise W 202 1 xvda1 10 40 1.03 +1876 supervise W 202 1 xvda1 10 40 0.76 +1892 supervise W 202 1 xvda1 10 40 0.74 +1878 supervise W 202 1 xvda1 10 40 0.94 +1886 supervise W 202 1 xvda1 10 40 0.76 +1894 supervise W 202 1 xvda1 10 40 0.69 +1869 supervise W 202 1 xvda1 10 40 0.72 +1888 supervise W 202 1 xvda1 10 40 1.70 +22199 bash R 202 1 xvda1 2 20 0.35 +482 xfsaild/md0 W 202 16 xvdb 5 13 0.27 +482 xfsaild/md0 W 202 32 xvdc 2 8 0.33 +31331 pickup R 202 1 xvda1 1 4 0.31 + +08:09:54 loadavg: 0.51 0.45 0.39 2/282 22346 + +PID COMM D MAJ MIN DISK I/O Kbytes AVGms +22069 dd R 202 1 xvda1 14689 117512 0.32 +326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.33 +1880 supervise W 202 1 xvda1 10 40 0.65 +1873 supervise W 202 1 xvda1 10 40 1.08 +1871 supervise W 202 1 xvda1 10 40 0.66 +1876 supervise W 202 1 xvda1 10 40 0.79 +1892 supervise W 202 1 xvda1 10 40 0.67 +1878 supervise W 202 1 xvda1 10 40 0.66 +1886 supervise W 202 1 xvda1 10 40 1.02 +1894 supervise W 202 1 xvda1 10 40 0.88 +1869 supervise W 202 1 xvda1 10 40 0.89 +1888 supervise W 202 1 xvda1 10 40 1.25 + +08:09:59 loadavg: 0.55 0.46 0.40 2/282 22461 + +PID COMM D MAJ MIN DISK I/O Kbytes AVGms +22069 dd R 202 1 xvda1 14442 115536 0.33 +326 jbd2/xvda1-8 W 202 1 xvda1 3 168 3.46 +1880 supervise W 202 1 xvda1 10 40 0.87 +1873 supervise W 202 1 xvda1 10 40 0.87 +1871 supervise W 202 1 xvda1 10 40 0.78 +1876 supervise W 202 1 xvda1 10 40 0.86 +1892 supervise W 202 1 xvda1 10 40 0.89 +1878 supervise W 202 1 xvda1 10 40 0.87 +1886 supervise W 202 1 xvda1 10 40 0.86 +1894 supervise W 202 1 xvda1 10 40 1.06 +1869 supervise W 202 1 xvda1 10 40 1.12 +1888 supervise W 202 1 xvda1 10 40 0.98 + +08:10:04 loadavg: 0.59 0.47 0.40 3/282 22576 + +PID COMM D MAJ MIN DISK I/O Kbytes AVGms +22069 dd R 202 1 xvda1 14179 113432 0.34 +326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.39 +1880 supervise W 202 1 xvda1 10 40 0.81 +1873 supervise W 202 1 xvda1 10 40 1.02 +1871 supervise W 202 1 xvda1 10 40 1.15 +1876 supervise W 202 1 xvda1 10 40 1.10 +1892 supervise W 202 1 xvda1 10 40 0.77 +1878 supervise W 202 1 xvda1 10 40 0.72 +1886 supervise W 202 1 xvda1 10 40 0.81 +1894 supervise W 202 1 xvda1 10 40 0.86 +1869 supervise W 202 1 xvda1 10 40 0.83 +1888 supervise W 202 1 xvda1 10 40 0.79 +24467 kworker/0:2 R 202 32 xvdc 3 12 0.26 +1056 cron R 202 1 xvda1 2 8 0.30 +24467 kworker/0:2 R 202 16 xvdb 1 4 0.23 + +08:10:09 loadavg: 0.54 0.46 0.40 2/281 22668 + +PID COMM D MAJ MIN DISK I/O Kbytes AVGms +22069 dd R 202 1 xvda1 250 2000 0.34 +326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.40 +1880 supervise W 202 1 xvda1 8 32 0.93 +1873 supervise W 202 1 xvda1 8 32 0.76 +1871 supervise W 202 1 xvda1 8 32 0.60 +1876 supervise W 202 1 xvda1 8 32 0.61 +1892 supervise W 202 1 xvda1 8 32 0.68 +1878 supervise W 202 1 xvda1 8 32 0.90 +1886 supervise W 202 1 xvda1 8 32 0.57 +1894 supervise W 202 1 xvda1 8 32 0.97 +1869 supervise W 202 1 xvda1 8 32 0.69 +1888 supervise W 202 1 xvda1 8 32 0.67 + +This shows another "dd" command reading from xvda1. On this system, various +"supervise" processes do 8 disk writes per second, every second (they are +creating and updating "status" files). + + +USAGE message: + +# ./biotop.py -h +usage: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count] + +Block device (disk) I/O by process + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -C, --noclear don't clear the screen + -r MAXROWS, --maxrows MAXROWS + maximum rows to print, default 20 + +examples: + ./biotop # block device I/O top, 1 second refresh + ./biotop -C # don't clear the screen + ./biotop 5 # 5 second summaries + ./biotop 5 10 # 5 second summaries, 10 times only