- tools/[bashreadline](tools/bashreadline.py): Print entered bash commands system wide. [Examples](tools/bashreadline_example.txt).
- tools/[biolatency](tools/biolatency.py): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt).
+- tools/[biotop](tools/biotop.py): Top for disks: Summarize block device I/O by process. [Examples](tools/biotop_example.txt).
- tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt).
- tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt).
- tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt).
--- /dev/null
+.TH biotop 8 "2016-02-06" "USER COMMANDS"
+.SH NAME
+biotop \- Block device (disk) I/O by process top.
+.SH SYNOPSIS
+.B biotop [\-h] [\-C] [\-r MAXROWS] [interval] [count]
+.SH DESCRIPTION
+This is top for disks.
+
+This traces block device I/O (disk I/O), and prints a per-process summary every
+interval (by default, 1 second). The summary is sorted on the top disk
+consumers by throughput (Kbytes). The PID and process name shown are measured
+from when the I/O was first created, which usually identifies the responsible
+process.
+
+For efficiency, this uses in-kernel eBPF maps to cache process details (PID and
+comm) by I/O request, as well as a starting timestamp for calculating I/O
+latency, and the final summary.
+
+This works by tracing various kernel blk_*() functions using dynamic tracing,
+and will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Summarize block device I/O by process, 1 second screen refresh:
+#
+.B biotop
+.TP
+Don't clear the screen:
+#
+.B biotop -C
+.TP
+5 second summaries, 10 times only:
+#
+.B biotop 5 10
+.SH FIELDS
+.TP
+loadavg:
+The contents of /proc/loadavg
+.TP
+PID
+Cached process ID, if present. This usually (but isn't guaranteed) to identify
+the responsible process for the I/O.
+.TP
+COMM
+Cached process name, if present. This usually (but isn't guaranteed) to identify
+the responsible process for the I/O.
+.TP
+D
+Direction: R == read, W == write.
+.TP
+MAJ
+Major device number.
+.TP
+MIN
+Minor device number.
+.TP
+DISK
+Disk device name.
+.TP
+I/O
+Number of I/O during the interval.
+.TP
+Kbytes
+Total Kbytes for these I/O, during the interval.
+.TP
+AVGms
+Average time for the I/O (latency) from the issue to the device, to its
+completion, in milliseconds.
+.SH OVERHEAD
+Since block device I/O usually has a relatively low frequency (< 10,000/s),
+the overhead for this tool is expected to be low or negligible. For high IOPS
+storage systems, test and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8), biolatency(8), iostat(1)
--- /dev/null
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# biotop block device (disk) I/O by process.
+# For Linux, uses BCC, eBPF.
+#
+# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
+#
+# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
+# request, as well as a starting timestamp for calculating I/O latency.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 06-Feb-2016 Brendan Gregg Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+from subprocess import call
+
+# arguments
+examples = """examples:
+ ./biotop # block device I/O top, 1 second refresh
+ ./biotop -C # don't clear the screen
+ ./biotop 5 # 5 second summaries
+ ./biotop 5 10 # 5 second summaries, 10 times only
+"""
+parser = argparse.ArgumentParser(
+ description="Block device (disk) I/O by process",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=examples)
+parser.add_argument("-C", "--noclear", action="store_true",
+ help="don't clear the screen")
+parser.add_argument("-r", "--maxrows", default=20,
+ help="maximum rows to print, default 20")
+parser.add_argument("interval", nargs="?", default=1,
+ help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+ help="number of outputs")
+args = parser.parse_args()
+interval = int(args.interval)
+countdown = int(args.count)
+maxrows = int(args.maxrows)
+clear = not int(args.noclear)
+
+# linux stats
+loadavg = "/proc/loadavg"
+diskstats = "/proc/diskstats"
+
+# signal handler
+def signal_ignore(signal, frame):
+ print()
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+// a value of one map, and a key for another:
+struct who_t {
+ u32 pid;
+ char name[TASK_COMM_LEN];
+};
+
+struct info_t {
+ u32 pid;
+ int type;
+ int major;
+ int minor;
+ char name[TASK_COMM_LEN];
+};
+
+struct val_t {
+ u64 bytes;
+ u64 us;
+ u32 io;
+};
+
+BPF_HASH(start, struct request *);
+BPF_HASH(whobyreq, struct request *, struct who_t);
+BPF_HASH(counts, struct info_t, struct val_t);
+
+// cache PID and comm by-req
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct who_t who = {};
+
+ if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
+ who.pid = bpf_get_current_pid_tgid();
+ whobyreq.update(&req, &who);
+ }
+
+ return 0;
+}
+
+// time block I/O
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+ u64 ts;
+
+ ts = bpf_ktime_get_ns();
+ start.update(&req, &ts);
+
+ return 0;
+}
+
+// output
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ u64 *tsp;
+
+ // fetch timestamp and calculate delta
+ tsp = start.lookup(&req);
+ if (tsp == 0) {
+ return 0; // missed tracing issue
+ }
+
+ struct who_t *whop;
+ struct val_t *valp, zero = {};
+ u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+
+ // setup info_t key
+ struct info_t info = {};
+ info.major = req->rq_disk->major;
+ info.minor = req->rq_disk->first_minor;
+ info.type = req->cmd_flags & REQ_WRITE;
+ whop = whobyreq.lookup(&req);
+ if (whop == 0) {
+ // missed pid who, save stats as pid 0
+ valp = counts.lookup_or_init(&info, &zero);
+ } else {
+ info.pid = whop->pid;
+ __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
+ valp = counts.lookup_or_init(&info, &zero);
+ }
+
+ // save stats
+ valp->us += delta_us;
+ valp->bytes += req->__data_len;
+ valp->io++;
+
+ start.delete(&req);
+ whobyreq.delete(&req);
+
+ return 0;
+}
+""", debug=0)
+b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_account_io_completion",
+ fn_name="trace_req_completion")
+
+print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
+
+# cache disk major,minor -> diskname
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+# output
+exiting = 0
+while 1:
+ try:
+ sleep(interval)
+ except KeyboardInterrupt:
+ exiting = 1
+
+ # header
+ if clear:
+ call("clear")
+ else:
+ print()
+ with open(loadavg) as stats:
+ print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+ print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
+ "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
+
+ # by-PID output
+ counts = b.get_table("counts")
+ line = 0
+ for k, v in reversed(sorted(counts.items(),
+ key=lambda counts: counts[1].bytes)):
+
+ # lookup disk
+ disk = str(k.major) + "," + str(k.minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "?"
+
+ # print line
+ avg_ms = (float(v.us) / 1000) / v.io
+ print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, k.name,
+ "W" if k.type else "R", k.major, k.minor, diskname, v.io,
+ v.bytes / 1024, avg_ms))
+
+ line += 1
+ if line >= maxrows:
+ break
+ counts.clear()
+
+ countdown -= 1
+ if exiting or countdown == 0:
+ print("Detaching...")
+ exit()
--- /dev/null
+Demonstrations of biotop, the Linux eBPF/bcc version.
+
+
+Short for block device I/O top, biotop summarizes which processes are
+performing disk I/O. It's top for disks. Sample output:
+
+# ./biotop
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+
+08:04:11 loadavg: 1.48 0.87 0.45 1/287 14547
+
+PID COMM D MAJ MIN DISK I/O Kbytes AVGms
+14501 cksum R 202 1 xvda1 361 28832 3.39
+6961 dd R 202 1 xvda1 1628 13024 0.59
+13855 dd R 202 1 xvda1 1627 13016 0.59
+326 jbd2/xvda1-8 W 202 1 xvda1 3 168 3.00
+1880 supervise W 202 1 xvda1 2 8 6.71
+1873 supervise W 202 1 xvda1 2 8 2.51
+1871 supervise W 202 1 xvda1 2 8 1.57
+1876 supervise W 202 1 xvda1 2 8 1.22
+1892 supervise W 202 1 xvda1 2 8 0.62
+1878 supervise W 202 1 xvda1 2 8 0.78
+1886 supervise W 202 1 xvda1 2 8 1.30
+1894 supervise W 202 1 xvda1 2 8 3.46
+1869 supervise W 202 1 xvda1 2 8 0.73
+1888 supervise W 202 1 xvda1 2 8 1.48
+
+By default the screen refreshes every 1 second, and shows the top 20 disk
+consumers, sorted on total Kbytes. The first line printed is the header,
+which has the time and then the contents of /proc/loadavg.
+
+For the interval summarized by the output above, the "cksum" command performed
+361 disk reads to the "xvda1" device, for a total of 28832 Kbytes, with an
+average I/O time of 3.39 ms. Two "dd" processes were also reading from the
+same disk, which a higher I/O rate and lower latency. While the average I/O
+size is not printed, it can be determined by dividing the Kbytes column by
+the I/O column.
+
+The columns through to Kbytes show the workload applied. The final column,
+AVGms, shows resulting performance. Other bcc tools can be used to get more
+details when needed: biolatency and biosnoop.
+
+Many years ago I created the original "iotop", and later regretted not calling
+it diskiotop or blockiotop, as "io" alone is ambiguous. This time it is biotop.
+
+
+The -C option can be used to prevent the screen from clearing (my preference).
+Here's using it with a 5 second interval:
+
+# ./biotop -C 5
+Tracing... Output every 5 secs. Hit Ctrl-C to end
+
+08:09:44 loadavg: 0.42 0.44 0.39 2/282 22115
+
+PID COMM D MAJ MIN DISK I/O Kbytes AVGms
+22069 dd R 202 1 xvda1 5993 47976 0.33
+326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.67
+1866 svscan R 202 1 xvda1 33 132 1.24
+1880 supervise W 202 1 xvda1 10 40 0.56
+1873 supervise W 202 1 xvda1 10 40 0.79
+1871 supervise W 202 1 xvda1 10 40 0.78
+1876 supervise W 202 1 xvda1 10 40 0.68
+1892 supervise W 202 1 xvda1 10 40 0.71
+1878 supervise W 202 1 xvda1 10 40 0.65
+1886 supervise W 202 1 xvda1 10 40 0.78
+1894 supervise W 202 1 xvda1 10 40 0.80
+1869 supervise W 202 1 xvda1 10 40 0.91
+1888 supervise W 202 1 xvda1 10 40 0.63
+22069 bash R 202 1 xvda1 1 16 19.94
+9251 kworker/u16:2 W 202 16 xvdb 2 8 0.13
+
+08:09:49 loadavg: 0.47 0.44 0.39 1/282 22231
+
+PID COMM D MAJ MIN DISK I/O Kbytes AVGms
+22069 dd R 202 1 xvda1 13450 107600 0.35
+22199 cksum R 202 1 xvda1 941 45548 4.63
+326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.93
+24467 kworker/0:2 W 202 16 xvdb 1 64 0.28
+1880 supervise W 202 1 xvda1 10 40 0.81
+1873 supervise W 202 1 xvda1 10 40 0.81
+1871 supervise W 202 1 xvda1 10 40 1.03
+1876 supervise W 202 1 xvda1 10 40 0.76
+1892 supervise W 202 1 xvda1 10 40 0.74
+1878 supervise W 202 1 xvda1 10 40 0.94
+1886 supervise W 202 1 xvda1 10 40 0.76
+1894 supervise W 202 1 xvda1 10 40 0.69
+1869 supervise W 202 1 xvda1 10 40 0.72
+1888 supervise W 202 1 xvda1 10 40 1.70
+22199 bash R 202 1 xvda1 2 20 0.35
+482 xfsaild/md0 W 202 16 xvdb 5 13 0.27
+482 xfsaild/md0 W 202 32 xvdc 2 8 0.33
+31331 pickup R 202 1 xvda1 1 4 0.31
+
+08:09:54 loadavg: 0.51 0.45 0.39 2/282 22346
+
+PID COMM D MAJ MIN DISK I/O Kbytes AVGms
+22069 dd R 202 1 xvda1 14689 117512 0.32
+326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.33
+1880 supervise W 202 1 xvda1 10 40 0.65
+1873 supervise W 202 1 xvda1 10 40 1.08
+1871 supervise W 202 1 xvda1 10 40 0.66
+1876 supervise W 202 1 xvda1 10 40 0.79
+1892 supervise W 202 1 xvda1 10 40 0.67
+1878 supervise W 202 1 xvda1 10 40 0.66
+1886 supervise W 202 1 xvda1 10 40 1.02
+1894 supervise W 202 1 xvda1 10 40 0.88
+1869 supervise W 202 1 xvda1 10 40 0.89
+1888 supervise W 202 1 xvda1 10 40 1.25
+
+08:09:59 loadavg: 0.55 0.46 0.40 2/282 22461
+
+PID COMM D MAJ MIN DISK I/O Kbytes AVGms
+22069 dd R 202 1 xvda1 14442 115536 0.33
+326 jbd2/xvda1-8 W 202 1 xvda1 3 168 3.46
+1880 supervise W 202 1 xvda1 10 40 0.87
+1873 supervise W 202 1 xvda1 10 40 0.87
+1871 supervise W 202 1 xvda1 10 40 0.78
+1876 supervise W 202 1 xvda1 10 40 0.86
+1892 supervise W 202 1 xvda1 10 40 0.89
+1878 supervise W 202 1 xvda1 10 40 0.87
+1886 supervise W 202 1 xvda1 10 40 0.86
+1894 supervise W 202 1 xvda1 10 40 1.06
+1869 supervise W 202 1 xvda1 10 40 1.12
+1888 supervise W 202 1 xvda1 10 40 0.98
+
+08:10:04 loadavg: 0.59 0.47 0.40 3/282 22576
+
+PID COMM D MAJ MIN DISK I/O Kbytes AVGms
+22069 dd R 202 1 xvda1 14179 113432 0.34
+326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.39
+1880 supervise W 202 1 xvda1 10 40 0.81
+1873 supervise W 202 1 xvda1 10 40 1.02
+1871 supervise W 202 1 xvda1 10 40 1.15
+1876 supervise W 202 1 xvda1 10 40 1.10
+1892 supervise W 202 1 xvda1 10 40 0.77
+1878 supervise W 202 1 xvda1 10 40 0.72
+1886 supervise W 202 1 xvda1 10 40 0.81
+1894 supervise W 202 1 xvda1 10 40 0.86
+1869 supervise W 202 1 xvda1 10 40 0.83
+1888 supervise W 202 1 xvda1 10 40 0.79
+24467 kworker/0:2 R 202 32 xvdc 3 12 0.26
+1056 cron R 202 1 xvda1 2 8 0.30
+24467 kworker/0:2 R 202 16 xvdb 1 4 0.23
+
+08:10:09 loadavg: 0.54 0.46 0.40 2/281 22668
+
+PID COMM D MAJ MIN DISK I/O Kbytes AVGms
+22069 dd R 202 1 xvda1 250 2000 0.34
+326 jbd2/xvda1-8 W 202 1 xvda1 3 168 2.40
+1880 supervise W 202 1 xvda1 8 32 0.93
+1873 supervise W 202 1 xvda1 8 32 0.76
+1871 supervise W 202 1 xvda1 8 32 0.60
+1876 supervise W 202 1 xvda1 8 32 0.61
+1892 supervise W 202 1 xvda1 8 32 0.68
+1878 supervise W 202 1 xvda1 8 32 0.90
+1886 supervise W 202 1 xvda1 8 32 0.57
+1894 supervise W 202 1 xvda1 8 32 0.97
+1869 supervise W 202 1 xvda1 8 32 0.69
+1888 supervise W 202 1 xvda1 8 32 0.67
+
+This shows another "dd" command reading from xvda1. On this system, various
+"supervise" processes do 8 disk writes per second, every second (they are
+creating and updating "status" files).
+
+
+USAGE message:
+
+# ./biotop.py -h
+usage: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
+
+Block device (disk) I/O by process
+
+positional arguments:
+ interval output interval, in seconds
+ count number of outputs
+
+optional arguments:
+ -h, --help show this help message and exit
+ -C, --noclear don't clear the screen
+ -r MAXROWS, --maxrows MAXROWS
+ maximum rows to print, default 20
+
+examples:
+ ./biotop # block device I/O top, 1 second refresh
+ ./biotop -C # don't clear the screen
+ ./biotop 5 # 5 second summaries
+ ./biotop 5 10 # 5 second summaries, 10 times only