--- /dev/null
+/*
+ * bitehist.c Block I/O size histogram.
+ * For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Based on eBPF sample tracex2 by Alexi Starovoitov.
+ * Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 15-Aug-2015 Brendan Gregg Created this.
+ */
+
+#include <uapi/linux/ptrace.h>
+#include "../../linux-4.2-rc5/include/linux/blkdev.h"
+
+BPF_TABLE("array", int, u64, dist, 64);
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32 + 1;
+ else
+ return log2(v) + 1;
+}
+
+int do_request(struct pt_regs *ctx, struct request *req)
+{
+ int index = log2l(req->__data_len / 1024);
+ u64 *leaf = dist.lookup(&index);
+ if (leaf) (*leaf)++;
+
+ return 0;
+}
--- /dev/null
+#!/usr/bin/python
+#
+# bitehist.py Block I/O size histogram.
+# For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of using a histogram to show a distribution.
+#
+# USAGE: bitehist.py [interval [count]]
+#
+# The default interval is 5 seconds. A Ctrl-C will print the partially
+# gathered histogram then exit.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Aug-2015 Brendan Gregg Created this.
+
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep
+from sys import argv
+
+def usage():
+ print("USAGE: %s [interval [count]]" % argv[0])
+ exit()
+
+# arguments
+interval = 5
+count = -1
+if len(argv) > 1:
+ try:
+ interval = int(argv[1])
+ if interval == 0:
+ raise
+ if len(argv) > 2:
+ count = int(argv[2])
+ except: # also catches -h, --help
+ usage()
+
+# load BPF program
+b = BPF(src_file = "bitehist.c")
+BPF.attach_kprobe(b.load_func("do_request", BPF.KPROBE), "blk_start_request")
+dist = b.get_table("dist", c_int, c_ulonglong)
+dist_max = 64
+
+# header
+print("Tracing... Hit Ctrl-C to end.")
+last = {}
+for i in range(1, dist_max + 1):
+ last[i] = 0
+
+# functions
+stars_max = 38
+def stars(val, val_max, width):
+ i = 0
+ text = ""
+ while (1):
+ if (i > (width * val / val_max) - 1) or (i > width - 1):
+ break
+ text += "*"
+ i += 1
+ if val > val_max:
+ text = text[:-1] + "+"
+ return text
+
+def print_log2_hist(d, val_type):
+ idx_max = -1
+ val_max = 0
+ for i in range(1, dist_max + 1):
+ try:
+ val = dist[c_int(i)].value - last[i]
+ if (val > 0):
+ idx_max = i
+ if (val > val_max):
+ val_max = val
+ except:
+ break
+ if idx_max > 0:
+ print(" %-15s : count distribution" % val_type);
+ for i in range(1, idx_max + 1):
+ low = (1 << i) >> 1
+ high = (1 << i) - 1
+ if (low == high):
+ low -= 1
+ try:
+ val = dist[c_int(i)].value - last[i]
+ print("%8d -> %-8d : %-8d |%-*s|" % (low, high, val,
+ stars_max, stars(val, val_max, stars_max)))
+ last[i] = dist[c_int(i)].value
+ except:
+ break
+
+# output
+loop = 0
+do_exit = 0
+while (1):
+ if count > 0:
+ loop += 1
+ if loop > count:
+ exit()
+ try:
+ sleep(interval)
+ except KeyboardInterrupt:
+ pass; do_exit = 1
+
+ print
+ print_log2_hist(dist, "kbytes")
+ if do_exit:
+ exit()
--- /dev/null
+Demonstrations of bitehist.py, the Linux eBPF/bcc version.
+
+This prints a power-of-2 histogram to show the block I/O size distribution.
+By default, a summary is printed every five seconds:
+
+# ./bitehist.py
+Tracing... Hit Ctrl-C to end.
+
+ kbytes : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 26 |************* |
+ 8 -> 15 : 3 |* |
+ 16 -> 31 : 5 |** |
+ 32 -> 63 : 6 |*** |
+ 64 -> 127 : 7 |*** |
+ 128 -> 255 : 75 |**************************************|
+
+ kbytes : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 83 |**************************************|
+ 8 -> 15 : 2 | |
+ 16 -> 31 : 6 |** |
+ 32 -> 63 : 6 |** |
+ 64 -> 127 : 5 |** |
+ 128 -> 255 : 21 |********* |
+^C
+ kbytes : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 8 |**************************************|
+
+The first output shows a bimodal distribution. The largest mode of 75 I/O were
+between 128 and 255 Kbytes in size, and another mode of 26 I/O were between 4
+and 7 Kbytes in size.
+
+The next output summary shows the workload is doing more 4 - 7 Kbyte I/O.
+
+The final output is partial, showing what was measured until Ctrl-C was hit.
+
+
+For an output interval of one second, and three summaries:
+
+# ./bitehist.py 1 3
+Tracing... Hit Ctrl-C to end.
+
+ kbytes : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 4 |**************************************|
+
+ kbytes : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 5 |**************************************|
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 1 |******* |
+
+ kbytes : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 4 |**************************************|
+
+
+Full usage:
+
+# ./bitehist.py -h
+USAGE: ./bitehist.py [interval [count]]
--- /dev/null
+/*
+ * disksnoop.c Trace block device I/O: basic version of iosnoop.
+ * For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Copyright (c) 2015 Brendan Gregg.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 11-Aug-2015 Brendan Gregg Created this.
+ */
+
+#include <uapi/linux/ptrace.h>
+#include "../../linux-4.2-rc5/include/linux/blkdev.h"
+
+struct key_t {
+ struct request *req;
+};
+BPF_TABLE("hash", struct key_t, u64, start, 10240);
+
+int do_request(struct pt_regs *ctx, struct request *req) {
+ struct key_t key = {};
+ u64 ts;
+
+ // stash start timestamp by request ptr
+ ts = bpf_ktime_get_ns();
+ key.req = req;
+ start.update(&key, &ts);
+
+ return 0;
+}
+
+int do_completion(struct pt_regs *ctx, struct request *req) {
+ struct key_t key = {};
+ u64 *tsp, delta;
+
+ key.req = req;
+ tsp = start.lookup(&key);
+
+ if (tsp != 0) {
+ delta = bpf_ktime_get_ns() - *tsp;
+ bpf_trace_printk("%d %x %d\n", req->__data_len,
+ req->cmd_flags, delta / 1000);
+ start.delete(&key);
+ }
+
+ return 0;
+}
--- /dev/null
+#!/usr/bin/python
+#
+# disksnoop.py Trace block device I/O: basic version of iosnoop.
+# For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of tracing latency.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 11-Aug-2015 Brendan Gregg Created this.
+
+from bpf import BPF
+import sys
+
+REQ_WRITE = 1 # from include/linux/blk_types.h
+
+# load BPF program
+b = BPF(src_file="disksnoop.c")
+BPF.attach_kprobe(b.load_func("do_request", BPF.KPROBE), "blk_start_request")
+BPF.attach_kprobe(b.load_func("do_completion", BPF.KPROBE), "blk_update_request")
+
+# header
+print "%-18s %-2s %-7s %8s" % ("TIME(s)", "T", "BYTES", "LAT(ms)")
+
+# open trace pipe
+try:
+ trace = open("/sys/kernel/debug/tracing/trace_pipe", "r")
+except:
+ print >> sys.stderr, "ERROR: opening trace_pipe"
+ exit(1)
+
+# format output
+while 1:
+ try:
+ line = trace.readline().rstrip()
+ except KeyboardInterrupt:
+ pass; exit()
+ prolog, time_s, colon, bytes_s, flags_s, us_s = \
+ line.rsplit(" ", 5)
+
+ time_s = time_s[:-1] # strip trailing ":"
+ flags = int(flags_s, 16)
+ if flags & REQ_WRITE:
+ type_s = "W"
+ elif bytes_s == "0": # see blk_fill_rwbs() for logic
+ type_s = "M"
+ else:
+ type_s = "R"
+ ms = float(int(us_s, 10)) / 1000
+
+ print "%-18s %-2s %-7s %8.2f" % (time_s, type_s, bytes_s, ms)
--- /dev/null
+Demonstrations of disksnoop.py, the Linux eBPF/bcc version.
+
+
+This traces block I/O, a prints a line to summarize each I/O completed:
+
+# ./disksnoop.py
+TIME(s) T BYTES LAT(ms)
+16458043.435457 W 4096 2.73
+16458043.435981 W 4096 3.24
+16458043.436012 W 4096 3.13
+16458043.437326 W 4096 4.44
+16458044.126545 R 4096 42.82
+16458044.129872 R 4096 3.24
+16458044.130705 R 4096 0.73
+16458044.142813 R 4096 12.01
+16458044.147302 R 4096 4.33
+16458044.148117 R 4096 0.71
+16458044.148950 R 4096 0.70
+16458044.164332 R 4096 15.29
+16458044.168003 R 4096 3.58
+16458044.171676 R 4096 3.59
+16458044.172453 R 4096 0.72
+16458044.173213 R 4096 0.71
+16458044.173989 R 4096 0.72
+16458044.174739 R 4096 0.70
+16458044.190334 R 4096 15.52
+16458044.196608 R 4096 6.17
+16458044.203091 R 4096 6.35
+
+The output includes a basic timestamp (in seconds), the type of I/O (W == write,
+R == read, M == metadata), the size of the I/O in bytes, and the latency (or
+duration) of the I/O in milliseconds.
+
+The latency is measured from I/O request to the device, to the device
+completion. This excludes latency spent queued in the OS.
+
+Most of the I/O in this example were 0.7 and 4 milliseconds in duration. There
+was an outlier of 42.82 milliseconds, a read which followed many writes (the
+high latency may have been caused by the writes still being serviced on the
+storage device).
--- /dev/null
+/*
+ * vfsreadlat.c VFS read latency distribution.
+ * For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Based on eBPF sample tracex2 by Alexi Starovoitov.
+ * Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 15-Aug-2015 Brendan Gregg Created this.
+ */
+
+#include <uapi/linux/ptrace.h>
+
+struct key_t {
+ u32 pid;
+};
+
+BPF_TABLE("hash", struct key_t, u64, start, 10240);
+BPF_TABLE("array", int, u64, dist, 64);
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32 + 1;
+ else
+ return log2(v) + 1;
+}
+
+int do_entry(struct pt_regs *ctx)
+{
+ struct key_t key = {};
+ u64 ts, *val, zero = 0;
+
+ key.pid = bpf_get_current_pid_tgid();
+ ts = bpf_ktime_get_ns();
+ start.update(&key, &ts);
+ return 0;
+}
+
+int do_return(struct pt_regs *ctx)
+{
+ struct key_t key = {};
+ u64 *tsp, delta;
+
+ key.pid = bpf_get_current_pid_tgid();
+ tsp = start.lookup(&key);
+
+ if (tsp != 0) {
+ delta = bpf_ktime_get_ns() - *tsp;
+ int index = log2l(delta / 1000);
+ u64 *leaf = dist.lookup(&index);
+ if (leaf) (*leaf)++;
+ start.delete(&key);
+ }
+
+ return 0;
+}
--- /dev/null
+#!/usr/bin/python
+#
+# vfsreadlat.py VFS read latency distribution.
+# For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of a function latency distribution histogram.
+#
+# USAGE: vfsreadlat.py [interval [count]]
+#
+# The default interval is 5 seconds. A Ctrl-C will print the partially
+# gathered histogram then exit.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Aug-2015 Brendan Gregg Created this.
+
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep
+from sys import argv
+
+def usage():
+ print("USAGE: %s [interval [count]]" % argv[0])
+ exit()
+
+# arguments
+interval = 5
+count = -1
+if len(argv) > 1:
+ try:
+ interval = int(argv[1])
+ if interval == 0:
+ raise
+ if len(argv) > 2:
+ count = int(argv[2])
+ except: # also catches -h, --help
+ usage()
+
+# load BPF program
+b = BPF(src_file = "vfsreadlat.c")
+BPF.attach_kprobe(b.load_func("do_entry", BPF.KPROBE), "vfs_read")
+BPF.attach_kretprobe(b.load_func("do_return", BPF.KPROBE), "vfs_read")
+dist = b.get_table("dist", c_int, c_ulonglong)
+dist_max = 64
+
+# header
+print("Tracing... Hit Ctrl-C to end.")
+last = {}
+for i in range(1, dist_max + 1):
+ last[i] = 0
+
+# functions
+stars_max = 38
+def stars(val, val_max, width):
+ i = 0
+ text = ""
+ while (1):
+ if (i > (width * val / val_max) - 1) or (i > width - 1):
+ break
+ text += "*"
+ i += 1
+ if val > val_max:
+ text = text[:-1] + "+"
+ return text
+
+def print_log2_hist(d, val_type):
+ idx_max = -1
+ val_max = 0
+ for i in range(1, dist_max + 1):
+ try:
+ val = dist[c_int(i)].value - last[i]
+ if (val > 0):
+ idx_max = i
+ if (val > val_max):
+ val_max = val
+ except:
+ break
+ if idx_max > 0:
+ print(" %-15s : count distribution" % val_type);
+ for i in range(1, idx_max + 1):
+ low = (1 << i) >> 1
+ high = (1 << i) - 1
+ if (low == high):
+ low -= 1
+ try:
+ val = dist[c_int(i)].value - last[i]
+ print("%8d -> %-8d : %-8d |%-*s|" % (low, high, val,
+ stars_max, stars(val, val_max, stars_max)))
+ last[i] = dist[c_int(i)].value
+ except:
+ break
+
+# output
+loop = 0
+do_exit = 0
+while (1):
+ if count > 0:
+ loop += 1
+ if loop > count:
+ exit()
+ try:
+ sleep(interval)
+ except KeyboardInterrupt:
+ pass; do_exit = 1
+
+ print
+ print_log2_hist(dist, "usecs")
+ if do_exit:
+ exit()
--- /dev/null
+Demonstrations of vfsreadlat.py, the Linux eBPF/bcc version.
+
+
+This example traces the latency of vfs_read (time from call to return), printing
+it as a histogram distribution. By default, output is every five seconds:
+
+# ./vfsreadlat.py
+Tracing... Hit Ctrl-C to end.
+
+ usecs : count distribution
+ 0 -> 1 : 4457 |*************************************+|
+ 2 -> 3 : 447 |*** |
+ 4 -> 7 : 2059 |***************** |
+ 8 -> 15 : 1179 |********** |
+ 16 -> 31 : 63 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 2 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 3 | |
+ 512 -> 1023 : 1 | |
+ 1024 -> 2047 : 3 | |
+ 2048 -> 4095 : 2 | |
+ 4096 -> 8191 : 0 | |
+ 8192 -> 16383 : 0 | |
+ 16384 -> 32767 : 0 | |
+ 32768 -> 65535 : 0 | |
+ 65536 -> 131071 : 4 | |
+ 131072 -> 262143 : 2 | |
+ 262144 -> 524287 : 0 | |
+ 524288 -> 1048575 : 4 | |
+^C
+ usecs : count distribution
+ 0 -> 1 : 241 |*************************************+|
+ 2 -> 3 : 17 |** |
+ 4 -> 7 : 2 | |
+ 8 -> 15 : 4 | |
+ 16 -> 31 : 2 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 1 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 1 | |
+ 512 -> 1023 : 1 | |
+ 1024 -> 2047 : 0 | |
+ 2048 -> 4095 : 1 | |
+ 4096 -> 8191 : 0 | |
+ 8192 -> 16383 : 0 | |
+ 16384 -> 32767 : 0 | |
+ 32768 -> 65535 : 0 | |
+ 65536 -> 131071 : 0 | |
+ 131072 -> 262143 : 0 | |
+ 262144 -> 524287 : 0 | |
+ 524288 -> 1048575 : 1 | |
+
+These examples show outliers in the 524 - 1048 milliseconds range. Since
+vfs_read() will catch many types of events, this could be anything including
+keystroke latency on ssh sessions. Further drilling with bcc will be necessary
+to identify more details.
+
+
+Full usage:
+
+# ./vfsreadlat.py -h
+USAGE: ./vfsreadlat.py [interval [count]]
--- /dev/null
+.TH pidpersec 8 "2015-08-18" "USER COMMANDS"
+.SH NAME
+pidpersec \- Count new processes (via fork()). Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B pidpersec
+.SH DESCRIPTION
+pidpersec shows how many new processes were created each second. There
+can be performance issues caused by many short-lived processes, which may not
+be visible in sampling tools like top(1). pidpersec provides one way to
+investigate this behavior.
+
+This works by tracing the kernel sched_fork() function using dynamic tracing,
+and will need updating to match any changes to this function.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Count new processes created each second:
+#
+.B pidpersec
+.SH OVERHEAD
+This traces the kernel fork function, and maintains an in-kernel count which is
+read asynchronously from user-space. As the rate of this is generally expected to
+be low (<< 1000/s), the overhead is also expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+top(1)
--- /dev/null
+.TH syncsnoop 8 "2015-08-18" "USER COMMANDS"
+.SH NAME
+syncsnoop \- Trace sync() syscall. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B syncsnoop
+.SH DESCRIPTION
+syncsnoop traces calls to sync(), which flushes file system buffers to
+storage devices. These calls can cause performance perturbations, and it can
+be useful to know if they are happening and how frequently.
+
+This works by tracing the kernel sys_sync() function using dynamic tracing, and
+will need updating to match any changes to this function.
+
+This program is also a basic example of eBPF/bcc.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Trace calls to sync():
+#
+.B syncsnoop
+.SH FIELDS
+.TP
+TIME(s)
+Time of the call, in seconds.
+.TP
+CALL
+Call traced.
+.SH OVERHEAD
+This traces the kernel sync function and prints output for each event. As the
+rate of this is generally expected to be low (<< 100/s), the overhead is also
+expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+iostat(1)
--- /dev/null
+.TH vfscount 8 "2015-08-18" "USER COMMANDS"
+.SH NAME
+vfscount \- Count some common VFS calls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B vfscount
+.SH DESCRIPTION
+This counts common VFS calls. This can be useful for general workload
+characterization of these operations.
+
+This works by tracing some kernel vfs functions using dynamic tracing, and will
+need updating to match any changes to these functions. Edit the script to
+customize and add functions to trace, which is easy to do.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Count some VFS calls until Ctrl-C is hit:
+#
+.B vfscount
+.SH FIELDS
+.TP
+ADDR
+Address of the instruction pointer that was traced (only useful if the FUNC column is suspicious and you would like to double check the translation).
+.TP
+FUNC
+Kernel function name
+.TP
+COUNT
+Number of calls while tracing
+.SH OVERHEAD
+This traces various kernel vfs functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of VFS operations can
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+vfsstat(8)
--- /dev/null
+.TH vfsstat 8 "2015-08-18" "USER COMMANDS"
+.SH NAME
+vfsstat \- Statistics for some common VFS calls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B vfsstat
+[interval [count]]
+.SH DESCRIPTION
+This traces some common VFS calls and prints per-second summaries. This can
+be useful for general workload characterization, and looking for patterns
+in operation usage over time.
+
+This works by tracing some kernel vfs functions using dynamic tracing, and will
+need updating to match any changes to these functions. Edit the script to
+customize which functions are traced. Also see vfscount, which is more
+easily customized to trace multiple functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Print summaries each second:
+#
+.B vfsstat
+.TP
+Print output every five seconds, three times:
+#
+.B vfsstat 5 3
+.SH FIELDS
+.TP
+READ/s
+Number of vfs_read() calls as a per-second average.
+.TP
+WRITE/s
+Number of vfs_write() calls as a per-second average.
+.TP
+CREATE/s
+Number of vfs_create() calls as a per-second average.
+.TP
+OPEN/s
+Number of vfs_open() calls as a per-second average.
+.TP
+FSYNC/s
+Number of vfs_fsync() calls as a per-second average.
+.SH OVERHEAD
+This traces various kernel vfs functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of VFS operations can
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+vfscount(8)
--- /dev/null
+#!/usr/bin/python
+#
+# pidpersec Count new processes (via fork).
+# For Linux, uses BCC, eBPF. See .c file.
+#
+# USAGE: pidpersec
+#
+# Written as a basic example of counting an event.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 11-Aug-2015 Brendan Gregg Created this.
+
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep, strftime
+
+# load BPF program
+b = BPF(src_file = "pidpersec.c")
+BPF.attach_kprobe(b.load_func("do_count", BPF.KPROBE), "sched_fork")
+stats = b.get_table("stats", c_int, c_ulonglong)
+
+# stat indexes
+S_COUNT = 1
+
+# header
+print "Tracing... Ctrl-C to end."
+
+# output
+last = 0
+while (1):
+ try:
+ sleep(1)
+ except KeyboardInterrupt:
+ pass; exit()
+
+ print "%s: PIDs/sec: %d" % (strftime("%H:%M:%S"),
+ (stats[c_int(S_COUNT)].value - last))
+ last = stats[c_int(S_COUNT)].value
--- /dev/null
+/*
+ * pidpersec.c Count new processes (via fork).
+ * For Linux, uses BCC, eBPF. See the Python front-end.
+ *
+ * USAGE: pidpersec.py
+ *
+ * Copyright (c) 2015 Brendan Gregg.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 11-Aug-2015 Brendan Gregg Created this.
+ */
+
+#include <uapi/linux/ptrace.h>
+
+enum stat_types {
+ S_COUNT = 1,
+ S_MAXSTAT
+};
+
+BPF_TABLE("array", int, u64, stats, S_MAXSTAT + 1);
+
+void stats_increment(int key) {
+ u64 *leaf = stats.lookup(&key);
+ if (leaf) (*leaf)++;
+}
+
+void do_count(struct pt_regs *ctx) { stats_increment(S_COUNT); }
--- /dev/null
+Demonstrations of pidpersec, the Linux eBPF/bcc version.
+
+
+This shows the number of new processes created per second, measured by tracing
+the kernel fork() routine:
+
+# ./pidpersec
+Tracing... Ctrl-C to end.
+18:33:06: PIDs/sec: 4
+18:33:07: PIDs/sec: 5
+18:33:08: PIDs/sec: 4
+18:33:09: PIDs/sec: 4
+18:33:10: PIDs/sec: 21
+18:33:11: PIDs/sec: 5
+18:33:12: PIDs/sec: 4
+18:33:13: PIDs/sec: 4
+
+Each second there are four new processes (this happens to be caused by a
+launcher script that is retrying in a loop, and encountering errors).
+
+At 18:33:10, I typed "man ls" in another server session, which caused an
+increase in the number of new processes as the necessary commands were run.
--- /dev/null
+#!/usr/bin/python
+#
+# syncsnoop Trace sync() syscall.
+# For Linux, uses BCC, eBPF. Embedded C.
+#
+# Written as a basic example of BCC trace & reformat. See
+# examples/hello_world.py for a BCC trace with default output example.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Aug-2015 Brendan Gregg Created this.
+
+from bpf import BPF
+import sys
+
+# load BPF program
+b = BPF(text = """
+int do_sync(void *ctx) {
+ bpf_trace_printk("sync()\\n");
+ return 0;
+};
+""")
+BPF.attach_kprobe(b.load_func("do_sync", BPF.KPROBE), "sys_sync")
+
+# header
+print "%-18s %s" % ("TIME(s)", "CALL")
+
+# open trace pipe
+try:
+ trace = open("/sys/kernel/debug/tracing/trace_pipe", "r")
+except:
+ print >> sys.stderr, "ERROR: opening trace_pipe"
+ exit(1)
+
+# format output
+while 1:
+ try:
+ line = trace.readline().rstrip()
+ except KeyboardInterrupt:
+ pass; exit()
+
+ prolog, time_s, colon, word = line.rsplit(" ", 3)
+ time_s = time_s[:-1] # strip trailing ":"
+
+ print "%-18s %s" % (time_s, word)
--- /dev/null
+Demonstrations of syncsnoop, the Linux eBPF/bcc version.
+
+
+This program traces calls to the kernel sync() routine, with basic timestamps:
+
+# ./syncsnoop
+TIME(s) CALL
+16458148.611952 sync()
+16458151.533709 sync()
+^C
+
+While tracing, the "sync" command was executed in another server session.
+
+This can be useful to identify that sync() is being called, and its frequency.
--- /dev/null
+#!/usr/bin/python
+#
+# vfscount Count some VFS calls.
+# For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of counting functions.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Aug-2015 Brendan Gregg Created this.
+
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep, strftime
+from sys import stderr
+
+# kernel symbol translation
+ksym_addrs = [] # addresses for binary search
+ksym_names = [] # same index as ksym_addrs
+def load_kallsyms():
+ symfile = "/proc/kallsyms"
+ try:
+ syms = open(symfile, "r")
+ except:
+ print >> stderr, "ERROR: reading " + symfile
+ exit()
+ line = syms.readline()
+ for line in iter(syms):
+ cols = line.split()
+ name = cols[2]
+ if name[:4] != "vfs_": # perf optimization
+ continue
+ addr = int(cols[0], 16)
+ ksym_addrs.append(addr)
+ ksym_names.append(name)
+ syms.close()
+def ksym(addr):
+ start = -1
+ end = len(ksym_addrs)
+ while end != start + 1:
+ mid = (start + end) / 2
+ if addr < ksym_addrs[mid]:
+ end = mid
+ else:
+ start = mid
+ if start == -1:
+ return "[unknown]"
+ return ksym_names[start]
+load_kallsyms()
+
+# load BPF program
+b = BPF(src_file = "vfscount.c")
+fn = b.load_func("do_count", BPF.KPROBE)
+BPF.attach_kprobe(fn, "vfs_read")
+BPF.attach_kprobe(fn, "vfs_write")
+BPF.attach_kprobe(fn, "vfs_fsync")
+BPF.attach_kprobe(fn, "vfs_open")
+BPF.attach_kprobe(fn, "vfs_create")
+counts = b.get_table("counts")
+
+# header
+print "Tracing... Ctrl-C to end."
+
+# output
+try:
+ sleep(99999999)
+except KeyboardInterrupt:
+ pass
+
+print "\n%-16s %-12s %8s" % ("ADDR", "FUNC", "COUNT")
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+ print "%-16x %-12s %8d" % (k.ip, ksym(k.ip), v.value)
--- /dev/null
+/*
+ * vfscount.c Count some VFS calls.
+ * For Linux, uses BCC, eBPF. See the Python front-end.
+ *
+ * USAGE: vfscount.py
+ *
+ * Copyright (c) 2015 Brendan Gregg.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 14-Aug-2015 Brendan Gregg Created this.
+ */
+
+#include <uapi/linux/ptrace.h>
+
+struct key_t {
+ u64 ip;
+};
+
+BPF_TABLE("hash", struct key_t, u64, counts, 256);
+
+int do_count(struct pt_regs *ctx) {
+ struct key_t key = {};
+ u64 zero = 0, *val;
+ key.ip = ctx->ip;
+ val = counts.lookup_or_init(&key, &zero);
+ (*val)++;
+ return 0;
+}
--- /dev/null
+Demonstrations of vfscount, the Linux eBPF/bcc version.
+
+
+This counts VFS calls, by tracing various kernel calls beginning with "vfs_"
+(edit the script to customize):
+
+# ./vfscount
+Tracing... Ctrl-C to end.
+^C
+ADDR FUNC COUNT
+ffffffff811f2cc1 vfs_create 24
+ffffffff811e71c1 vfs_write 203
+ffffffff811e6061 vfs_open 765
+ffffffff811e7091 vfs_read 1852
+
+This can be useful for workload characterization, to see what types of
+operations are in use.
--- /dev/null
+#!/usr/bin/python
+#
+# vfsstat Count some VFS calls.
+# For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of counting multiple events as a stat tool.
+#
+# USAGE: vfsstat [interval [count]]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Aug-2015 Brendan Gregg Created this.
+
+from __future__ import print_function
+from bpf import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep, strftime
+from sys import argv
+
+def usage():
+ print("USAGE: %s [interval [count]]" % argv[0])
+ exit()
+
+# arguments
+interval = 1
+count = -1
+if len(argv) > 1:
+ try:
+ interval = int(argv[1])
+ if interval == 0:
+ raise
+ if len(argv) > 2:
+ count = int(argv[2])
+ except: # also catches -h, --help
+ usage()
+
+# load BPF program
+b = BPF(src_file = "vfsstat.c")
+BPF.attach_kprobe(b.load_func("do_read", BPF.KPROBE), "vfs_read")
+BPF.attach_kprobe(b.load_func("do_write", BPF.KPROBE), "vfs_write")
+BPF.attach_kprobe(b.load_func("do_fsync", BPF.KPROBE), "vfs_fsync")
+BPF.attach_kprobe(b.load_func("do_open", BPF.KPROBE), "vfs_open")
+BPF.attach_kprobe(b.load_func("do_create", BPF.KPROBE), "vfs_create")
+stats = b.get_table("stats", c_int, c_ulonglong)
+
+# stat column labels and indexes
+stat_types = {
+ "READ" : 1,
+ "WRITE" : 2,
+ "FSYNC" : 3,
+ "OPEN" : 4,
+ "CREATE" : 5
+}
+
+# header
+print("%-8s " % "TIME", end="")
+last = {}
+for stype in stat_types.keys():
+ print(" %8s" % (stype + "/s"), end="")
+ idx = stat_types[stype]
+ last[idx] = 0
+print("")
+
+# output
+i = 0
+while (1):
+ if count > 0:
+ i += 1
+ if i > count:
+ exit()
+ try:
+ sleep(interval)
+ except KeyboardInterrupt:
+ pass; exit()
+
+ print("%-8s: " % strftime("%H:%M:%S"), end="")
+ # print each statistic as a column
+ for stype in stat_types.keys():
+ idx = stat_types[stype]
+ try:
+ delta = stats[c_int(idx)].value - last[idx]
+ print(" %8d" % (delta / interval), end="")
+ last[idx] = stats[c_int(idx)].value
+ except:
+ print(" %8d" % 0, end="")
+ print("")
--- /dev/null
+/*
+ * vfsstat.c Count some VFS calls.
+ * For Linux, uses BCC, eBPF. See the Python front-end.
+ *
+ * USAGE: vfsstat.py [interval [count]]
+ *
+ * Copyright (c) 2015 Brendan Gregg.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 14-Aug-2015 Brendan Gregg Created this.
+ */
+
+#include <uapi/linux/ptrace.h>
+
+enum stat_types {
+ S_READ = 1,
+ S_WRITE,
+ S_FSYNC,
+ S_OPEN,
+ S_CREATE,
+ S_MAXSTAT
+};
+
+BPF_TABLE("array", int, u64, stats, S_MAXSTAT + 1);
+
+void stats_increment(int key) {
+ u64 *leaf = stats.lookup(&key);
+ if (leaf) (*leaf)++;
+}
+
+void do_read(struct pt_regs *ctx) { stats_increment(S_READ); }
+void do_write(struct pt_regs *ctx) { stats_increment(S_WRITE); }
+void do_fsync(struct pt_regs *ctx) { stats_increment(S_FSYNC); }
+void do_open(struct pt_regs *ctx) { stats_increment(S_OPEN); }
+void do_create(struct pt_regs *ctx) { stats_increment(S_CREATE); }
--- /dev/null
+Demonstrations of vfsstat, the Linux eBPF/bcc version.
+
+
+This traces some common VFS calls and prints per-second summaries. By default,
+the output interval is one second:
+
+# ./vfsstat
+TIME READ/s WRITE/s CREATE/s OPEN/s FSYNC/s
+18:35:32: 231 12 4 98 0
+18:35:33: 274 13 4 106 0
+18:35:34: 586 86 4 251 0
+18:35:35: 241 15 4 99 0
+18:35:36: 232 10 4 98 0
+18:35:37: 244 10 4 107 0
+18:35:38: 235 13 4 97 0
+18:35:39: 6749 2633 4 1446 0
+18:35:40: 277 31 4 115 0
+18:35:41: 238 16 6 102 0
+18:35:42: 284 50 8 114 0
+^C
+
+
+Here we are using an output interval of five seconds, and printing three output
+lines:
+
+# ./vfsstat 5 3
+TIME READ/s WRITE/s CREATE/s OPEN/s FSYNC/s
+18:35:55: 238 8 3 101 0
+18:36:00: 962 233 4 247 0
+18:36:05: 241 8 3 100 0
+
+
+Full usage:
+
+# ./vfsstat -h
+USAGE: ./vfsstat [interval [count]]