From c937a9e6b46ca2d22a86a0aa84b82744096c4f35 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Fri, 12 Feb 2016 02:23:39 -0800 Subject: [PATCH] xfsslower --- README.md | 1 + man/man8/xfsslower.8 | 113 +++++++++++++++++ tools/xfsslower.py | 288 ++++++++++++++++++++++++++++++++++++++++++++ tools/xfsslower_example.txt | 148 +++++++++++++++++++++++ 4 files changed, 550 insertions(+) create mode 100644 man/man8/xfsslower.8 create mode 100755 tools/xfsslower.py create mode 100644 tools/xfsslower_example.txt diff --git a/README.md b/README.md index 0de68ed..54df4e0 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ Tools: - tools/[vfscount](tools/vfscount.py) tools/[vfscount.c](tools/vfscount.c): Count VFS calls. [Examples](tools/vfscount_example.txt). - tools/[vfsstat](tools/vfsstat.py) tools/[vfsstat.c](tools/vfsstat.c): Count some VFS calls, with column output. [Examples](tools/vfsstat_example.txt). - tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt). +- tools/[xfsslower](tools/xfsslower.py): Trace slow XFS operations: read, write, open, fsync. [Examples](tools/xfsslower_example.txt). ### Networking diff --git a/man/man8/xfsslower.8 b/man/man8/xfsslower.8 new file mode 100644 index 0000000..533a701 --- /dev/null +++ b/man/man8/xfsslower.8 @@ -0,0 +1,113 @@ +.TH xfsslower 8 "2016-02-11" "USER COMMANDS" +.SH NAME +xfsslower \- Trace slow xfs file operations, with per-event details. +.SH SYNOPSIS +.B xfsslower [\-h] [\-j] [\-p PID] [min_ms] +.SH DESCRIPTION +This tool traces common XFS file operations: reads, writes, opens, and +syncs. It measures the time spent in these operations, and prints details +for each that exceeded a threshold. + +WARNING: See the OVERHEAD section. + +By default, a minimum millisecond threshold of 10 is used. If a threshold of 0 +is used, all events are printed (warning: verbose). + +Since this works by tracing the xfs_file_operations interface functions, it +will need updating to match any changes to these functions. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +\-p PID +Trace this PID only. +.TP +min_ms +Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms. +.SH EXAMPLES +.TP +Trace synchronous file reads and writes slower than 10 ms: +# +.B xfsslower +.TP +Trace slower than 1 ms: +# +.B xfsslower 1 +.TP +Trace slower than 1 ms, and output just the fields in parsable format (csv): +# +.B xfsslower \-j 1 +.TP +Trace all file reads and writes (warning: the output will be verbose): +# +.B xfsslower 0 +.TP +Trace slower than 1 ms, for PID 181 only: +# +.B xfsslower \-p 181 1 +.SH FIELDS +.TP +TIME(s) +Time of I/O completion since the first I/O seen, in seconds. +.TP +COMM +Process name. +.TP +PID +Process ID. +.TP +T +Type of operation. R == read, W == write, O == open, S == fsync. +.TP +OFF_KB +File offset for the I/O, in Kbytes. +.TP +BYTES +Size of I/O, in bytes. +.TP +LAT(ms) +Latency (duration) of I/O, measured from when it was issued by VFS to the +filesystem, to when it completed. This time is inclusive of block device I/O, +file system CPU cycles, file system locks, run queue latency, etc. It's a more +accurate measure of the latency suffered by applications performing file +system I/O, than to measure this down at the block device interface. +.TP +FILENAME +A cached kernel file name (comes from dentry->d_iname). +.TP +ENDTIME_us +Completion timestamp, microseconds (\-j only). +.TP +OFFSET_b +File offset, bytes (\-j only). +.TP +LATENCY_us +Latency (duration) of the I/O, in microseconds (\-j only). +.SH OVERHEAD +This adds low-overhead instrumentation to these XFS operations, +including reads and writes from the file system cache. Such reads and writes +can be very frequent (depending on the workload; eg, 1M/sec), at which +point the overhead of this tool (even if it prints no "slower" events) can +begin to become significant. Measure and quantify before use. If this +continues to be a problem, consider switching to a tool that prints in-kernel +summaries only. +.PP +Note that the overhead of this tool should be less than fileslower(8), as +this tool targets xfs functions only, and not all file read/write paths +(which can include socket I/O). +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +biosnoop(8), funccount(8), fileslower(8) diff --git a/tools/xfsslower.py b/tools/xfsslower.py new file mode 100755 index 0000000..26f8af6 --- /dev/null +++ b/tools/xfsslower.py @@ -0,0 +1,288 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# xfsslower Trace slow XFS operations. +# For Linux, uses BCC, eBPF. +# +# USAGE: xfsslower [-h] [-j] [-p PID] [min_ms] +# +# This script traces common XFS file operations: reads, writes, opens, and +# syncs. It measures the time spent in these operations, and prints details +# for each that exceeded a threshold. +# +# WARNING: This adds low-overhead instrumentation to these XFS operations, +# including reads and writes from the file system cache. Such reads and writes +# can be very frequent (depending on the workload; eg, 1M/sec), at which +# point the overhead of this tool (even if it prints no "slower" events) can +# begin to become significant. +# +# By default, a minimum millisecond threshold of 10 is used. +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 11-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +import argparse +from time import strftime +import ctypes as ct + +# arguments +examples = """examples: + ./xfsslower # trace operations slower than 10 ms (default) + ./xfsslower 1 # trace operations slower than 1 ms + ./xfsslower -j 1 # ... 1 ms, parsable output (csv) + ./xfsslower 0 # trace all operations (warning: verbose) + ./xfsslower -p 185 # trace PID 185 only +""" +parser = argparse.ArgumentParser( + description="Trace common XFS file operations slower than a threshold", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-j", "--csv", action="store_true", + help="just print fields: comma-separated values") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("min_ms", nargs="?", default='10', + help="minimum I/O duration to trace, in ms (default 10)") +args = parser.parse_args() +min_ms = int(args.min_ms) +pid = args.pid +csv = args.csv +debug = 0 + +# define BPF program +bpf_text = """ +#include +#include +#include +#include + +// XXX: switch these to char's when supported +#define TRACE_READ 0 +#define TRACE_WRITE 1 +#define TRACE_OPEN 2 +#define TRACE_FSYNC 3 + +struct val_t { + u64 ts; + u64 offset; + struct file *fp; +}; + +struct data_t { + // XXX: switch some to u32's when supported + u64 ts_us; + u64 type; + u64 size; + u64 offset; + u64 delta_us; + u64 pid; + char task[TASK_COMM_LEN]; + char file[DNAME_INLINE_LEN]; +}; + +BPF_HASH(entryinfo, pid_t, struct val_t); +BPF_PERF_OUTPUT(events); + +// +// Store timestamp and size on entry +// + +// xfs_file_read_iter(), xfs_file_write_iter(): +int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb) +{ + u32 pid; + pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // store filep and timestamp by pid + struct val_t val = {}; + val.ts = bpf_ktime_get_ns(); + val.fp = iocb->ki_filp; + val.offset = iocb->ki_pos; + if (val.fp) + entryinfo.update(&pid, &val); + + return 0; +} + +// xfs_file_open(): +int trace_open_entry(struct pt_regs *ctx, struct inode *inode, + struct file *file) +{ + u32 pid; + pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // store filep and timestamp by pid + struct val_t val = {}; + val.ts = bpf_ktime_get_ns(); + val.fp = file; + val.offset = 0; + if (val.fp) + entryinfo.update(&pid, &val); + + return 0; +} + +// xfs_file_fsync(): +int trace_fsync_entry(struct pt_regs *ctx, struct file *file) +{ + u32 pid; + pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // store filep and timestamp by pid + struct val_t val = {}; + val.ts = bpf_ktime_get_ns(); + val.fp = file; + val.offset = 0; + if (val.fp) + entryinfo.update(&pid, &val); + + return 0; +} + +// +// Output +// + +static int trace_return(struct pt_regs *ctx, int type) +{ + struct val_t *valp; + u32 pid = bpf_get_current_pid_tgid(); + + valp = entryinfo.lookup(&pid); + if (valp == 0) { + // missed tracing issue or filtered + return 0; + } + + // calculate delta + u64 ts = bpf_ktime_get_ns(); + u64 delta_us = (ts - valp->ts) / 1000; + entryinfo.delete(&pid); + if (FILTER_US) + return 0; + + // workaround (rewriter should handle file to d_iname in one step): + struct dentry *de = NULL; + bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry); + + // populate output struct + u32 size = ctx->ax; + struct data_t data = {.type = type, .size = size, .delta_us = delta_us, + .pid = pid}; + data.ts_us = ts / 1000; + data.offset = valp->offset; + bpf_probe_read(&data.file, sizeof(data.file), de->d_iname); + bpf_get_current_comm(&data.task, sizeof(data.task)); + + events.perf_submit(ctx, &data, sizeof(data)); + + return 0; +} + +int trace_read_return(struct pt_regs *ctx) +{ + return trace_return(ctx, TRACE_READ); +} + +int trace_write_return(struct pt_regs *ctx) +{ + return trace_return(ctx, TRACE_WRITE); +} + +int trace_open_return(struct pt_regs *ctx) +{ + return trace_return(ctx, TRACE_OPEN); +} + +int trace_fsync_return(struct pt_regs *ctx) +{ + return trace_return(ctx, TRACE_FSYNC); +} + +""" +if min_ms == 0: + bpf_text = bpf_text.replace('FILTER_US', '0') +else: + bpf_text = bpf_text.replace('FILTER_US', + 'delta_us <= %s' % str(min_ms * 1000)) +if args.pid: + bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) +else: + bpf_text = bpf_text.replace('FILTER_PID', '0') +if debug: + print(bpf_text) + +# kernel->user event data: struct data_t +DNAME_INLINE_LEN = 32 # linux/dcache.h +TASK_COMM_LEN = 16 # linux/sched.h +class Data(ct.Structure): + _fields_ = [ + ("ts_us", ct.c_ulonglong), + ("type", ct.c_ulonglong), + ("size", ct.c_ulonglong), + ("offset", ct.c_ulonglong), + ("delta_us", ct.c_ulonglong), + ("pid", ct.c_ulonglong), + ("task", ct.c_char * TASK_COMM_LEN), + ("file", ct.c_char * DNAME_INLINE_LEN) + ] + +# process event +def print_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(Data)).contents + + type = 'R' + if event.type == 1: + type = 'W' + elif event.type == 2: + type = 'O' + elif event.type == 3: + type = 'S' + + if (csv): + print("%d,%s,%d,%s,%d,%d,%d,%s" % ( + event.ts_us, event.task, event.pid, type, event.size, + event.offset, event.delta_us, event.file)) + return + print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), + event.task, event.pid, type, event.size, event.offset / 1024, + float(event.delta_us) / 1000, event.file)) + +# initialize BPF +b = BPF(text=bpf_text) + +# common file functions +b.attach_kprobe(event="xfs_file_read_iter", fn_name="trace_rw_entry") +b.attach_kprobe(event="xfs_file_write_iter", fn_name="trace_rw_entry") +b.attach_kprobe(event="xfs_file_open", fn_name="trace_open_entry") +b.attach_kprobe(event="xfs_file_fsync", fn_name="trace_fsync_entry") +b.attach_kretprobe(event="xfs_file_read_iter", fn_name="trace_read_return") +b.attach_kretprobe(event="xfs_file_write_iter", fn_name="trace_write_return") +b.attach_kretprobe(event="xfs_file_open", fn_name="trace_open_return") +b.attach_kretprobe(event="xfs_file_fsync", fn_name="trace_fsync_return") + +# header +if (csv): + print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") +else: + if min_ms == 0: + print("Tracing XFS operations") + else: + print("Tracing XFS operations slower than %d ms" % min_ms) + print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", + "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) + +# read events +b["events"].open_perf_buffer(print_event) +while 1: + b.kprobe_poll() diff --git a/tools/xfsslower_example.txt b/tools/xfsslower_example.txt new file mode 100644 index 0000000..4c6ae33 --- /dev/null +++ b/tools/xfsslower_example.txt @@ -0,0 +1,148 @@ +Demonstrations of xfsslower, the Linux eBPF/bcc version. + + +xfsslower shows XFS reads, writes, opens, and fsyncs, slower than a threshold. +For example: + +# ./xfsslower +Tracing XFS operations slower than 10 ms +TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME +06:23:06 randread.pl 32497 R 8192 24938024 17.93 data1 +06:23:06 randread.pl 32521 R 8192 13431528 18.27 data1 +06:23:08 randread.pl 32497 R 8192 5070904 16.37 data1 +06:23:08 randread.pl 32521 R 8192 12693016 16.06 data1 +06:23:18 randread.pl 32521 R 8192 27049136 21.68 data1 +06:23:18 randread.pl 32497 R 8192 257864 21.74 data1 +06:23:20 randread.pl 32497 R 8192 17797208 13.37 data1 +06:23:20 randread.pl 32497 R 8192 6088224 19.74 data1 + +This shows several reads from a "randread.pl" program, each 8 Kbytes in size, +and from a "data1" file. These all had over 10 ms latency. + +This "latency" is measured from when the operation was issued from the VFS +interface to the file system, to when it completed. This spans everything: +block device I/O (disk I/O), file system CPU cycles, file system locks, run +queue latency, etc. This is a better measure of the latency suffered by +applications reading from the file system than measuring this down at the +block device interface. + +Note that this only traces the common file system operations previously +listed: other file system operations (eg, inode operations including +getattr()) are not traced. + + +The threshold can be provided as an argument. Eg, I/O slower than 1 ms: + +# ./xfsslower 1 +Tracing XFS operations slower than 1 ms +TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME +06:26:59 randread.pl 5394 R 8192 9045728 1.24 data1 +06:26:59 randread.pl 5394 R 8192 23532136 1.17 data1 +06:26:59 randread.pl 5442 R 8192 2192376 2.06 data1 +06:27:00 randread.pl 5394 R 8192 3535176 1.27 data1 +06:27:00 randread.pl 5442 R 8192 21361784 3.18 data1 +06:27:00 randread.pl 5394 R 8192 2556336 3.23 data1 +06:27:00 randread.pl 5394 R 8192 20020880 2.87 data1 +06:27:00 randread.pl 5442 R 8192 20708888 3.32 data1 +06:27:00 randread.pl 5394 R 8192 4654680 2.00 data1 +06:27:00 randread.pl 5442 R 8192 5591744 1.98 data1 +06:27:00 randread.pl 5394 R 8192 2431056 1.22 data1 +06:27:00 randread.pl 5394 R 8192 384288 2.95 data1 +06:27:00 randread.pl 5442 R 8192 29277672 3.07 data1 +06:27:00 randread.pl 5442 R 8192 29508216 3.23 data1 +06:27:00 randread.pl 5394 R 8192 17200008 2.86 data1 +06:27:00 randread.pl 5442 R 8192 20693088 1.06 data1 +06:27:00 randread.pl 5394 R 8192 28124192 1.38 data1 +06:27:00 randread.pl 5442 R 8192 23821184 1.28 data1 +06:27:00 randread.pl 5394 R 8192 1623200 1.47 data1 +[...] + +There's now much more output (this spans only 2 seconds, the previous output +spanned 14 seconds), as the lower threshold is catching more I/O. + + +A threshold of 0 will trace all operations. Warning: the output will be +verbose, as it will include all file system cache hits. + +# ./xfsslower 0 +Tracing XFS operations +TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME +06:29:43 ls 9291 O 0 0 0.00 bench +06:29:47 cat 9361 O 0 0 0.00 date.txt +06:29:47 cat 9361 R 29 0 0.01 date.txt +06:29:47 cat 9361 R 0 0 0.00 date.txt +06:29:50 bash 20500 O 0 0 0.00 bench +06:29:50 bash 20500 O 0 0 0.00 bench +06:29:50 bash 20500 O 0 0 0.00 bench +06:29:50 bash 9431 O 0 0 0.00 bench +06:29:50 bash 9432 O 0 0 0.00 bench +06:29:50 bash 9456 O 0 0 0.00 newdate.txt +06:29:50 date 9456 W 29 0 0.01 newdate.txt +06:29:53 cksum 9503 O 0 0 0.00 data1 +06:29:53 cksum 9503 R 65536 0 0.06 data1 +06:29:53 cksum 9503 R 65536 64 0.01 data1 +06:29:53 cksum 9503 R 65536 128 0.02 data1 +06:29:53 cksum 9503 R 65536 192 0.01 data1 +06:29:53 cksum 9503 R 65536 256 0.01 data1 +06:29:53 cksum 9503 R 65536 320 0.01 data1 +06:29:53 cksum 9503 R 65536 384 0.01 data1 +06:29:53 cksum 9503 R 65536 448 0.04 data1 +06:29:53 cksum 9503 R 65536 512 0.01 data1 +06:29:53 cksum 9503 R 65536 576 0.02 data1 +06:29:53 cksum 9503 R 65536 640 0.01 data1 +06:29:53 cksum 9503 R 65536 704 0.01 data1 +06:29:53 cksum 9503 R 65536 768 0.01 data1 +06:29:53 cksum 9503 R 65536 832 0.01 data1 +06:29:53 cksum 9503 R 65536 896 0.01 data1 +06:29:53 cksum 9503 R 65536 960 0.01 data1 +06:29:53 cksum 9503 R 65536 1024 0.01 data1 +06:29:53 cksum 9503 R 65536 1088 0.02 data1 +06:29:53 cksum 9503 R 65536 1152 0.01 data1 +06:29:53 cksum 9503 R 65536 1216 0.01 data1 +[...] + +The output now includes open operations ("O"), and writes ("W"). A cksum(1) +command can be seen reading from a data1 file, from progressively increasing +offsets: a sequential workload. + + +A -j option will print just the fields (parsable output, csv): + +# ./xfsslower -j 1 +ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE +125563830632,randread.pl,12155,R,8192,27824193536,1057,data1 +125565050578,randread.pl,12155,R,8192,16908525568,1969,data1 +125566331140,randread.pl,12202,R,8192,16310689792,1738,data1 +125566427955,randread.pl,12155,R,8192,11127439360,1058,data1 +125567223494,randread.pl,12202,R,8192,8422031360,1131,data1 +125567331145,randread.pl,12155,R,8192,9233088512,1230,data1 +125567331220,randread.pl,12202,R,8192,12716326912,1148,data1 +125567334983,randread.pl,12155,R,8192,24545206272,2182,data1 +[...] + +This may be useful for visualizing with another tool, for example, for +producing a scatter plot of ENDTIME vs LATENCY, to look for time-based +patterns. + + +USAGE message: + +# ./xfsslower -h +usage: xfsslower [-h] [-j] [-p PID] [min_ms] + +Trace common XFS file operations slower than a threshold + +positional arguments: + min_ms minimum I/O duration to trace, in ms (default 10) + +optional arguments: + -h, --help show this help message and exit + -j, --csv just print fields: comma-separated values + -p PID, --pid PID trace this PID only + +examples: + ./xfsslower # trace operations slower than 10 ms (default) + ./xfsslower 1 # trace operations slower than 1 ms + ./xfsslower -j 1 # ... 1 ms, parsable output (csv) + ./xfsslower 0 # trace all operations (warning: verbose) + ./xfsslower -p 185 # trace PID 185 only -- 2.7.4