From 23c96fe4f1d9bca4824775390cbfc73ddd9ddd44 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Fri, 12 Feb 2016 02:25:32 -0800 Subject: [PATCH] xfsdist --- README.md | 3 +- man/man8/xfsdist.8 | 80 ++++++++++++++++++++++ tools/xfsdist.py | 168 ++++++++++++++++++++++++++++++++++++++++++++++ tools/xfsdist_example.txt | 155 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 405 insertions(+), 1 deletion(-) create mode 100644 man/man8/xfsdist.8 create mode 100755 tools/xfsdist.py create mode 100644 tools/xfsdist_example.txt diff --git a/README.md b/README.md index 54df4e0..f21d7ac 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,8 @@ Tools: - tools/[vfscount](tools/vfscount.py) tools/[vfscount.c](tools/vfscount.c): Count VFS calls. [Examples](tools/vfscount_example.txt). - tools/[vfsstat](tools/vfsstat.py) tools/[vfsstat.c](tools/vfsstat.c): Count some VFS calls, with column output. [Examples](tools/vfsstat_example.txt). - tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt). -- tools/[xfsslower](tools/xfsslower.py): Trace slow XFS operations: read, write, open, fsync. [Examples](tools/xfsslower_example.txt). +- tools/[xfsdist](tools/xfsdist.py): Summarize XFS operation latency. [Examples](tools/xfsdist_example.txt). +- tools/[xfsslower](tools/xfsslower.py): Trace slow XFS operations. [Examples](tools/xfsslower_example.txt). ### Networking diff --git a/man/man8/xfsdist.8 b/man/man8/xfsdist.8 new file mode 100644 index 0000000..12de643 --- /dev/null +++ b/man/man8/xfsdist.8 @@ -0,0 +1,80 @@ +.TH xfsdist 8 "2016-02-12" "USER COMMANDS" +.SH NAME +xfsdist \- Summarize XFS operation latency. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B xfsdist [\-h] [\-T] [\-N] [\-d] [interval] [count] +.SH DESCRIPTION +This tool summarizes time (latency) spent in common XFS file operations: reads, +writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an +in-kernel eBPF map to store the histogram for efficiency. + +Since this works by tracing the xfs_file_operations interface functions, it +will need updating to match any changes to these functions. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Don't include timestamps on interval output. +.TP +\-m +Output in milliseconds. +.TP +\-p PID +Trace this PID only. +.SH EXAMPLES +.TP +Trace XFS operation time, and print a summary on Ctrl-C: +# +.B xfsdist +.TP +Trace PID 181 only: +# +.B xfsdist -p 181 +.TP +Print 1 second summaries, 10 times: +# +.B xfsdist 1 10 +.TP +1 second summaries, printed in milliseconds +# +.B xfsdist \-m 1 +.SH FIELDS +.TP +msecs +Range of milliseconds for this bucket. +.TP +usecs +Range of microseconds for this bucket. +.TP +count +Number of operations in this time range. +.TP +distribution +ASCII representation of the distribution (the count column). +.SH OVERHEAD +This adds low-overhead instrumentation to these XFS operations, +including reads and writes from the file system cache. Such reads and writes +can be very frequent (depending on the workload; eg, 1M/sec), at which +point the overhead of this tool may become noticeable. +Measure and quantify before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +xfssnoop(8) diff --git a/tools/xfsdist.py b/tools/xfsdist.py new file mode 100755 index 0000000..18d5a8e --- /dev/null +++ b/tools/xfsdist.py @@ -0,0 +1,168 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# xfsdist Summarize XFS operation latency. +# For Linux, uses BCC, eBPF. +# +# USAGE: xfsdist [-h] [-T] [-m] [-p PID] [interval] [count] +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 12-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse + +# arguments +examples = """examples: + ./xfsdist # show operation latency as a histogram + ./xfsdist -p 181 # trace PID 181 only + ./xfsdist 1 10 # print 1 second summaries, 10 times + ./xfsdist -m 5 # 5s summaries, milliseconds +""" +parser = argparse.ArgumentParser( + description="Summarize XFS operation latency", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--notimestamp", action="store_true", + help="don't include timestamp on interval output") +parser.add_argument("-m", "--milliseconds", action="store_true", + help="output in milliseconds") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("interval", nargs="?", + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +pid = args.pid +countdown = int(args.count) +if args.milliseconds: + factor = 1000000 + label = "msecs" +else: + factor = 1000 + label = "usecs" +if args.interval and int(args.interval) == 0: + print("ERROR: interval 0. Exiting.") + exit() +debug = 0 + +# define BPF program +bpf_text = """ +#include +#include +#include + +#define OP_NAME_LEN 8 +typedef struct dist_key { + char op[OP_NAME_LEN]; + u64 slot; +} dist_key_t; +BPF_HASH(start, u32); +BPF_HISTOGRAM(dist, dist_key_t); + +// time operation +int trace_entry(struct pt_regs *ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +static int trace_return(struct pt_regs *ctx, const char *op) +{ + u64 *tsp; + u32 pid = bpf_get_current_pid_tgid(); + + // fetch timestamp and calculate delta + tsp = start.lookup(&pid); + if (tsp == 0) { + return 0; // missed start or filtered + } + u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR; + + // store as histogram + dist_key_t key = {.slot = bpf_log2l(delta)}; + __builtin_memcpy(&key.op, op, sizeof(key.op)); + dist.increment(key); + + start.delete(&pid); + return 0; +} + +int trace_read_return(struct pt_regs *ctx) +{ + char *op = "read"; + return trace_return(ctx, op); +} + +int trace_write_return(struct pt_regs *ctx) +{ + char *op = "write"; + return trace_return(ctx, op); +} + +int trace_open_return(struct pt_regs *ctx) +{ + char *op = "open"; + return trace_return(ctx, op); +} + +int trace_fsync_return(struct pt_regs *ctx) +{ + char *op = "fsync"; + return trace_return(ctx, op); +} +""" +bpf_text = bpf_text.replace('FACTOR', str(factor)) +if args.pid: + bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) +else: + bpf_text = bpf_text.replace('FILTER_PID', '0') +if debug: + print(bpf_text) + +# load BPF program +b = BPF(text=bpf_text) + +# common file functions +b.attach_kprobe(event="xfs_file_read_iter", fn_name="trace_entry") +b.attach_kprobe(event="xfs_file_write_iter", fn_name="trace_entry") +b.attach_kprobe(event="xfs_file_open", fn_name="trace_entry") +b.attach_kprobe(event="xfs_file_fsync", fn_name="trace_entry") +b.attach_kretprobe(event="xfs_file_read_iter", fn_name="trace_read_return") +b.attach_kretprobe(event="xfs_file_write_iter", fn_name="trace_write_return") +b.attach_kretprobe(event="xfs_file_open", fn_name="trace_open_return") +b.attach_kretprobe(event="xfs_file_fsync", fn_name="trace_fsync_return") + +print("Tracing XFS operation latency... Hit Ctrl-C to end.") + +# output +exiting = 0 +dist = b.get_table("dist") +while (1): + try: + if args.interval: + sleep(int(args.interval)) + else: + sleep(99999999) + except KeyboardInterrupt: + exiting = 1 + + print() + if args.interval and (not args.notimestamp): + print(strftime("%H:%M:%S:")) + + dist.print_log2_hist(label, "operation") + dist.clear() + + countdown -= 1 + if exiting or countdown == 0: + exit() diff --git a/tools/xfsdist_example.txt b/tools/xfsdist_example.txt new file mode 100644 index 0000000..c646501 --- /dev/null +++ b/tools/xfsdist_example.txt @@ -0,0 +1,155 @@ +Demonstrations of xfsdist, the Linux eBPF/bcc version. + + +xfsdist traces XFS reads, writes, opens, and fsyncs, and summarizes their +latency as a power-of-2 histogram. For example: + +# ./xfsdist +Tracing XFS operation latency... Hit Ctrl-C to end. +^C + +operation = 'read' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 362 | | + 4 -> 7 : 807 |* | + 8 -> 15 : 20686 |****************************************| + 16 -> 31 : 512 | | + 32 -> 63 : 4 | | + 64 -> 127 : 2744 |***** | + 128 -> 255 : 7127 |************* | + 256 -> 511 : 2483 |**** | + 512 -> 1023 : 1281 |** | + 1024 -> 2047 : 39 | | + 2048 -> 4095 : 5 | | + 4096 -> 8191 : 1 | | + +operation = 'open' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 3 |****************************************| + +This output shows a bi-modal distribution for read latency, with a faster +mode of 20,686 reads that took between 8 and 15 microseconds, and a slower +mode of over 10,000 reads that took between 64 and 1023 microseconds. It's +likely that the faster mode was a hit from the in-memory file system cache, +and the slower mode is a read from a storage device (disk). + +This "latency" is measured from when the operation was issued from the VFS +interface to the file system, to when it completed. This spans everything: +block device I/O (disk I/O), file system CPU cycles, file system locks, run +queue latency, etc. This is a better measure of the latency suffered by +applications reading from the file system than measuring this down at the +block device interface. + +Note that this only traces the common file system operations previously +listed: other file system operations (eg, inode operations including +getattr()) are not traced. + + +An optional interval and a count can be provided, as well as -m to show the +distributions in milliseconds. For example: + +# ./xfsdist -m 1 5 +Tracing XFS operation latency... Hit Ctrl-C to end. + +10:14:15: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 1366 |****************************************| + 2 -> 3 : 86 |** | + 4 -> 7 : 95 |** | + 8 -> 15 : 132 |*** | + 16 -> 31 : 72 |** | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 685 |****************************************| + +10:14:16: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 984 |****************************************| + 2 -> 3 : 66 |** | + 4 -> 7 : 67 |** | + 8 -> 15 : 104 |**** | + 16 -> 31 : 70 |** | + 32 -> 63 : 12 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 536 |****************************************| + +10:14:17: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 1262 |****************************************| + 2 -> 3 : 75 |** | + 4 -> 7 : 80 |** | + 8 -> 15 : 119 |*** | + 16 -> 31 : 75 |** | + 32 -> 63 : 3 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 639 |****************************************| + +10:14:18: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 1070 |****************************************| + 2 -> 3 : 58 |** | + 4 -> 7 : 74 |** | + 8 -> 15 : 140 |***** | + 16 -> 31 : 60 |** | + 32 -> 63 : 5 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 556 |****************************************| + +10:14:19: + +operation = 'read' + msecs : count distribution + 0 -> 1 : 1176 |****************************************| + 2 -> 3 : 53 |* | + 4 -> 7 : 94 |*** | + 8 -> 15 : 112 |*** | + 16 -> 31 : 77 |** | + 32 -> 63 : 3 | | + +operation = 'write' + msecs : count distribution + 0 -> 1 : 613 |****************************************| + +This shows a mixed read/write workload, where the slower read mode was around +10 ms. + + +USAGE message: + +# ./xfsdist -h +usage: xfsdist [-h] [-T] [-m] [-p PID] [interval] [count] + +Summarize XFS operation latency + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -T, --notimestamp don't include timestamp on interval output + -m, --milliseconds output in milliseconds + -p PID, --pid PID trace this PID only + +examples: + ./xfsdist # show operation latency as a histogram + ./xfsdist -p 181 # trace PID 181 only + ./xfsdist 1 10 # print 1 second summaries, 10 times + ./xfsdist -m 5 # 5s summaries, milliseconds -- 2.7.4