From 9c1f362983006d3f7a93ec1d417908bc5f5424c2 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Sun, 14 Feb 2016 23:14:12 -0800 Subject: [PATCH] zfsdist --- README.md | 1 + man/man8/zfsdist.8 | 83 +++++++++++++++++++++ tools/zfsdist.py | 168 ++++++++++++++++++++++++++++++++++++++++++ tools/zfsdist_example.txt | 183 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 435 insertions(+) create mode 100644 man/man8/zfsdist.8 create mode 100755 tools/zfsdist.py create mode 100644 tools/zfsdist_example.txt diff --git a/README.md b/README.md index b813dbd..6e8a32d 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,7 @@ Tools: - tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt). - tools/[xfsdist](tools/xfsdist.py): Summarize XFS operation latency. [Examples](tools/xfsdist_example.txt). - tools/[xfsslower](tools/xfsslower.py): Trace slow XFS operations. [Examples](tools/xfsslower_example.txt). +- tools/[zfsdist](tools/zfsdist.py): Summarize ZFS operation latency. [Examples](tools/zfsdist_example.txt). - tools/[zfsslower](tools/zfsslower.py): Trace slow ZFS operations. [Examples](tools/zfsslower_example.txt). ### Networking diff --git a/man/man8/zfsdist.8 b/man/man8/zfsdist.8 new file mode 100644 index 0000000..5532d8c --- /dev/null +++ b/man/man8/zfsdist.8 @@ -0,0 +1,83 @@ +.TH zfsdist 8 "2016-02-12" "USER COMMANDS" +.SH NAME +zfsdist \- Summarize ZFS operation latency. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B zfsdist [\-h] [\-T] [\-N] [\-d] [interval] [count] +.SH DESCRIPTION +This tool summarizes time (latency) spent in common ZFS file operations: reads, +writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an +in-kernel eBPF map to store the histogram for efficiency. + +This uses kernel dynamic tracing of the ZPL interface (ZFS POSIX +Layer), and will need updates to match any changes to this interface. +.TP +This is intended to work with the ZFS on Linux project: +http://zfsonlinux.org +.PP +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Don't include timestamps on interval output. +.TP +\-m +Output in milliseconds. +.TP +\-p PID +Trace this PID only. +.SH EXAMPLES +.TP +Trace ZFS operation time, and print a summary on Ctrl-C: +# +.B zfsdist +.TP +Trace PID 181 only: +# +.B zfsdist -p 181 +.TP +Print 1 second summaries, 10 times: +# +.B zfsdist 1 10 +.TP +1 second summaries, printed in milliseconds +# +.B zfsdist \-m 1 +.SH FIELDS +.TP +msecs +Range of milliseconds for this bucket. +.TP +usecs +Range of microseconds for this bucket. +.TP +count +Number of operations in this time range. +.TP +distribution +ASCII representation of the distribution (the count column). +.SH OVERHEAD +This adds low-overhead instrumentation to these ZFS operations, +including reads and writes from the file system cache. Such reads and writes +can be very frequent (depending on the workload; eg, 1M/sec), at which +point the overhead of this tool may become noticeable. +Measure and quantify before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +zfssnoop(8) diff --git a/tools/zfsdist.py b/tools/zfsdist.py new file mode 100755 index 0000000..2d9ae21 --- /dev/null +++ b/tools/zfsdist.py @@ -0,0 +1,168 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# zfsdist Summarize ZFS operation latency. +# For Linux, uses BCC, eBPF. +# +# USAGE: zfsdist [-h] [-T] [-m] [-p PID] [interval] [count] +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 14-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse + +# arguments +examples = """examples: + ./zfsdist # show operation latency as a histogram + ./zfsdist -p 181 # trace PID 181 only + ./zfsdist 1 10 # print 1 second summaries, 10 times + ./zfsdist -m 5 # 5s summaries, milliseconds +""" +parser = argparse.ArgumentParser( + description="Summarize ZFS operation latency", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--notimestamp", action="store_true", + help="don't include timestamp on interval output") +parser.add_argument("-m", "--milliseconds", action="store_true", + help="output in milliseconds") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("interval", nargs="?", + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +pid = args.pid +countdown = int(args.count) +if args.milliseconds: + factor = 1000000 + label = "msecs" +else: + factor = 1000 + label = "usecs" +if args.interval and int(args.interval) == 0: + print("ERROR: interval 0. Exiting.") + exit() +debug = 0 + +# define BPF program +bpf_text = """ +#include +#include +#include + +#define OP_NAME_LEN 8 +typedef struct dist_key { + char op[OP_NAME_LEN]; + u64 slot; +} dist_key_t; +BPF_HASH(start, u32); +BPF_HISTOGRAM(dist, dist_key_t); + +// time operation +int trace_entry(struct pt_regs *ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +static int trace_return(struct pt_regs *ctx, const char *op) +{ + u64 *tsp; + u32 pid = bpf_get_current_pid_tgid(); + + // fetch timestamp and calculate delta + tsp = start.lookup(&pid); + if (tsp == 0) { + return 0; // missed start or filtered + } + u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR; + + // store as histogram + dist_key_t key = {.slot = bpf_log2l(delta)}; + __builtin_memcpy(&key.op, op, sizeof(key.op)); + dist.increment(key); + + start.delete(&pid); + return 0; +} + +int trace_read_return(struct pt_regs *ctx) +{ + char *op = "read"; + return trace_return(ctx, op); +} + +int trace_write_return(struct pt_regs *ctx) +{ + char *op = "write"; + return trace_return(ctx, op); +} + +int trace_open_return(struct pt_regs *ctx) +{ + char *op = "open"; + return trace_return(ctx, op); +} + +int trace_fsync_return(struct pt_regs *ctx) +{ + char *op = "fsync"; + return trace_return(ctx, op); +} +""" +bpf_text = bpf_text.replace('FACTOR', str(factor)) +if args.pid: + bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) +else: + bpf_text = bpf_text.replace('FILTER_PID', '0') +if debug: + print(bpf_text) + +# load BPF program +b = BPF(text=bpf_text) + +# common file functions +b.attach_kprobe(event="zpl_read", fn_name="trace_entry") +b.attach_kprobe(event="zpl_write", fn_name="trace_entry") +b.attach_kprobe(event="zpl_open", fn_name="trace_entry") +b.attach_kprobe(event="zpl_fsync", fn_name="trace_entry") +b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return") +b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return") +b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return") +b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return") + +print("Tracing ZFS operation latency... Hit Ctrl-C to end.") + +# output +exiting = 0 +dist = b.get_table("dist") +while (1): + try: + if args.interval: + sleep(int(args.interval)) + else: + sleep(99999999) + except KeyboardInterrupt: + exiting = 1 + + print() + if args.interval and (not args.notimestamp): + print(strftime("%H:%M:%S:")) + + dist.print_log2_hist(label, "operation") + dist.clear() + + countdown -= 1 + if exiting or countdown == 0: + exit() diff --git a/tools/zfsdist_example.txt b/tools/zfsdist_example.txt new file mode 100644 index 0000000..a02d4dc --- /dev/null +++ b/tools/zfsdist_example.txt @@ -0,0 +1,183 @@ +Demonstrations of zfsdist, the Linux eBPF/bcc version. + + +zfsdist traces ZFS reads, writes, opens, and fsyncs, and summarizes their +latency as a power-of-2 histogram. It has been written to work on ZFS on Linux +(http://zfsonlinux.org). For example: + +# ./zfsdist +Tracing ZFS operation latency... Hit Ctrl-C to end. +^C + +operation = 'read' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 4479 |****************************************| + 8 -> 15 : 1028 |********* | + 16 -> 31 : 14 | | + 32 -> 63 : 1 | | + 64 -> 127 : 2 | | + 128 -> 255 : 6 | | + 256 -> 511 : 1 | | + 512 -> 1023 : 1256 |*********** | + 1024 -> 2047 : 9 | | + 2048 -> 4095 : 1 | | + 4096 -> 8191 : 2 | | + +operation = 'write' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 75 |****************************************| + 256 -> 511 : 11 |***** | + 512 -> 1023 : 0 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 0 | | + 8192 -> 16383 : 0 | | + 16384 -> 32767 : 0 | | + 32768 -> 65535 : 0 | | + 65536 -> 131071 : 13 |****** | + 131072 -> 262143 : 1 | | + +operation = 'open' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 2 |****************************************| + +This output shows a bimodal distribution for read latency, with a faster +mode of around 5 thousand reads that took between 4 and 15 microseconds, and a +slower mode of 1256 reads that took between 512 and 1023 microseconds. It's +likely that the faster mode was a hit from the in-memory file system cache, +and the slower mode is a read from a storage device (disk). + +The write latency is also bimodal, with a faster mode between 128 and 511 us, +and the slower mode between 65 and 131 ms. + +This "latency" is measured from when the operation was issued from the VFS +interface to the file system (via the ZFS POSIX layer), to when it completed. +This spans everything: block device I/O (disk I/O), file system CPU cycles, +file system locks, run queue latency, etc. This is a better measure of the +latency suffered by applications reading from the file system than measuring +this down at the block device interface. + +Note that this only traces the common file system operations previously +listed: other file system operations (eg, inode operations including +getattr()) are not traced. + + +An optional interval and a count can be provided, as well as -m to show the +distributions in milliseconds. For example: + +# ./zfsdist 1 5 +Tracing ZFS operation latency... Hit Ctrl-C to end. + +06:55:41: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 3976 |****************************************| + 8 -> 15 : 1181 |*********** | + 16 -> 31 : 18 | | + 32 -> 63 : 4 | | + 64 -> 127 : 17 | | + 128 -> 255 : 16 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 1275 |************ | + 1024 -> 2047 : 36 | | + 2048 -> 4095 : 3 | | + 4096 -> 8191 : 0 | | + 8192 -> 16383 : 1 | | + 16384 -> 32767 : 1 | | + +06:55:42: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 12751 |****************************************| + 8 -> 15 : 1190 |*** | + 16 -> 31 : 38 | | + 32 -> 63 : 7 | | + 64 -> 127 : 85 | | + 128 -> 255 : 47 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 1010 |*** | + 1024 -> 2047 : 49 | | + 2048 -> 4095 : 12 | | + +06:55:43: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 80925 |****************************************| + 8 -> 15 : 1645 | | + 16 -> 31 : 251 | | + 32 -> 63 : 24 | | + 64 -> 127 : 16 | | + 128 -> 255 : 12 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 80 | | + 1024 -> 2047 : 1 | | + +06:55:44: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 81207 |****************************************| + 8 -> 15 : 2075 |* | + 16 -> 31 : 2005 | | + 32 -> 63 : 177 | | + 64 -> 127 : 3 | | + +06:55:45: + +operation = 'read' + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 74364 |****************************************| + 8 -> 15 : 865 | | + 16 -> 31 : 4960 |** | + 32 -> 63 : 625 | | + 64 -> 127 : 2 | | + +This workload was randomly reading from a file that became cached. The slower +mode can be seen to disappear by the final summaries. + + +USAGE message: + +# ./zfsdist -h +usage: zfsdist [-h] [-T] [-m] [-p PID] [interval] [count] + +Summarize ZFS operation latency + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -T, --notimestamp don't include timestamp on interval output + -m, --milliseconds output in milliseconds + -p PID, --pid PID trace this PID only + +examples: + ./zfsdist # show operation latency as a histogram + ./zfsdist -p 181 # trace PID 181 only + ./zfsdist 1 10 # print 1 second summaries, 10 times + ./zfsdist -m 5 # 5s summaries, milliseconds -- 2.7.4