From: Brendan Gregg Date: Tue, 9 Feb 2016 08:36:43 +0000 (-0800) Subject: filetop.8 X-Git-Tag: v0.1.8~31^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=08c2981427457544ea991cd4463f35ceaf5bdd3e;p=platform%2Fupstream%2Fbcc.git filetop.8 --- diff --git a/README.md b/README.md index b9eb6cb..d357ced 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ Tools: - tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt). - tools/[filelife](tools/filelife.py): Trace the lifespan of short-lived files. [Examples](tools/filelife_example.txt). - tools/[fileslower](tools/fileslower.py): Trace slow synchronous file reads and writes. [Examples](tools/fileslower_example.txt). +- tools/[filetop](tools/filetop.py): File reads and writes by filename and process. Top for files. [Examples](tools/filetop_example.txt). - tools/[funccount](tools/funccount.py): Count kernel function calls. [Examples](tools/funccount_example.txt). - tools/[funclatency](tools/funclatency.py): Time kernel functions and show their latency distribution. [Examples](tools/funclatency_example.txt). - tools/[gethostlatency](tools/gethostlatency.py): Show latency for getaddrinfo/gethostbyname[2] calls. [Examples](tools/gethostlatency_example.txt). diff --git a/man/man8/filetop.8 b/man/man8/filetop.8 new file mode 100644 index 0000000..6a1c15c --- /dev/null +++ b/man/man8/filetop.8 @@ -0,0 +1,110 @@ +.TH filetop 8 "2016-02-08" "USER COMMANDS" +.SH NAME +filetop \- File reads and writes by filename and process. Top for files. +.SH SYNOPSIS +.B filetop [\-h] [\-C] [\-r MAXROWS] [\-p PID] [interval] [count] +.SH DESCRIPTION +This is top for files. + +This traces file reads and writes, and prints a per-file summary every +interval (by default, 1 second). The summary is sorted on the highest read +throughput (Kbytes). + +This uses in-kernel eBPF maps to store per process summaries for efficiency. + +This script works by tracing the __vfs_read() and __vfs_write() functions using +kernel dynamic tracing, which instruments explicit read and write calls. If +files are read or written using another means (eg, via mmap()), then they +will not be visible using this tool. Also, this tool will need updating to +match any code changes to those vfs functions. + +This should be useful for file system workload characterization when analyzing +the performance of applications. + +Note that tracing VFS level reads and writes can be a frequent activity, and +this tool can begin to cost measurable overhead at high I/O rates. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-C +Don't clear the screen. +.TP +\-r MAXROWS +Maximum number of rows to print. Default is 20. +.TP +\-p PID +Trace this PID only. +.TP +interval +Interval between updates, seconds. +.TP +count +Number of interval summaries. + +.SH EXAMPLES +.TP +Summarize block device I/O by process, 1 second screen refresh: +# +.B filetop +.TP +Don't clear the screen, and top 8 rows only: +# +.B filetop -Cr 8 +.TP +5 second summaries, 10 times only: +# +.B filetop 5 10 +.SH FIELDS +.TP +loadavg: +The contents of /proc/loadavg +.TP +PID +Process ID. +.TP +COMM +Process name. +.TP +READS +Count of reads during interval. +.TP +WRITES +Count of writes during interval. +.TP +R_Kb +Total read Kbytes during interval. +.TP +W_Kb +Total write Kbytes during interval. +.TP +T +Type of file: R == regular, S == socket, O == other (pipe, etc). +.SH OVERHEAD +Depending on the frequency of application reads and writes, overhead can become +significant, in the worst case slowing applications by over 50%. Hopefully for +real world workloads the overhead is much less -- test before use. The reason +for the high overhead is that VFS reads and writes can be a frequent event, and +despite the eBPF overhead being very small per event, if you multiply this +small overhead by a million events per second, it becomes a million times +worse. Literally. You can gauge the number of reads and writes using the +vfsstat(8) tool, also from bcc. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH INSPIRATION +top(1) by William LeFebvre +.SH SEE ALSO +vfsstat(8), vfscount(8), fileslower(8) diff --git a/tools/filetop.py b/tools/filetop.py new file mode 100755 index 0000000..047dd8c --- /dev/null +++ b/tools/filetop.py @@ -0,0 +1,188 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# filetop file reads and writes by process. +# For Linux, uses BCC, eBPF. +# +# USAGE: filetop.py [-h] [-C] [-r MAXROWS] [interval] [count] +# +# This uses in-kernel eBPF maps to store per process summaries for efficiency. +# +# Copyright 2016 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 06-Feb-2016 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse +import signal +from subprocess import call + +# arguments +examples = """examples: + ./filetop # file I/O top, 1 second refresh + ./filetop -C # don't clear the screen + ./filetop -p 181 # PID 181 only + ./filetop 5 # 5 second summaries + ./filetop 5 10 # 5 second summaries, 10 times only +""" +parser = argparse.ArgumentParser( + description="File reads and writes by process", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-C", "--noclear", action="store_true", + help="don't clear the screen") +parser.add_argument("-r", "--maxrows", default=20, + help="maximum rows to print, default 20") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("interval", nargs="?", default=1, + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +interval = int(args.interval) +countdown = int(args.count) +maxrows = int(args.maxrows) +clear = not int(args.noclear) +debug = 0 + +# linux stats +loadavg = "/proc/loadavg" + +# signal handler +def signal_ignore(signal, frame): + print() + +# define BPF program +bpf_text = """ +#include +#include + +#define MAX_FILE_LEN 32 + +// the key for the output summary +struct info_t { + u32 pid; + char name[TASK_COMM_LEN]; + char file[MAX_FILE_LEN]; + char type; +}; + +// the value of the output summary +struct val_t { + u64 reads; + u64 writes; + u64 rbytes; + u64 wbytes; +}; + +BPF_HASH(counts, struct info_t, struct val_t); + +static int do_entry(struct pt_regs *ctx, struct file *file, + char __user *buf, size_t count, int is_read) +{ + u32 pid; + + pid = bpf_get_current_pid_tgid(); + if (FILTER) + return 0; + + // skip I/O lacking a filename + struct dentry *de = file->f_path.dentry; + if (de->d_iname[0] == 0) + return 0; + + // store counts and sizes by pid & file + struct info_t info = {.pid = pid}; + bpf_get_current_comm(&info.name, sizeof(info.name)); + __builtin_memcpy(&info.file, de->d_iname, sizeof(info.file)); + int mode = file->f_inode->i_mode; + if (S_ISREG(mode)) { + info.type = 'R'; + } else if (S_ISSOCK(mode)) { + info.type = 'S'; + } else { + info.type = 'O'; + } + + struct val_t *valp, zero = {}; + valp = counts.lookup_or_init(&info, &zero); + if (is_read) { + valp->reads++; + valp->rbytes += count; + } else { + valp->writes++; + valp->wbytes += count; + } + + return 0; +} + +int trace_read_entry(struct pt_regs *ctx, struct file *file, + char __user *buf, size_t count) +{ + return do_entry(ctx, file, buf, count, 1); +} + +int trace_write_entry(struct pt_regs *ctx, struct file *file, + char __user *buf, size_t count) +{ + return do_entry(ctx, file, buf, count, 0); +} + +""" +if args.pid: + bpf_text = bpf_text.replace('FILTER', 'pid != %s' % args.pid) +else: + bpf_text = bpf_text.replace('FILTER', '0') +if debug: + print(bpf_text) + +# initialize BPF +b = BPF(text=bpf_text) +b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") +b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") + +print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) + +# output +exiting = 0 +while 1: + try: + sleep(interval) + except KeyboardInterrupt: + exiting = 1 + + # header + if clear: + call("clear") + else: + print() + with open(loadavg) as stats: + print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) + print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("PID", "COMM", + "READS", "WRITES", "R_Kb", "W_Kb", "T", "FILE")) + + # by-PID output + counts = b.get_table("counts") + line = 0 + for k, v in reversed(sorted(counts.items(), + key=lambda counts: counts[1].rbytes)): + + # print line + print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.name, + v.reads, v.writes, v.rbytes / 1024, v.wbytes / 1024, k.type, + k.file)) + + line += 1 + if line >= maxrows: + break + counts.clear() + + countdown -= 1 + if exiting or countdown == 0: + print("Detaching...") + exit() diff --git a/tools/filetop_example.txt b/tools/filetop_example.txt new file mode 100644 index 0000000..1fa0b19 --- /dev/null +++ b/tools/filetop_example.txt @@ -0,0 +1,157 @@ +Demonstrations of filetop, the Linux eBPF/bcc version. + + +filetop shows reads and writes by file, with process details. For example: + +# ./filetop -C +Tracing... Output every 1 secs. Hit Ctrl-C to end + +08:00:23 loadavg: 0.91 0.33 0.23 3/286 26635 + +PID COMM READS WRITES R_Kb W_Kb T FILE +26628 ld 161 186 643 152 R built-in.o +26634 cc1 1 0 200 0 R autoconf.h +26618 cc1 1 0 200 0 R autoconf.h +26634 cc1 12 0 192 0 R tracepoint.h +26584 cc1 2 0 143 0 R mm.h +26634 cc1 2 0 143 0 R mm.h +26631 make 34 0 136 0 R auto.conf +26634 cc1 1 0 98 0 R fs.h +26584 cc1 1 0 98 0 R fs.h +26634 cc1 1 0 91 0 R sched.h +26634 cc1 1 0 78 0 R printk.c +26634 cc1 3 0 73 0 R mmzone.h +26628 ld 18 0 72 0 R hibernate.o +26628 ld 16 0 64 0 R suspend.o +26628 ld 16 0 64 0 R snapshot.o +26630 cat 1 0 64 0 O null +26628 ld 16 0 64 0 R qos.o +26628 ld 13 0 52 0 R main.o +26628 ld 12 0 52 0 R swap.o +12421 sshd 3 0 48 0 O ptmx +[...] + +This shows various files read and written during a Linux kernel build. The +output is sorted by the total read size in Kbytes (R_Kb). This is instrumenting +at the VFS interface, so this is reads and writes that may return entirely +from the file system cache (page cache). + +While not printed, the average read and write size can be calculated by +dividing R_Kb by READS, and the same for writes. + +The "T" column indicates the type of the file: "R" for regular files, "S" for +sockets, and "O" for other (including pipes). + +This script works by tracing the vfs_read() and vfs_write() functions using +kernel dynamic tracing, which instruments explicit read and write calls. If +files are read or written using another means (eg, via mmap()), then they +will not be visible using this tool. + +This should be useful for file system workload characterization when analyzing +the performance of applications. + +Note that tracing VFS level reads and writes can be a frequent activity, and +this tool can begin to cost measurable overhead at high I/O rates. + + +A -C option will stop clearing the screen, and -r with a number will restrict +the output to that many rows (20 by default). For example, not clearing +the screen and showing the top 5 only: + +# ./filetop -Cr 5 +Tracing... Output every 1 secs. Hit Ctrl-C to end + +08:05:11 loadavg: 0.75 0.35 0.25 3/285 822 + +PID COMM READS WRITES R_Kb W_Kb T FILE +32672 cksum 5006 0 320384 0 R data1 +12296 sshd 2 0 32 0 O ptmx +809 run 2 0 8 0 R nsswitch.conf +811 run 2 0 8 0 R nsswitch.conf +804 chown 2 0 8 0 R nsswitch.conf + +08:05:12 loadavg: 0.75 0.35 0.25 3/285 845 + +PID COMM READS WRITES R_Kb W_Kb T FILE +32672 cksum 4986 0 319104 0 R data1 +845 chown 2 0 8 0 R nsswitch.conf +828 run 2 0 8 0 R nsswitch.conf +835 run 2 0 8 0 R nsswitch.conf +830 run 2 0 8 0 R nsswitch.conf + +08:05:13 loadavg: 0.75 0.35 0.25 3/285 868 + +PID COMM READS WRITES R_Kb W_Kb T FILE +32672 cksum 4985 0 319040 0 R data1 +857 run 2 0 8 0 R nsswitch.conf +858 run 2 0 8 0 R nsswitch.conf +859 run 2 0 8 0 R nsswitch.conf +848 run 2 0 8 0 R nsswitch.conf +[...] + +This output shows a cksum command reading data1. Note that + + +An optional interval and optional count can also be added to the end of the +command line. For example, for 1 second interval, and 3 summaries in total: + +# ./filetop -Cr 5 1 3 +Tracing... Output every 1 secs. Hit Ctrl-C to end + +08:08:20 loadavg: 0.30 0.42 0.31 3/282 5187 + +PID COMM READS WRITES R_Kb W_Kb T FILE +12421 sshd 14101 0 225616 0 O ptmx +12296 sshd 4 0 64 0 O ptmx +12421 sshd 3 14104 48 778 S TCP +5178 run 2 0 8 0 R nsswitch.conf +5165 run 2 0 8 0 R nsswitch.conf + +08:08:21 loadavg: 0.30 0.42 0.31 5/282 5210 + +PID COMM READS WRITES R_Kb W_Kb T FILE +12421 sshd 9159 0 146544 0 O ptmx +12421 sshd 3 9161 48 534 S TCP +12296 sshd 1 0 16 0 S TCP +5188 run 2 0 8 0 R nsswitch.conf +5203 run 2 0 8 0 R nsswitch.conf + +08:08:22 loadavg: 0.30 0.42 0.31 2/282 5233 + +PID COMM READS WRITES R_Kb W_Kb T FILE +12421 sshd 26166 0 418656 0 O ptmx +12421 sshd 4 26171 64 1385 S TCP +12296 sshd 1 0 16 0 O ptmx +5214 run 2 0 8 0 R nsswitch.conf +5227 run 2 0 8 0 R nsswitch.conf +Detaching... + +This example has caught heavy socket I/O from an sshd process, showing up as +non-regular file types (the "O" for other, and "S" for socket, in the type +column: "T"). + + +USAGE message: + +# ./filetop -h +usage: filetop [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count] + +File reads and writes by process + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -C, --noclear don't clear the screen + -r MAXROWS, --maxrows MAXROWS + maximum rows to print, default 20 + -p PID, --pid PID trace this PID only + +examples: + ./filetop # file I/O top, 1 second refresh + ./filetop -C # don't clear the screen + ./filetop -p 181 # PID 181 only + ./filetop 5 # 5 second summaries + ./filetop 5 10 # 5 second summaries, 10 times only