(void *) BPF_FUNC_perf_event_output;
static int (*bpf_skb_load_bytes)(void *ctx, int offset, void *to, u32 len) =
(void *) BPF_FUNC_skb_load_bytes;
+
+/* bpf_get_stackid will return a negative value in the case of an error
+ *
+ * BPF_STACK_TRACE(_name, _size) will allocate space for _size stack traces.
+ * -ENOMEM will be returned when this limit is reached.
+ */
static int (*bpf_get_stackid_)(void *ctx, void *map, u64 flags) =
(void *) BPF_FUNC_get_stackid;
static inline __attribute__((always_inline))
int bpf_get_stackid(uintptr_t map, void *ctx, u64 flags) {
return bpf_get_stackid_(ctx, (void *)map, flags);
}
+
static int (*bpf_csum_diff)(void *from, u64 from_size, void *to, u64 to_size, u64 seed) =
(void *) BPF_FUNC_csum_diff;
import argparse
import signal
+# arg validation
+def positive_int(val):
+ try:
+ ival = int(val)
+ except ValueError:
+ raise argparse.ArgumentTypeError("must be an integer")
+
+ if ival < 0:
+ raise argparse.ArgumentTypeError("must be positive")
+ return ival
+
+def positive_nonzero_int(val):
+ ival = positive_int(val)
+ if ival == 0:
+ raise argparse.ArgumentTypeError("must be nonzero")
+ return ival
+
# arguments
examples = """examples:
./offcputime # trace off-CPU stack time until Ctrl-C
./offcputime 5 # trace for 5 seconds only
./offcputime -f 5 # 5 seconds, and output in folded format
./offcputime -u # don't include kernel threads (user only)
- ./offcputime -p 185 # trace fo PID 185 only
+ ./offcputime -p 185 # trace for PID 185 only
"""
parser = argparse.ArgumentParser(
description="Summarize off-CPU time by kernel stack trace",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
-parser.add_argument("-u", "--useronly", action="store_true",
+thread_group = parser.add_mutually_exclusive_group()
+thread_group.add_argument("-u", "--useronly", action="store_true",
help="user threads only (no kernel threads)")
-parser.add_argument("-p", "--pid",
+thread_group.add_argument("-p", "--pid", type=positive_int,
help="trace this PID only")
parser.add_argument("-v", "--verbose", action="store_true",
help="show raw addresses")
parser.add_argument("-f", "--folded", action="store_true",
help="output folded format")
+parser.add_argument("--stack-storage-size", default=1024,
+ type=positive_nonzero_int,
+ help="the number of unique stack traces that can be stored and " \
+ "displayed (default 1024)")
parser.add_argument("duration", nargs="?", default=99999999,
+ type=positive_nonzero_int,
help="duration of trace, in seconds")
args = parser.parse_args()
folded = args.folded
duration = int(args.duration)
-debug = 0
-if args.pid and args.useronly:
- print("ERROR: use either -p or -u.")
- exit()
# signal handler
def signal_ignore(signal, frame):
};
BPF_HASH(counts, struct key_t);
BPF_HASH(start, u32);
-BPF_STACK_TRACE(stack_traces, 1024)
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE)
int oncpu(struct pt_regs *ctx, struct task_struct *prev) {
u32 pid;
return 0;
}
"""
-if args.pid:
+
+# set thread filter
+if args.pid is not None:
filter = 'pid == %s' % args.pid
elif args.useronly:
filter = '!(prev->flags & PF_KTHREAD)'
else:
filter = '1'
bpf_text = bpf_text.replace('FILTER', filter)
-if debug:
- print(bpf_text)
+
+# set stack storage size
+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event="finish_task_switch", fn_name="oncpu")
matched = b.num_open_kprobes()
if matched == 0:
- print("0 functions traced. Exiting.")
- exit()
+ print("error: 0 functions traced. Exiting.", file=stderr)
+ exit(1)
# header
if not folded:
print()
missing_stacks = 0
+has_enomem = False
counts = b.get_table("counts")
stack_traces = b.get_table("stack_traces")
for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
- """
- bpf_get_stackid will return a negative value in the case of an error
-
- BPF_STACK_TRACE(_name, _size) will allocate space for _size stack traces
- on each CPU. -ENOMEM will be returned when this limit is reached within a
- single CPU.
- """
+ # handle get_stackid erorrs
if k.stack_id < 0:
missing_stacks += 1
+ # check for an ENOMEM error
+ if k.stack_id == -12:
+ has_enomem = True
continue
stack = stack_traces.walk(k.stack_id)
print(" %d\n" % v.value)
if missing_stacks > 0:
- print(("WARNING: %d stack traces could not be displayed. "
- "You may be running out of storage space for stack traces.") %
- missing_stacks,
+ enomem_str = "" if not has_enomem else \
+ " Consider increasing --stack-storage-size."
+ print("WARNING: %d stack traces could not be displayed.%s" %
+ (missing_stacks, enomem_str),
file=stderr)
stack trace looks like a page fault (do_page_fault() etc) from the "chmod"
command, and in total was off-CPU for 13 microseconds.
-# ./offcputime
+# ./offcputime
Tracing off-CPU time (us) by kernel stack... Hit Ctrl-C to end.
^C
schedule
Here, dd was blocked for 4.4 seconds out of 5. Or put differently, likely
on-CPU for about 12% of the time. Which matches the ratio seen by time(1):
-# time dd if=/dev/md0 iflag=direct of=/dev/null bs=1k
+# time dd if=/dev/md0 iflag=direct of=/dev/null bs=1k
^C108115+0 records in
108114+0 records out
110708736 bytes (111 MB) copied, 13.7565 s, 8.0 MB/s
USAGE message:
# ./offcputime -h
-usage: offcputime [-h] [-u] [-p PID] [-v] [-f] [duration]
+usage: offcputime.py [-h] [-u | -p PID] [-v] [-f]
+ [--stack-storage-size STACK_STORAGE_SIZE]
+ [duration]
Summarize off-CPU time by kernel stack trace
positional arguments:
- duration duration of trace, in seconds
+ duration duration of trace, in seconds
optional arguments:
- -h, --help show this help message and exit
- -u, --useronly user threads only (no kernel threads)
- -p PID, --pid PID trace this PID only
- -v, --verbose show raw addresses
- -f, --folded output folded format
+ -h, --help show this help message and exit
+ -u, --useronly user threads only (no kernel threads)
+ -p PID, --pid PID trace this PID only
+ -v, --verbose show raw addresses
+ -f, --folded output folded format
+ --stack-storage-size STACK_STORAGE_SIZE
+ the number of unique stack traces that can be stored
+ and displayed (default 1024)
examples:
./offcputime # trace off-CPU stack time until Ctrl-C