From: Andrew Birchall Date: Thu, 5 May 2016 17:56:40 +0000 (-0700) Subject: Add user space stack traces to offcputime X-Git-Tag: v0.2.0~78^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1f202e7b4899249c54a979ebe8882cb752de954a;p=platform%2Fupstream%2Fbcc.git Add user space stack traces to offcputime Summary: * generalize `KernelSymbolCache` into `SymbolCache` which supports user/kernel space symbols * create `BPF.sym()` for resolving user/kernel space symbols * `_ksym_cache` => `_sym_caches` and create `BPF._sym_cache()` to leverage the symbol caches * update `tools/offcputime.py` to print user space stack traces Test Plan: ``` dev[bcc](abirchall_next): ~/bcc_run_tool.sh offcputime -U -k -f 1 ERROR: Displaying user stacks for kernel threads doesn't make sense. devbig680[bcc](abirchall_next): ~/bcc_run_tool.sh offcputime -K -f 1 | grep python2 | head -n 1 python2.7;system_call_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule 19 dev[bcc](abirchall_next): ~/bcc_run_tool.sh offcputime -U -f 1 | grep python2 | head -n 1 python2.7;clone;start_thread;t_bootstrap;PyEval_CallObjectWithKeywords;PyObject_Call;instancemethod_call;PyObject_Call;function_call;PyEval_EvalCodeEx;PyEval_EvalFrameEx;PyObject_Call;function_call;PyEval_EvalCodeEx;PyEval_EvalFrameEx;time_sleep;PyEval_RestoreThread 5 devbig680[bcc](abirchall_next): ~/bcc_run_tool.sh offcputime -f 1 | grep python2 | head -n 1 python2.7;system_call_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;[unknown];__libc_start_main;Py_Main;RunModule;PyObject_Call;function_call;PyEval_EvalCodeEx;PyEval_EvalFrameEx;PyEval_EvalFrameEx;PyEval_EvalCodeEx;PyEval_EvalFrameEx;lock_PyThread_acquire_lock;PyEval_RestoreThread 39 ``` --- diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 2570849..dbc797b 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -180,6 +180,12 @@ static int (*bpf_skb_load_bytes)(void *ctx, int offset, void *to, u32 len) = * * BPF_STACK_TRACE(_name, _size) will allocate space for _size stack traces. * -ENOMEM will be returned when this limit is reached. + * + * -EFAULT is typically returned when requesting user-space stack straces (using + * BPF_F_USER_STACK) for kernel threads. However, a valid stackid may be + * returned in some cases; consider a tracepoint or kprobe executing in the + * kernel context. Given this you can typically ignore -EFAULT errors when + * retrieving user-space stack traces. */ static int (*bpf_get_stackid_)(void *ctx, void *map, u64 flags) = (void *) BPF_FUNC_get_stackid; diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 3a7755a..804c481 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -63,9 +63,9 @@ def _check_probe_quota(num_new_probes): if len(open_kprobes) + len(open_uprobes) + num_new_probes > _kprobe_limit: raise Exception("Number of open probes would exceed quota") -class KernelSymbolCache(object): - def __init__(self): - self.cache = lib.bcc_symcache_new(-1) +class SymbolCache(object): + def __init__(self, pid): + self.cache = lib.bcc_symcache_new(pid) def resolve(self, addr): sym = bcc_symbol() @@ -87,14 +87,14 @@ class BPF(object): SCHED_ACT = 4 _probe_repl = re.compile("[^a-zA-Z0-9_]") - _ksym_cache = KernelSymbolCache() + _sym_caches = {} _auto_includes = { - "linux/time.h" : ["time"], - "linux/fs.h" : ["fs", "file"], - "linux/blkdev.h" : ["bio", "request"], - "linux/slab.h" : ["alloc"], - "linux/netdevice.h" : ["sk_buff", "net_device"] + "linux/time.h": ["time"], + "linux/fs.h": ["fs", "file"], + "linux/blkdev.h": ["bio", "request"], + "linux/slab.h": ["alloc"], + "linux/netdevice.h": ["sk_buff", "net_device"] } @classmethod @@ -635,14 +635,37 @@ class BPF(object): exit() @staticmethod + def _sym_cache(pid): + """_sym_cache(pid) + + Returns a symbol cache for the specified PID. + The kernel symbol cache is accessed by providing any PID less than zero. + """ + if pid < 0 and pid != -1: + pid = -1 + if not pid in BPF._sym_caches: + BPF._sym_caches[pid] = SymbolCache(pid) + return BPF._sym_caches[pid] + + @staticmethod + def sym(addr, pid): + """sym(addr, pid) + + Translate a memory address into a function name for a pid, which is + returned. + A pid of less than zero will access the kernel symbol cache. + """ + name, _ = BPF._sym_cache(pid).resolve(addr) + return name + + @staticmethod def ksym(addr): """ksym(addr) Translate a kernel memory address into a kernel function name, which is returned. """ - name, _ = BPF._ksym_cache.resolve(addr) - return name + return BPF.sym(addr, -1) @staticmethod def ksymaddr(addr): @@ -652,7 +675,7 @@ class BPF(object): instruction offset as a hexidecimal number, which is returned as a string. """ - name, offset = BPF._ksym_cache.resolve(addr) + name, offset = BPF._sym_cache(-1).resolve(addr) return "%s+0x%x" % (name, offset) @staticmethod @@ -661,7 +684,7 @@ class BPF(object): Translate a kernel name into an address. This is the reverse of ksymaddr. Returns -1 when the function name is unknown.""" - return BPF._ksym_cache.resolve_name(name) + return BPF._sym_cache(-1).resolve_name(name) @staticmethod def num_open_kprobes(): diff --git a/tools/offcputime.py b/tools/offcputime.py index 751349f..675a802 100755 --- a/tools/offcputime.py +++ b/tools/offcputime.py @@ -1,9 +1,9 @@ #!/usr/bin/python # -# offcputime Summarize off-CPU time by kernel stack trace +# offcputime Summarize off-CPU time by stack trace # For Linux, uses BCC, eBPF. # -# USAGE: offcputime [-h] [-p PID | -u | -k] [-f] [duration] +# USAGE: offcputime [-h] [-p PID | -u | -k] [-U | -K] [-f] [duration] # # Copyright 2016 Netflix, Inc. # Licensed under the Apache License, Version 2.0 (the "License") @@ -54,6 +54,11 @@ thread_group.add_argument("-k", "--kernel-threads-only", action="store_true", help="kernel threads only (no user threads)") thread_group.add_argument("-u", "--user-threads-only", action="store_true", help="user threads only (no kernel threads)") +stack_group = parser.add_mutually_exclusive_group() +stack_group.add_argument("-U", "--user-stacks-only", action="store_true", + help="show stack from user space only (no kernel space stacks)") +stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", + help="show stack from kernel space only (no user space stacks)") parser.add_argument("-f", "--folded", action="store_true", help="output folded format") parser.add_argument("--stack-storage-size", default=1024, @@ -79,8 +84,10 @@ bpf_text = """ #define MINBLOCK_US 1 struct key_t { + u32 pid; + int user_stack_id; + int kernel_stack_id; char name[TASK_COMM_LEN]; - int stack_id; }; BPF_HASH(counts, struct key_t); BPF_HASH(start, u32); @@ -97,23 +104,29 @@ int oncpu(struct pt_regs *ctx, struct task_struct *prev) { start.update(&pid, &ts); } - // calculate current thread's delta time + // get the current thread's start time pid = bpf_get_current_pid_tgid(); tsp = start.lookup(&pid); - if (tsp == 0) + if (tsp == 0) { return 0; // missed start or filtered + } + + // calculate current thread's delta time u64 delta = bpf_ktime_get_ns() - *tsp; start.delete(&pid); delta = delta / 1000; - if (delta < MINBLOCK_US) + if (delta < MINBLOCK_US) { return 0; + } // create map key u64 zero = 0, *val; struct key_t key = {}; + key.pid = pid; + key.user_stack_id = USER_STACK_GET; + key.kernel_stack_id = KERNEL_STACK_GET; bpf_get_current_comm(&key.name, sizeof(key.name)); - key.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); val = counts.lookup_or_init(&key, &zero); (*val) += delta; @@ -140,6 +153,29 @@ bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) # set stack storage size bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) +# handle stack args +kernel_stack_get = "stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID)" +user_stack_get = \ + "stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID | BPF_F_USER_STACK)" +stack_context = "" +if args.user_stacks_only: + stack_context = "user" + kernel_stack_get = "-1" +elif args.kernel_stacks_only: + stack_context = "kernel" + user_stack_get = "-1" +else: + stack_context = "user + kernel" +bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) +bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) + +# check for an edge case; the code below will handle this case correctly +# but ultimately nothing will be displayed +if args.kernel_threads_only and args.user_stacks_only: + print("ERROR: Displaying user stacks for kernel threads " \ + "doesn't make sense.", file=stderr) + exit(1) + # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="finish_task_switch", fn_name="oncpu") @@ -150,8 +186,8 @@ if matched == 0: # header if not folded: - print("Tracing off-CPU time (us) of %s by kernel stack" % - thread_context, end="") + print("Tracing off-CPU time (us) of %s by %s stack" % + (thread_context, stack_context), end="") if duration < 99999999: print(" for %d secs." % duration) else: @@ -172,25 +208,35 @@ counts = b.get_table("counts") stack_traces = b.get_table("stack_traces") for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): # handle get_stackid erorrs - if k.stack_id < 0: + if (not args.user_stacks_only and k.kernel_stack_id < 0) or \ + (not args.kernel_stacks_only and k.user_stack_id < 0 and \ + k.user_stack_id != -14): missing_stacks += 1 # check for an ENOMEM error - if k.stack_id == -12: + if k.kernel_stack_id == -12 or k.user_stack_id == -12: has_enomem = True continue - stack = stack_traces.walk(k.stack_id) + user_stack = [] if k.user_stack_id < 0 else \ + stack_traces.walk(k.user_stack_id) + kernel_stack = [] if k.kernel_stack_id < 0 else \ + stack_traces.walk(k.kernel_stack_id) if folded: # print folded stack output - stack = list(stack)[1:] - line = [k.name.decode()] + [b.ksym(addr) for addr in reversed(stack)] + user_stack = list(user_stack)[1:] + kernel_stack = list(kernel_stack)[1:] + line = [k.name.decode()] + \ + [b.ksym(addr) for addr in reversed(kernel_stack)] + \ + [b.sym(addr, k.pid) for addr in reversed(user_stack)] print("%s %d" % (";".join(line), v.value)) else: # print default multi-line stack output - for addr in stack: - print(" %-16x %s" % (addr, b.ksym(addr))) - print(" %-16s %s" % ("-", k.name)) + for addr in user_stack: + print(" %016x %s" % (addr, b.sym(addr, k.pid))) + for addr in kernel_stack: + print(" %016x %s" % (addr, b.ksym(addr))) + print(" %-16s %s (%d)" % ("-", k.name, k.pid)) print(" %d\n" % v.value) if missing_stacks > 0: diff --git a/tools/offcputime_example.txt b/tools/offcputime_example.txt index fb984c6..18ce780 100644 --- a/tools/offcputime_example.txt +++ b/tools/offcputime_example.txt @@ -11,7 +11,7 @@ Here is some example output. To explain what we are seeing: the very first stack trace looks like a page fault (do_page_fault() etc) from the "chmod" command, and in total was off-CPU for 13 microseconds. -# ./offcputime +# ./offcputime -K Tracing off-CPU time (us) by kernel stack... Hit Ctrl-C to end. ^C schedule @@ -599,7 +599,7 @@ the overhead will be measurable. A -p option can be used to filter (in-kernel) on a single process ID. For example, only matching PID 26651, which is a running "dd" command: -# ./offcputime -p 26651 +# ./offcputime -K -p 26651 Tracing off-CPU time (us) by kernel stack... Hit Ctrl-C to end. ^C schedule @@ -623,7 +623,7 @@ total of 2.4 seconds during tracing. A duration can be added, for example, tracing for 5 seconds only: -# ./offcputime -p 26651 5 +# ./offcputime -K -p 26651 5 Tracing off-CPU time (us) by kernel stack for 5 secs. schedule @@ -658,7 +658,7 @@ A -f option will emit output using the "folded stacks" format, which can be read directly by flamegraph.pl from the FlameGraph open source software (https://github.com/brendangregg/FlameGraph). Eg: -# ./offcputime -f 5 +# ./offcputime -K -f 5 bash;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;tty_read;n_tty_read;call_rwsem_down_read_failed;rwsem_down_read_failed;schedule 8 yes;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;call_rwsem_down_read_failed;rwsem_down_read_failed;schedule 14 run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__do_fault;filemap_fault;__lock_page_or_retry;wait_on_page_bit_killable;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 33 @@ -718,7 +718,7 @@ creating your "off-CPU time flame graphs". USAGE message: # ./offcputime -h -usage: offcputime.py [-h] [-p PID | -k | -u] [-f] +usage: offcputime.py [-h] [-p PID | -k | -u] [-U | -K] [-f] [--stack-storage-size STACK_STORAGE_SIZE] [duration] @@ -734,6 +734,12 @@ optional arguments: kernel threads only (no user threads) -u, --user-threads-only user threads only (no kernel threads) + -U, --user-stacks-only + show stack from user space only (no kernel space + stacks) + -K, --kernel-stacks-only + show stack from kernel space only (no user space + stacks) -f, --folded output folded format --stack-storage-size STACK_STORAGE_SIZE the number of unique stack traces that can be stored