stackcount: add -K, -U, -d, -D, and -f (#1335)
authorBrendan Gregg <brendan.d.gregg@gmail.com>
Sun, 3 Sep 2017 19:08:52 +0000 (12:08 -0700)
committerSasha Goldshtein <goldshtn@gmail.com>
Sun, 3 Sep 2017 19:08:52 +0000 (20:08 +0100)
* stackcount: add -K and -U

* stackcount: add -D for duration

* stackcount: add -f for folded output

man/man8/stackcount.8
tools/stackcount.py
tools/stackcount_example.txt

index 10999e2..d6ab993 100644 (file)
@@ -2,14 +2,14 @@
 .SH NAME
 stackcount \- Count function calls and their stack traces. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-r] [\-s]
-              [\-P] [\-v] [\-d] pattern
+.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-D DURATION] [\-T] [\-r] [\-s]
+              [\-P] [\-K] [\-U] [\-v] [\-d] [\-f] pattern
 .SH DESCRIPTION
 stackcount traces functions and frequency counts them with their entire
-stack trace, summarized in-kernel for efficiency. This allows higher
-frequency events to be studied. The output consists of unique stack traces,
-and their occurrence counts. In addition to kernel and user functions, kernel
-tracepoints and USDT tracepoint are also supported.
+stack trace, kernel stack and user stack, summarized in-kernel for efficiency.
+This allows higher frequency events to be studied. The output consists of
+unique stack traces, and their occurrence counts. In addition to kernel and
+user functions, kernel tracepoints and USDT tracepoint are also supported.
 
 The pattern is a string with optional '*' wildcards, similar to file globbing.
 If you'd prefer to use regular expressions, use the \-r option.
@@ -38,11 +38,19 @@ Include a timestamp with interval output.
 Show raw addresses.
 .TP
 \-d
+Print a delimiter ("--") in-between the kernel and user stacks.
+.TP
+\-\-debug
 Print the source of the BPF program when loading it (for debugging purposes).
 .TP
 \-i interval
 Summary interval, in seconds.
 .TP
+\-D duration
+Total duration of trace, in seconds.
+\-f
+Folded output format.
+.TP
 \-p PID
 Trace this process ID only (filtered in-kernel).
 .TP
@@ -52,47 +60,59 @@ A function name, or a search pattern. Can include wildcards ("*"). If the
 \-r option is used, can include regular expressions.
 .SH EXAMPLES
 .TP
-Count kernel stack traces for submit_bio():
+Count kernel and user stack traces for submit_bio():
 #
 .B stackcount submit_bio
 .TP
-Count kernel stack traces for ip_output():
+Count stacks with a delimiter for submit_bio():
+#
+.B stackcount \-d submit_bio
+.TP
+Count kernel stack trace only for submit_bio():
+#
+.B stackcount \-K submit_bio
+.TP
+Count user stack trace only for submit_bio():
+#
+.B stackcount \-U submit_bio
+.TP
+Count stack traces for ip_output():
 #
 .B stackcount ip_output
 .TP
 Show symbol offsets:
 #
-.B stackcount -s ip_output
+.B stackcount \-s ip_output
 .TP
 Show offsets and raw addresses (verbose):
 #
-.B stackcount -sv ip_output
+.B stackcount \-sv ip_output
 .TP
-Count kernel stacks for kernel functions matching tcp_send*:
+Count stacks for kernel functions matching tcp_send*:
 #
 .B stackcount 'tcp_send*'
 .TP
 Same as previous, but using regular expressions:
 #
-.B stackcount -r '^tcp_send.*'
+.B stackcount \-r '^tcp_send.*'
 .TP
 Output every 5 seconds, with timestamps:
 #
-.B stackcount -Ti 5 ip_output
+.B stackcount \-Ti 5 ip_output
 .TP
 Only count stacks when PID 185 is on-CPU:
 #
-.B stackcount -p 185 ip_output
+.B stackcount \-p 185 ip_output
 .TP
 Count user stacks for dynamic heap allocations with malloc in PID 185:
 #
-.B stackcount -p 185 c:malloc
+.B stackcount \-p 185 c:malloc
 .TP
 Count user stacks for thread creation (USDT tracepoint) in PID 185:
 #
-.B stackcount -p 185 u:pthread:pthread_create
+.B stackcount \-p 185 u:pthread:pthread_create
 .TP
-Count kernel stacks for context switch events using a kernel tracepoint:
+Count stacks for context switch events using a kernel tracepoint:
 #
 .B stackcount t:sched:sched_switch
 .SH OVERHEAD
index 914e7ee..536deed 100755 (executable)
@@ -3,8 +3,8 @@
 # stackcount    Count events and their stack traces.
 #               For Linux, uses BCC, eBPF.
 #
-# USAGE: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s]
-#                   [-P] [-v] pattern
+# USAGE: stackcount.py [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s]
+#                      [-P] [-K] [-U] [-v] [-d] [-f] [--debug]
 #
 # The pattern is a string with optional '*' wildcards, similar to file
 # globbing. If you'd prefer to use regular expressions, use the -r option.
@@ -27,7 +27,8 @@ import traceback
 debug = False
 
 class Probe(object):
-    def __init__(self, pattern, use_regex=False, pid=None, per_pid=False):
+    def __init__(self, pattern, kernel_stack, user_stack, use_regex=False,
+                 pid=None, per_pid=False):
         """Init a new probe.
 
         Init the probe from the pattern provided by the user. The supported
@@ -41,6 +42,8 @@ class Probe(object):
             t:cat:event     -- probe a kernel tracepoint
             u:lib:probe     -- probe a USDT tracepoint
         """
+        self.kernel_stack = kernel_stack
+        self.user_stack = user_stack
         parts = pattern.split(':')
         if len(parts) == 1:
             parts = ["p", "", parts[0]]
@@ -103,28 +106,51 @@ class Probe(object):
                             self.pattern)
 
     def load(self):
+        ctx_name = "ctx"
+        stack_trace = ""
+        if self.user_stack:
+                stack_trace += """
+                    key.user_stack_id = stack_traces.get_stackid(
+                      %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK
+                    );""" % (ctx_name)
+        else:
+                stack_trace += "key.user_stack_id = -1;"
+        if self.kernel_stack:
+                stack_trace += """
+                    key.kernel_stack_id = stack_traces.get_stackid(
+                      %s, BPF_F_REUSE_STACKID
+                    );""" % (ctx_name)
+        else:
+                stack_trace += "key.kernel_stack_id = -1;"
+
         trace_count_text = """
 int trace_count(void *ctx) {
     FILTER
     struct key_t key = {};
-    key.pid = GET_PID;
-    key.stackid = stack_traces.get_stackid(ctx, STACK_FLAGS);
+    key.tgid = GET_TGID;
+    STORE_COMM
+    %s
     u64 zero = 0;
     u64 *val = counts.lookup_or_init(&key, &zero);
     (*val)++;
     return 0;
 }
         """
+        trace_count_text = trace_count_text % (stack_trace)
+
         bpf_text = """#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
 
 struct key_t {
-    u32 pid;
-    int stackid;
+    // no pid (thread ID) so that we do not needlessly split this key
+    u32 tgid;
+    int kernel_stack_id;
+    int user_stack_id;
+    char name[TASK_COMM_LEN];
 };
 
 BPF_HASH(counts, struct key_t);
 BPF_STACK_TRACE(stack_traces, 1024);
-
         """
 
         # We really mean the tgid from the kernel's perspective, which is in
@@ -138,18 +164,18 @@ BPF_STACK_TRACE(stack_traces, 1024);
 
         # We need per-pid statistics when tracing a user-space process, because
         # the meaning of the symbols depends on the pid. We also need them if
-        # per-pid statistics were requested with -P.
-        if self.per_pid or not self.is_kernel_probe():
-            trace_count_text = trace_count_text.replace('GET_PID',
+        # per-pid statistics were requested with -P, or for user stacks.
+        if self.per_pid or not self.is_kernel_probe() or self.user_stack:
+            trace_count_text = trace_count_text.replace('GET_TGID',
                                         'bpf_get_current_pid_tgid() >> 32')
+            trace_count_text = trace_count_text.replace('STORE_COMM',
+                        'bpf_get_current_comm(&key.name, sizeof(key.name));')
         else:
+            # kernel stacks only. skip splitting on PID so these aggregate
+            # together, and don't store the process name.
             trace_count_text = trace_count_text.replace(
-                                    'GET_PID', '0xffffffff')
-
-        stack_flags = 'BPF_F_REUSE_STACKID'
-        if not self.is_kernel_probe():
-            stack_flags += '| BPF_F_USER_STACK'     # can't do both U *and* K
-        trace_count_text = trace_count_text.replace('STACK_FLAGS', stack_flags)
+                                    'GET_TGID', '0xffffffff')
+            trace_count_text = trace_count_text.replace('STORE_COMM', '')
 
         self.usdt = None
         if self.type == "u":
@@ -182,6 +208,7 @@ class Tool(object):
     def __init__(self):
         examples = """examples:
     ./stackcount submit_bio         # count kernel stack traces for submit_bio
+    ./stackcount -d ip_output       # include a user/kernel stack delimiter
     ./stackcount -s ip_output       # show symbol offsets
     ./stackcount -sv ip_output      # show offsets and raw addresses (verbose)
     ./stackcount 'tcp_send*'        # count stacks for funcs matching tcp_send*
@@ -191,6 +218,8 @@ class Tool(object):
     ./stackcount -p 185 c:malloc    # count stacks for malloc in PID 185
     ./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint
     ./stackcount -p 185 u:node:*    # count stacks for all USDT probes in node
+    ./stackcount -K t:sched:sched_switch   # kernel stacks only
+    ./stackcount -U t:sched:sched_switch   # user stacks only
         """
         parser = argparse.ArgumentParser(
             description="Count events and their stack traces",
@@ -198,8 +227,10 @@ class Tool(object):
             epilog=examples)
         parser.add_argument("-p", "--pid", type=int,
             help="trace this PID only")
-        parser.add_argument("-i", "--interval", default=99999999,
+        parser.add_argument("-i", "--interval",
             help="summary interval, seconds")
+        parser.add_argument("-D", "--duration",
+            help="total duration of trace, seconds")
         parser.add_argument("-T", "--timestamp", action="store_true",
             help="include timestamp on output")
         parser.add_argument("-r", "--regexp", action="store_true",
@@ -208,19 +239,56 @@ class Tool(object):
             help="show address offsets")
         parser.add_argument("-P", "--perpid", action="store_true",
             help="display stacks separately for each process")
+        parser.add_argument("-K", "--kernel-stacks-only",
+            action="store_true", help="kernel stack only", default=False)
+        parser.add_argument("-U", "--user-stacks-only",
+            action="store_true", help="user stack only", default=False)
         parser.add_argument("-v", "--verbose", action="store_true",
             help="show raw addresses")
-        parser.add_argument("-d", "--debug", action="store_true",
+        parser.add_argument("-d", "--delimited", action="store_true",
+            help="insert delimiter between kernel/user stacks")
+        parser.add_argument("-f", "--folded", action="store_true",
+            help="output folded format")
+        parser.add_argument("--debug", action="store_true",
             help="print BPF program before starting (for debugging purposes)")
         parser.add_argument("pattern",
             help="search expression for events")
         self.args = parser.parse_args()
         global debug
         debug = self.args.debug
-        self.probe = Probe(self.args.pattern, self.args.regexp,
-                           self.args.pid, self.args.perpid)
 
-    def _print_frame(self, addr, pid):
+        if self.args.duration and not self.args.interval:
+            self.args.interval = self.args.duration
+        if not self.args.interval:
+            self.args.interval = 99999999
+
+        if self.args.kernel_stacks_only and self.args.user_stacks_only:
+            print("ERROR: -K and -U are mutually exclusive. If you want " +
+                "both stacks, that is the default.")
+            exit()
+        if not self.args.kernel_stacks_only and not self.args.user_stacks_only:
+            self.kernel_stack = True
+            self.user_stack = True
+        else:
+            self.kernel_stack = self.args.kernel_stacks_only
+            self.user_stack = self.args.user_stacks_only
+
+        self.probe = Probe(self.args.pattern,
+                           self.kernel_stack, self.user_stack,
+                           self.args.regexp, self.args.pid, self.args.perpid)
+        self.need_delimiter = self.args.delimited and not (
+                    self.args.kernel_stacks_only or self.args.user_stacks_only)
+
+    def _print_kframe(self, addr):
+        print("  ", end="")
+        if self.args.verbose:
+            print("%-16x " % addr, end="")
+        if self.args.offset:
+            print("%s" % self.probe.bpf.ksym(addr, show_offset=True))
+        else:
+            print("%s" % self.probe.bpf.ksym(addr))
+
+    def _print_uframe(self, addr, pid):
         print("  ", end="")
         if self.args.verbose:
             print("%-16x " % addr, end="")
@@ -233,34 +301,31 @@ class Tool(object):
     def _signal_ignore(signal, frame):
         print()
 
-    def _comm_for_pid(self, pid):
-        if pid in self.comm_cache:
-            return self.comm_cache[pid]
-
-        try:
-            comm = "    %s [%d]" % (
-                    open("/proc/%d/comm" % pid).read().strip(),
-                    pid)
-            self.comm_cache[pid] = comm
-            return comm
-        except:
-            return "    unknown process [%d]" % pid
+    def _print_comm(self, comm, pid):
+        print("    %s [%d]" % (comm, pid))
 
     def run(self):
         self.probe.load()
         self.probe.attach()
-        print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
-              (self.probe.matched, self.args.pattern))
+        if not self.args.folded:
+            print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
+                  (self.probe.matched, self.args.pattern))
+        b = self.probe.bpf
         exiting = 0 if self.args.interval else 1
+        seconds = 0
         while True:
             try:
                 sleep(int(self.args.interval))
+                seconds += int(self.args.interval)
             except KeyboardInterrupt:
                 exiting = 1
                 # as cleanup can take many seconds, trap Ctrl-C:
                 signal.signal(signal.SIGINT, Tool._signal_ignore)
+            if self.args.duration and seconds >= int(self.args.duration):
+                exiting = 1
 
-            print()
+            if not self.args.folded:
+                print()
             if self.args.timestamp:
                 print("%-8s\n" % strftime("%H:%M:%S"), end="")
 
@@ -269,16 +334,37 @@ class Tool(object):
             self.comm_cache = {}
             for k, v in sorted(counts.items(),
                                key=lambda counts: counts[1].value):
-                for addr in stack_traces.walk(k.stackid):
-                    pid = -1 if self.probe.is_kernel_probe() else k.pid
-                    self._print_frame(addr, pid)
-                if not self.args.pid and k.pid != 0xffffffff:
-                    print(self._comm_for_pid(k.pid))
-                print("    %d\n" % v.value)
+                user_stack = [] if k.user_stack_id < 0 else \
+                    stack_traces.walk(k.user_stack_id)
+                kernel_stack = [] if k.kernel_stack_id < 0 else \
+                    stack_traces.walk(k.kernel_stack_id)
+
+                if self.args.folded:
+                    # print folded stack output
+                    user_stack = list(user_stack)
+                    kernel_stack = list(kernel_stack)
+                    line = [k.name.decode()] + \
+                        [b.sym(addr, k.tgid) for addr in
+                        reversed(user_stack)] + \
+                        (self.need_delimiter and ["-"] or []) + \
+                        [b.ksym(addr) for addr in reversed(kernel_stack)]
+                    print("%s %d" % (";".join(line), v.value))
+                else:
+                    # print multi-line stack output
+                    for addr in kernel_stack:
+                        self._print_kframe(addr)
+                    if self.need_delimiter:
+                        print("    --")
+                    for addr in user_stack:
+                        self._print_uframe(addr, k.tgid)
+                    if not self.args.pid and k.tgid != 0xffffffff:
+                        self._print_comm(k.name, k.tgid)
+                    print("    %d\n" % v.value)
             counts.clear()
 
             if exiting:
-                print("Detaching...")
+                if not self.args.folded:
+                    print("Detaching...")
                 exit()
 
 if __name__ == "__main__":
index d2d6448..92a77a8 100644 (file)
@@ -3,55 +3,118 @@ Demonstrations of stackcount, the Linux eBPF/bcc version.
 
 This program traces functions and frequency counts them with their entire
 stack trace, summarized in-kernel for efficiency. For example, counting
-stack traces that led to submit_bio(), which creates block device I/O:
+stack traces that led to the submit_bio() kernel function, which creates
+block device I/O:
 
 # ./stackcount submit_bio
 Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
 ^C
   submit_bio
+  ext4_writepages
+  do_writepages
+  __filemap_fdatawrite_range
+  filemap_flush
+  ext4_alloc_da_blocks
+  ext4_release_file
+  __fput
+  ____fput
+  task_work_run
+  exit_to_usermode_loop
+  syscall_return_slowpath
+  entry_SYSCALL_64_fastpath
+  [unknown]
+  [unknown]
+    tar [15069]
+    5
+
+  submit_bio
+  ext4_bio_write_page
+  mpage_submit_page
+  mpage_map_and_submit_buffers
+  ext4_writepages
+  do_writepages
+  __filemap_fdatawrite_range
+  filemap_flush
+  ext4_alloc_da_blocks
+  ext4_release_file
+  __fput
+  ____fput
+  task_work_run
+  exit_to_usermode_loop
+  syscall_return_slowpath
+  entry_SYSCALL_64_fastpath
+  [unknown]
+  [unknown]
+    tar [15069]
+    15
+
+  submit_bio
+  ext4_readpages
+  __do_page_cache_readahead
+  ondemand_readahead
+  page_cache_async_readahead
+  generic_file_read_iter
+  __vfs_read
+  vfs_read
+  sys_read
+  entry_SYSCALL_64_fastpath
+  [unknown]
+    tar [15069]
+    113
+
+Detaching...
+
+The output shows unique stack traces, in order from leaf (on-CPU) to root,
+followed by their occurrence count. The last stack trace in the above output
+shows syscall handling, sys_read(), vfs_read(), and then "readahead" functions:
+looks like an application issued file read has triggered read ahead. The
+application can be seen after the stack trace, in this case, "tar [15069]"
+for the "tar" command, PID 15069.
+
+The order of printed stack traces is from least to most frequent. The most
+frequent in this case, the ext4_rename() stack, was taken 113 times during
+tracing.
+
+The "[unknown]" frames are from user-level, since this simple workload is
+the tar command, which apparently has been compiled without frame pointers.
+It's a common compiler optimization, but it breaks frame pointer-based stack
+walkers. Similar broken stacks will be seen by other profilers and debuggers
+that use frame pointers. Hopefully your application preserves them so that
+the user-level stack trace is visible. So how does one get frame pointers, if
+your application doesn't have them to start with? For the current bcc (until
+it supports other stack walkers), you need to be running a application binaries
+that preserves frame pointers, eg, using gcc's -fno-omit-frame-pointer. That's
+about all I'll say here: this is a big topic that is not bcc/BPF specific.
+
+It can be useful to trace the path to submit_bio to explain unusual rates of
+disk IOPS. These could have in-kernel origins (eg, background scrub).
+
+
+Now adding the -d option to delimit kernel and user stacks:
+
+# ./stackcount -d submit_bio
+Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
+^C
+  submit_bio
   submit_bh
-  journal_submit_commit_record.isra.13
+  journal_submit_commit_record
   jbd2_journal_commit_transaction
   kjournald2
   kthread
   ret_from_fork
-  mb_cache_list
+    --
+    jbd2/xvda1-8 [405]
     1
 
   submit_bio
-  __block_write_full_page.constprop.39
-  block_write_full_page
-  blkdev_writepage
-  __writepage
-  write_cache_pages
-  generic_writepages
-  do_writepages
-  __writeback_single_inode
-  writeback_sb_inodes
-  __writeback_inodes_wb
-    2
-
-  submit_bio
-  __block_write_full_page.constprop.39
-  block_write_full_page
-  blkdev_writepage
-  __writepage
-  write_cache_pages
-  generic_writepages
-  do_writepages
-  __filemap_fdatawrite_range
-  filemap_fdatawrite
-  fdatawrite_one_bdev
-    36
-
-  submit_bio
   submit_bh
   jbd2_journal_commit_transaction
   kjournald2
   kthread
   ret_from_fork
-  mb_cache_list
-    38
+    --
+    jbd2/xvda1-8 [405]
+    2
 
   submit_bio
   ext4_writepages
@@ -59,138 +122,220 @@ Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
   __filemap_fdatawrite_range
   filemap_flush
   ext4_alloc_da_blocks
-  ext4_rename
-  ext4_rename2
-  vfs_rename
-  sys_rename
+  ext4_release_file
+  __fput
+  ____fput
+  task_work_run
+  exit_to_usermode_loop
+  syscall_return_slowpath
   entry_SYSCALL_64_fastpath
-    79
+    --
+  [unknown]
+  [unknown]
+    tar [15187]
+    5
 
-Detaching...
+  submit_bio
+  ext4_bio_write_page
+  mpage_submit_page
+  mpage_map_and_submit_buffers
+  ext4_writepages
+  do_writepages
+  __filemap_fdatawrite_range
+  filemap_flush
+  ext4_alloc_da_blocks
+  ext4_release_file
+  __fput
+  ____fput
+  task_work_run
+  exit_to_usermode_loop
+  syscall_return_slowpath
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+    tar [15187]
+    15
 
-The output shows unique stack traces, in order from leaf (on-CPU) to root,
-followed by their occurrence count. The last stack trace in the above output
-shows syscall handling, ext4_rename(), and filemap_flush(): looks like an
-application issued file rename has caused back end disk I/O due to ext4
-block allocation and a filemap_flush(). I'd have to browse the code to those
-functions to confirm!
+  submit_bio
+  ext4_readpages
+  __do_page_cache_readahead
+  ondemand_readahead
+  page_cache_async_readahead
+  generic_file_read_iter
+  __vfs_read
+  vfs_read
+  sys_read
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+  [unknown]
+    tar [15187]
+    171
 
-The order of printed stack traces is from least to most frequent. The most
-frequent in this case, the ext4_rename() stack, was taken 79 times during
-tracing.
+Detaching...
 
-It can be useful to trace the path to submit_bio to explain unusual rates of
-disk IOPS. These could have in-kernel origins (eg, background scrub).
+A "--" is printed between the kernel and user stacks.
 
 
-As another example, here are the code paths that led to ip_output(), which
-sends a packet at the IP level:
+As a different example, here is the kernel function hrtimer_init_sleeper():
 
-# ./stackcount ip_output
-Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
+# ./stackcount.py -d hrtimer_init_sleeper
+Tracing 1 functions for "hrtimer_init_sleeper"... Hit Ctrl-C to end.
 ^C
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_write_xmit
-  tcp_tsq_handler.part.32
-  tcp_tasklet_func
-  tasklet_action
-  __do_softirq
-  do_softirq_own_stack
-  do_softirq
-  __local_bh_enable_ip
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+    containerd [16020]
     1
 
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_send_ack
-  tcp_send_delayed_ack
-  __tcp_ack_snd_check
-  tcp_rcv_established
-  tcp_v4_do_rcv
-  tcp_v4_rcv
-  ip_local_deliver_finish
-  ip_local_deliver
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  __pthread_cond_timedwait
+  Monitor::IWait(Thread*, long)
+  Monitor::wait(bool, long, bool)
+  CompileQueue::get()
+  CompileBroker::compiler_thread_loop()
+  JavaThread::thread_main_inner()
+  JavaThread::run()
+  java_start(Thread*)
+  start_thread
+    java [4996]
     1
 
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_send_ack
-  tcp_send_delayed_ack
-  __tcp_ack_snd_check
-  tcp_rcv_established
-  tcp_v4_do_rcv
-  tcp_v4_rcv
-  ip_local_deliver_finish
-  ip_local_deliver
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+    containerd [16020]
     1
 
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_send_ack
-  tcp_delack_timer_handler
-  tcp_delack_timer
-  call_timer_fn
-  run_timer_softirq
-  __do_softirq
-  irq_exit
-  xen_evtchn_do_upcall
-    1
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  __pthread_cond_timedwait
+  VMThread::loop()
+  VMThread::run()
+  java_start(Thread*)
+  start_thread
+    java [4996]
+    3
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+    dockerd [16008]
+    4
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+    dockerd [16008]
+    4
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  __pthread_cond_timedwait
+  Lio/netty/util/ThreadDeathWatcher$Watcher;::run
+  Interpreter
+  Interpreter
+  call_stub
+  JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)
+  JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*)
+  JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*)
+  thread_entry(JavaThread*, Thread*)
+  JavaThread::thread_main_inner()
+  JavaThread::run()
+  java_start(Thread*)
+  start_thread
+    java [4996]
+    4
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  __pthread_cond_timedwait
+  clock_gettime
+  [unknown]
+    java [4996]
+    79
+
+Detaching...
+
+I was just trying to find a more interesting example. This output includes
+some Java stacks where user-level has been walked correctly (even includes a
+JIT symbol translation). dockerd and containerd don't have frame pointers
+(grumble), but Java does (which is running with -XX:+PreserveFramePointer).
 
+
+Here's another kernel function, ip_output():
+
+# ./stackcount.py -d ip_output
+Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
+^C
   ip_output
   ip_queue_xmit
   tcp_transmit_skb
   tcp_write_xmit
   __tcp_push_pending_frames
-  tcp_rcv_established
-  tcp_v4_do_rcv
-  release_sock
+  tcp_push
   tcp_sendmsg
   inet_sendmsg
   sock_sendmsg
-    3
-
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_write_xmit
-  tcp_tsq_handler.part.32
-  tcp_tasklet_func
-  tasklet_action
-  __do_softirq
-  run_ksoftirqd
-  smpboot_thread_fn
-  kthread
-    3
+  sock_write_iter
+  __vfs_write
+  vfs_write
+  SyS_write
+  entry_SYSCALL_64_fastpath
+    --
+  __write_nocancel
+  [unknown]
+    sshd [15015]
+    5
 
   ip_output
   ip_queue_xmit
   tcp_transmit_skb
   tcp_write_xmit
   __tcp_push_pending_frames
-  tcp_rcv_established
-  tcp_v4_do_rcv
-  tcp_v4_rcv
-  ip_local_deliver_finish
-  ip_local_deliver
-  ip_rcv_finish
-    4
-
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_send_ack
-  tcp_cleanup_rbuf
-  tcp_recvmsg
-  inet_recvmsg
-  sock_recvmsg
-  sock_read_iter
-  __vfs_read
-  vfs_read
+  tcp_push
+  tcp_sendmsg
+  inet_sendmsg
+  sock_sendmsg
+  sock_write_iter
+  __vfs_write
+  vfs_write
+  SyS_write
+  entry_SYSCALL_64_fastpath
+    --
+  __write_nocancel
+  [unknown]
+  [unknown]
+    sshd [8234]
     5
 
   ip_output
@@ -198,14 +343,30 @@ Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
   tcp_transmit_skb
   tcp_write_xmit
   __tcp_push_pending_frames
-  tcp_rcv_established
-  tcp_v4_do_rcv
-  release_sock
+  tcp_push
   tcp_sendmsg
   inet_sendmsg
   sock_sendmsg
-    9
+  sock_write_iter
+  __vfs_write
+  vfs_write
+  SyS_write
+  entry_SYSCALL_64_fastpath
+    --
+  __write_nocancel
+    sshd [15015]
+    7
+
+Detaching...
+
+This time just sshd is triggering ip_output() calls.
 
+
+Watch what happens if I filter on kernel stacks only (-K) for ip_output():
+
+# ./stackcount.py -K ip_output
+Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
+^C
   ip_output
   ip_queue_xmit
   tcp_transmit_skb
@@ -217,61 +378,45 @@ Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
   sock_sendmsg
   sock_write_iter
   __vfs_write
-    51
+  vfs_write
+  SyS_write
+  entry_SYSCALL_64_fastpath
+    13
 
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_write_xmit
-  __tcp_push_pending_frames
-  tcp_rcv_established
-  tcp_v4_do_rcv
-  tcp_v4_rcv
-  ip_local_deliver_finish
-  ip_local_deliver
-  ip_rcv_finish
-    171
+Detaching...
 
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_write_xmit
-  __tcp_push_pending_frames
-  tcp_rcv_established
-  tcp_v4_do_rcv
-  tcp_v4_rcv
-  ip_local_deliver_finish
-  ip_local_deliver
-  ip_rcv_finish
-    994
+They have grouped together as a single unique stack, since the kernel part
+was the same.
 
-  ip_output
-  ip_queue_xmit
-  tcp_transmit_skb
-  tcp_write_xmit
-  tcp_tsq_handler.part.32
-  tcp_tasklet_func
-  tasklet_action
-  __do_softirq
-  irq_exit
-  xen_evtchn_do_upcall
-  xen_do_hypervisor_callback
-    1002
 
-Detaching...
+Here is just the user stacks, fetched during the kernel function ip_output():
 
-The last two most frequent stack traces are via tcp_transmit_skb(). Note the
-send last begins with ip_rcv_finish(), for a local receive, which then becomes
-a transmit: likely pushing more frames when processing the newly received ones.
+# ./stackcount.py -U ip_output
+Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
+^C
+  [unknown]
+    snmpd [1645]
+    1
+
+  __write_nocancel
+  [unknown]
+  [unknown]
+    sshd [8234]
+    3
+
+  __write_nocancel
+    sshd [15015]
+    4
 
-As may be obvious, this is a great tool for quickly understanding kernel code
-flow.
+I should really run a custom sshd with frame pointers so we can see its
+stack trace...
 
 
 User-space functions can also be traced if a library name is provided. For
-example, to quickly identify code locations that allocate heap memory:
+example, to quickly identify code locations that allocate heap memory for
+PID 4902 (using -p), by tracing malloc from libc ("c:malloc"):
 
-# ./stackcount -l c -p 4902 malloc
+# ./stackcount -p 4902 c:malloc
 Tracing 1 functions for "malloc"... Hit Ctrl-C to end.
 ^C
   malloc
@@ -290,10 +435,12 @@ Tracing 1 functions for "malloc"... Hit Ctrl-C to end.
 
 Detaching...
 
+Kernel stacks are absent as this didn't enter kernel code.
+
 Note that user-space uses of stackcount can be somewhat more limited because
-a lot of user-space libraries and binaries are compiled without debuginfo, or
-with frame-pointer omission (-fomit-frame-pointer), which makes it impossible
-to reliably obtain the stack trace.
+a lot of user-space libraries and binaries are compiled without frame-pointers
+as discussed earlier (the -fomit-frame-pointer compiler default) or are used
+without debuginfo.
 
 
 In addition to kernel and user-space functions, kernel tracepoints and USDT
@@ -313,132 +460,275 @@ Tracing 1 functions for "u:pthread:pthread_create"... Hit Ctrl-C to end.
   [unknown]
     7
 
+You can use "readelf -n file" to see if it has USDT tracepoints.
+
+
 Similarly, to determine where context switching is happening in the kernel, 
 use the sched:sched_switch kernel tracepoint:
 
 # ./stackcount t:sched:sched_switch
-... (omitted for brevity)
+  __schedule
+  schedule
+  worker_thread
+  kthread
+  ret_from_fork
+    kworker/0:2 [25482]
+    1
 
   __schedule
   schedule
   schedule_hrtimeout_range_clock
   schedule_hrtimeout_range
-  poll_schedule_timeout
-  do_select
-  core_sys_select
-  SyS_select
+  ep_poll
+  SyS_epoll_wait
   entry_SYSCALL_64_fastpath
-    40
+  epoll_wait
+  Lsun/nio/ch/SelectorImpl;::lockAndDoSelect
+  Lsun/nio/ch/SelectorImpl;::select
+  Lio/netty/channel/nio/NioEventLoop;::select
+  Lio/netty/channel/nio/NioEventLoop;::run
+  Interpreter
+  Interpreter
+  call_stub
+  JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)
+  JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*)
+  JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*)
+  thread_entry(JavaThread*, Thread*)
+  JavaThread::thread_main_inner()
+  JavaThread::run()
+  java_start(Thread*)
+  start_thread
+    java [4996]
+    1
+
+... (omitted for brevity)
 
   __schedule
   schedule
   schedule_preempt_disabled
   cpu_startup_entry
-  start_secondary
-    85
+  xen_play_dead
+  arch_cpu_idle_dead
+  cpu_startup_entry
+  cpu_bringup_and_idle
+    swapper/1 [0]
+    289
 
 
 A -i option can be used to set an output interval, and -T to include a
 timestamp. For example:
 
-# ./stackcount -Ti 1 submit_bio
+# ./stackcount.py -Tdi 1 submit_bio
 Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
 
-01:11:37
+06:05:13
+
+06:05:14
   submit_bio
-  submit_bh
-  journal_submit_commit_record.isra.13
-  jbd2_journal_commit_transaction
-  kjournald2
+  xfs_do_writepage
+  write_cache_pages
+  xfs_vm_writepages
+  do_writepages
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
   kthread
   ret_from_fork
-  mb_cache_list
+    --
+    kworker/u16:1 [15686]
     1
 
   submit_bio
-  ext4_writepages
-  do_writepages
-  __filemap_fdatawrite_range
-  filemap_flush
-  ext4_alloc_da_blocks
-  ext4_rename
-  ext4_rename2
-  vfs_rename
-  sys_rename
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:0 [16007]
+    1
+
+  submit_bio
+  xfs_buf_submit
+  xlog_bdstrat
+  xlog_sync
+  xlog_state_release_iclog
+  _xfs_log_force
+  xfs_log_force
+  xfs_fs_sync_fs
+  sync_fs_one_sb
+  iterate_supers
+  sys_sync
   entry_SYSCALL_64_fastpath
-    20
+    --
+  [unknown]
+    sync [16039]
+    1
 
   submit_bio
   submit_bh
+  journal_submit_commit_record
   jbd2_journal_commit_transaction
   kjournald2
   kthread
   ret_from_fork
-  mb_cache_list
-    39
+    --
+    jbd2/xvda1-8 [405]
+    1
 
+  submit_bio
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/0:2 [25482]
+    2
 
-01:11:38
   submit_bio
   ext4_writepages
   do_writepages
-  __filemap_fdatawrite_range
-  filemap_flush
-  ext4_alloc_da_blocks
-  ext4_rename
-  ext4_rename2
-  vfs_rename
-  sys_rename
-  entry_SYSCALL_64_fastpath
-    20
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:0 [16007]
+    4
 
+  submit_bio
+  xfs_vm_writepages
+  do_writepages
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:1 [15686]
+    5
 
-01:11:39
   submit_bio
-  ext4_writepages
+  __block_write_full_page
+  block_write_full_page
+  blkdev_writepage
+  __writepage
+  write_cache_pages
+  generic_writepages
+  blkdev_writepages
   do_writepages
   __filemap_fdatawrite_range
-  filemap_flush
-  ext4_alloc_da_blocks
-  ext4_rename
-  ext4_rename2
-  vfs_rename
-  sys_rename
+  filemap_fdatawrite
+  fdatawrite_one_bdev
+  iterate_bdevs
+  sys_sync
   entry_SYSCALL_64_fastpath
-    20
+    --
+  [unknown]
+    sync [16039]
+    7
 
-^C
-01:11:39
   submit_bio
+  submit_bh
+  jbd2_journal_commit_transaction
+  kjournald2
+  kthread
+  ret_from_fork
+    --
+    jbd2/xvda1-8 [405]
+    8
+
+  submit_bio
+  ext4_bio_write_page
+  mpage_submit_page
+  mpage_map_and_submit_buffers
   ext4_writepages
   do_writepages
-  __filemap_fdatawrite_range
-  filemap_flush
-  ext4_alloc_da_blocks
-  ext4_rename
-  ext4_rename2
-  vfs_rename
-  sys_rename
-  entry_SYSCALL_64_fastpath
-    20
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:0 [16007]
+    8
+
+  submit_bio
+  __block_write_full_page
+  block_write_full_page
+  blkdev_writepage
+  __writepage
+  write_cache_pages
+  generic_writepages
+  blkdev_writepages
+  do_writepages
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:0 [16007]
+    60
+
+
+06:05:15
+
+06:05:16
 
 Detaching...
 
+This only included output for the 06:05:14 interval. The other internals
+did not span block device I/O.
+
 
 The -s output prints the return instruction offset for each function (aka
 symbol offset). Eg:
 
-# ./stackcount -s tcp_sendmsg
+# ./stackcount.py -s tcp_sendmsg
 Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end.
 ^C
-  tcp_sendmsg0x1
-  sock_sendmsg0x38
-  sock_write_iter0x78
-  __vfs_write0xaa
-  vfs_write0xa9
-  sys_write0x46
-  entry_SYSCALL_64_fastpath0x16
-    29
+  tcp_sendmsg+0x1
+  sock_sendmsg+0x38
+  sock_write_iter+0x85
+  __vfs_write+0xe3
+  vfs_write+0xb8
+  SyS_write+0x55
+  entry_SYSCALL_64_fastpath+0x1e
+  __write_nocancel+0x7
+    sshd [15015]
+    3
+
+  tcp_sendmsg+0x1
+  sock_sendmsg+0x38
+  sock_write_iter+0x85
+  __vfs_write+0xe3
+  vfs_write+0xb8
+  SyS_write+0x55
+  entry_SYSCALL_64_fastpath+0x1e
+  __write_nocancel+0x7
+    sshd [8234]
+    3
 
 Detaching...
 
@@ -446,9 +736,40 @@ If it wasn't clear how one function called another, knowing the instruction
 offset can help you locate the lines of code from a disassembly dump.
 
 
-A wildcard can also be used. Eg, all functions beginning with "tcp_send":
+The -v output is verbose, and shows raw addresses:
 
-# ./stackcount -s 'tcp_send*'
+./stackcount.py -v tcp_sendmsg
+Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end.
+^C
+  ffffffff817b05c1 tcp_sendmsg
+  ffffffff8173ea48 sock_sendmsg
+  ffffffff8173eae5 sock_write_iter
+  ffffffff81232b33 __vfs_write
+  ffffffff812331b8 vfs_write
+  ffffffff81234625 SyS_write
+  ffffffff818739bb entry_SYSCALL_64_fastpath
+  7f120511e6e0     __write_nocancel
+    sshd [8234]
+    3
+
+  ffffffff817b05c1 tcp_sendmsg
+  ffffffff8173ea48 sock_sendmsg
+  ffffffff8173eae5 sock_write_iter
+  ffffffff81232b33 __vfs_write
+  ffffffff812331b8 vfs_write
+  ffffffff81234625 SyS_write
+  ffffffff818739bb entry_SYSCALL_64_fastpath
+  7f919c5a26e0     __write_nocancel
+    sshd [15015]
+    11
+
+Detaching...
+
+
+A wildcard can also be used. Eg, all functions beginning with "tcp_send",
+kernel stacks only (-K) with offsets (-s):
+
+# ./stackcount -Ks 'tcp_send*'
 Tracing 14 functions for "tcp_send*"... Hit Ctrl-C to end.
 ^C
   tcp_send_delayed_ack0x1
@@ -501,39 +822,69 @@ Detaching...
 Use -r to allow regular expressions.
 
 
+The -f option will emit folded output, which can be used as input to other
+tools including flame graphs. For example, with delimiters as well:
+
+# ./stackcount.py -df t:sched:sched_switch
+^Csnmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
+kworker/7:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1
+watchdog/0;-;ret_from_fork;kthread;smpboot_thread_fn;schedule;__schedule 1
+snmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
+svscan;[unknown];-;entry_SYSCALL_64_fastpath;SyS_nanosleep;hrtimer_nanosleep;do_nanosleep;schedule;__schedule 1
+python;[unknown];__select_nocancel;-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
+kworker/2:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1
+[...]
+
+Flame graphs visualize stack traces. For information about them and links
+to open source software, see http://www.brendangregg.com/flamegraphs.html .
+This folded output can be piped directly into flamegraph.pl (the Perl version).
+
+
 USAGE message:
 
 # ./stackcount -h
-usage: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s]
-                  [-l LIBRARY] [-v] [-d] pattern
+usage: stackcount [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s]
+                  [-P] [-K] [-U] [-v] [-d] [-f] [--debug]
+                  pattern
 
-Count function calls and their stack traces
+Count events and their stack traces
 
 positional arguments:
-  pattern               search expression for functions
+  pattern               search expression for events
 
 optional arguments:
   -h, --help            show this help message and exit
   -p PID, --pid PID     trace this PID only
   -i INTERVAL, --interval INTERVAL
                         summary interval, seconds
+  -D DURATION, --duration DURATION
+                        total duration of trace, seconds
   -T, --timestamp       include timestamp on output
   -r, --regexp          use regular expressions. Default is "*" wildcards
                         only.
   -s, --offset          show address offsets
-  -l, --library         trace user-space functions from this library or executable
+  -P, --perpid          display stacks separately for each process
+  -K, --kernel-stacks-only
+                        kernel stack only
+  -U, --user-stacks-only
+                        user stack only
   -v, --verbose         show raw addresses
-  -d, --debug           print BPF program before starting (for debugging purposes)
+  -d, --delimited       insert delimiter between kernel/user stacks
+  -f, --folded          output folded format
+  --debug               print BPF program before starting (for debugging
+                        purposes)
 
 examples:
-    ./stackcount submit_bio          # count kernel stack traces for submit_bio
-    ./stackcount ip_output           # count kernel stack traces for ip_output
-    ./stackcount -s ip_output        # show symbol offsets
-    ./stackcount -sv ip_output       # show offsets and raw addresses (verbose)
-    ./stackcount 'tcp_send*'         # count stacks for funcs matching tcp_send*
-    ./stackcount -r '^tcp_send.*'    # same as above, using regular expressions
-    ./stackcount -Ti 5 ip_output     # output every 5 seconds, with timestamps
-    ./stackcount -p 185 ip_output    # count ip_output stacks for PID 185 only
-    ./stackcount -p 185 -l c malloc  # count stacks for malloc in PID 185
-    ./stackcount t:sched:sched_fork  # count stacks for the sched_fork tracepoint
-    ./stackcount -p 185 u:node:*     # count stacks for all USDT probes in node
+    ./stackcount submit_bio         # count kernel stack traces for submit_bio
+    ./stackcount -d ip_output       # include a user/kernel stack delimiter
+    ./stackcount -s ip_output       # show symbol offsets
+    ./stackcount -sv ip_output      # show offsets and raw addresses (verbose)
+    ./stackcount 'tcp_send*'        # count stacks for funcs matching tcp_send*
+    ./stackcount -r '^tcp_send.*'   # same as above, using regular expressions
+    ./stackcount -Ti 5 ip_output    # output every 5 seconds, with timestamps
+    ./stackcount -p 185 ip_output   # count ip_output stacks for PID 185 only
+    ./stackcount -p 185 c:malloc    # count stacks for malloc in PID 185
+    ./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint
+    ./stackcount -p 185 u:node:*    # count stacks for all USDT probes in node
+    ./stackcount -K t:sched:sched_switch   # kernel stacks only
+    ./stackcount -U t:sched:sched_switch   # user stacks only