From 047541c4b0274d68cd6837e7f19c133b1ce548ce Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 13 Jan 2022 19:56:32 +0800 Subject: [PATCH] tools/trace.py: aggregate trace events by '-A' option Orignally we can save the backtrace of a specified function into a text file, then try to analyze the messy trace to guess why the function is called. To make the work easier, aggregate amount of trace by -A with -M EVENTS. A typical example: 1, if we find that the sys CPU utilization is higher by 'top' command 2, then find that the timer interrupt is more normal by 'irqtop' command 3, to confirm kernel timer setting frequence by 'funccount -i 1 clockevents_program_event' 4, to trace timer setting by 'trace clockevents_program_event -K -A -M 1000' 1294576 1294584 CPU 0/KVM clockevents_program_event clockevents_program_event+0x1 [kernel] hrtimer_start_range_ns+0x209 [kernel] start_sw_timer+0x173 [kvm] restart_apic_timer+0x6c [kvm] kvm_set_msr_common+0x442 [kvm] __kvm_set_msr+0xa2 [kvm] kvm_emulate_wrmsr+0x36 [kvm] vcpu_enter_guest+0x326 [kvm] kvm_arch_vcpu_ioctl_run+0xcc [kvm] kvm_vcpu_ioctl+0x22f [kvm] do_vfs_ioctl+0xa1 [kernel] ksys_ioctl+0x60 [kernel] __x64_sys_ioctl+0x16 [kernel] do_syscall_64+0x59 [kernel] entry_SYSCALL_64_after_hwframe+0x44 [kernel] -->COUNT 271 ... Then we can know that 271 timer setting in recent 1000(~27%) occurs in KVM code path. To avoid too large size of a dictionary in python, -A works always with -M. Also fix several minor changes to keep 'trace -h' aligned with trace_example.txt. Signed-off-by: zhenwei pi --- man/man8/trace.8 | 5 +++- tools/trace.py | 64 ++++++++++++++++++++++++++++++----------- tools/trace_example.txt | 35 ++++++++++++++++++++-- 3 files changed, 83 insertions(+), 21 deletions(-) diff --git a/man/man8/trace.8 b/man/man8/trace.8 index 7afd2527..acfff58f 100644 --- a/man/man8/trace.8 +++ b/man/man8/trace.8 @@ -3,7 +3,7 @@ trace \- Trace a function and print its arguments or return value, optionally evaluating a filter. Uses Linux eBPF/bcc. .SH SYNOPSIS .B trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [--uid UID] [-v] [-Z STRING_SIZE] [-S] [-s SYM_FILE_LIST] - [-M MAX_EVENTS] [-t] [-u] [-T] [-C] [-K] [-U] [-a] [-I header] + [-M MAX_EVENTS] [-t] [-u] [-T] [-C] [-K] [-U] [-a] [-I header] [-A] probe [probe ...] .SH DESCRIPTION trace probes functions you specify and displays trace messages if a particular @@ -83,6 +83,9 @@ Additional header files to include in the BPF program. This is needed if your filter or print expressions use types or data structures that are not available in the standard headers. For example: 'linux/mm.h' .TP +\-A +Print aggregated amount of each trace. This should be used with -M/--max-events together. +.TP probe [probe ...] One or more probes that attach to functions, filter conditions, and print information. See PROBE SYNTAX below. diff --git a/tools/trace.py b/tools/trace.py index 0f6d90e8..b51cccff 100755 --- a/tools/trace.py +++ b/tools/trace.py @@ -5,6 +5,7 @@ # # usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] [-c cgroup_path] # [-M MAX_EVENTS] [-s SYMBOLFILES] [-T] [-t] [-K] [-U] [-a] [-I header] +# [-A] # probe [probe ...] # # Licensed under the Apache License, Version 2.0 (the "License") @@ -40,6 +41,8 @@ class Probe(object): uid = -1 page_cnt = None build_id_enabled = False + aggregate = False + symcount = {} @classmethod def configure(cls, args): @@ -58,6 +61,10 @@ class Probe(object): cls.page_cnt = args.buffer_pages cls.bin_cmp = args.bin_cmp cls.build_id_enabled = args.sym_file_list is not None + cls.aggregate = args.aggregate + if cls.aggregate and cls.max_events is None: + raise ValueError("-M/--max-events should be specified" + " with -A/--aggregate") def __init__(self, probe, string_size, kernel_stack, user_stack, cgroup_map_name, name, msg_filter): @@ -584,18 +591,20 @@ BPF_PERF_OUTPUT(%s); else: # self.probe_type == 't' return self.tp_event - def print_stack(self, bpf, stack_id, tgid): + def _stack_to_string(self, bpf, stack_id, tgid): if stack_id < 0: - print(" %d" % stack_id) - return + return (" %d" % stack_id) + stackstr = '' stack = list(bpf.get_table(self.stacks_name).walk(stack_id)) for addr in stack: - print(" ", end="") + stackstr += ' ' if Probe.print_address: - print("%16x " % addr, end="") - print("%s" % (bpf.sym(addr, tgid, - show_module=True, show_offset=True))) + stackstr += ("%16x " % addr) + symstr = bpf.sym(addr, tgid, show_module=True, show_offset=True) + stackstr += ('%s\n' % (symstr.decode('utf-8'))) + + return stackstr def _format_message(self, bpf, tgid, values): # Replace each %K with kernel sym and %U with user sym in tgid @@ -610,6 +619,11 @@ BPF_PERF_OUTPUT(%s); show_module=True, show_offset=True) return self.python_format % tuple(values) + def print_aggregate_events(self): + for k, v in sorted(self.symcount.items(), key=lambda item: \ + item[1], reverse=True): + print("%s-->COUNT %d\n\n" % (k, v), end="") + def print_event(self, bpf, cpu, data, size): # Cast as the generated structure type and display # according to the format string in the probe. @@ -621,32 +635,43 @@ BPF_PERF_OUTPUT(%s); msg = self._format_message(bpf, event.tgid, values) if self.msg_filter and self.msg_filter not in msg: return + eventstr = '' if Probe.print_time: time = strftime("%H:%M:%S") if Probe.use_localtime else \ Probe._time_off_str(event.timestamp_ns) if Probe.print_unix_timestamp: - print("%-17s " % time[:17], end="") + eventstr += ("%-17s " % time[:17]) else: - print("%-8s " % time[:8], end="") + eventstr += ("%-8s " % time[:8]) if Probe.print_cpu: - print("%-3s " % event.cpu, end="") - print("%-7d %-7d %-15s %-16s %s" % + eventstr += ("%-3s " % event.cpu) + eventstr += ("%-7d %-7d %-15s %-16s %s\n" % (event.tgid, event.pid, event.comm.decode('utf-8', 'replace'), self._display_function(), msg)) if self.kernel_stack: - self.print_stack(bpf, event.kernel_stack_id, -1) + eventstr += self._stack_to_string(bpf, event.kernel_stack_id, -1) if self.user_stack: - self.print_stack(bpf, event.user_stack_id, event.tgid) - if self.user_stack or self.kernel_stack: + eventstr += self._stack_to_string(bpf, event.user_stack_id, event.tgid) + + if self.aggregate is False: + print(eventstr, end="") + if self.kernel_stack or self.user_stack: print("") + else: + if eventstr in self.symcount: + self.symcount[eventstr] += 1 + else: + self.symcount[eventstr] = 1 Probe.event_count += 1 if Probe.max_events is not None and \ Probe.event_count >= Probe.max_events: - exit() - sys.stdout.flush() + if self.aggregate: + self.print_aggregate_events() + sys.stdout.flush() + exit() def attach(self, bpf, verbose): if len(self.library) == 0: @@ -700,7 +725,7 @@ trace do_sys_open trace kfree_skb+0x12 Trace the kfree_skb kernel function after the instruction on the 0x12 offset trace 'do_sys_open "%s", arg2@user' - Trace the open syscall and print the filename. being opened @user is + Trace the open syscall and print the filename being opened @user is added to arg2 in kprobes to ensure that char * should be copied from the userspace stack to the bpf stack. If not specified, previous behaviour is expected. @@ -752,6 +777,9 @@ trace -I 'net/sock.h' \\ to 53 (DNS; 13568 in big endian order) trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users' Trace the number of users accessing the file system of the current task +trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U + Trace inet_pton system call and use the specified libraries/executables for + symbol resolution. """ def __init__(self): @@ -815,6 +843,8 @@ trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users' "as either full path, " "or relative to current working directory, " "or relative to default kernel header search path") + parser.add_argument("-A", "--aggregate", action="store_true", + help="aggregate amount of each trace") parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) self.args = parser.parse_args() diff --git a/tools/trace_example.txt b/tools/trace_example.txt index 36010d61..ccefdaa7 100644 --- a/tools/trace_example.txt +++ b/tools/trace_example.txt @@ -237,6 +237,32 @@ Remember to use the -I argument include the appropriate header file. We didn't need to do that here because `struct timespec` is used internally by the tool, so it always includes this header file. +To aggregate amount of trace, you need specify -A with -M EVENTS. A typical +example: +1, if we find that the sys CPU utilization is higher by 'top' command +2, then find that the timer interrupt is more normal by 'irqtop' command +3, to confirm kernel timer setting frequence by 'funccount -i 1 clockevents_program_event' +4, to trace timer setting by 'trace clockevents_program_event -K -A -M 1000' + +1294576 1294584 CPU 0/KVM clockevents_program_event + clockevents_program_event+0x1 [kernel] + hrtimer_start_range_ns+0x209 [kernel] + start_sw_timer+0x173 [kvm] + restart_apic_timer+0x6c [kvm] + kvm_set_msr_common+0x442 [kvm] + __kvm_set_msr+0xa2 [kvm] + kvm_emulate_wrmsr+0x36 [kvm] + vcpu_enter_guest+0x326 [kvm] + kvm_arch_vcpu_ioctl_run+0xcc [kvm] + kvm_vcpu_ioctl+0x22f [kvm] + do_vfs_ioctl+0xa1 [kernel] + ksys_ioctl+0x60 [kernel] + __x64_sys_ioctl+0x16 [kernel] + do_syscall_64+0x59 [kernel] + entry_SYSCALL_64_after_hwframe+0x44 [kernel] +-->COUNT 271 +... +So we can know that 271 timer setting in recent 1000(~27%). As a final example, let's trace open syscalls for a specific process. By default, tracing is system-wide, but the -p switch overrides this: @@ -384,6 +410,7 @@ optional arguments: as either full path, or relative to current working directory, or relative to default kernel header search path + -A, --aggregate aggregate amount of each trace EXAMPLES: @@ -392,10 +419,11 @@ trace do_sys_open trace kfree_skb+0x12 Trace the kfree_skb kernel function after the instruction on the 0x12 offset trace 'do_sys_open "%s", arg2@user' - Trace the open syscall and print the filename being opened. @user is + Trace the open syscall and print the filename being opened @user is added to arg2 in kprobes to ensure that char * should be copied from the userspace stack to the bpf stack. If not specified, previous behaviour is expected. + trace 'do_sys_open "%s", arg2@user' -n main Trace the open syscall and only print event that process names containing "main" trace 'do_sys_open "%s", arg2@user' --uid 1001 @@ -420,6 +448,8 @@ trace 't:block:block_rq_complete "sectors=%d", args->nr_sector' Trace the block_rq_complete kernel tracepoint and print # of tx sectors trace 'u:pthread:pthread_create (arg4 != 0)' Trace the USDT probe pthread_create when its 4th argument is non-zero +trace 'u:pthread:libpthread:pthread_create (arg4 != 0)' + Ditto, but the provider name "libpthread" is specified. trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec' Trace the nanosleep syscall and print the sleep duration in ns trace -c /sys/fs/cgroup/system.slice/workload.service '__x64_sys_nanosleep' '__x64_sys_clone' @@ -435,7 +465,7 @@ trace -I 'kernel/sched/sched.h' \ in kernel/sched/sched.h which is in kernel source tree and not in kernel-devel package. So this command needs to run at the kernel source tree root directory so that the added header file can be found by the compiler. -trace -I 'net/sock.h' \\ +trace -I 'net/sock.h' \ 'udpv6_sendmsg(struct sock *sk) (sk->sk_dport == 13568)' Trace udpv6 sendmsg calls only if socket's destination port is equal to 53 (DNS; 13568 in big endian order) @@ -444,4 +474,3 @@ trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users' trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U Trace inet_pton system call and use the specified libraries/executables for symbol resolution. -" -- 2.34.1