From 06d90d3d4b35815027b7b7a7fc48167d497d2de3 Mon Sep 17 00:00:00 2001 From: Sasha Goldshtein Date: Thu, 30 Jun 2016 07:39:27 -0700 Subject: [PATCH] cpudist: Use `finish_task_switch` kprobe instead of `sched_switch` tracepoint The `sched_switch` tracepoint approach requires storing the previous task's tgid in a map and fetching it from there, because it is not available as a tracepoint argument. Instead, placing a kprobe on the `finish_task_switch` function allows cleanly fetching the previous task's pid and tgid from the task_struct. --- man/man8/cpudist.8 | 5 ----- tools/cpudist.py | 42 ++++++++---------------------------------- 2 files changed, 8 insertions(+), 39 deletions(-) diff --git a/man/man8/cpudist.8 b/man/man8/cpudist.8 index 8d507aa..6ee1f3b 100644 --- a/man/man8/cpudist.8 +++ b/man/man8/cpudist.8 @@ -19,11 +19,6 @@ This tool uses in-kernel eBPF maps for storing timestamps and the histogram, for efficiency. Despite this, the overhead of this tool may become significant for some workloads: see the OVERHEAD section. -This tool uses the sched:sched_switch kernel tracepoint to determine when a -task is scheduled and descheduled. If the tracepoint arguments change in the -future, this tool will have to be updated. Still, it is more reliable than -using kprobes on the respective kernel functions directly. - Since this uses BPF, only the root user can use this tool. .SH REQUIREMENTS CONFIG_BPF and bcc. diff --git a/tools/cpudist.py b/tools/cpudist.py index 1807258..6a0e04b 100755 --- a/tools/cpudist.py +++ b/tools/cpudist.py @@ -48,12 +48,9 @@ args = parser.parse_args() countdown = int(args.count) debug = 0 -tp = Tracepoint.enable_tracepoint("sched", "sched_switch") -bpf_text = "#include \n" -bpf_text += "#include \n" -bpf_text += tp.generate_decl() -bpf_text += tp.generate_entry_probe() -bpf_text += tp.generate_struct() +bpf_text = """#include +#include +""" if not args.offcpu: bpf_text += "#define ONCPU\n" @@ -66,17 +63,8 @@ typedef struct pid_key { BPF_HASH(start, u32, u64); -BPF_HASH(tgid_for_pid, u32, u32); STORAGE -static inline u32 get_tgid(u32 pid) -{ - u32 *stored_tgid = tgid_for_pid.lookup(&pid); - if (stored_tgid != 0) - return *stored_tgid; - return 0xffffffff; -} - static inline void store_start(u32 tgid, u32 pid, u64 ts) { if (FILTER) @@ -99,32 +87,19 @@ static inline void update_hist(u32 tgid, u32 pid, u64 ts) STORE } -int sched_switch(struct pt_regs *ctx) +int sched_switch(struct pt_regs *ctx, struct task_struct *prev) { u64 ts = bpf_ktime_get_ns(); u64 pid_tgid = bpf_get_current_pid_tgid(); u32 tgid = pid_tgid >> 32, pid = pid_tgid; - // Keep a mapping of tgid for pid because when sched_switch hits, - // we only have the tgid information for the *current* pid, but not - // for the previous one. - tgid_for_pid.update(&pid, &tgid); - - u64 *di = __trace_di.lookup(&pid_tgid); - if (di == 0) - return 0; - - struct sched_switch_trace_entry args = {}; - bpf_probe_read(&args, sizeof(args), (void *)*di); #ifdef ONCPU - if (args.prev_state == TASK_RUNNING) { + if (prev->state == TASK_RUNNING) { #else if (1) { #endif - u32 prev_pid = args.prev_pid; - u32 prev_tgid = get_tgid(prev_pid); - if (prev_tgid == 0xffffffff) - goto BAIL; + u32 prev_pid = prev->pid; + u32 prev_tgid = prev->tgid; #ifdef ONCPU update_hist(prev_tgid, prev_pid, ts); #else @@ -173,8 +148,7 @@ if debug: print(bpf_text) b = BPF(text=bpf_text) -Tracepoint.attach(b) -b.attach_kprobe(event="perf_trace_sched_switch", fn_name="sched_switch") +b.attach_kprobe(event="finish_task_switch", fn_name="sched_switch") print("Tracing %s-CPU time... Hit Ctrl-C to end." % ("off" if args.offcpu else "on")) -- 2.7.4