It should honor cpu and task filtering with -a, -C or -p, -t options.
Committer testing:
# perf record --off-cpu --cpu 1 perf bench sched messaging -l 1000
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run
Total time: 1.722 [sec]
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 1.446 MB perf.data (7248 samples) ]
#
# perf script | head -20
perf 97164 [001] 38287.696761: 1 cycles:
ffffffffb6070174 native_write_msr+0x4 (vmlinux)
perf 97164 [001] 38287.696764: 1 cycles:
ffffffffb6070174 native_write_msr+0x4 (vmlinux)
perf 97164 [001] 38287.696765: 9 cycles:
ffffffffb6070174 native_write_msr+0x4 (vmlinux)
perf 97164 [001] 38287.696767: 212 cycles:
ffffffffb6070176 native_write_msr+0x6 (vmlinux)
perf 97164 [001] 38287.696768: 5130 cycles:
ffffffffb6070176 native_write_msr+0x6 (vmlinux)
perf 97164 [001] 38287.696770: 123063 cycles:
ffffffffb6e0011e syscall_return_via_sysret+0x38 (vmlinux)
perf 97164 [001] 38287.696803:
2292748 cycles:
ffffffffb636c82d __fput+0xad (vmlinux)
swapper 0 [001] 38287.702852:
1927474 cycles:
ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
:97513 97513 [001] 38287.767207:
1172536 cycles:
ffffffffb612ff65 newidle_balance+0x5 (vmlinux)
swapper 0 [001] 38287.769567:
1073081 cycles:
ffffffffb618216d ktime_get_mono_fast_ns+0xd (vmlinux)
:97533 97533 [001] 38287.770962: 984460 cycles:
ffffffffb65b2900 selinux_socket_sendmsg+0x0 (vmlinux)
:97540 97540 [001] 38287.772242: 883462 cycles:
ffffffffb6d0bf59 irqentry_exit_to_user_mode+0x9 (vmlinux)
swapper 0 [001] 38287.773633: 741963 cycles:
ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
:97552 97552 [001] 38287.774539: 606680 cycles:
ffffffffb62eda0a page_add_file_rmap+0x7a (vmlinux)
:97556 97556 [001] 38287.775333: 502254 cycles:
ffffffffb634f964 get_obj_cgroup_from_current+0xc4 (vmlinux)
:97561 97561 [001] 38287.776163: 427891 cycles:
ffffffffb61b1522 cgroup_rstat_updated+0x22 (vmlinux)
swapper 0 [001] 38287.776854: 359030 cycles:
ffffffffb612fc5e load_balance+0x9ce (vmlinux)
:97567 97567 [001] 38287.777312: 330371 cycles:
ffffffffb6a8d8d0 skb_set_owner_w+0x0 (vmlinux)
:97566 97566 [001] 38287.777589: 311622 cycles:
ffffffffb614a7a8 native_queued_spin_lock_slowpath+0x148 (vmlinux)
:97512 97512 [001] 38287.777671: 307851 cycles:
ffffffffb62e0f35 find_vma+0x55 (vmlinux)
#
# perf record --off-cpu --cpu 4 perf bench sched messaging -l 1000
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run
Total time: 1.613 [sec]
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 1.415 MB perf.data (6729 samples) ]
# perf script | head -20
perf 97650 [004] 38323.728036: 1 cycles:
ffffffffb6070174 native_write_msr+0x4 (vmlinux)
perf 97650 [004] 38323.728040: 1 cycles:
ffffffffb6070174 native_write_msr+0x4 (vmlinux)
perf 97650 [004] 38323.728041: 9 cycles:
ffffffffb6070174 native_write_msr+0x4 (vmlinux)
perf 97650 [004] 38323.728042: 208 cycles:
ffffffffb6070176 native_write_msr+0x6 (vmlinux)
perf 97650 [004] 38323.728044: 5026 cycles:
ffffffffb6070176 native_write_msr+0x6 (vmlinux)
perf 97650 [004] 38323.728046: 119970 cycles:
ffffffffb6d0bebc syscall_exit_to_user_mode+0x1c (vmlinux)
perf 97650 [004] 38323.728078:
2190103 cycles: 54b756 perf_tool__process_synth_event+0x16 (/home/acme/bin/perf)
swapper 0 [004] 38323.783357:
1593139 cycles:
ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
swapper 0 [004] 38323.785352:
1593139 cycles:
ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
swapper 0 [004] 38323.797330:
1418936 cycles:
ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
swapper 0 [004] 38323.802350:
1418936 cycles:
ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
swapper 0 [004] 38323.806333:
1418936 cycles:
ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
:97996 97996 [004] 38323.807145:
1418936 cycles:
7f5db9be6917 [unknown] ([unknown])
:97959 97959 [004] 38323.807730:
1445074 cycles:
ffffffffb6329d36 memcg_slab_post_alloc_hook+0x146 (vmlinux)
:97959 97959 [004] 38323.808103:
1341584 cycles:
ffffffffb62fd90f get_page_from_freelist+0x112f (vmlinux)
:97959 97959 [004] 38323.808451:
1227537 cycles:
ffffffffb65b2905 selinux_socket_sendmsg+0x5 (vmlinux)
:97959 97959 [004] 38323.808768:
1184321 cycles:
ffffffffb6d1ba35 _raw_spin_lock_irqsave+0x15 (vmlinux)
:97959 97959 [004] 38323.809073:
1153017 cycles:
ffffffffb6a8d92d skb_set_owner_w+0x5d (vmlinux)
:97959 97959 [004] 38323.809402:
1126875 cycles:
ffffffffb6329c64 memcg_slab_post_alloc_hook+0x74 (vmlinux)
:97959 97959 [004] 38323.809695:
1073248 cycles:
ffffffffb6e0001d entry_SYSCALL_64+0x1d (vmlinux)
#
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20220518224725.742882-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
static int record__config_off_cpu(struct record *rec)
{
- return off_cpu_prepare(rec->evlist);
+ return off_cpu_prepare(rec->evlist, &rec->opts.target);
}
static bool record__kcore_readable(struct machine *machine)
#include "util/off_cpu.h"
#include "util/perf-hooks.h"
#include "util/session.h"
+#include "util/target.h"
+#include "util/cpumap.h"
+#include "util/thread_map.h"
#include <bpf/bpf.h>
#include "bpf_skel/off_cpu.skel.h"
return 0;
}
-static void off_cpu_start(void *arg __maybe_unused)
+static void off_cpu_start(void *arg)
{
+ struct evlist *evlist = arg;
+
+ /* update task filter for the given workload */
+ if (!skel->bss->has_cpu && !skel->bss->has_task &&
+ perf_thread_map__pid(evlist->core.threads, 0) != -1) {
+ int fd;
+ u32 pid;
+ u8 val = 1;
+
+ skel->bss->has_task = 1;
+ fd = bpf_map__fd(skel->maps.task_filter);
+ pid = perf_thread_map__pid(evlist->core.threads, 0);
+ bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
+ }
+
skel->bss->enabled = 1;
}
off_cpu_bpf__destroy(skel);
}
-int off_cpu_prepare(struct evlist *evlist)
+int off_cpu_prepare(struct evlist *evlist, struct target *target)
{
- int err;
+ int err, fd, i;
+ int ncpus = 1, ntasks = 1;
if (off_cpu_config(evlist) < 0) {
pr_err("Failed to config off-cpu BPF event\n");
return -1;
}
- set_max_rlimit();
-
- skel = off_cpu_bpf__open_and_load();
+ skel = off_cpu_bpf__open();
if (!skel) {
pr_err("Failed to open off-cpu BPF skeleton\n");
return -1;
}
+ /* don't need to set cpu filter for system-wide mode */
+ if (target->cpu_list) {
+ ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
+ bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
+ }
+
+ if (target__has_task(target)) {
+ ntasks = perf_thread_map__nr(evlist->core.threads);
+ bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
+ }
+
+ set_max_rlimit();
+
+ err = off_cpu_bpf__load(skel);
+ if (err) {
+ pr_err("Failed to load off-cpu skeleton\n");
+ goto out;
+ }
+
+ if (target->cpu_list) {
+ u32 cpu;
+ u8 val = 1;
+
+ skel->bss->has_cpu = 1;
+ fd = bpf_map__fd(skel->maps.cpu_filter);
+
+ for (i = 0; i < ncpus; i++) {
+ cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
+ bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
+ }
+ }
+
+ if (target__has_task(target)) {
+ u32 pid;
+ u8 val = 1;
+
+ skel->bss->has_task = 1;
+ fd = bpf_map__fd(skel->maps.task_filter);
+
+ for (i = 0; i < ntasks; i++) {
+ pid = perf_thread_map__pid(evlist->core.threads, i);
+ bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
+ }
+ }
+
err = off_cpu_bpf__attach(skel);
if (err) {
pr_err("Failed to attach off-cpu BPF skeleton\n");
goto out;
}
- if (perf_hooks__set_hook("record_start", off_cpu_start, NULL) ||
- perf_hooks__set_hook("record_end", off_cpu_finish, NULL)) {
+ if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) ||
+ perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) {
pr_err("Failed to attach off-cpu skeleton\n");
goto out;
}
__uint(max_entries, MAX_ENTRIES);
} off_cpu SEC(".maps");
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} cpu_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} task_filter SEC(".maps");
+
/* old kernel task_struct definition */
struct task_struct___old {
long state;
} __attribute__((preserve_access_index));
int enabled = 0;
+int has_cpu = 0;
+int has_task = 0;
/*
* Old kernel used to call it task_struct->state and now it's '__state'.
return BPF_CORE_READ(t_old, state);
}
+static inline int can_record(struct task_struct *t, int state)
+{
+ /* kernel threads don't have user stack */
+ if (t->flags & PF_KTHREAD)
+ return 0;
+
+ if (state != TASK_INTERRUPTIBLE &&
+ state != TASK_UNINTERRUPTIBLE)
+ return 0;
+
+ if (has_cpu) {
+ __u32 cpu = bpf_get_smp_processor_id();
+ __u8 *ok;
+
+ ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
+ if (!ok)
+ return 0;
+ }
+
+ if (has_task) {
+ __u8 *ok;
+ __u32 pid = t->pid;
+
+ ok = bpf_map_lookup_elem(&task_filter, &pid);
+ if (!ok)
+ return 0;
+ }
+
+ return 1;
+}
+
SEC("tp_btf/sched_switch")
int on_switch(u64 *ctx)
{
ts = bpf_ktime_get_ns();
- if (prev->flags & PF_KTHREAD)
- goto next;
- if (state != TASK_INTERRUPTIBLE &&
- state != TASK_UNINTERRUPTIBLE)
+ if (!can_record(prev, state))
goto next;
stack_id = bpf_get_stackid(ctx, &stacks,
#define PERF_UTIL_OFF_CPU_H
struct evlist;
+struct target;
struct perf_session;
#define OFFCPU_EVENT "offcpu-time"
#ifdef HAVE_BPF_SKEL
-int off_cpu_prepare(struct evlist *evlist);
+int off_cpu_prepare(struct evlist *evlist, struct target *target);
int off_cpu_write(struct perf_session *session);
#else
-static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused)
+static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused,
+ struct target *target __maybe_unused)
{
return -1;
}