perf record: Implement basic filtering for off-cpu
authorNamhyung Kim <namhyung@kernel.org>
Wed, 18 May 2022 22:47:22 +0000 (15:47 -0700)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Thu, 26 May 2022 15:36:57 +0000 (12:36 -0300)
It should honor cpu and task filtering with -a, -C or -p, -t options.

Committer testing:

  # perf record --off-cpu --cpu 1 perf bench sched messaging -l 1000
  # Running 'sched/messaging' benchmark:
  # 20 sender and receiver processes per group
  # 10 groups == 400 processes run

       Total time: 1.722 [sec]
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 1.446 MB perf.data (7248 samples) ]
  #
  # perf script | head -20
              perf 97164 [001] 38287.696761:          1      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97164 [001] 38287.696764:          1      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97164 [001] 38287.696765:          9      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97164 [001] 38287.696767:        212      cycles:  ffffffffb6070176 native_write_msr+0x6 (vmlinux)
              perf 97164 [001] 38287.696768:       5130      cycles:  ffffffffb6070176 native_write_msr+0x6 (vmlinux)
              perf 97164 [001] 38287.696770:     123063      cycles:  ffffffffb6e0011e syscall_return_via_sysret+0x38 (vmlinux)
              perf 97164 [001] 38287.696803:    2292748      cycles:  ffffffffb636c82d __fput+0xad (vmlinux)
           swapper     0 [001] 38287.702852:    1927474      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
            :97513 97513 [001] 38287.767207:    1172536      cycles:  ffffffffb612ff65 newidle_balance+0x5 (vmlinux)
           swapper     0 [001] 38287.769567:    1073081      cycles:  ffffffffb618216d ktime_get_mono_fast_ns+0xd (vmlinux)
            :97533 97533 [001] 38287.770962:     984460      cycles:  ffffffffb65b2900 selinux_socket_sendmsg+0x0 (vmlinux)
            :97540 97540 [001] 38287.772242:     883462      cycles:  ffffffffb6d0bf59 irqentry_exit_to_user_mode+0x9 (vmlinux)
           swapper     0 [001] 38287.773633:     741963      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
            :97552 97552 [001] 38287.774539:     606680      cycles:  ffffffffb62eda0a page_add_file_rmap+0x7a (vmlinux)
            :97556 97556 [001] 38287.775333:     502254      cycles:  ffffffffb634f964 get_obj_cgroup_from_current+0xc4 (vmlinux)
            :97561 97561 [001] 38287.776163:     427891      cycles:  ffffffffb61b1522 cgroup_rstat_updated+0x22 (vmlinux)
           swapper     0 [001] 38287.776854:     359030      cycles:  ffffffffb612fc5e load_balance+0x9ce (vmlinux)
            :97567 97567 [001] 38287.777312:     330371      cycles:  ffffffffb6a8d8d0 skb_set_owner_w+0x0 (vmlinux)
            :97566 97566 [001] 38287.777589:     311622      cycles:  ffffffffb614a7a8 native_queued_spin_lock_slowpath+0x148 (vmlinux)
            :97512 97512 [001] 38287.777671:     307851      cycles:  ffffffffb62e0f35 find_vma+0x55 (vmlinux)
  #
  # perf record --off-cpu --cpu 4 perf bench sched messaging -l 1000
  # Running 'sched/messaging' benchmark:
  # 20 sender and receiver processes per group
  # 10 groups == 400 processes run

       Total time: 1.613 [sec]
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 1.415 MB perf.data (6729 samples) ]
  # perf script | head -20
              perf 97650 [004] 38323.728036:          1      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97650 [004] 38323.728040:          1      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97650 [004] 38323.728041:          9      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97650 [004] 38323.728042:        208      cycles:  ffffffffb6070176 native_write_msr+0x6 (vmlinux)
              perf 97650 [004] 38323.728044:       5026      cycles:  ffffffffb6070176 native_write_msr+0x6 (vmlinux)
              perf 97650 [004] 38323.728046:     119970      cycles:  ffffffffb6d0bebc syscall_exit_to_user_mode+0x1c (vmlinux)
              perf 97650 [004] 38323.728078:    2190103      cycles:            54b756 perf_tool__process_synth_event+0x16 (/home/acme/bin/perf)
           swapper     0 [004] 38323.783357:    1593139      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
           swapper     0 [004] 38323.785352:    1593139      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
           swapper     0 [004] 38323.797330:    1418936      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
           swapper     0 [004] 38323.802350:    1418936      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
           swapper     0 [004] 38323.806333:    1418936      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
            :97996 97996 [004] 38323.807145:    1418936      cycles:      7f5db9be6917 [unknown] ([unknown])
            :97959 97959 [004] 38323.807730:    1445074      cycles:  ffffffffb6329d36 memcg_slab_post_alloc_hook+0x146 (vmlinux)
            :97959 97959 [004] 38323.808103:    1341584      cycles:  ffffffffb62fd90f get_page_from_freelist+0x112f (vmlinux)
            :97959 97959 [004] 38323.808451:    1227537      cycles:  ffffffffb65b2905 selinux_socket_sendmsg+0x5 (vmlinux)
            :97959 97959 [004] 38323.808768:    1184321      cycles:  ffffffffb6d1ba35 _raw_spin_lock_irqsave+0x15 (vmlinux)
            :97959 97959 [004] 38323.809073:    1153017      cycles:  ffffffffb6a8d92d skb_set_owner_w+0x5d (vmlinux)
            :97959 97959 [004] 38323.809402:    1126875      cycles:  ffffffffb6329c64 memcg_slab_post_alloc_hook+0x74 (vmlinux)
            :97959 97959 [004] 38323.809695:    1073248      cycles:  ffffffffb6e0001d entry_SYSCALL_64+0x1d (vmlinux)
  #

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20220518224725.742882-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/builtin-record.c
tools/perf/util/bpf_off_cpu.c
tools/perf/util/bpf_skel/off_cpu.bpf.c
tools/perf/util/off_cpu.h

index b76f57e..9601438 100644 (file)
@@ -892,7 +892,7 @@ static int record__config_text_poke(struct evlist *evlist)
 
 static int record__config_off_cpu(struct record *rec)
 {
-       return off_cpu_prepare(rec->evlist);
+       return off_cpu_prepare(rec->evlist, &rec->opts.target);
 }
 
 static bool record__kcore_readable(struct machine *machine)
index 9ed7aca..b5e2d03 100644 (file)
@@ -6,6 +6,9 @@
 #include "util/off_cpu.h"
 #include "util/perf-hooks.h"
 #include "util/session.h"
+#include "util/target.h"
+#include "util/cpumap.h"
+#include "util/thread_map.h"
 #include <bpf/bpf.h>
 
 #include "bpf_skel/off_cpu.skel.h"
@@ -60,8 +63,23 @@ static int off_cpu_config(struct evlist *evlist)
        return 0;
 }
 
-static void off_cpu_start(void *arg __maybe_unused)
+static void off_cpu_start(void *arg)
 {
+       struct evlist *evlist = arg;
+
+       /* update task filter for the given workload */
+       if (!skel->bss->has_cpu && !skel->bss->has_task &&
+           perf_thread_map__pid(evlist->core.threads, 0) != -1) {
+               int fd;
+               u32 pid;
+               u8 val = 1;
+
+               skel->bss->has_task = 1;
+               fd = bpf_map__fd(skel->maps.task_filter);
+               pid = perf_thread_map__pid(evlist->core.threads, 0);
+               bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
+       }
+
        skel->bss->enabled = 1;
 }
 
@@ -71,31 +89,75 @@ static void off_cpu_finish(void *arg __maybe_unused)
        off_cpu_bpf__destroy(skel);
 }
 
-int off_cpu_prepare(struct evlist *evlist)
+int off_cpu_prepare(struct evlist *evlist, struct target *target)
 {
-       int err;
+       int err, fd, i;
+       int ncpus = 1, ntasks = 1;
 
        if (off_cpu_config(evlist) < 0) {
                pr_err("Failed to config off-cpu BPF event\n");
                return -1;
        }
 
-       set_max_rlimit();
-
-       skel = off_cpu_bpf__open_and_load();
+       skel = off_cpu_bpf__open();
        if (!skel) {
                pr_err("Failed to open off-cpu BPF skeleton\n");
                return -1;
        }
 
+       /* don't need to set cpu filter for system-wide mode */
+       if (target->cpu_list) {
+               ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
+               bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
+       }
+
+       if (target__has_task(target)) {
+               ntasks = perf_thread_map__nr(evlist->core.threads);
+               bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
+       }
+
+       set_max_rlimit();
+
+       err = off_cpu_bpf__load(skel);
+       if (err) {
+               pr_err("Failed to load off-cpu skeleton\n");
+               goto out;
+       }
+
+       if (target->cpu_list) {
+               u32 cpu;
+               u8 val = 1;
+
+               skel->bss->has_cpu = 1;
+               fd = bpf_map__fd(skel->maps.cpu_filter);
+
+               for (i = 0; i < ncpus; i++) {
+                       cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
+                       bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
+               }
+       }
+
+       if (target__has_task(target)) {
+               u32 pid;
+               u8 val = 1;
+
+               skel->bss->has_task = 1;
+               fd = bpf_map__fd(skel->maps.task_filter);
+
+               for (i = 0; i < ntasks; i++) {
+                       pid = perf_thread_map__pid(evlist->core.threads, i);
+                       bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
+               }
+       }
+
        err = off_cpu_bpf__attach(skel);
        if (err) {
                pr_err("Failed to attach off-cpu BPF skeleton\n");
                goto out;
        }
 
-       if (perf_hooks__set_hook("record_start", off_cpu_start, NULL) ||
-           perf_hooks__set_hook("record_end", off_cpu_finish, NULL)) {
+       if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) ||
+           perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) {
                pr_err("Failed to attach off-cpu skeleton\n");
                goto out;
        }
index 5173ed8..78cdcc8 100644 (file)
@@ -49,12 +49,28 @@ struct {
        __uint(max_entries, MAX_ENTRIES);
 } off_cpu SEC(".maps");
 
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(__u8));
+       __uint(max_entries, 1);
+} cpu_filter SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(__u8));
+       __uint(max_entries, 1);
+} task_filter SEC(".maps");
+
 /* old kernel task_struct definition */
 struct task_struct___old {
        long state;
 } __attribute__((preserve_access_index));
 
 int enabled = 0;
+int has_cpu = 0;
+int has_task = 0;
 
 /*
  * Old kernel used to call it task_struct->state and now it's '__state'.
@@ -74,6 +90,37 @@ static inline int get_task_state(struct task_struct *t)
        return BPF_CORE_READ(t_old, state);
 }
 
+static inline int can_record(struct task_struct *t, int state)
+{
+       /* kernel threads don't have user stack */
+       if (t->flags & PF_KTHREAD)
+               return 0;
+
+       if (state != TASK_INTERRUPTIBLE &&
+           state != TASK_UNINTERRUPTIBLE)
+               return 0;
+
+       if (has_cpu) {
+               __u32 cpu = bpf_get_smp_processor_id();
+               __u8 *ok;
+
+               ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
+               if (!ok)
+                       return 0;
+       }
+
+       if (has_task) {
+               __u8 *ok;
+               __u32 pid = t->pid;
+
+               ok = bpf_map_lookup_elem(&task_filter, &pid);
+               if (!ok)
+                       return 0;
+       }
+
+       return 1;
+}
+
 SEC("tp_btf/sched_switch")
 int on_switch(u64 *ctx)
 {
@@ -92,10 +139,7 @@ int on_switch(u64 *ctx)
 
        ts = bpf_ktime_get_ns();
 
-       if (prev->flags & PF_KTHREAD)
-               goto next;
-       if (state != TASK_INTERRUPTIBLE &&
-           state != TASK_UNINTERRUPTIBLE)
+       if (!can_record(prev, state))
                goto next;
 
        stack_id = bpf_get_stackid(ctx, &stacks,
index 375d03c..f47af02 100644 (file)
@@ -2,15 +2,17 @@
 #define PERF_UTIL_OFF_CPU_H
 
 struct evlist;
+struct target;
 struct perf_session;
 
 #define OFFCPU_EVENT  "offcpu-time"
 
 #ifdef HAVE_BPF_SKEL
-int off_cpu_prepare(struct evlist *evlist);
+int off_cpu_prepare(struct evlist *evlist, struct target *target);
 int off_cpu_write(struct perf_session *session);
 #else
-static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused)
+static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused,
+                                 struct target *target __maybe_unused)
 {
        return -1;
 }