perf/x86/intel/ds: Fix the conversion from TSC to perf time
authorKan Liang <kan.liang@linux.intel.com>
Wed, 25 Jan 2023 20:49:25 +0000 (12:49 -0800)
committerPeter Zijlstra <peterz@infradead.org>
Sat, 11 Feb 2023 10:18:12 +0000 (11:18 +0100)
The time order is incorrect when the TSC in a PEBS record is used.

 $perf record -e cycles:upp dd if=/dev/zero of=/dev/null
  count=10000
 $ perf script --show-task-events
       perf-exec     0     0.000000: PERF_RECORD_COMM: perf-exec:915/915
              dd   915   106.479872: PERF_RECORD_COMM exec: dd:915/915
              dd   915   106.483270: PERF_RECORD_EXIT(915:915):(914:914)
              dd   915   106.512429:          1 cycles:upp:
 ffffffff96c011b7 [unknown] ([unknown])
 ... ...

The perf time is from sched_clock_cpu(). The current PEBS code
unconditionally convert the TSC to native_sched_clock(). There is a
shift between the two clocks. If the TSC is stable, the shift is
consistent, __sched_clock_offset. If the TSC is unstable, the shift has
to be calculated at runtime.

This patch doesn't support the conversion when the TSC is unstable. The
TSC unstable case is a corner case and very unlikely to happen. If it
happens, the TSC in a PEBS record will be dropped and fall back to
perf_event_clock().

Fixes: 47a3aeb39e8d ("perf/x86/intel/pebs: Fix PEBS timestamps overwritten")
Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/CAM9d7cgWDVAq8-11RbJ2uGfwkKD6fA-OMwOKDrNUrU_=8MgEjg@mail.gmail.com/
arch/x86/events/intel/ds.c

index 183efa9..b0354dc 100644 (file)
@@ -2,12 +2,14 @@
 #include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 
 #include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/tlbflush.h>
 #include <asm/insn.h>
 #include <asm/io.h>
+#include <asm/timer.h>
 
 #include "../perf_event.h"
 
@@ -1568,6 +1570,27 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
        return val;
 }
 
+static void setup_pebs_time(struct perf_event *event,
+                           struct perf_sample_data *data,
+                           u64 tsc)
+{
+       /* Converting to a user-defined clock is not supported yet. */
+       if (event->attr.use_clockid != 0)
+               return;
+
+       /*
+        * Doesn't support the conversion when the TSC is unstable.
+        * The TSC unstable case is a corner case and very unlikely to
+        * happen. If it happens, the TSC in a PEBS record will be
+        * dropped and fall back to perf_event_clock().
+        */
+       if (!using_native_sched_clock() || !sched_clock_stable())
+               return;
+
+       data->time = native_sched_clock_from_tsc(tsc) + __sched_clock_offset;
+       data->sample_flags |= PERF_SAMPLE_TIME;
+}
+
 #define PERF_SAMPLE_ADDR_TYPE  (PERF_SAMPLE_ADDR |             \
                                 PERF_SAMPLE_PHYS_ADDR |        \
                                 PERF_SAMPLE_DATA_PAGE_SIZE)
@@ -1715,11 +1738,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
         *
         * We can only do this for the default trace clock.
         */
-       if (x86_pmu.intel_cap.pebs_format >= 3 &&
-               event->attr.use_clockid == 0) {
-               data->time = native_sched_clock_from_tsc(pebs->tsc);
-               data->sample_flags |= PERF_SAMPLE_TIME;
-       }
+       if (x86_pmu.intel_cap.pebs_format >= 3)
+               setup_pebs_time(event, data, pebs->tsc);
 
        if (has_branch_stack(event))
                perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
@@ -1781,10 +1801,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
        perf_sample_data_init(data, 0, event->hw.last_period);
        data->period = event->hw.last_period;
 
-       if (event->attr.use_clockid == 0) {
-               data->time = native_sched_clock_from_tsc(basic->tsc);
-               data->sample_flags |= PERF_SAMPLE_TIME;
-       }
+       setup_pebs_time(event, data, basic->tsc);
 
        /*
         * We must however always use iregs for the unwinder to stay sane; the