perf report: Support Retire Latency
authorKan Liang <kan.liang@linux.intel.com>
Wed, 4 Jan 2023 20:13:48 +0000 (12:13 -0800)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Fri, 3 Feb 2023 20:24:02 +0000 (17:24 -0300)
The Retire Latency field is added in the var3_w of the
PERF_SAMPLE_WEIGHT_STRUCT. The Retire Latency reports pipeline stall of
this instruction compared to the previous instruction in cycles.  That's
quite useful to display the information with perf mem report.

The p_stage_cyc for Power is also from the var3_w. Union the p_stage_cyc
and retire_lat to share the code.

Implement X86 specific codes to display the X86 specific header.

Add a new sort key retire_lat for the Retire Latency.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20230104201349.1451191-8-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/Documentation/perf-report.txt
tools/perf/arch/x86/util/event.c
tools/perf/util/sample.h
tools/perf/util/sort.c
tools/perf/util/sort.h

index 9b0c0db..c242e8d 100644 (file)
@@ -115,6 +115,8 @@ OPTIONS
        - p_stage_cyc: On powerpc, this presents the number of cycles spent in a
          pipeline stage. And currently supported only on powerpc.
        - addr: (Full) virtual address of the sampled instruction
+       - retire_lat: On X86, this reports pipeline stall of this instruction compared
+         to the previous instruction in cycles. And currently supported only on X86
 
        By default, comm, dso and symbol keys are used.
        (i.e. --sort comm,dso,symbol)
index a3acefe..37b3feb 100644 (file)
@@ -89,6 +89,7 @@ void arch_perf_parse_sample_weight(struct perf_sample *data,
        else {
                data->weight = weight.var1_dw;
                data->ins_lat = weight.var2_w;
+               data->retire_lat = weight.var3_w;
        }
 }
 
@@ -102,3 +103,22 @@ void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
                *array |= ((u64)data->ins_lat << 32);
        }
 }
+
+const char *arch_perf_header_entry(const char *se_header)
+{
+       if (!strcmp(se_header, "Local Pipeline Stage Cycle"))
+               return "Local Retire Latency";
+       else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+               return "Retire Latency";
+
+       return se_header;
+}
+
+int arch_support_sort_key(const char *sort_key)
+{
+       if (!strcmp(sort_key, "p_stage_cyc"))
+               return 1;
+       if (!strcmp(sort_key, "local_p_stage_cyc"))
+               return 1;
+       return 0;
+}
index 60ec79d..33b08e0 100644 (file)
@@ -92,7 +92,10 @@ struct perf_sample {
        u8  cpumode;
        u16 misc;
        u16 ins_lat;
-       u16 p_stage_cyc;
+       union {
+               u16 p_stage_cyc;
+               u16 retire_lat;
+       };
        bool no_hw_idx;         /* No hw_idx collected in branch_stack */
        char insn[MAX_INSN];
        void *raw_data;
index d7d0f99..4a64823 100644 (file)
@@ -2133,6 +2133,8 @@ static struct sort_dimension common_sort_dimensions[] = {
        DIM(SORT_LOCAL_PIPELINE_STAGE_CYC, "local_p_stage_cyc", sort_local_p_stage_cyc),
        DIM(SORT_GLOBAL_PIPELINE_STAGE_CYC, "p_stage_cyc", sort_global_p_stage_cyc),
        DIM(SORT_ADDR, "addr", sort_addr),
+       DIM(SORT_LOCAL_RETIRE_LAT, "local_retire_lat", sort_local_p_stage_cyc),
+       DIM(SORT_GLOBAL_RETIRE_LAT, "retire_lat", sort_global_p_stage_cyc),
 };
 
 #undef DIM
index 921715e..9a91d0d 100644 (file)
@@ -237,6 +237,8 @@ enum sort_type {
        SORT_LOCAL_PIPELINE_STAGE_CYC,
        SORT_GLOBAL_PIPELINE_STAGE_CYC,
        SORT_ADDR,
+       SORT_LOCAL_RETIRE_LAT,
+       SORT_GLOBAL_RETIRE_LAT,
 
        /* branch stack specific sort keys */
        __SORT_BRANCH_STACK,