perf stat: Add computation of TopDown formulas

author Andi Kleen <ak@linux.intel.com>

Tue, 24 May 2016 19:52:37 +0000 (12:52 -0700)

committer Arnaldo Carvalho de Melo <acme@redhat.com>

Mon, 6 Jun 2016 20:04:16 +0000 (17:04 -0300)
author Andi Kleen <ak@linux.intel.com>
Tue, 24 May 2016 19:52:37 +0000 (12:52 -0700)
committer Arnaldo Carvalho de Melo <acme@redhat.com>
Mon, 6 Jun 2016 20:04:16 +0000 (17:04 -0300)
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c

index aa9efe0..8a2bbd2 100644 (file)
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -36,6 +36,11 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
  static bool have_frontend_stalled;
  
  struct stats walltime_nsecs_stats;
@@ -82,6 +87,11 @@ void perf_stat__reset_shadow_stats(void)
                 sizeof(runtime_transaction_stats));
         memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
         memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
+       memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots));
+       memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired));
+       memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
+       memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
+       memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
  }
  
  /*
@@ -105,6 +115,16 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
                 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
         else if (perf_stat_evsel__is(counter, ELISION_START))
                 update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
+       else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
+               update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]);
+       else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
+               update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]);
+       else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
+               update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]);
+       else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
+               update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]);
+       else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
+               update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]);
         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
                 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
@@ -302,6 +322,107 @@ static void print_ll_cache_misses(int cpu,
         out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
  }
  
+/*
+ * High level "TopDown" CPU core pipe line bottleneck break down.
+ *
+ * Basic concept following
+ * Yasin, A Top Down Method for Performance analysis and Counter architecture
+ * ISPASS14
+ *
+ * The CPU pipeline is divided into 4 areas that can be bottlenecks:
+ *
+ * Frontend -> Backend -> Retiring
+ * BadSpeculation in addition means out of order execution that is thrown away
+ * (for example branch mispredictions)
+ * Frontend is instruction decoding.
+ * Backend is execution, like computation and accessing data in memory
+ * Retiring is good execution that is not directly bottlenecked
+ *
+ * The formulas are computed in slots.
+ * A slot is an entry in the pipeline each for the pipeline width
+ * (for example a 4-wide pipeline has 4 slots for each cycle)
+ *
+ * Formulas:
+ * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
+ *                     TotalSlots
+ * Retiring = SlotsRetired / TotalSlots
+ * FrontendBound = FetchBubbles / TotalSlots
+ * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
+ *
+ * The kernel provides the mapping to the low level CPU events and any scaling
+ * needed for the CPU pipeline width, for example:
+ *
+ * TotalSlots = Cycles * 4
+ *
+ * The scaling factor is communicated in the sysfs unit.
+ *
+ * In some cases the CPU may not be able to measure all the formulas due to
+ * missing events. In this case multiple formulas are combined, as possible.
+ *
+ * Full TopDown supports more levels to sub-divide each area: for example
+ * BackendBound into computing bound and memory bound. For now we only
+ * support Level 1 TopDown.
+ */
+
+static double sanitize_val(double x)
+{
+       if (x < 0 && x >= -0.02)
+               return 0.0;
+       return x;
+}
+
+static double td_total_slots(int ctx, int cpu)
+{
+       return avg_stats(&runtime_topdown_total_slots[ctx][cpu]);
+}
+
+static double td_bad_spec(int ctx, int cpu)
+{
+       double bad_spec = 0;
+       double total_slots;
+       double total;
+
+       total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) -
+               avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) +
+               avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]);
+       total_slots = td_total_slots(ctx, cpu);
+       if (total_slots)
+               bad_spec = total / total_slots;
+       return sanitize_val(bad_spec);
+}
+
+static double td_retiring(int ctx, int cpu)
+{
+       double retiring = 0;
+       double total_slots = td_total_slots(ctx, cpu);
+       double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]);
+
+       if (total_slots)
+               retiring = ret_slots / total_slots;
+       return retiring;
+}
+
+static double td_fe_bound(int ctx, int cpu)
+{
+       double fe_bound = 0;
+       double total_slots = td_total_slots(ctx, cpu);
+       double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]);
+
+       if (total_slots)
+               fe_bound = fetch_bub / total_slots;
+       return fe_bound;
+}
+
+static double td_be_bound(int ctx, int cpu)
+{
+       double sum = (td_fe_bound(ctx, cpu) +
+                     td_bad_spec(ctx, cpu) +
+                     td_retiring(ctx, cpu));
+       if (sum == 0)
+               return 0;
+       return sanitize_val(1.0 - sum);
+}
+
  void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
                                    double avg, int cpu,
                                    struct perf_stat_output_ctx *out)
@@ -309,6 +430,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
         void *ctxp = out->ctx;
         print_metric_t print_metric = out->print_metric;
         double total, ratio = 0.0, total2;
+       const char *color = NULL;
         int ctx = evsel_context(evsel);
  
         if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
@@ -452,6 +574,46 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
                                      avg / ratio);
                 else
                         print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
+       } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
+               double fe_bound = td_fe_bound(ctx, cpu);
+
+               if (fe_bound > 0.2)
+                       color = PERF_COLOR_RED;
+               print_metric(ctxp, color, "%8.1f%%", "frontend bound",
+                               fe_bound * 100.);
+       } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
+               double retiring = td_retiring(ctx, cpu);
+
+               if (retiring > 0.7)
+                       color = PERF_COLOR_GREEN;
+               print_metric(ctxp, color, "%8.1f%%", "retiring",
+                               retiring * 100.);
+       } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
+               double bad_spec = td_bad_spec(ctx, cpu);
+
+               if (bad_spec > 0.1)
+                       color = PERF_COLOR_RED;
+               print_metric(ctxp, color, "%8.1f%%", "bad speculation",
+                               bad_spec * 100.);
+       } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
+               double be_bound = td_be_bound(ctx, cpu);
+               const char *name = "backend bound";
+               static int have_recovery_bubbles = -1;
+
+               /* In case the CPU does not support topdown-recovery-bubbles */
+               if (have_recovery_bubbles < 0)
+                       have_recovery_bubbles = pmu_have_event("cpu",
+                                       "topdown-recovery-bubbles");
+               if (!have_recovery_bubbles)
+                       name = "backend bound/bad spec";
+
+               if (be_bound > 0.2)
+                       color = PERF_COLOR_RED;
+               if (td_total_slots(ctx, cpu) > 0)
+                       print_metric(ctxp, color, "%8.1f%%", name,
+                                       be_bound * 100.);
+               else
+                       print_metric(ctxp, NULL, NULL, name, 0);
         } else if (runtime_nsecs_stats[cpu].n != 0) {
                 char unit = 'M';
                 char unit_buf[10];
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c

index ffa1d06..c1ba255 100644 (file)
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -79,6 +79,11 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = {
         ID(TRANSACTION_START,   cpu/tx-start/),
         ID(ELISION_START,       cpu/el-start/),
         ID(CYCLES_IN_TX_CP,     cpu/cycles-ct/),
+       ID(TOPDOWN_TOTAL_SLOTS, topdown-total-slots),
+       ID(TOPDOWN_SLOTS_ISSUED, topdown-slots-issued),
+       ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired),
+       ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles),
+       ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles),
  };
  #undef ID
  
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h

index 0150e78..c29bb94 100644 (file)
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -17,6 +17,11 @@ enum perf_stat_evsel_id {
         PERF_STAT_EVSEL_ID__TRANSACTION_START,
         PERF_STAT_EVSEL_ID__ELISION_START,
         PERF_STAT_EVSEL_ID__CYCLES_IN_TX_CP,
+       PERF_STAT_EVSEL_ID__TOPDOWN_TOTAL_SLOTS,
+       PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_ISSUED,
+       PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED,
+       PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES,
+       PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES,
         PERF_STAT_EVSEL_ID__MAX,
  };
author	Andi Kleen <ak@linux.intel.com>
	Tue, 24 May 2016 19:52:37 +0000 (12:52 -0700)
committer	Arnaldo Carvalho de Melo <acme@redhat.com>
	Mon, 6 Jun 2016 20:04:16 +0000 (17:04 -0300)
tools/perf/util/stat-shadow.c		patch \| blob \| history
tools/perf/util/stat.c		patch \| blob \| history
tools/perf/util/stat.h		patch \| blob \| history