[llvm-mca] Improved report generated by the SchedulerStatistics view.
authorAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Mon, 27 Aug 2018 14:52:52 +0000 (14:52 +0000)
committerAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Mon, 27 Aug 2018 14:52:52 +0000 (14:52 +0000)
Before this patch, the SchedulerStatistics only printed the maximum number of
buffer entries consumed in each scheduler's queue at a given point of the
simulation.

This patch restructures the reported table, and adds an extra field named
"Average number of used buffer entries" to it.
This patch also uses different colors to help identifying bottlenecks caused by
high scheduler's buffer pressure.

llvm-svn: 340746

llvm/docs/CommandGuide/llvm-mca.rst
llvm/test/tools/llvm-mca/X86/BtVer2/scheduler-queue-usage.s
llvm/test/tools/llvm-mca/X86/option-all-stats-1.s
llvm/test/tools/llvm-mca/X86/option-all-stats-2.s
llvm/test/tools/llvm-mca/X86/option-all-views-1.s
llvm/test/tools/llvm-mca/X86/option-all-views-2.s
llvm/test/tools/llvm-mca/X86/scheduler-queue-usage.s
llvm/tools/llvm-mca/Views/SchedulerStatistics.cpp
llvm/tools/llvm-mca/Views/SchedulerStatistics.h

index 955adc0..5dcd97f 100644 (file)
@@ -458,7 +458,8 @@ counters for the dispatch logic, the reorder buffer, the retire control unit,
 and the register file.
 
 Below is an example of ``-all-stats`` output generated by  :program:`llvm-mca`
-for the dot-product example discussed in the previous sections.
+for 300 iterations of the dot-product example discussed in the previous
+sections.
 
 .. code-block:: none
 
@@ -484,11 +485,16 @@ for the dot-product example discussed in the previous sections.
    1,          306  (50.2%)
    2,          297  (48.7%)
 
-
   Scheduler's queue usage:
-  JALU01,  0/20
-  JFPU01,  18/18
-  JLSAGU,  0/12
+  [1] Resource name.
+  [2] Average number of used buffer entries.
+  [3] Maximum number of used buffer entries.
+  [4] Total number of buffer entries.
+
+   [1]            [2]        [3]        [4]
+  JALU01           0          0          20
+  JFPU01           17         18         18
+  JLSAGU           0          0          12
 
 
   Retire Control Unit - number of cycles where we saw N instructions retired:
@@ -528,8 +534,8 @@ representing the number of instructions issued on some number of cycles.  In
 this case, of the 610 simulated cycles, single instructions were issued 306
 times (50.2%) and there were 7 cycles where no instructions were issued.
 
-The *Scheduler's queue usage* table shows that the maximum number of buffer
-entries (i.e., scheduler queue entries) used at runtime.  Resource JFPU01
+The *Scheduler's queue usage* table shows that the average and maximum number of
+buffer entries (i.e., scheduler queue entries) used at runtime.  Resource JFPU01
 reached its maximum (18 of 18 queue entries). Note that AMD Jaguar implements
 three schedulers:
 
index d02ed8b..32b73f3 100644 (file)
@@ -29,9 +29,15 @@ add  %rsi, %rsi
 # CHECK-NEXT:  2,          1  (10.0%)
 
 # CHECK:      Scheduler's queue usage:
-# CHECK-NEXT: JALU01,  1/20
-# CHECK-NEXT: JFPU01,  1/18
-# CHECK-NEXT: JLSAGU,  1/12
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: JALU01           0          1          20
+# CHECK-NEXT: JFPU01           0          1          18
+# CHECK-NEXT: JLSAGU           0          1          12
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - JALU0
index b6b0378..072eec8 100644 (file)
@@ -44,9 +44,15 @@ add %eax, %eax
 # FULLREPORT-NEXT:  1,          100  (97.1%)
 
 # FULLREPORT:      Scheduler's queue usage:
-# FULLREPORT-NEXT: JALU01,  20/20
-# FULLREPORT-NEXT: JFPU01,  0/18
-# FULLREPORT-NEXT: JLSAGU,  0/12
+# FULLREPORT-NEXT: [1] Resource name.
+# FULLREPORT-NEXT: [2] Average number of used buffer entries.
+# FULLREPORT-NEXT: [3] Maximum number of used buffer entries.
+# FULLREPORT-NEXT: [4] Total number of buffer entries.
+
+# FULLREPORT:       [1]            [2]        [3]        [4]
+# FULLREPORT-NEXT: JALU01           15         20         20
+# FULLREPORT-NEXT: JFPU01           0          0          18
+# FULLREPORT-NEXT: JLSAGU           0          0          12
 
 # FULLREPORT:      Retire Control Unit - number of cycles where we saw N instructions retired:
 # FULLREPORT-NEXT: [# retired], [# cycles]
index d40cdf0..d5cc3c8 100644 (file)
@@ -45,9 +45,15 @@ add %eax, %eax
 # FULL-NEXT:  1,          100  (97.1%)
 
 # FULL:      Scheduler's queue usage:
-# FULL-NEXT: JALU01,  20/20
-# FULL-NEXT: JFPU01,  0/18
-# FULL-NEXT: JLSAGU,  0/12
+# FULL-NEXT: [1] Resource name.
+# FULL-NEXT: [2] Average number of used buffer entries.
+# FULL-NEXT: [3] Maximum number of used buffer entries.
+# FULL-NEXT: [4] Total number of buffer entries.
+
+# FULL:       [1]            [2]        [3]        [4]
+# FULL-NEXT: JALU01           15         20         20
+# FULL-NEXT: JFPU01           0          0          18
+# FULL-NEXT: JLSAGU           0          0          12
 
 # FULL:      Retire Control Unit - number of cycles where we saw N instructions retired:
 # FULL-NEXT: [# retired], [# cycles]
index fb2ab14..aa9561e 100644 (file)
@@ -46,9 +46,15 @@ add %eax, %eax
 # FULLREPORT-NEXT:     1,          100  (97.1%)
 
 # FULLREPORT:         Scheduler's queue usage:
-# FULLREPORT-NEXT:    JALU01,  20/20
-# FULLREPORT-NEXT:    JFPU01,  0/18
-# FULLREPORT-NEXT:    JLSAGU,  0/12
+# FULLREPORT-NEXT:    [1] Resource name.
+# FULLREPORT-NEXT:    [2] Average number of used buffer entries.
+# FULLREPORT-NEXT:    [3] Maximum number of used buffer entries.
+# FULLREPORT-NEXT:    [4] Total number of buffer entries.
+
+# FULLREPORT:          [1]            [2]        [3]        [4]
+# FULLREPORT-NEXT:    JALU01           15         20         20
+# FULLREPORT-NEXT:    JFPU01           0          0          18
+# FULLREPORT-NEXT:    JLSAGU           0          0          12
 
 # FULLREPORT:         Retire Control Unit - number of cycles where we saw N instructions retired:
 # FULLREPORT-NEXT:    [# retired], [# cycles]
index 6014061..076c30a 100644 (file)
@@ -45,9 +45,15 @@ add %eax, %eax
 # ALL-NEXT:         1,          100  (97.1%)
 
 # ALL:             Scheduler's queue usage:
-# ALL-NEXT:        JALU01,  20/20
-# ALL-NEXT:        JFPU01,  0/18
-# ALL-NEXT:        JLSAGU,  0/12
+# ALL-NEXT:        [1] Resource name.
+# ALL-NEXT:        [2] Average number of used buffer entries.
+# ALL-NEXT:        [3] Maximum number of used buffer entries.
+# ALL-NEXT:        [4] Total number of buffer entries.
+
+# ALL:              [1]            [2]        [3]        [4]
+# ALL-NEXT:        JALU01           15         20         20
+# ALL-NEXT:        JFPU01           0          0          18
+# ALL-NEXT:        JLSAGU           0          0          12
 
 # ALL:             Retire Control Unit - number of cycles where we saw N instructions retired:
 # ALL-NEXT:        [# retired], [# cycles]
index 8448960..ea06ad0 100644 (file)
@@ -17,36 +17,90 @@ xor %eax, %ebx
 # ALL-NEXT:         0,          3  (75.0%)
 # ALL-NEXT:         1,          1  (25.0%)
 
-# BDW:             Scheduler's queue usage:
-# BDW-NEXT:        BWPortAny,  1/60
-
-# HSW:             Scheduler's queue usage:
-# HSW-NEXT:        HWPortAny,  1/60
+# SLM:             Scheduler's queue usage:
+# SLM-NEXT:        No scheduler resources used.
 
-# KNL:             Scheduler's queue usage:
-# KNL-NEXT:        HWPortAny,  1/60
+# BDW:             Scheduler's queue usage:
+# BDW-NEXT:        [1] Resource name.
+# BDW-NEXT:        [2] Average number of used buffer entries.
+# BDW-NEXT:        [3] Maximum number of used buffer entries.
+# BDW-NEXT:        [4] Total number of buffer entries.
 
 # BTVER2:          Scheduler's queue usage:
-# BTVER2-NEXT:     JALU01,  1/20
-# BTVER2-NEXT:     JFPU01,  0/18
-# BTVER2-NEXT:     JLSAGU,  0/12
+# BTVER2-NEXT:     [1] Resource name.
+# BTVER2-NEXT:     [2] Average number of used buffer entries.
+# BTVER2-NEXT:     [3] Maximum number of used buffer entries.
+# BTVER2-NEXT:     [4] Total number of buffer entries.
 
-# SLM:             Scheduler's queue usage:
-# SLM-NEXT:        No scheduler resources used.
+# HSW:             Scheduler's queue usage:
+# HSW-NEXT:        [1] Resource name.
+# HSW-NEXT:        [2] Average number of used buffer entries.
+# HSW-NEXT:        [3] Maximum number of used buffer entries.
+# HSW-NEXT:        [4] Total number of buffer entries.
 
 # IVB:             Scheduler's queue usage:
-# IVB-NEXT:        SBPortAny,  1/54
+# IVB-NEXT:        [1] Resource name.
+# IVB-NEXT:        [2] Average number of used buffer entries.
+# IVB-NEXT:        [3] Maximum number of used buffer entries.
+# IVB-NEXT:        [4] Total number of buffer entries.
 
-# SNB:             Scheduler's queue usage:
-# SNB-NEXT:        SBPortAny,  1/54
+# KNL:             Scheduler's queue usage:
+# KNL-NEXT:        [1] Resource name.
+# KNL-NEXT:        [2] Average number of used buffer entries.
+# KNL-NEXT:        [3] Maximum number of used buffer entries.
+# KNL-NEXT:        [4] Total number of buffer entries.
 
 # SKX:             Scheduler's queue usage:
-# SKX-NEXT:        SKLPortAny,  1/60
+# SKX-NEXT:        [1] Resource name.
+# SKX-NEXT:        [2] Average number of used buffer entries.
+# SKX-NEXT:        [3] Maximum number of used buffer entries.
+# SKX-NEXT:        [4] Total number of buffer entries.
 
 # SKX-AVX512:      Scheduler's queue usage:
-# SKX-AVX512-NEXT: SKXPortAny,  1/60
+# SKX-AVX512-NEXT: [1] Resource name.
+# SKX-AVX512-NEXT: [2] Average number of used buffer entries.
+# SKX-AVX512-NEXT: [3] Maximum number of used buffer entries.
+# SKX-AVX512-NEXT: [4] Total number of buffer entries.
+
+# SNB:             Scheduler's queue usage:
+# SNB-NEXT:        [1] Resource name.
+# SNB-NEXT:        [2] Average number of used buffer entries.
+# SNB-NEXT:        [3] Maximum number of used buffer entries.
+# SNB-NEXT:        [4] Total number of buffer entries.
 
 # ZNVER1:          Scheduler's queue usage:
-# ZNVER1-NEXT:     ZnAGU,  0/28
-# ZNVER1-NEXT:     ZnALU,  1/56
-# ZNVER1-NEXT:     ZnFPU,  0/36
+# ZNVER1-NEXT:     [1] Resource name.
+# ZNVER1-NEXT:     [2] Average number of used buffer entries.
+# ZNVER1-NEXT:     [3] Maximum number of used buffer entries.
+# ZNVER1-NEXT:     [4] Total number of buffer entries.
+
+# BDW:              [1]            [2]        [3]        [4]
+# BDW-NEXT:        BWPortAny        0          1          60
+
+# HSW:              [1]            [2]        [3]        [4]
+# HSW-NEXT:        HWPortAny        0          1          60
+
+# KNL:              [1]            [2]        [3]        [4]
+# KNL-NEXT:        HWPortAny        0          1          60
+
+# BTVER2:           [1]            [2]        [3]        [4]
+# BTVER2-NEXT:     JALU01           0          1          20
+# BTVER2-NEXT:     JFPU01           0          0          18
+# BTVER2-NEXT:     JLSAGU           0          0          12
+
+# IVB:              [1]            [2]        [3]        [4]
+# IVB-NEXT:        SBPortAny        0          1          54
+
+# SNB:              [1]            [2]        [3]        [4]
+# SNB-NEXT:        SBPortAny        0          1          54
+
+# SKX:              [1]            [2]        [3]        [4]
+# SKX-NEXT:        SKLPortAny       0          1          60
+
+# SKX-AVX512:       [1]            [2]        [3]        [4]
+# SKX-AVX512-NEXT: SKXPortAny       0          1          60
+
+# ZNVER1:           [1]            [2]        [3]        [4]
+# ZNVER1-NEXT:     ZnAGU            0          0          28
+# ZNVER1-NEXT:     ZnALU            0          1          56
+# ZNVER1-NEXT:     ZnFPU            0          0          36
index f5e4c89..4c00512 100644 (file)
@@ -14,6 +14,7 @@
 
 #include "Views/SchedulerStatistics.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
 
 using namespace llvm;
 
@@ -26,69 +27,101 @@ void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
 
 void SchedulerStatistics::onReservedBuffers(ArrayRef<unsigned> Buffers) {
   for (const unsigned Buffer : Buffers) {
-    if (BufferedResources.find(Buffer) != BufferedResources.end()) {
-      BufferUsage &BU = BufferedResources[Buffer];
-      BU.SlotsInUse++;
-      BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
-      continue;
-    }
-
-    BufferedResources.insert(
-        std::pair<unsigned, BufferUsage>(Buffer, {1U, 1U}));
+    BufferUsage &BU = Usage[Buffer];
+    BU.SlotsInUse++;
+    BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse);
   }
 }
 
 void SchedulerStatistics::onReleasedBuffers(ArrayRef<unsigned> Buffers) {
-  for (const unsigned Buffer : Buffers) {
-    assert(BufferedResources.find(Buffer) != BufferedResources.end() &&
-           "Buffered resource not in map?");
-    BufferUsage &BU = BufferedResources[Buffer];
-    BU.SlotsInUse--;
-  }
+  for (const unsigned Buffer : Buffers)
+    Usage[Buffer].SlotsInUse--;
 }
 
-void SchedulerStatistics::printSchedulerStatistics(
-    llvm::raw_ostream &OS) const {
-  std::string Buffer;
-  raw_string_ostream TempStream(Buffer);
-  TempStream << "\n\nSchedulers - number of cycles where we saw N instructions "
-                "issued:\n";
-  TempStream << "[# issued], [# cycles]\n";
-  for (const std::pair<unsigned, unsigned> &Entry : IssuedPerCycle) {
-    TempStream << " " << Entry.first << ",          " << Entry.second << "  ("
-               << format("%.1f", ((double)Entry.second / NumCycles) * 100)
-               << "%)\n";
-  }
+void SchedulerStatistics::updateHistograms() {
+  for (BufferUsage &BU : Usage)
+    BU.CumulativeNumUsedSlots += BU.SlotsInUse;
+  IssuedPerCycle[NumIssued]++;
+  NumIssued = 0;
+}
+
+void SchedulerStatistics::printSchedulerStats(raw_ostream &OS) const {
+  OS << "\n\nSchedulers - "
+     << "number of cycles where we saw N instructions issued:\n";
+  OS << "[# issued], [# cycles]\n";
+
+  const auto It =
+      std::max_element(IssuedPerCycle.begin(), IssuedPerCycle.end());
+  unsigned Index = std::distance(IssuedPerCycle.begin(), It);
+
+  bool HasColors = OS.has_colors();
+  for (unsigned I = 0, E = IssuedPerCycle.size(); I < E; ++I) {
+    unsigned IPC = IssuedPerCycle[I];
+    if (!IPC)
+      continue;
+
+    if (I == Index && HasColors)
+      OS.changeColor(raw_ostream::SAVEDCOLOR, true, false);
 
-  TempStream.flush();
-  OS << Buffer;
+    OS << " " << I << ",          " << IPC << "  ("
+       << format("%.1f", ((double)IPC / NumCycles) * 100) << "%)\n";
+    if (HasColors)
+      OS.resetColor();
+  }
 }
 
 void SchedulerStatistics::printSchedulerUsage(raw_ostream &OS) const {
-  std::string Buffer;
-  raw_string_ostream TempStream(Buffer);
-  TempStream << "\n\nScheduler's queue usage:\n";
-  // Early exit if no buffered resources were consumed.
-  if (BufferedResources.empty()) {
-    TempStream << "No scheduler resources used.\n";
-    TempStream.flush();
-    OS << Buffer;
+  assert(NumCycles && "Unexpected number of cycles!");
+
+  OS << "\nScheduler's queue usage:\n";
+  if (all_of(Usage, [](const BufferUsage &BU) { return !BU.MaxUsedSlots; })) {
+    OS << "No scheduler resources used.\n";
     return;
   }
 
+  OS << "[1] Resource name.\n"
+     << "[2] Average number of used buffer entries.\n"
+     << "[3] Maximum number of used buffer entries.\n"
+     << "[4] Total number of buffer entries.\n\n"
+     << " [1]            [2]        [3]        [4]\n";
+
+  formatted_raw_ostream FOS(OS);
+  bool HasColors = FOS.has_colors();
   for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
     const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
     if (ProcResource.BufferSize <= 0)
       continue;
 
-    const auto It = BufferedResources.find(I);
-    unsigned MaxUsedSlots =
-        It == BufferedResources.end() ? 0 : It->second.MaxUsedSlots;
-    TempStream << ProcResource.Name << ",  " << MaxUsedSlots << '/'
-               << ProcResource.BufferSize << '\n';
+    const BufferUsage &BU = Usage[I];
+    double AvgUsage = (double)BU.CumulativeNumUsedSlots / NumCycles;
+    double AlmostFullThreshold = (double)(ProcResource.BufferSize * 4) / 5;
+    unsigned NormalizedAvg = floor((AvgUsage * 10) + 0.5) / 10;
+    unsigned NormalizedThreshold = floor((AlmostFullThreshold * 10) + 0.5) / 10;
+
+    FOS << ProcResource.Name;
+    FOS.PadToColumn(17);
+    if (HasColors && NormalizedAvg >= NormalizedThreshold)
+      FOS.changeColor(raw_ostream::YELLOW, true, false);
+    FOS << NormalizedAvg;
+    if (HasColors)
+      FOS.resetColor();
+    FOS.PadToColumn(28);
+    if (HasColors &&
+        BU.MaxUsedSlots == static_cast<unsigned>(ProcResource.BufferSize))
+      FOS.changeColor(raw_ostream::RED, true, false);
+    FOS << BU.MaxUsedSlots;
+    if (HasColors)
+      FOS.resetColor();
+    FOS.PadToColumn(39);
+    FOS << ProcResource.BufferSize << '\n';
   }
 
-  TempStream.flush();
-  OS << Buffer;
+  FOS.flush();
+}
+
+void SchedulerStatistics::printView(llvm::raw_ostream &OS) const {
+  printSchedulerStats(OS);
+  printSchedulerUsage(OS);
 }
+
 } // namespace mca
index 3857c0e..a3f45c2 100644 (file)
 ///
 /// Schedulers - number of cycles where we saw N instructions issued:
 /// [# issued], [# cycles]
-///  0,          7  (5.4%)
-///  1,          4  (3.1%)
-///  2,          8  (6.2%)
+///  0,          6  (2.9%)
+///  1,          106  (50.7%)
+///  2,          97  (46.4%)
 ///
 /// Scheduler's queue usage:
-/// JALU01,  0/20
-/// JFPU01,  18/18
-/// JLSAGU,  0/12
+/// [1] Resource name.
+/// [2] Average number of used buffer entries.
+/// [3] Maximum number of used buffer entries.
+/// [4] Total number of buffer entries.
 ///
+///  [1]            [2]        [3]        [4]
+/// JALU01           0          0          20
+/// JFPU01           15         18         18
+/// JLSAGU           0          0          12
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H
 
 namespace mca {
 
-class SchedulerStatistics : public View {
+class SchedulerStatistics final : public View {
   const llvm::MCSchedModel &SM;
-
-  using Histogram = std::map<unsigned, unsigned>;
-  Histogram IssuedPerCycle;
-
   unsigned NumIssued;
   unsigned NumCycles;
 
@@ -51,21 +53,21 @@ class SchedulerStatistics : public View {
   struct BufferUsage {
     unsigned SlotsInUse;
     unsigned MaxUsedSlots;
+    uint64_t CumulativeNumUsedSlots;
   };
 
-  std::map<unsigned, BufferUsage> BufferedResources;
-
-  void updateHistograms() {
-    IssuedPerCycle[NumIssued]++;
-    NumIssued = 0;
-  }
+  std::vector<unsigned> IssuedPerCycle;
+  std::vector<BufferUsage> Usage;
 
-  void printSchedulerStatistics(llvm::raw_ostream &OS) const;
+  void updateHistograms();
+  void printSchedulerStats(llvm::raw_ostream &OS) const;
   void printSchedulerUsage(llvm::raw_ostream &OS) const;
 
 public:
   SchedulerStatistics(const llvm::MCSubtargetInfo &STI)
-      : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0) {}
+      : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0),
+        IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0),
+        Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
 
   void onEvent(const HWInstructionEvent &Event) override;
 
@@ -81,10 +83,7 @@ public:
   // buffered resource in the Buffers set.
   void onReleasedBuffers(llvm::ArrayRef<unsigned> Buffers) override;
 
-  void printView(llvm::raw_ostream &OS) const override {
-    printSchedulerStatistics(OS);
-    printSchedulerUsage(OS);
-  }
+  void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca