[MCA] Highlight kernel bottlenecks in the summary view.

author Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>

Mon, 4 Mar 2019 11:52:34 +0000 (11:52 +0000)

committer Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>

Mon, 4 Mar 2019 11:52:34 +0000 (11:52 +0000)
author Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Mon, 4 Mar 2019 11:52:34 +0000 (11:52 +0000)
committer Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Mon, 4 Mar 2019 11:52:34 +0000 (11:52 +0000)
diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst

index bc50794..a0eeb08 100644 (file)
--- a/llvm/docs/CommandGuide/llvm-mca.rst
+++ b/llvm/docs/CommandGuide/llvm-mca.rst
@@ -169,6 +169,12 @@ option specifies "``-``", then the output will also be sent to standard output.
    the theoretical uniform distribution of resource pressure for every
    instruction in sequence.
  
+.. option:: -bottleneck-analysis
+
+  Print information about bottlenecks that affect the throughput. This analysis
+  can be expensive, and it is disabled by default.  Bottlenecks are highlighted
+  in the summary view.
+
  
  EXIT STATUS
  -----------
diff --git a/llvm/include/llvm/MCA/Context.h b/llvm/include/llvm/MCA/Context.h

index a9f3e05..e02b033 100644 (file)
--- a/llvm/include/llvm/MCA/Context.h
+++ b/llvm/include/llvm/MCA/Context.h
@@ -32,14 +32,16 @@ namespace mca {
  /// the pre-built "default" out-of-order pipeline.
  struct PipelineOptions {
    PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
-                  bool NoAlias)
+                  bool NoAlias, bool ShouldEnableBottleneckAnalysis = false)
        : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
-        StoreQueueSize(SQS), AssumeNoAlias(NoAlias) {}
+        StoreQueueSize(SQS), AssumeNoAlias(NoAlias),
+        EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {}
    unsigned DispatchWidth;
    unsigned RegisterFileSize;
    unsigned LoadQueueSize;
    unsigned StoreQueueSize;
    bool AssumeNoAlias;
+  bool EnableBottleneckAnalysis;
  };
  
  class Context {
diff --git a/llvm/include/llvm/MCA/HWEventListener.h b/llvm/include/llvm/MCA/HWEventListener.h

index 1857ad2..2f81b87 100644 (file)
--- a/llvm/include/llvm/MCA/HWEventListener.h
+++ b/llvm/include/llvm/MCA/HWEventListener.h
@@ -125,6 +125,35 @@ public:
    const InstRef &IR;
  };
  
+// A HWPressureEvent describes an increase in backend pressure caused by
+// the presence of data dependencies or unavailability of pipeline resources.
+class HWPressureEvent {
+public:
+  enum GenericReason {
+    INVALID = 0,
+    // Scheduler was unable to issue all the ready instructions because some
+    // pipeline resources were unavailable.
+    RESOURCES,
+    // Instructions could not be issued because of register data dependencies.
+    REGISTER_DEPS,
+    // Instructions could not be issued because of memory dependencies.
+    MEMORY_DEPS
+  };
+
+  HWPressureEvent(GenericReason reason, ArrayRef<InstRef> Insts,
+                  uint64_t Mask = 0)
+      : Reason(reason), AffectedInstructions(Insts), ResourceMask(Mask) {}
+
+  // Reason for this increase in backend pressure.
+  GenericReason Reason;
+
+  // Instructions affected (i.e. delayed) by this increase in backend pressure.
+  ArrayRef<InstRef> AffectedInstructions;
+
+  // A mask of unavailable processor resources.
+  const uint64_t ResourceMask;
+};
+
  class HWEventListener {
  public:
    // Generic events generated by the pipeline.
@@ -133,6 +162,7 @@ public:
  
    virtual void onEvent(const HWInstructionEvent &Event) {}
    virtual void onEvent(const HWStallEvent &Event) {}
+  virtual void onEvent(const HWPressureEvent &Event) {}
  
    using ResourceRef = std::pair<uint64_t, uint64_t>;
    virtual void onResourceAvailable(const ResourceRef &RRef) {}
diff --git a/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h b/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h

index 429d656..fc618a7 100644 (file)
--- a/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h
+++ b/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h
@@ -180,8 +180,8 @@ public:
    /// Check if the instruction in 'IR' can be dispatched during this cycle.
    /// Return SC_AVAILABLE if both scheduler and LS resources are available.
    ///
-  /// This method internally sets field HadTokenStall based on the Scheduler
-  /// Status value.
+  /// This method is also responsible for setting field HadTokenStall if
+  /// IR cannot be dispatched to the Scheduler due to unavailable resources.
    Status isAvailable(const InstRef &IR);
  
    /// Reserves buffer and LSUnit queue resources that are necessary to issue
@@ -225,16 +225,25 @@ public:
    /// resources are not available.
    InstRef select();
  
-  /// Returns a mask of busy resources. Each bit of the mask identifies a unique
-  /// processor resource unit. In the absence of bottlenecks caused by resource
-  /// pressure, the mask value returned by this method is always zero.
-  uint64_t getBusyResourceUnits() const { return BusyResourceUnits; }
    bool arePipelinesFullyUsed() const {
      return !Resources->getAvailableProcResUnits();
    }
    bool isReadySetEmpty() const { return ReadySet.empty(); }
    bool isWaitSetEmpty() const { return WaitSet.empty(); }
  
+  /// This method is called by the ExecuteStage at the end of each cycle to
+  /// identify bottlenecks caused by data dependencies. Vector RegDeps is
+  /// populated by instructions that were not issued because of unsolved
+  /// register dependencies.  Vector MemDeps is populated by instructions that
+  /// were not issued because of unsolved memory dependencies.
+  void analyzeDataDependencies(SmallVectorImpl<InstRef> &RegDeps,
+                               SmallVectorImpl<InstRef> &MemDeps);
+
+  /// Returns a mask of busy resources, and populates vector Insts with
+  /// instructions that could not be issued to the underlying pipelines because
+  /// not all pipeline resources were available.
+  uint64_t analyzeResourcePressure(SmallVectorImpl<InstRef> &Insts);
+
    // Returns true if the dispatch logic couldn't dispatch a full group due to
    // unavailable scheduler and/or LS resources.
    bool hadTokenStall() const { return HadTokenStall; }
diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h

index e6ee783..62453d3 100644 (file)
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -448,7 +448,13 @@ class Instruction : public InstructionBase {
    // Retire Unit token ID for this instruction.
    unsigned RCUTokenID;
  
+  // A bitmask of busy processor resource units.
+  // This field is set to zero only if execution is not delayed during this
+  // cycle because of unavailable pipeline resources.
    uint64_t CriticalResourceMask;
+
+  // An instruction identifier. This field is only set if execution is delayed
+  // by a memory dependency.
    unsigned CriticalMemDep;
  
  public:
@@ -499,12 +505,12 @@ public:
      Stage = IS_RETIRED;
    }
  
-  void updateCriticalResourceMask(uint64_t BusyResourceUnits) {
-    CriticalResourceMask |= BusyResourceUnits;
-  }
    uint64_t getCriticalResourceMask() const { return CriticalResourceMask; }
-  void setCriticalMemDep(unsigned IID) { CriticalMemDep = IID; }
    unsigned getCriticalMemDep() const { return CriticalMemDep; }
+  void setCriticalResourceMask(uint64_t ResourceMask) {
+    CriticalResourceMask = ResourceMask;
+  }
+  void setCriticalMemDep(unsigned IID) { CriticalMemDep = IID; }
  
    void cycleEvent();
  };
diff --git a/llvm/include/llvm/MCA/Stages/ExecuteStage.h b/llvm/include/llvm/MCA/Stages/ExecuteStage.h

index ec9eae0..c20173b 100644 (file)
--- a/llvm/include/llvm/MCA/Stages/ExecuteStage.h
+++ b/llvm/include/llvm/MCA/Stages/ExecuteStage.h
@@ -28,6 +28,12 @@ namespace mca {
  class ExecuteStage final : public Stage {
    Scheduler &HWS;
  
+  unsigned NumDispatchedOpcodes;
+  unsigned NumIssuedOpcodes;
+
+  // True if this stage should notify listeners of HWPressureEvents.
+  bool EnablePressureEvents;
+
    Error issueInstruction(InstRef &IR);
  
    // Called at the beginning of each cycle to issue already dispatched
@@ -41,7 +47,10 @@ class ExecuteStage final : public Stage {
    ExecuteStage &operator=(const ExecuteStage &Other) = delete;
  
  public:
-  ExecuteStage(Scheduler &S) : Stage(), HWS(S) {}
+  ExecuteStage(Scheduler &S) : ExecuteStage(S, false) {}
+  ExecuteStage(Scheduler &S, bool ShouldPerformBottleneckAnalysis)
+      : Stage(), HWS(S), NumDispatchedOpcodes(0), NumIssuedOpcodes(0),
+        EnablePressureEvents(ShouldPerformBottleneckAnalysis) {}
  
    // This stage works under the assumption that the Pipeline will eventually
    // execute a retire stage. We don't need to check if pipelines and/or
@@ -60,6 +69,7 @@ public:
    // Instructions that transitioned to the 'Executed' state are automatically
    // moved to the next stage (i.e. RetireStage).
    Error cycleStart() override;
+  Error cycleEnd() override;
    Error execute(InstRef &IR) override;
  
    void notifyInstructionIssued(
diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp

index 18489cc..8de6753 100644 (file)
--- a/llvm/lib/MCA/Context.cpp
+++ b/llvm/lib/MCA/Context.cpp
@@ -42,7 +42,8 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
    auto Fetch = llvm::make_unique<EntryStage>(SrcMgr);
    auto Dispatch = llvm::make_unique<DispatchStage>(STI, MRI, Opts.DispatchWidth,
                                                     *RCU, *PRF);
-  auto Execute = llvm::make_unique<ExecuteStage>(*HWS);
+  auto Execute =
+      llvm::make_unique<ExecuteStage>(*HWS, Opts.EnableBottleneckAnalysis);
    auto Retire = llvm::make_unique<RetireStage>(*RCU, *PRF);
  
    // Pass the ownership of all the hardware units to this Context.
diff --git a/llvm/lib/MCA/HardwareUnits/Scheduler.cpp b/llvm/lib/MCA/HardwareUnits/Scheduler.cpp

index 1a428ac..5b2527b 100644 (file)
--- a/llvm/lib/MCA/HardwareUnits/Scheduler.cpp
+++ b/llvm/lib/MCA/HardwareUnits/Scheduler.cpp
@@ -183,9 +183,9 @@ InstRef Scheduler::select() {
      InstRef &IR = ReadySet[I];
      if (QueueIndex == ReadySet.size() ||
          Strategy->compare(IR, ReadySet[QueueIndex])) {
-      const InstrDesc &D = IR.getInstruction()->getDesc();
-      uint64_t BusyResourceMask = Resources->checkAvailability(D);
-      IR.getInstruction()->updateCriticalResourceMask(BusyResourceMask);
+      Instruction &IS = *IR.getInstruction();
+      uint64_t BusyResourceMask = Resources->checkAvailability(IS.getDesc());
+      IS.setCriticalResourceMask(BusyResourceMask);
        BusyResourceUnits |= BusyResourceMask;
        if (!BusyResourceMask)
          QueueIndex = I;
@@ -227,6 +227,28 @@ void Scheduler::updateIssuedSet(SmallVectorImpl<InstRef> &Executed) {
    IssuedSet.resize(IssuedSet.size() - RemovedElements);
  }
  
+uint64_t Scheduler::analyzeResourcePressure(SmallVectorImpl<InstRef> &Insts) {
+  Insts.insert(Insts.end(), ReadySet.begin(), ReadySet.end());
+  return BusyResourceUnits;
+}
+
+void Scheduler::analyzeDataDependencies(SmallVectorImpl<InstRef> &RegDeps,
+                                        SmallVectorImpl<InstRef> &MemDeps) {
+  const auto EndIt = PendingSet.end() - NumDispatchedToThePendingSet;
+  for (InstRef &IR : make_range(PendingSet.begin(), EndIt)) {
+    Instruction &IS = *IR.getInstruction();
+    if (Resources->checkAvailability(IS.getDesc()))
+      continue;
+
+    if (IS.isReady() ||
+        (IS.isMemOp() && LSU.isReady(IR) != IR.getSourceIndex())) {
+      MemDeps.emplace_back(IR);
+    } else {
+      RegDeps.emplace_back(IR);
+    }
+  }
+}
+
  void Scheduler::cycleEvent(SmallVectorImpl<ResourceRef> &Freed,
                             SmallVectorImpl<InstRef> &Executed,
                             SmallVectorImpl<InstRef> &Ready) {
diff --git a/llvm/lib/MCA/Stages/ExecuteStage.cpp b/llvm/lib/MCA/Stages/ExecuteStage.cpp

index 27ac8f8..49210e0 100644 (file)
--- a/llvm/lib/MCA/Stages/ExecuteStage.cpp
+++ b/llvm/lib/MCA/Stages/ExecuteStage.cpp
@@ -54,6 +54,7 @@ Error ExecuteStage::issueInstruction(InstRef &IR) {
    SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> Used;
    SmallVector<InstRef, 4> Ready;
    HWS.issueInstruction(IR, Used, Ready);
+  NumIssuedOpcodes += IR.getInstruction()->getDesc().NumMicroOps;
  
    notifyReservedOrReleasedBuffers(IR, /* Reserved */ false);
  
@@ -89,6 +90,8 @@ Error ExecuteStage::cycleStart() {
    SmallVector<InstRef, 4> Ready;
  
    HWS.cycleEvent(Freed, Executed, Ready);
+  NumDispatchedOpcodes = 0;
+  NumIssuedOpcodes = 0;
  
    for (const ResourceRef &RR : Freed)
      notifyResourceAvailable(RR);
@@ -106,6 +109,45 @@ Error ExecuteStage::cycleStart() {
    return issueReadyInstructions();
  }
  
+Error ExecuteStage::cycleEnd() {
+  if (!EnablePressureEvents)
+    return ErrorSuccess();
+
+  // Always conservatively report any backpressure events if the dispatch logic
+  // was stalled due to unavailable scheduler resources.
+  if (!HWS.hadTokenStall() && NumDispatchedOpcodes <= NumIssuedOpcodes)
+    return ErrorSuccess();
+
+  SmallVector<InstRef, 8> Insts;
+  uint64_t Mask = HWS.analyzeResourcePressure(Insts);
+  if (Mask) {
+    LLVM_DEBUG(dbgs() << "[E] Backpressure increased because of unavailable "
+                         "pipeline resources: "
+                      << format_hex(Mask, 16) << '\n');
+    HWPressureEvent Ev(HWPressureEvent::RESOURCES, Insts, Mask);
+    notifyEvent(Ev);
+    return ErrorSuccess();
+  }
+
+  SmallVector<InstRef, 8> RegDeps;
+  SmallVector<InstRef, 8> MemDeps;
+  HWS.analyzeDataDependencies(RegDeps, MemDeps);
+  if (RegDeps.size()) {
+    LLVM_DEBUG(
+        dbgs() << "[E] Backpressure increased by register dependencies\n");
+    HWPressureEvent Ev(HWPressureEvent::REGISTER_DEPS, RegDeps);
+    notifyEvent(Ev);
+  }
+
+  if (MemDeps.size()) {
+    LLVM_DEBUG(dbgs() << "[E] Backpressure increased by memory dependencies\n");
+    HWPressureEvent Ev(HWPressureEvent::MEMORY_DEPS, MemDeps);
+    notifyEvent(Ev);
+  }
+
+  return ErrorSuccess();
+}
+
  #ifndef NDEBUG
  static void verifyInstructionEliminated(const InstRef &IR) {
    const Instruction &Inst = *IR.getInstruction();
@@ -147,6 +189,7 @@ Error ExecuteStage::execute(InstRef &IR) {
    // be released after MCIS is issued, and all the ResourceCycles for those
    // units have been consumed.
    bool IsReadyInstruction = HWS.dispatch(IR);
+  NumDispatchedOpcodes += IR.getInstruction()->getDesc().NumMicroOps;
    notifyReservedOrReleasedBuffers(IR, /* Reserved */ true);
    if (!IsReadyInstruction)
      return ErrorSuccess();
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s

new file mode 100644 (file)

index 0000000..16577cf
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s
+
+add %eax, %ebx
+add %ebx, %ecx
+add %ecx, %edx
+add %edx, %eax
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Cycles with backend pressure increase [ 94.04% ]
+# CHECK-NEXT: Throughput Bottlenecks:
+# CHECK-NEXT:   Resource Pressure       [ 0.00% ]
+# CHECK-NEXT:   Data Dependencies:      [ 94.04% ]
+# CHECK-NEXT:   - Register Dependencies [ 94.04% ]
+# CHECK-NEXT:   - Memory Dependencies   [ 0.00% ]
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        addl   %eax, %ebx
+# CHECK-NEXT:  1      1     0.50                        addl   %ebx, %ecx
+# CHECK-NEXT:  1      1     0.50                        addl   %ecx, %edx
+# CHECK-NEXT:  1      1     0.50                        addl   %edx, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     addl   %eax, %ebx
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl   %ebx, %ecx
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     addl   %ecx, %edx
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl   %edx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   addl %eax, %ebx
+# CHECK-NEXT: [0,1]     D=eER..   addl %ebx, %ecx
+# CHECK-NEXT: [0,2]     .D=eER.   addl %ecx, %edx
+# CHECK-NEXT: [0,3]     .D==eER   addl %edx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       addl        %eax, %ebx
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       addl        %ebx, %ecx
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       addl        %ecx, %edx
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       addl        %edx, %eax
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s

new file mode 100644 (file)

index 0000000..83444d4
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s
@@ -0,0 +1,72 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s
+
+vhaddps %xmm0, %xmm0, %xmm1
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      100
+# CHECK-NEXT: Total Cycles:      106
+# CHECK-NEXT: Total uOps:        100
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.94
+# CHECK-NEXT: IPC:               0.94
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Cycles with backend pressure increase [ 76.42% ]
+# CHECK-NEXT: Throughput Bottlenecks:
+# CHECK-NEXT:   Resource Pressure       [ 76.42% ]
+# CHECK-NEXT:   - JFPA  [ 76.42% ]
+# CHECK-NEXT:   - JFPU0  [ 76.42% ]
+# CHECK-NEXT:   Data Dependencies:      [ 0.00% ]
+# CHECK-NEXT:   - Register Dependencies [ 0.00% ]
+# CHECK-NEXT:   - Memory Dependencies   [ 0.00% ]
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        vhaddps        %xmm0, %xmm0, %xmm1
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vhaddps        %xmm0, %xmm0, %xmm1
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeeeeER   vhaddps      %xmm0, %xmm0, %xmm1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vhaddps     %xmm0, %xmm0, %xmm1
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s

new file mode 100644 (file)

index 0000000..6cd613a
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s
@@ -0,0 +1,106 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -noalias=false -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s
+
+vmovaps (%rsi), %xmm0
+vmovaps %xmm0, (%rdi)
+vmovaps 16(%rsi), %xmm0
+vmovaps %xmm0, 16(%rdi)
+vmovaps 32(%rsi), %xmm0
+vmovaps %xmm0, 32(%rdi)
+vmovaps 48(%rsi), %xmm0
+vmovaps %xmm0, 48(%rdi)
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      12000
+# CHECK-NEXT: Total Cycles:      36003
+# CHECK-NEXT: Total uOps:        12000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Cycles with backend pressure increase [ 99.89% ]
+# CHECK-NEXT: Throughput Bottlenecks:
+# CHECK-NEXT:   Resource Pressure       [ 0.00% ]
+# CHECK-NEXT:   Data Dependencies:      [ 99.89% ]
+# CHECK-NEXT:   - Register Dependencies [ 0.00% ]
+# CHECK-NEXT:   - Memory Dependencies   [ 99.89% ]
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00    *                   vmovaps        (%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps        %xmm0, (%rdi)
+# CHECK-NEXT:  1      5     1.00    *                   vmovaps        16(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps        %xmm0, 16(%rdi)
+# CHECK-NEXT:  1      5     1.00    *                   vmovaps        32(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps        %xmm0, 32(%rdi)
+# CHECK-NEXT:  1      5     1.00    *                   vmovaps        48(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps        %xmm0, 48(%rdi)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -     2.00   2.00   4.00   4.00   4.00    -     4.00   4.00    -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -     1.00    -      -      -      -      -      -     vmovaps        (%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -     vmovaps        %xmm0, (%rdi)
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -     1.00    -      -      -      -      -      -     vmovaps        16(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -     vmovaps        %xmm0, 16(%rdi)
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -     1.00    -      -      -      -      -      -     vmovaps        32(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -     vmovaps        %xmm0, 32(%rdi)
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -     1.00    -      -      -      -      -      -     vmovaps        48(%rsi), %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -     vmovaps        %xmm0, 48(%rdi)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   vmovaps  (%rsi), %xmm0
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    ..   vmovaps  %xmm0, (%rdi)
+# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    ..   vmovaps  16(%rsi), %xmm0
+# CHECK-NEXT: [0,3]     .D==========eER.    .    ..   vmovaps  %xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4]     . D==========eeeeeER.    ..   vmovaps  32(%rsi), %xmm0
+# CHECK-NEXT: [0,5]     . D===============eER    ..   vmovaps  %xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6]     .  D===============eeeeeER.   vmovaps  48(%rsi), %xmm0
+# CHECK-NEXT: [0,7]     .  D====================eER   vmovaps  %xmm0, 48(%rdi)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps     (%rsi), %xmm0
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       vmovaps     %xmm0, (%rdi)
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       vmovaps     16(%rsi), %xmm0
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       vmovaps     %xmm0, 16(%rdi)
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       vmovaps     32(%rsi), %xmm0
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       vmovaps     %xmm0, 32(%rdi)
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       vmovaps     48(%rsi), %xmm0
+# CHECK-NEXT: 7.     1     21.0   0.0    0.0       vmovaps     %xmm0, 48(%rdi)
diff --git a/llvm/tools/llvm-mca/Views/SummaryView.cpp b/llvm/tools/llvm-mca/Views/SummaryView.cpp

index 1f14f3d..5942160 100644 (file)
--- a/llvm/tools/llvm-mca/Views/SummaryView.cpp
+++ b/llvm/tools/llvm-mca/Views/SummaryView.cpp
@@ -25,10 +25,14 @@ namespace mca {
  SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S,
                           unsigned Width)
      : SM(Model), Source(S), DispatchWidth(Width), LastInstructionIdx(0),
-      TotalCycles(0), NumMicroOps(0),
+      TotalCycles(0), NumMicroOps(0), BPI({0, 0, 0, 0}),
+      ResourcePressureDistribution(Model.getNumProcResourceKinds(), 0),
        ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
        ProcResourceMasks(Model.getNumProcResourceKinds()),
-      ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0) {
+      ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0),
+      PressureIncreasedBecauseOfResources(false),
+      PressureIncreasedBecauseOfDataDependencies(false),
+      SeenStallCycles(false) {
    computeProcResourceMasks(SM, ProcResourceMasks);
    for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
      unsigned Index = getResourceStateIndex(ProcResourceMasks[I]);
@@ -61,6 +65,98 @@ void SummaryView::onEvent(const HWInstructionEvent &Event) {
    }
  }
  
+void SummaryView::onEvent(const HWPressureEvent &Event) {
+  assert(Event.Reason != HWPressureEvent::INVALID &&
+         "Unexpected invalid event!");
+
+  switch (Event.Reason) {
+  default:
+    break;
+
+  case HWPressureEvent::RESOURCES: {
+    PressureIncreasedBecauseOfResources = true;
+    ++BPI.ResourcePressureCycles;
+    uint64_t ResourceMask = Event.ResourceMask;
+    while (ResourceMask) {
+      uint64_t Current = ResourceMask & (-ResourceMask);
+      unsigned Index = getResourceStateIndex(Current);
+      unsigned ProcResID = ResIdx2ProcResID[Index];
+      const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
+      if (!PRDesc.SubUnitsIdxBegin) {
+        ResourcePressureDistribution[Index]++;
+        ResourceMask ^= Current;
+        continue;
+      }
+
+      for (unsigned I = 0, E = PRDesc.NumUnits; I < E; ++I) {
+        unsigned OtherProcResID = PRDesc.SubUnitsIdxBegin[I];
+        unsigned OtherMask = ProcResourceMasks[OtherProcResID];
+        ResourcePressureDistribution[getResourceStateIndex(OtherMask)]++;
+      }
+
+      ResourceMask ^= Current;
+    }
+  }
+
+  break;
+  case HWPressureEvent::REGISTER_DEPS:
+    PressureIncreasedBecauseOfDataDependencies = true;
+    ++BPI.RegisterDependencyCycles;
+    break;
+  case HWPressureEvent::MEMORY_DEPS:
+    PressureIncreasedBecauseOfDataDependencies = true;
+    ++BPI.MemoryDependencyCycles;
+    break;
+  }
+}
+
+void SummaryView::printBottleneckHints(raw_ostream &OS) const {
+  if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
+    return;
+
+  double PressurePerCycle =
+      (double)BPI.PressureIncreaseCycles * 100 / TotalCycles;
+  double ResourcePressurePerCycle =
+      (double)BPI.ResourcePressureCycles * 100 / TotalCycles;
+  double DDPerCycle = (double)BPI.DataDependencyCycles * 100 / TotalCycles;
+  double RegDepPressurePerCycle =
+      (double)BPI.RegisterDependencyCycles * 100 / TotalCycles;
+  double MemDepPressurePerCycle =
+      (double)BPI.MemoryDependencyCycles * 100 / TotalCycles;
+
+  OS << "\nCycles with backend pressure increase [ "
+     << format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]";
+
+  OS << "\nThroughput Bottlenecks: "
+     << "\n  Resource Pressure       [ "
+     << format("%.2f", floor((ResourcePressurePerCycle * 100) + 0.5) / 100)
+     << "% ]";
+
+  if (BPI.PressureIncreaseCycles) {
+    for (unsigned I = 0, E = ResourcePressureDistribution.size(); I < E; ++I) {
+      if (ResourcePressureDistribution[I]) {
+        double Frequency =
+            (double)ResourcePressureDistribution[I] * 100 / TotalCycles;
+        unsigned Index = ResIdx2ProcResID[getResourceStateIndex(1ULL << I)];
+        const MCProcResourceDesc &PRDesc = *SM.getProcResource(Index);
+        OS << "\n  - " << PRDesc.Name << "  [ "
+           << format("%.2f", floor((Frequency * 100) + 0.5) / 100) << "% ]";
+      }
+    }
+  }
+
+  OS << "\n  Data Dependencies:      [ "
+     << format("%.2f", floor((DDPerCycle * 100) + 0.5) / 100) << "% ]";
+
+  OS << "\n  - Register Dependencies [ "
+     << format("%.2f", floor((RegDepPressurePerCycle * 100) + 0.5) / 100)
+     << "% ]";
+
+  OS << "\n  - Memory Dependencies   [ "
+     << format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100)
+     << "% ]\n\n";
+}
+
  void SummaryView::printView(raw_ostream &OS) const {
    unsigned Instructions = Source.size();
    unsigned Iterations = (LastInstructionIdx / Instructions) + 1;
@@ -85,6 +181,8 @@ void SummaryView::printView(raw_ostream &OS) const {
    TempStream << "\nBlock RThroughput: "
               << format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10)
               << '\n';
+
+  printBottleneckHints(TempStream);
    TempStream.flush();
    OS << Buffer;
  }
diff --git a/llvm/tools/llvm-mca/Views/SummaryView.h b/llvm/tools/llvm-mca/Views/SummaryView.h

index 631e409..dbccdd3 100644 (file)
--- a/llvm/tools/llvm-mca/Views/SummaryView.h
+++ b/llvm/tools/llvm-mca/Views/SummaryView.h
@@ -45,6 +45,25 @@ class SummaryView : public View {
    unsigned TotalCycles;
    // The total number of micro opcodes contributed by a block of instructions.
    unsigned NumMicroOps;
+
+  struct BackPressureInfo {
+    // Cycles where backpressure increased.
+    unsigned PressureIncreaseCycles;
+    // Cycles where backpressure increased because of pipeline pressure.
+    unsigned ResourcePressureCycles;
+    // Cycles where backpressure increased because of data dependencies.
+    unsigned DataDependencyCycles;
+    // Cycles where backpressure increased because of register dependencies.
+    unsigned RegisterDependencyCycles;
+    // Cycles where backpressure increased because of memory dependencies.
+    unsigned MemoryDependencyCycles;
+  };
+  BackPressureInfo BPI;
+
+  // Resource pressure distribution. There is an element for every processor
+  // resource declared by the scheduling model. Quantities are number of cycles.
+  llvm::SmallVector<unsigned, 8> ResourcePressureDistribution;
+
    // For each processor resource, this vector stores the cumulative number of
    // resource cycles consumed by the analyzed code block.
    llvm::SmallVector<unsigned, 8> ProcResourceUsage;
@@ -58,18 +77,43 @@ class SummaryView : public View {
    // Used to map resource indices to actual processor resource IDs.
    llvm::SmallVector<unsigned, 8> ResIdx2ProcResID;
  
+  // True if resource pressure events were notified during this cycle.
+  bool PressureIncreasedBecauseOfResources;
+  bool PressureIncreasedBecauseOfDataDependencies;
+
+  // True if throughput was affected by dispatch stalls.
+  bool SeenStallCycles;
+
    // Compute the reciprocal throughput for the analyzed code block.
    // The reciprocal block throughput is computed as the MAX between:
    //   - NumMicroOps / DispatchWidth
    //   - Total Resource Cycles / #Units   (for every resource consumed).
    double getBlockRThroughput() const;
  
+  // Prints a bottleneck message to OS.
+  void printBottleneckHints(llvm::raw_ostream &OS) const;
+
  public:
    SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef<llvm::MCInst> S,
                unsigned Width);
  
-  void onCycleEnd() override { ++TotalCycles; }
+  void onCycleEnd() override {
+    ++TotalCycles;
+    if (PressureIncreasedBecauseOfResources ||
+        PressureIncreasedBecauseOfDataDependencies) {
+      ++BPI.PressureIncreaseCycles;
+      if (PressureIncreasedBecauseOfDataDependencies)
+        ++BPI.DataDependencyCycles;
+      PressureIncreasedBecauseOfResources = false;
+      PressureIncreasedBecauseOfDataDependencies = false;
+    }
+  }
    void onEvent(const HWInstructionEvent &Event) override;
+  void onEvent(const HWStallEvent &Event) override {
+    SeenStallCycles = true;
+  }
+
+  void onEvent(const HWPressureEvent &Event) override;
  
    void printView(llvm::raw_ostream &OS) const override;
  };
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp

index 704a7b4..c7c1a4f 100644 (file)
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -175,6 +175,11 @@ static cl::opt<bool>
                     cl::desc("Print all views including hardware statistics"),
                     cl::cat(ViewOptions), cl::init(false));
  
+static cl::opt<bool> EnableBottleneckAnalysis(
+    "bottleneck-analysis",
+    cl::desc("Enable bottleneck analysis (disabled by default)"),
+    cl::cat(ViewOptions), cl::init(false));
+
  namespace {
  
  const Target *getTarget(const char *ProgName) {
@@ -387,7 +392,8 @@ int main(int argc, char **argv) {
    mca::Context MCA(*MRI, *STI);
  
    mca::PipelineOptions PO(Width, RegisterFileSize, LoadQueueSize,
-                          StoreQueueSize, AssumeNoAlias);
+                          StoreQueueSize, AssumeNoAlias,
+                          EnableBottleneckAnalysis);
  
    // Number each region in the sequence.
    unsigned RegionIdx = 0;
author	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
	Mon, 4 Mar 2019 11:52:34 +0000 (11:52 +0000)
committer	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
	Mon, 4 Mar 2019 11:52:34 +0000 (11:52 +0000)
llvm/docs/CommandGuide/llvm-mca.rst		patch \| blob \| history
llvm/include/llvm/MCA/Context.h		patch \| blob \| history
llvm/include/llvm/MCA/HWEventListener.h		patch \| blob \| history
llvm/include/llvm/MCA/HardwareUnits/Scheduler.h		patch \| blob \| history
llvm/include/llvm/MCA/Instruction.h		patch \| blob \| history
llvm/include/llvm/MCA/Stages/ExecuteStage.h		patch \| blob \| history
llvm/lib/MCA/Context.cpp		patch \| blob \| history
llvm/lib/MCA/HardwareUnits/Scheduler.cpp		patch \| blob \| history
llvm/lib/MCA/Stages/ExecuteStage.cpp		patch \| blob \| history
llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s	[new file with mode: 0644]	patch \| blob
llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s	[new file with mode: 0644]	patch \| blob
llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s	[new file with mode: 0644]	patch \| blob
llvm/tools/llvm-mca/Views/SummaryView.cpp		patch \| blob \| history
llvm/tools/llvm-mca/Views/SummaryView.h		patch \| blob \| history
llvm/tools/llvm-mca/llvm-mca.cpp		patch \| blob \| history