From be3281a281e36c416df469ed81a4e398132da953 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Mon, 4 Mar 2019 11:52:34 +0000 Subject: [PATCH] [MCA] Highlight kernel bottlenecks in the summary view. This patch adds a new flag named -bottleneck-analysis to print out information about throughput bottlenecks. MCA knows how to identify and classify dynamic dispatch stalls. However, it doesn't know how to analyze and highlight kernel bottlenecks. The goal of this patch is to teach MCA how to correlate increases in backend pressure to backend stalls (and therefore, the loss of throughput). From a Scheduler point of view, backend pressure is a function of the scheduler buffer usage (i.e. how the number of uOps in the scheduler buffers changes over time). Backend pressure increases (or decreases) when there is a mismatch between the number of opcodes dispatched, and the number of opcodes issued in the same cycle. Since buffer resources are limited, continuous increases in backend pressure would eventually leads to dispatch stalls. So, there is a strong correlation between dispatch stalls, and how backpressure changed over time. This patch teaches how to identify situations where backend pressure increases due to: - unavailable pipeline resources. - data dependencies. Data dependencies may delay execution of instructions and therefore increase the time that uOps have to spend in the scheduler buffers. That often translates to an increase in backend pressure which may eventually lead to a bottleneck. Contention on pipeline resources may also delay execution of instructions, and lead to a temporary increase in backend pressure. Internally, the Scheduler classifies instructions based on whether register / memory operands are available or not. An instruction is marked as "ready to execute" only if data dependencies are fully resolved. Every cycle, the Scheduler attempts to execute all instructions that are ready to execute. If an instruction cannot execute because of unavailable pipeline resources, then the Scheduler internally updates a BusyResourceUnits mask with the ID of each unavailable resource. ExecuteStage is responsible for tracking changes in backend pressure. If backend pressure increases during a cycle because of contention on pipeline resources, then ExecuteStage sends a "backend pressure" event to the listeners. That event would contain information about instructions delayed by resource pressure, as well as the BusyResourceUnits mask. Note that ExecuteStage also knows how to identify situations where backpressure increased because of delays introduced by data dependencies. The SummaryView observes "backend pressure" events and prints out a "bottleneck report". Example of bottleneck report: ``` Cycles with backend pressure increase [ 99.89% ] Throughput Bottlenecks: Resource Pressure [ 0.00% ] Data Dependencies: [ 99.89% ] - Register Dependencies [ 0.00% ] - Memory Dependencies [ 99.89% ] ``` A bottleneck report is printed out only if increases in backend pressure eventually caused backend stalls. About the time complexity: Time complexity is linear in the number of instructions in the Scheduler::PendingSet. The average slowdown tends to be in the range of ~5-6%. For memory intensive kernels, the slowdown can be significant if flag -noalias=false is specified. In the worst case scenario I have observed a slowdown of ~30% when flag -noalias=false was specified. We can definitely recover part of that slowdown if we optimize class LSUnit (by doing extra bookkeeping to speedup queries). For now, this new analysis is disabled by default, and it can be enabled via flag -bottleneck-analysis. Users of MCA as a library can enable the generation of pressure events through the constructor of ExecuteStage. This patch partially addresses https://bugs.llvm.org/show_bug.cgi?id=37494 Differential Revision: https://reviews.llvm.org/D58728 llvm-svn: 355308 --- llvm/docs/CommandGuide/llvm-mca.rst | 6 ++ llvm/include/llvm/MCA/Context.h | 6 +- llvm/include/llvm/MCA/HWEventListener.h | 30 ++++++ llvm/include/llvm/MCA/HardwareUnits/Scheduler.h | 21 ++-- llvm/include/llvm/MCA/Instruction.h | 14 ++- llvm/include/llvm/MCA/Stages/ExecuteStage.h | 12 ++- llvm/lib/MCA/Context.cpp | 3 +- llvm/lib/MCA/HardwareUnits/Scheduler.cpp | 28 +++++- llvm/lib/MCA/Stages/ExecuteStage.cpp | 43 +++++++++ .../tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s | 85 +++++++++++++++++ .../tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s | 72 ++++++++++++++ .../tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s | 106 +++++++++++++++++++++ llvm/tools/llvm-mca/Views/SummaryView.cpp | 102 +++++++++++++++++++- llvm/tools/llvm-mca/Views/SummaryView.h | 46 ++++++++- llvm/tools/llvm-mca/llvm-mca.cpp | 8 +- 15 files changed, 561 insertions(+), 21 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s create mode 100644 llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s create mode 100644 llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst index bc50794..a0eeb08 100644 --- a/llvm/docs/CommandGuide/llvm-mca.rst +++ b/llvm/docs/CommandGuide/llvm-mca.rst @@ -169,6 +169,12 @@ option specifies "``-``", then the output will also be sent to standard output. the theoretical uniform distribution of resource pressure for every instruction in sequence. +.. option:: -bottleneck-analysis + + Print information about bottlenecks that affect the throughput. This analysis + can be expensive, and it is disabled by default. Bottlenecks are highlighted + in the summary view. + EXIT STATUS ----------- diff --git a/llvm/include/llvm/MCA/Context.h b/llvm/include/llvm/MCA/Context.h index a9f3e05..e02b033 100644 --- a/llvm/include/llvm/MCA/Context.h +++ b/llvm/include/llvm/MCA/Context.h @@ -32,14 +32,16 @@ namespace mca { /// the pre-built "default" out-of-order pipeline. struct PipelineOptions { PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS, - bool NoAlias) + bool NoAlias, bool ShouldEnableBottleneckAnalysis = false) : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS), - StoreQueueSize(SQS), AssumeNoAlias(NoAlias) {} + StoreQueueSize(SQS), AssumeNoAlias(NoAlias), + EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {} unsigned DispatchWidth; unsigned RegisterFileSize; unsigned LoadQueueSize; unsigned StoreQueueSize; bool AssumeNoAlias; + bool EnableBottleneckAnalysis; }; class Context { diff --git a/llvm/include/llvm/MCA/HWEventListener.h b/llvm/include/llvm/MCA/HWEventListener.h index 1857ad2..2f81b87 100644 --- a/llvm/include/llvm/MCA/HWEventListener.h +++ b/llvm/include/llvm/MCA/HWEventListener.h @@ -125,6 +125,35 @@ public: const InstRef &IR; }; +// A HWPressureEvent describes an increase in backend pressure caused by +// the presence of data dependencies or unavailability of pipeline resources. +class HWPressureEvent { +public: + enum GenericReason { + INVALID = 0, + // Scheduler was unable to issue all the ready instructions because some + // pipeline resources were unavailable. + RESOURCES, + // Instructions could not be issued because of register data dependencies. + REGISTER_DEPS, + // Instructions could not be issued because of memory dependencies. + MEMORY_DEPS + }; + + HWPressureEvent(GenericReason reason, ArrayRef Insts, + uint64_t Mask = 0) + : Reason(reason), AffectedInstructions(Insts), ResourceMask(Mask) {} + + // Reason for this increase in backend pressure. + GenericReason Reason; + + // Instructions affected (i.e. delayed) by this increase in backend pressure. + ArrayRef AffectedInstructions; + + // A mask of unavailable processor resources. + const uint64_t ResourceMask; +}; + class HWEventListener { public: // Generic events generated by the pipeline. @@ -133,6 +162,7 @@ public: virtual void onEvent(const HWInstructionEvent &Event) {} virtual void onEvent(const HWStallEvent &Event) {} + virtual void onEvent(const HWPressureEvent &Event) {} using ResourceRef = std::pair; virtual void onResourceAvailable(const ResourceRef &RRef) {} diff --git a/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h b/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h index 429d656..fc618a7 100644 --- a/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h +++ b/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h @@ -180,8 +180,8 @@ public: /// Check if the instruction in 'IR' can be dispatched during this cycle. /// Return SC_AVAILABLE if both scheduler and LS resources are available. /// - /// This method internally sets field HadTokenStall based on the Scheduler - /// Status value. + /// This method is also responsible for setting field HadTokenStall if + /// IR cannot be dispatched to the Scheduler due to unavailable resources. Status isAvailable(const InstRef &IR); /// Reserves buffer and LSUnit queue resources that are necessary to issue @@ -225,16 +225,25 @@ public: /// resources are not available. InstRef select(); - /// Returns a mask of busy resources. Each bit of the mask identifies a unique - /// processor resource unit. In the absence of bottlenecks caused by resource - /// pressure, the mask value returned by this method is always zero. - uint64_t getBusyResourceUnits() const { return BusyResourceUnits; } bool arePipelinesFullyUsed() const { return !Resources->getAvailableProcResUnits(); } bool isReadySetEmpty() const { return ReadySet.empty(); } bool isWaitSetEmpty() const { return WaitSet.empty(); } + /// This method is called by the ExecuteStage at the end of each cycle to + /// identify bottlenecks caused by data dependencies. Vector RegDeps is + /// populated by instructions that were not issued because of unsolved + /// register dependencies. Vector MemDeps is populated by instructions that + /// were not issued because of unsolved memory dependencies. + void analyzeDataDependencies(SmallVectorImpl &RegDeps, + SmallVectorImpl &MemDeps); + + /// Returns a mask of busy resources, and populates vector Insts with + /// instructions that could not be issued to the underlying pipelines because + /// not all pipeline resources were available. + uint64_t analyzeResourcePressure(SmallVectorImpl &Insts); + // Returns true if the dispatch logic couldn't dispatch a full group due to // unavailable scheduler and/or LS resources. bool hadTokenStall() const { return HadTokenStall; } diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h index e6ee783..62453d3 100644 --- a/llvm/include/llvm/MCA/Instruction.h +++ b/llvm/include/llvm/MCA/Instruction.h @@ -448,7 +448,13 @@ class Instruction : public InstructionBase { // Retire Unit token ID for this instruction. unsigned RCUTokenID; + // A bitmask of busy processor resource units. + // This field is set to zero only if execution is not delayed during this + // cycle because of unavailable pipeline resources. uint64_t CriticalResourceMask; + + // An instruction identifier. This field is only set if execution is delayed + // by a memory dependency. unsigned CriticalMemDep; public: @@ -499,12 +505,12 @@ public: Stage = IS_RETIRED; } - void updateCriticalResourceMask(uint64_t BusyResourceUnits) { - CriticalResourceMask |= BusyResourceUnits; - } uint64_t getCriticalResourceMask() const { return CriticalResourceMask; } - void setCriticalMemDep(unsigned IID) { CriticalMemDep = IID; } unsigned getCriticalMemDep() const { return CriticalMemDep; } + void setCriticalResourceMask(uint64_t ResourceMask) { + CriticalResourceMask = ResourceMask; + } + void setCriticalMemDep(unsigned IID) { CriticalMemDep = IID; } void cycleEvent(); }; diff --git a/llvm/include/llvm/MCA/Stages/ExecuteStage.h b/llvm/include/llvm/MCA/Stages/ExecuteStage.h index ec9eae0..c20173b 100644 --- a/llvm/include/llvm/MCA/Stages/ExecuteStage.h +++ b/llvm/include/llvm/MCA/Stages/ExecuteStage.h @@ -28,6 +28,12 @@ namespace mca { class ExecuteStage final : public Stage { Scheduler &HWS; + unsigned NumDispatchedOpcodes; + unsigned NumIssuedOpcodes; + + // True if this stage should notify listeners of HWPressureEvents. + bool EnablePressureEvents; + Error issueInstruction(InstRef &IR); // Called at the beginning of each cycle to issue already dispatched @@ -41,7 +47,10 @@ class ExecuteStage final : public Stage { ExecuteStage &operator=(const ExecuteStage &Other) = delete; public: - ExecuteStage(Scheduler &S) : Stage(), HWS(S) {} + ExecuteStage(Scheduler &S) : ExecuteStage(S, false) {} + ExecuteStage(Scheduler &S, bool ShouldPerformBottleneckAnalysis) + : Stage(), HWS(S), NumDispatchedOpcodes(0), NumIssuedOpcodes(0), + EnablePressureEvents(ShouldPerformBottleneckAnalysis) {} // This stage works under the assumption that the Pipeline will eventually // execute a retire stage. We don't need to check if pipelines and/or @@ -60,6 +69,7 @@ public: // Instructions that transitioned to the 'Executed' state are automatically // moved to the next stage (i.e. RetireStage). Error cycleStart() override; + Error cycleEnd() override; Error execute(InstRef &IR) override; void notifyInstructionIssued( diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp index 18489cc..8de6753 100644 --- a/llvm/lib/MCA/Context.cpp +++ b/llvm/lib/MCA/Context.cpp @@ -42,7 +42,8 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB, auto Fetch = llvm::make_unique(SrcMgr); auto Dispatch = llvm::make_unique(STI, MRI, Opts.DispatchWidth, *RCU, *PRF); - auto Execute = llvm::make_unique(*HWS); + auto Execute = + llvm::make_unique(*HWS, Opts.EnableBottleneckAnalysis); auto Retire = llvm::make_unique(*RCU, *PRF); // Pass the ownership of all the hardware units to this Context. diff --git a/llvm/lib/MCA/HardwareUnits/Scheduler.cpp b/llvm/lib/MCA/HardwareUnits/Scheduler.cpp index 1a428ac..5b2527b 100644 --- a/llvm/lib/MCA/HardwareUnits/Scheduler.cpp +++ b/llvm/lib/MCA/HardwareUnits/Scheduler.cpp @@ -183,9 +183,9 @@ InstRef Scheduler::select() { InstRef &IR = ReadySet[I]; if (QueueIndex == ReadySet.size() || Strategy->compare(IR, ReadySet[QueueIndex])) { - const InstrDesc &D = IR.getInstruction()->getDesc(); - uint64_t BusyResourceMask = Resources->checkAvailability(D); - IR.getInstruction()->updateCriticalResourceMask(BusyResourceMask); + Instruction &IS = *IR.getInstruction(); + uint64_t BusyResourceMask = Resources->checkAvailability(IS.getDesc()); + IS.setCriticalResourceMask(BusyResourceMask); BusyResourceUnits |= BusyResourceMask; if (!BusyResourceMask) QueueIndex = I; @@ -227,6 +227,28 @@ void Scheduler::updateIssuedSet(SmallVectorImpl &Executed) { IssuedSet.resize(IssuedSet.size() - RemovedElements); } +uint64_t Scheduler::analyzeResourcePressure(SmallVectorImpl &Insts) { + Insts.insert(Insts.end(), ReadySet.begin(), ReadySet.end()); + return BusyResourceUnits; +} + +void Scheduler::analyzeDataDependencies(SmallVectorImpl &RegDeps, + SmallVectorImpl &MemDeps) { + const auto EndIt = PendingSet.end() - NumDispatchedToThePendingSet; + for (InstRef &IR : make_range(PendingSet.begin(), EndIt)) { + Instruction &IS = *IR.getInstruction(); + if (Resources->checkAvailability(IS.getDesc())) + continue; + + if (IS.isReady() || + (IS.isMemOp() && LSU.isReady(IR) != IR.getSourceIndex())) { + MemDeps.emplace_back(IR); + } else { + RegDeps.emplace_back(IR); + } + } +} + void Scheduler::cycleEvent(SmallVectorImpl &Freed, SmallVectorImpl &Executed, SmallVectorImpl &Ready) { diff --git a/llvm/lib/MCA/Stages/ExecuteStage.cpp b/llvm/lib/MCA/Stages/ExecuteStage.cpp index 27ac8f8..49210e0 100644 --- a/llvm/lib/MCA/Stages/ExecuteStage.cpp +++ b/llvm/lib/MCA/Stages/ExecuteStage.cpp @@ -54,6 +54,7 @@ Error ExecuteStage::issueInstruction(InstRef &IR) { SmallVector, 4> Used; SmallVector Ready; HWS.issueInstruction(IR, Used, Ready); + NumIssuedOpcodes += IR.getInstruction()->getDesc().NumMicroOps; notifyReservedOrReleasedBuffers(IR, /* Reserved */ false); @@ -89,6 +90,8 @@ Error ExecuteStage::cycleStart() { SmallVector Ready; HWS.cycleEvent(Freed, Executed, Ready); + NumDispatchedOpcodes = 0; + NumIssuedOpcodes = 0; for (const ResourceRef &RR : Freed) notifyResourceAvailable(RR); @@ -106,6 +109,45 @@ Error ExecuteStage::cycleStart() { return issueReadyInstructions(); } +Error ExecuteStage::cycleEnd() { + if (!EnablePressureEvents) + return ErrorSuccess(); + + // Always conservatively report any backpressure events if the dispatch logic + // was stalled due to unavailable scheduler resources. + if (!HWS.hadTokenStall() && NumDispatchedOpcodes <= NumIssuedOpcodes) + return ErrorSuccess(); + + SmallVector Insts; + uint64_t Mask = HWS.analyzeResourcePressure(Insts); + if (Mask) { + LLVM_DEBUG(dbgs() << "[E] Backpressure increased because of unavailable " + "pipeline resources: " + << format_hex(Mask, 16) << '\n'); + HWPressureEvent Ev(HWPressureEvent::RESOURCES, Insts, Mask); + notifyEvent(Ev); + return ErrorSuccess(); + } + + SmallVector RegDeps; + SmallVector MemDeps; + HWS.analyzeDataDependencies(RegDeps, MemDeps); + if (RegDeps.size()) { + LLVM_DEBUG( + dbgs() << "[E] Backpressure increased by register dependencies\n"); + HWPressureEvent Ev(HWPressureEvent::REGISTER_DEPS, RegDeps); + notifyEvent(Ev); + } + + if (MemDeps.size()) { + LLVM_DEBUG(dbgs() << "[E] Backpressure increased by memory dependencies\n"); + HWPressureEvent Ev(HWPressureEvent::MEMORY_DEPS, MemDeps); + notifyEvent(Ev); + } + + return ErrorSuccess(); +} + #ifndef NDEBUG static void verifyInstructionEliminated(const InstRef &IR) { const Instruction &Inst = *IR.getInstruction(); @@ -147,6 +189,7 @@ Error ExecuteStage::execute(InstRef &IR) { // be released after MCIS is issued, and all the ResourceCycles for those // units have been consumed. bool IsReadyInstruction = HWS.dispatch(IR); + NumDispatchedOpcodes += IR.getInstruction()->getDesc().NumMicroOps; notifyReservedOrReleasedBuffers(IR, /* Reserved */ true); if (!IsReadyInstruction) return ErrorSuccess(); diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s new file mode 100644 index 0000000..16577cf --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s @@ -0,0 +1,85 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s + +add %eax, %ebx +add %ebx, %ecx +add %ecx, %edx +add %edx, %eax + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Cycles with backend pressure increase [ 94.04% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 0.00% ] +# CHECK-NEXT: Data Dependencies: [ 94.04% ] +# CHECK-NEXT: - Register Dependencies [ 94.04% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 addl %eax, %ebx +# CHECK-NEXT: 1 1 0.50 addl %ebx, %ecx +# CHECK-NEXT: 1 1 0.50 addl %ecx, %edx +# CHECK-NEXT: 1 1 0.50 addl %edx, %eax + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %eax, %ebx +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - addl %ebx, %ecx +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %ecx, %edx +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - addl %edx, %eax + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeER .. addl %eax, %ebx +# CHECK-NEXT: [0,1] D=eER.. addl %ebx, %ecx +# CHECK-NEXT: [0,2] .D=eER. addl %ecx, %edx +# CHECK-NEXT: [0,3] .D==eER addl %edx, %eax + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 addl %eax, %ebx +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 addl %ebx, %ecx +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 addl %ecx, %edx +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 addl %edx, %eax diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s new file mode 100644 index 0000000..83444d4 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s @@ -0,0 +1,72 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s + +vhaddps %xmm0, %xmm0, %xmm1 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 100 +# CHECK-NEXT: Total Cycles: 106 +# CHECK-NEXT: Total uOps: 100 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.94 +# CHECK-NEXT: IPC: 0.94 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Cycles with backend pressure increase [ 76.42% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 76.42% ] +# CHECK-NEXT: - JFPA [ 76.42% ] +# CHECK-NEXT: - JFPU0 [ 76.42% ] +# CHECK-NEXT: Data Dependencies: [ 0.00% ] +# CHECK-NEXT: - Register Dependencies [ 0.00% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 1.00 vhaddps %xmm0, %xmm0, %xmm1 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vhaddps %xmm0, %xmm0, %xmm1 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeeeeER vhaddps %xmm0, %xmm0, %xmm1 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vhaddps %xmm0, %xmm0, %xmm1 diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s new file mode 100644 index 0000000..6cd613a --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s @@ -0,0 +1,106 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -noalias=false -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s + +vmovaps (%rsi), %xmm0 +vmovaps %xmm0, (%rdi) +vmovaps 16(%rsi), %xmm0 +vmovaps %xmm0, 16(%rdi) +vmovaps 32(%rsi), %xmm0 +vmovaps %xmm0, 32(%rdi) +vmovaps 48(%rsi), %xmm0 +vmovaps %xmm0, 48(%rdi) + +# CHECK: Iterations: 1500 +# CHECK-NEXT: Instructions: 12000 +# CHECK-NEXT: Total Cycles: 36003 +# CHECK-NEXT: Total uOps: 12000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.33 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Cycles with backend pressure increase [ 99.89% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 0.00% ] +# CHECK-NEXT: Data Dependencies: [ 99.89% ] +# CHECK-NEXT: - Register Dependencies [ 0.00% ] +# CHECK-NEXT: - Memory Dependencies [ 99.89% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi) + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 2.00 2.00 4.00 4.00 4.00 - 4.00 4.00 - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - 1.00 1.00 - 1.00 - - - - - - vmovaps (%rsi), %xmm0 +# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, (%rdi) +# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: - - - - 1.00 1.00 - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 48(%rdi) + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. vmovaps (%rsi), %xmm0 +# CHECK-NEXT: [0,1] D=====eER . . . .. vmovaps %xmm0, (%rdi) +# CHECK-NEXT: [0,2] .D=====eeeeeER . . .. vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: [0,3] .D==========eER. . .. vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: [0,4] . D==========eeeeeER. .. vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: [0,5] . D===============eER .. vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: [0,6] . D===============eeeeeER. vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: [0,7] . D====================eER vmovaps %xmm0, 48(%rdi) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0 +# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi) +# CHECK-NEXT: 2. 1 6.0 0.0 0.0 vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: 3. 1 11.0 0.0 0.0 vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 4. 1 11.0 0.0 0.0 vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: 5. 1 16.0 0.0 0.0 vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: 6. 1 16.0 0.0 0.0 vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: 7. 1 21.0 0.0 0.0 vmovaps %xmm0, 48(%rdi) diff --git a/llvm/tools/llvm-mca/Views/SummaryView.cpp b/llvm/tools/llvm-mca/Views/SummaryView.cpp index 1f14f3d..5942160 100644 --- a/llvm/tools/llvm-mca/Views/SummaryView.cpp +++ b/llvm/tools/llvm-mca/Views/SummaryView.cpp @@ -25,10 +25,14 @@ namespace mca { SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef S, unsigned Width) : SM(Model), Source(S), DispatchWidth(Width), LastInstructionIdx(0), - TotalCycles(0), NumMicroOps(0), + TotalCycles(0), NumMicroOps(0), BPI({0, 0, 0, 0}), + ResourcePressureDistribution(Model.getNumProcResourceKinds(), 0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0), ProcResourceMasks(Model.getNumProcResourceKinds()), - ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0) { + ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0), + PressureIncreasedBecauseOfResources(false), + PressureIncreasedBecauseOfDataDependencies(false), + SeenStallCycles(false) { computeProcResourceMasks(SM, ProcResourceMasks); for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { unsigned Index = getResourceStateIndex(ProcResourceMasks[I]); @@ -61,6 +65,98 @@ void SummaryView::onEvent(const HWInstructionEvent &Event) { } } +void SummaryView::onEvent(const HWPressureEvent &Event) { + assert(Event.Reason != HWPressureEvent::INVALID && + "Unexpected invalid event!"); + + switch (Event.Reason) { + default: + break; + + case HWPressureEvent::RESOURCES: { + PressureIncreasedBecauseOfResources = true; + ++BPI.ResourcePressureCycles; + uint64_t ResourceMask = Event.ResourceMask; + while (ResourceMask) { + uint64_t Current = ResourceMask & (-ResourceMask); + unsigned Index = getResourceStateIndex(Current); + unsigned ProcResID = ResIdx2ProcResID[Index]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); + if (!PRDesc.SubUnitsIdxBegin) { + ResourcePressureDistribution[Index]++; + ResourceMask ^= Current; + continue; + } + + for (unsigned I = 0, E = PRDesc.NumUnits; I < E; ++I) { + unsigned OtherProcResID = PRDesc.SubUnitsIdxBegin[I]; + unsigned OtherMask = ProcResourceMasks[OtherProcResID]; + ResourcePressureDistribution[getResourceStateIndex(OtherMask)]++; + } + + ResourceMask ^= Current; + } + } + + break; + case HWPressureEvent::REGISTER_DEPS: + PressureIncreasedBecauseOfDataDependencies = true; + ++BPI.RegisterDependencyCycles; + break; + case HWPressureEvent::MEMORY_DEPS: + PressureIncreasedBecauseOfDataDependencies = true; + ++BPI.MemoryDependencyCycles; + break; + } +} + +void SummaryView::printBottleneckHints(raw_ostream &OS) const { + if (!SeenStallCycles || !BPI.PressureIncreaseCycles) + return; + + double PressurePerCycle = + (double)BPI.PressureIncreaseCycles * 100 / TotalCycles; + double ResourcePressurePerCycle = + (double)BPI.ResourcePressureCycles * 100 / TotalCycles; + double DDPerCycle = (double)BPI.DataDependencyCycles * 100 / TotalCycles; + double RegDepPressurePerCycle = + (double)BPI.RegisterDependencyCycles * 100 / TotalCycles; + double MemDepPressurePerCycle = + (double)BPI.MemoryDependencyCycles * 100 / TotalCycles; + + OS << "\nCycles with backend pressure increase [ " + << format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]"; + + OS << "\nThroughput Bottlenecks: " + << "\n Resource Pressure [ " + << format("%.2f", floor((ResourcePressurePerCycle * 100) + 0.5) / 100) + << "% ]"; + + if (BPI.PressureIncreaseCycles) { + for (unsigned I = 0, E = ResourcePressureDistribution.size(); I < E; ++I) { + if (ResourcePressureDistribution[I]) { + double Frequency = + (double)ResourcePressureDistribution[I] * 100 / TotalCycles; + unsigned Index = ResIdx2ProcResID[getResourceStateIndex(1ULL << I)]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(Index); + OS << "\n - " << PRDesc.Name << " [ " + << format("%.2f", floor((Frequency * 100) + 0.5) / 100) << "% ]"; + } + } + } + + OS << "\n Data Dependencies: [ " + << format("%.2f", floor((DDPerCycle * 100) + 0.5) / 100) << "% ]"; + + OS << "\n - Register Dependencies [ " + << format("%.2f", floor((RegDepPressurePerCycle * 100) + 0.5) / 100) + << "% ]"; + + OS << "\n - Memory Dependencies [ " + << format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100) + << "% ]\n\n"; +} + void SummaryView::printView(raw_ostream &OS) const { unsigned Instructions = Source.size(); unsigned Iterations = (LastInstructionIdx / Instructions) + 1; @@ -85,6 +181,8 @@ void SummaryView::printView(raw_ostream &OS) const { TempStream << "\nBlock RThroughput: " << format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10) << '\n'; + + printBottleneckHints(TempStream); TempStream.flush(); OS << Buffer; } diff --git a/llvm/tools/llvm-mca/Views/SummaryView.h b/llvm/tools/llvm-mca/Views/SummaryView.h index 631e409..dbccdd3 100644 --- a/llvm/tools/llvm-mca/Views/SummaryView.h +++ b/llvm/tools/llvm-mca/Views/SummaryView.h @@ -45,6 +45,25 @@ class SummaryView : public View { unsigned TotalCycles; // The total number of micro opcodes contributed by a block of instructions. unsigned NumMicroOps; + + struct BackPressureInfo { + // Cycles where backpressure increased. + unsigned PressureIncreaseCycles; + // Cycles where backpressure increased because of pipeline pressure. + unsigned ResourcePressureCycles; + // Cycles where backpressure increased because of data dependencies. + unsigned DataDependencyCycles; + // Cycles where backpressure increased because of register dependencies. + unsigned RegisterDependencyCycles; + // Cycles where backpressure increased because of memory dependencies. + unsigned MemoryDependencyCycles; + }; + BackPressureInfo BPI; + + // Resource pressure distribution. There is an element for every processor + // resource declared by the scheduling model. Quantities are number of cycles. + llvm::SmallVector ResourcePressureDistribution; + // For each processor resource, this vector stores the cumulative number of // resource cycles consumed by the analyzed code block. llvm::SmallVector ProcResourceUsage; @@ -58,18 +77,43 @@ class SummaryView : public View { // Used to map resource indices to actual processor resource IDs. llvm::SmallVector ResIdx2ProcResID; + // True if resource pressure events were notified during this cycle. + bool PressureIncreasedBecauseOfResources; + bool PressureIncreasedBecauseOfDataDependencies; + + // True if throughput was affected by dispatch stalls. + bool SeenStallCycles; + // Compute the reciprocal throughput for the analyzed code block. // The reciprocal block throughput is computed as the MAX between: // - NumMicroOps / DispatchWidth // - Total Resource Cycles / #Units (for every resource consumed). double getBlockRThroughput() const; + // Prints a bottleneck message to OS. + void printBottleneckHints(llvm::raw_ostream &OS) const; + public: SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef S, unsigned Width); - void onCycleEnd() override { ++TotalCycles; } + void onCycleEnd() override { + ++TotalCycles; + if (PressureIncreasedBecauseOfResources || + PressureIncreasedBecauseOfDataDependencies) { + ++BPI.PressureIncreaseCycles; + if (PressureIncreasedBecauseOfDataDependencies) + ++BPI.DataDependencyCycles; + PressureIncreasedBecauseOfResources = false; + PressureIncreasedBecauseOfDataDependencies = false; + } + } void onEvent(const HWInstructionEvent &Event) override; + void onEvent(const HWStallEvent &Event) override { + SeenStallCycles = true; + } + + void onEvent(const HWPressureEvent &Event) override; void printView(llvm::raw_ostream &OS) const override; }; diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp index 704a7b4..c7c1a4f 100644 --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -175,6 +175,11 @@ static cl::opt cl::desc("Print all views including hardware statistics"), cl::cat(ViewOptions), cl::init(false)); +static cl::opt EnableBottleneckAnalysis( + "bottleneck-analysis", + cl::desc("Enable bottleneck analysis (disabled by default)"), + cl::cat(ViewOptions), cl::init(false)); + namespace { const Target *getTarget(const char *ProgName) { @@ -387,7 +392,8 @@ int main(int argc, char **argv) { mca::Context MCA(*MRI, *STI); mca::PipelineOptions PO(Width, RegisterFileSize, LoadQueueSize, - StoreQueueSize, AssumeNoAlias); + StoreQueueSize, AssumeNoAlias, + EnableBottleneckAnalysis); // Number each region in the sequence. unsigned RegionIdx = 0; -- 2.7.4