the theoretical uniform distribution of resource pressure for every
instruction in sequence.
+.. option:: -bottleneck-analysis
+
+ Print information about bottlenecks that affect the throughput. This analysis
+ can be expensive, and it is disabled by default. Bottlenecks are highlighted
+ in the summary view.
+
EXIT STATUS
-----------
/// the pre-built "default" out-of-order pipeline.
struct PipelineOptions {
PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
- bool NoAlias)
+ bool NoAlias, bool ShouldEnableBottleneckAnalysis = false)
: DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
- StoreQueueSize(SQS), AssumeNoAlias(NoAlias) {}
+ StoreQueueSize(SQS), AssumeNoAlias(NoAlias),
+ EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {}
unsigned DispatchWidth;
unsigned RegisterFileSize;
unsigned LoadQueueSize;
unsigned StoreQueueSize;
bool AssumeNoAlias;
+ bool EnableBottleneckAnalysis;
};
class Context {
const InstRef &IR;
};
+// A HWPressureEvent describes an increase in backend pressure caused by
+// the presence of data dependencies or unavailability of pipeline resources.
+class HWPressureEvent {
+public:
+ enum GenericReason {
+ INVALID = 0,
+ // Scheduler was unable to issue all the ready instructions because some
+ // pipeline resources were unavailable.
+ RESOURCES,
+ // Instructions could not be issued because of register data dependencies.
+ REGISTER_DEPS,
+ // Instructions could not be issued because of memory dependencies.
+ MEMORY_DEPS
+ };
+
+ HWPressureEvent(GenericReason reason, ArrayRef<InstRef> Insts,
+ uint64_t Mask = 0)
+ : Reason(reason), AffectedInstructions(Insts), ResourceMask(Mask) {}
+
+ // Reason for this increase in backend pressure.
+ GenericReason Reason;
+
+ // Instructions affected (i.e. delayed) by this increase in backend pressure.
+ ArrayRef<InstRef> AffectedInstructions;
+
+ // A mask of unavailable processor resources.
+ const uint64_t ResourceMask;
+};
+
class HWEventListener {
public:
// Generic events generated by the pipeline.
virtual void onEvent(const HWInstructionEvent &Event) {}
virtual void onEvent(const HWStallEvent &Event) {}
+ virtual void onEvent(const HWPressureEvent &Event) {}
using ResourceRef = std::pair<uint64_t, uint64_t>;
virtual void onResourceAvailable(const ResourceRef &RRef) {}
/// Check if the instruction in 'IR' can be dispatched during this cycle.
/// Return SC_AVAILABLE if both scheduler and LS resources are available.
///
- /// This method internally sets field HadTokenStall based on the Scheduler
- /// Status value.
+ /// This method is also responsible for setting field HadTokenStall if
+ /// IR cannot be dispatched to the Scheduler due to unavailable resources.
Status isAvailable(const InstRef &IR);
/// Reserves buffer and LSUnit queue resources that are necessary to issue
/// resources are not available.
InstRef select();
- /// Returns a mask of busy resources. Each bit of the mask identifies a unique
- /// processor resource unit. In the absence of bottlenecks caused by resource
- /// pressure, the mask value returned by this method is always zero.
- uint64_t getBusyResourceUnits() const { return BusyResourceUnits; }
bool arePipelinesFullyUsed() const {
return !Resources->getAvailableProcResUnits();
}
bool isReadySetEmpty() const { return ReadySet.empty(); }
bool isWaitSetEmpty() const { return WaitSet.empty(); }
+ /// This method is called by the ExecuteStage at the end of each cycle to
+ /// identify bottlenecks caused by data dependencies. Vector RegDeps is
+ /// populated by instructions that were not issued because of unsolved
+ /// register dependencies. Vector MemDeps is populated by instructions that
+ /// were not issued because of unsolved memory dependencies.
+ void analyzeDataDependencies(SmallVectorImpl<InstRef> &RegDeps,
+ SmallVectorImpl<InstRef> &MemDeps);
+
+ /// Returns a mask of busy resources, and populates vector Insts with
+ /// instructions that could not be issued to the underlying pipelines because
+ /// not all pipeline resources were available.
+ uint64_t analyzeResourcePressure(SmallVectorImpl<InstRef> &Insts);
+
// Returns true if the dispatch logic couldn't dispatch a full group due to
// unavailable scheduler and/or LS resources.
bool hadTokenStall() const { return HadTokenStall; }
// Retire Unit token ID for this instruction.
unsigned RCUTokenID;
+ // A bitmask of busy processor resource units.
+ // This field is set to zero only if execution is not delayed during this
+ // cycle because of unavailable pipeline resources.
uint64_t CriticalResourceMask;
+
+ // An instruction identifier. This field is only set if execution is delayed
+ // by a memory dependency.
unsigned CriticalMemDep;
public:
Stage = IS_RETIRED;
}
- void updateCriticalResourceMask(uint64_t BusyResourceUnits) {
- CriticalResourceMask |= BusyResourceUnits;
- }
uint64_t getCriticalResourceMask() const { return CriticalResourceMask; }
- void setCriticalMemDep(unsigned IID) { CriticalMemDep = IID; }
unsigned getCriticalMemDep() const { return CriticalMemDep; }
+ void setCriticalResourceMask(uint64_t ResourceMask) {
+ CriticalResourceMask = ResourceMask;
+ }
+ void setCriticalMemDep(unsigned IID) { CriticalMemDep = IID; }
void cycleEvent();
};
class ExecuteStage final : public Stage {
Scheduler &HWS;
+ unsigned NumDispatchedOpcodes;
+ unsigned NumIssuedOpcodes;
+
+ // True if this stage should notify listeners of HWPressureEvents.
+ bool EnablePressureEvents;
+
Error issueInstruction(InstRef &IR);
// Called at the beginning of each cycle to issue already dispatched
ExecuteStage &operator=(const ExecuteStage &Other) = delete;
public:
- ExecuteStage(Scheduler &S) : Stage(), HWS(S) {}
+ ExecuteStage(Scheduler &S) : ExecuteStage(S, false) {}
+ ExecuteStage(Scheduler &S, bool ShouldPerformBottleneckAnalysis)
+ : Stage(), HWS(S), NumDispatchedOpcodes(0), NumIssuedOpcodes(0),
+ EnablePressureEvents(ShouldPerformBottleneckAnalysis) {}
// This stage works under the assumption that the Pipeline will eventually
// execute a retire stage. We don't need to check if pipelines and/or
// Instructions that transitioned to the 'Executed' state are automatically
// moved to the next stage (i.e. RetireStage).
Error cycleStart() override;
+ Error cycleEnd() override;
Error execute(InstRef &IR) override;
void notifyInstructionIssued(
auto Fetch = llvm::make_unique<EntryStage>(SrcMgr);
auto Dispatch = llvm::make_unique<DispatchStage>(STI, MRI, Opts.DispatchWidth,
*RCU, *PRF);
- auto Execute = llvm::make_unique<ExecuteStage>(*HWS);
+ auto Execute =
+ llvm::make_unique<ExecuteStage>(*HWS, Opts.EnableBottleneckAnalysis);
auto Retire = llvm::make_unique<RetireStage>(*RCU, *PRF);
// Pass the ownership of all the hardware units to this Context.
InstRef &IR = ReadySet[I];
if (QueueIndex == ReadySet.size() ||
Strategy->compare(IR, ReadySet[QueueIndex])) {
- const InstrDesc &D = IR.getInstruction()->getDesc();
- uint64_t BusyResourceMask = Resources->checkAvailability(D);
- IR.getInstruction()->updateCriticalResourceMask(BusyResourceMask);
+ Instruction &IS = *IR.getInstruction();
+ uint64_t BusyResourceMask = Resources->checkAvailability(IS.getDesc());
+ IS.setCriticalResourceMask(BusyResourceMask);
BusyResourceUnits |= BusyResourceMask;
if (!BusyResourceMask)
QueueIndex = I;
IssuedSet.resize(IssuedSet.size() - RemovedElements);
}
+uint64_t Scheduler::analyzeResourcePressure(SmallVectorImpl<InstRef> &Insts) {
+ Insts.insert(Insts.end(), ReadySet.begin(), ReadySet.end());
+ return BusyResourceUnits;
+}
+
+void Scheduler::analyzeDataDependencies(SmallVectorImpl<InstRef> &RegDeps,
+ SmallVectorImpl<InstRef> &MemDeps) {
+ const auto EndIt = PendingSet.end() - NumDispatchedToThePendingSet;
+ for (InstRef &IR : make_range(PendingSet.begin(), EndIt)) {
+ Instruction &IS = *IR.getInstruction();
+ if (Resources->checkAvailability(IS.getDesc()))
+ continue;
+
+ if (IS.isReady() ||
+ (IS.isMemOp() && LSU.isReady(IR) != IR.getSourceIndex())) {
+ MemDeps.emplace_back(IR);
+ } else {
+ RegDeps.emplace_back(IR);
+ }
+ }
+}
+
void Scheduler::cycleEvent(SmallVectorImpl<ResourceRef> &Freed,
SmallVectorImpl<InstRef> &Executed,
SmallVectorImpl<InstRef> &Ready) {
SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> Used;
SmallVector<InstRef, 4> Ready;
HWS.issueInstruction(IR, Used, Ready);
+ NumIssuedOpcodes += IR.getInstruction()->getDesc().NumMicroOps;
notifyReservedOrReleasedBuffers(IR, /* Reserved */ false);
SmallVector<InstRef, 4> Ready;
HWS.cycleEvent(Freed, Executed, Ready);
+ NumDispatchedOpcodes = 0;
+ NumIssuedOpcodes = 0;
for (const ResourceRef &RR : Freed)
notifyResourceAvailable(RR);
return issueReadyInstructions();
}
+Error ExecuteStage::cycleEnd() {
+ if (!EnablePressureEvents)
+ return ErrorSuccess();
+
+ // Always conservatively report any backpressure events if the dispatch logic
+ // was stalled due to unavailable scheduler resources.
+ if (!HWS.hadTokenStall() && NumDispatchedOpcodes <= NumIssuedOpcodes)
+ return ErrorSuccess();
+
+ SmallVector<InstRef, 8> Insts;
+ uint64_t Mask = HWS.analyzeResourcePressure(Insts);
+ if (Mask) {
+ LLVM_DEBUG(dbgs() << "[E] Backpressure increased because of unavailable "
+ "pipeline resources: "
+ << format_hex(Mask, 16) << '\n');
+ HWPressureEvent Ev(HWPressureEvent::RESOURCES, Insts, Mask);
+ notifyEvent(Ev);
+ return ErrorSuccess();
+ }
+
+ SmallVector<InstRef, 8> RegDeps;
+ SmallVector<InstRef, 8> MemDeps;
+ HWS.analyzeDataDependencies(RegDeps, MemDeps);
+ if (RegDeps.size()) {
+ LLVM_DEBUG(
+ dbgs() << "[E] Backpressure increased by register dependencies\n");
+ HWPressureEvent Ev(HWPressureEvent::REGISTER_DEPS, RegDeps);
+ notifyEvent(Ev);
+ }
+
+ if (MemDeps.size()) {
+ LLVM_DEBUG(dbgs() << "[E] Backpressure increased by memory dependencies\n");
+ HWPressureEvent Ev(HWPressureEvent::MEMORY_DEPS, MemDeps);
+ notifyEvent(Ev);
+ }
+
+ return ErrorSuccess();
+}
+
#ifndef NDEBUG
static void verifyInstructionEliminated(const InstRef &IR) {
const Instruction &Inst = *IR.getInstruction();
// be released after MCIS is issued, and all the ResourceCycles for those
// units have been consumed.
bool IsReadyInstruction = HWS.dispatch(IR);
+ NumDispatchedOpcodes += IR.getInstruction()->getDesc().NumMicroOps;
notifyReservedOrReleasedBuffers(IR, /* Reserved */ true);
if (!IsReadyInstruction)
return ErrorSuccess();
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s
+
+add %eax, %ebx
+add %ebx, %ecx
+add %ecx, %edx
+add %edx, %eax
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 403
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 2
+# CHECK-NEXT: uOps Per Cycle: 0.99
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Cycles with backend pressure increase [ 94.04% ]
+# CHECK-NEXT: Throughput Bottlenecks:
+# CHECK-NEXT: Resource Pressure [ 0.00% ]
+# CHECK-NEXT: Data Dependencies: [ 94.04% ]
+# CHECK-NEXT: - Register Dependencies [ 94.04% ]
+# CHECK-NEXT: - Memory Dependencies [ 0.00% ]
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 1 0.50 addl %eax, %ebx
+# CHECK-NEXT: 1 1 0.50 addl %ebx, %ecx
+# CHECK-NEXT: 1 1 0.50 addl %ecx, %edx
+# CHECK-NEXT: 1 1 0.50 addl %edx, %eax
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %eax, %ebx
+# CHECK-NEXT: 1.00 - - - - - - - - - - - - - addl %ebx, %ecx
+# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %ecx, %edx
+# CHECK-NEXT: 1.00 - - - - - - - - - - - - - addl %edx, %eax
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 0123456
+
+# CHECK: [0,0] DeER .. addl %eax, %ebx
+# CHECK-NEXT: [0,1] D=eER.. addl %ebx, %ecx
+# CHECK-NEXT: [0,2] .D=eER. addl %ecx, %edx
+# CHECK-NEXT: [0,3] .D==eER addl %edx, %eax
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 addl %eax, %ebx
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 addl %ebx, %ecx
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 addl %ecx, %edx
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 addl %edx, %eax
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s
+
+vhaddps %xmm0, %xmm0, %xmm1
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 100
+# CHECK-NEXT: Total Cycles: 106
+# CHECK-NEXT: Total uOps: 100
+
+# CHECK: Dispatch Width: 2
+# CHECK-NEXT: uOps Per Cycle: 0.94
+# CHECK-NEXT: IPC: 0.94
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK: Cycles with backend pressure increase [ 76.42% ]
+# CHECK-NEXT: Throughput Bottlenecks:
+# CHECK-NEXT: Resource Pressure [ 76.42% ]
+# CHECK-NEXT: - JFPA [ 76.42% ]
+# CHECK-NEXT: - JFPU0 [ 76.42% ]
+# CHECK-NEXT: Data Dependencies: [ 0.00% ]
+# CHECK-NEXT: - Register Dependencies [ 0.00% ]
+# CHECK-NEXT: - Memory Dependencies [ 0.00% ]
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 4 1.00 vhaddps %xmm0, %xmm0, %xmm1
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vhaddps %xmm0, %xmm0, %xmm1
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 0123456
+
+# CHECK: [0,0] DeeeeER vhaddps %xmm0, %xmm0, %xmm1
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vhaddps %xmm0, %xmm0, %xmm1
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -noalias=false -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s
+
+vmovaps (%rsi), %xmm0
+vmovaps %xmm0, (%rdi)
+vmovaps 16(%rsi), %xmm0
+vmovaps %xmm0, 16(%rdi)
+vmovaps 32(%rsi), %xmm0
+vmovaps %xmm0, 32(%rdi)
+vmovaps 48(%rsi), %xmm0
+vmovaps %xmm0, 48(%rdi)
+
+# CHECK: Iterations: 1500
+# CHECK-NEXT: Instructions: 12000
+# CHECK-NEXT: Total Cycles: 36003
+# CHECK-NEXT: Total uOps: 12000
+
+# CHECK: Dispatch Width: 2
+# CHECK-NEXT: uOps Per Cycle: 0.33
+# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Cycles with backend pressure increase [ 99.89% ]
+# CHECK-NEXT: Throughput Bottlenecks:
+# CHECK-NEXT: Resource Pressure [ 0.00% ]
+# CHECK-NEXT: Data Dependencies: [ 99.89% ]
+# CHECK-NEXT: - Register Dependencies [ 0.00% ]
+# CHECK-NEXT: - Memory Dependencies [ 99.89% ]
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi)
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - 2.00 2.00 4.00 4.00 4.00 - 4.00 4.00 - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 1.00 1.00 - 1.00 - - - - - - vmovaps (%rsi), %xmm0
+# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: - - - - 1.00 1.00 - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 48(%rdi)
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123456
+
+# CHECK: [0,0] DeeeeeER . . . .. vmovaps (%rsi), %xmm0
+# CHECK-NEXT: [0,1] D=====eER . . . .. vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: [0,2] .D=====eeeeeER . . .. vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: [0,3] .D==========eER. . .. vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4] . D==========eeeeeER. .. vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: [0,5] . D===============eER .. vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6] . D===============eeeeeER. vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: [0,7] . D====================eER vmovaps %xmm0, 48(%rdi)
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0
+# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi)
+# CHECK-NEXT: 2. 1 6.0 0.0 0.0 vmovaps 16(%rsi), %xmm0
+# CHECK-NEXT: 3. 1 11.0 0.0 0.0 vmovaps %xmm0, 16(%rdi)
+# CHECK-NEXT: 4. 1 11.0 0.0 0.0 vmovaps 32(%rsi), %xmm0
+# CHECK-NEXT: 5. 1 16.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
+# CHECK-NEXT: 6. 1 16.0 0.0 0.0 vmovaps 48(%rsi), %xmm0
+# CHECK-NEXT: 7. 1 21.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S,
unsigned Width)
: SM(Model), Source(S), DispatchWidth(Width), LastInstructionIdx(0),
- TotalCycles(0), NumMicroOps(0),
+ TotalCycles(0), NumMicroOps(0), BPI({0, 0, 0, 0}),
+ ResourcePressureDistribution(Model.getNumProcResourceKinds(), 0),
ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
ProcResourceMasks(Model.getNumProcResourceKinds()),
- ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0) {
+ ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0),
+ PressureIncreasedBecauseOfResources(false),
+ PressureIncreasedBecauseOfDataDependencies(false),
+ SeenStallCycles(false) {
computeProcResourceMasks(SM, ProcResourceMasks);
for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
unsigned Index = getResourceStateIndex(ProcResourceMasks[I]);
}
}
+void SummaryView::onEvent(const HWPressureEvent &Event) {
+ assert(Event.Reason != HWPressureEvent::INVALID &&
+ "Unexpected invalid event!");
+
+ switch (Event.Reason) {
+ default:
+ break;
+
+ case HWPressureEvent::RESOURCES: {
+ PressureIncreasedBecauseOfResources = true;
+ ++BPI.ResourcePressureCycles;
+ uint64_t ResourceMask = Event.ResourceMask;
+ while (ResourceMask) {
+ uint64_t Current = ResourceMask & (-ResourceMask);
+ unsigned Index = getResourceStateIndex(Current);
+ unsigned ProcResID = ResIdx2ProcResID[Index];
+ const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
+ if (!PRDesc.SubUnitsIdxBegin) {
+ ResourcePressureDistribution[Index]++;
+ ResourceMask ^= Current;
+ continue;
+ }
+
+ for (unsigned I = 0, E = PRDesc.NumUnits; I < E; ++I) {
+ unsigned OtherProcResID = PRDesc.SubUnitsIdxBegin[I];
+ unsigned OtherMask = ProcResourceMasks[OtherProcResID];
+ ResourcePressureDistribution[getResourceStateIndex(OtherMask)]++;
+ }
+
+ ResourceMask ^= Current;
+ }
+ }
+
+ break;
+ case HWPressureEvent::REGISTER_DEPS:
+ PressureIncreasedBecauseOfDataDependencies = true;
+ ++BPI.RegisterDependencyCycles;
+ break;
+ case HWPressureEvent::MEMORY_DEPS:
+ PressureIncreasedBecauseOfDataDependencies = true;
+ ++BPI.MemoryDependencyCycles;
+ break;
+ }
+}
+
+void SummaryView::printBottleneckHints(raw_ostream &OS) const {
+ if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
+ return;
+
+ double PressurePerCycle =
+ (double)BPI.PressureIncreaseCycles * 100 / TotalCycles;
+ double ResourcePressurePerCycle =
+ (double)BPI.ResourcePressureCycles * 100 / TotalCycles;
+ double DDPerCycle = (double)BPI.DataDependencyCycles * 100 / TotalCycles;
+ double RegDepPressurePerCycle =
+ (double)BPI.RegisterDependencyCycles * 100 / TotalCycles;
+ double MemDepPressurePerCycle =
+ (double)BPI.MemoryDependencyCycles * 100 / TotalCycles;
+
+ OS << "\nCycles with backend pressure increase [ "
+ << format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]";
+
+ OS << "\nThroughput Bottlenecks: "
+ << "\n Resource Pressure [ "
+ << format("%.2f", floor((ResourcePressurePerCycle * 100) + 0.5) / 100)
+ << "% ]";
+
+ if (BPI.PressureIncreaseCycles) {
+ for (unsigned I = 0, E = ResourcePressureDistribution.size(); I < E; ++I) {
+ if (ResourcePressureDistribution[I]) {
+ double Frequency =
+ (double)ResourcePressureDistribution[I] * 100 / TotalCycles;
+ unsigned Index = ResIdx2ProcResID[getResourceStateIndex(1ULL << I)];
+ const MCProcResourceDesc &PRDesc = *SM.getProcResource(Index);
+ OS << "\n - " << PRDesc.Name << " [ "
+ << format("%.2f", floor((Frequency * 100) + 0.5) / 100) << "% ]";
+ }
+ }
+ }
+
+ OS << "\n Data Dependencies: [ "
+ << format("%.2f", floor((DDPerCycle * 100) + 0.5) / 100) << "% ]";
+
+ OS << "\n - Register Dependencies [ "
+ << format("%.2f", floor((RegDepPressurePerCycle * 100) + 0.5) / 100)
+ << "% ]";
+
+ OS << "\n - Memory Dependencies [ "
+ << format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100)
+ << "% ]\n\n";
+}
+
void SummaryView::printView(raw_ostream &OS) const {
unsigned Instructions = Source.size();
unsigned Iterations = (LastInstructionIdx / Instructions) + 1;
TempStream << "\nBlock RThroughput: "
<< format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10)
<< '\n';
+
+ printBottleneckHints(TempStream);
TempStream.flush();
OS << Buffer;
}
unsigned TotalCycles;
// The total number of micro opcodes contributed by a block of instructions.
unsigned NumMicroOps;
+
+ struct BackPressureInfo {
+ // Cycles where backpressure increased.
+ unsigned PressureIncreaseCycles;
+ // Cycles where backpressure increased because of pipeline pressure.
+ unsigned ResourcePressureCycles;
+ // Cycles where backpressure increased because of data dependencies.
+ unsigned DataDependencyCycles;
+ // Cycles where backpressure increased because of register dependencies.
+ unsigned RegisterDependencyCycles;
+ // Cycles where backpressure increased because of memory dependencies.
+ unsigned MemoryDependencyCycles;
+ };
+ BackPressureInfo BPI;
+
+ // Resource pressure distribution. There is an element for every processor
+ // resource declared by the scheduling model. Quantities are number of cycles.
+ llvm::SmallVector<unsigned, 8> ResourcePressureDistribution;
+
// For each processor resource, this vector stores the cumulative number of
// resource cycles consumed by the analyzed code block.
llvm::SmallVector<unsigned, 8> ProcResourceUsage;
// Used to map resource indices to actual processor resource IDs.
llvm::SmallVector<unsigned, 8> ResIdx2ProcResID;
+ // True if resource pressure events were notified during this cycle.
+ bool PressureIncreasedBecauseOfResources;
+ bool PressureIncreasedBecauseOfDataDependencies;
+
+ // True if throughput was affected by dispatch stalls.
+ bool SeenStallCycles;
+
// Compute the reciprocal throughput for the analyzed code block.
// The reciprocal block throughput is computed as the MAX between:
// - NumMicroOps / DispatchWidth
// - Total Resource Cycles / #Units (for every resource consumed).
double getBlockRThroughput() const;
+ // Prints a bottleneck message to OS.
+ void printBottleneckHints(llvm::raw_ostream &OS) const;
+
public:
SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef<llvm::MCInst> S,
unsigned Width);
- void onCycleEnd() override { ++TotalCycles; }
+ void onCycleEnd() override {
+ ++TotalCycles;
+ if (PressureIncreasedBecauseOfResources ||
+ PressureIncreasedBecauseOfDataDependencies) {
+ ++BPI.PressureIncreaseCycles;
+ if (PressureIncreasedBecauseOfDataDependencies)
+ ++BPI.DataDependencyCycles;
+ PressureIncreasedBecauseOfResources = false;
+ PressureIncreasedBecauseOfDataDependencies = false;
+ }
+ }
void onEvent(const HWInstructionEvent &Event) override;
+ void onEvent(const HWStallEvent &Event) override {
+ SeenStallCycles = true;
+ }
+
+ void onEvent(const HWPressureEvent &Event) override;
void printView(llvm::raw_ostream &OS) const override;
};
cl::desc("Print all views including hardware statistics"),
cl::cat(ViewOptions), cl::init(false));
+static cl::opt<bool> EnableBottleneckAnalysis(
+ "bottleneck-analysis",
+ cl::desc("Enable bottleneck analysis (disabled by default)"),
+ cl::cat(ViewOptions), cl::init(false));
+
namespace {
const Target *getTarget(const char *ProgName) {
mca::Context MCA(*MRI, *STI);
mca::PipelineOptions PO(Width, RegisterFileSize, LoadQueueSize,
- StoreQueueSize, AssumeNoAlias);
+ StoreQueueSize, AssumeNoAlias,
+ EnableBottleneckAnalysis);
// Number each region in the sequence.
unsigned RegionIdx = 0;