This patch adds an experimental stage named MicroOpQueueStage.
MicroOpQueueStage can be used to simulate a hardware micro-op queue (basically,
a decoupling queue between 'decode' and 'dispatch'). Users can specify a queue
size, as well as a optional MaxIPC (which - in the absence of a "Decoders" stage
- can be used to simulate a different throughput from the decoders).
This stage is added to the default pipeline between the EntryStage and the
DispatchStage only if PipelineOption::MicroOpQueue is different than zero. By
default, llvm-mca sets PipelineOption::MicroOpQueue to the value of hidden flag
-micro-op-queue-size.
Throughput from the decoder can be simulated via another hidden flag named
-decoder-throughput. That flag allows us to quickly experiment with different
frontend throughputs. For targets that declare a loop buffer, flag
-decoder-throughput allows users to do multiple runs, each time simulating a
different throughput from the decoders.
This stage can/will be extended in future. For example, we could add a "buffer
full" event to notify bottlenecks caused by backpressure. flag
-decoder-throughput would probably go away if in future we delegate to another
stage (DecoderStage?) the simulation of a (potentially variable) throughput from
the decoders. For now, flag -decoder-throughput is "good enough" to run some
simple experiments.
Differential Revision: https://reviews.llvm.org/D59928
llvm-svn: 357248
/// This is a convenience struct to hold the parameters necessary for creating
/// the pre-built "default" out-of-order pipeline.
struct PipelineOptions {
- PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
- bool NoAlias, bool ShouldEnableBottleneckAnalysis = false)
- : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
+ PipelineOptions(unsigned UOPQSize, unsigned DecThr, unsigned DW, unsigned RFS,
+ unsigned LQS, unsigned SQS, bool NoAlias,
+ bool ShouldEnableBottleneckAnalysis = false)
+ : MicroOpQueueSize(UOPQSize), DecodersThroughput(DecThr),
+ DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
StoreQueueSize(SQS), AssumeNoAlias(NoAlias),
EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {}
+ unsigned MicroOpQueueSize;
+ unsigned DecodersThroughput; // Instructions per cycle.
unsigned DispatchWidth;
unsigned RegisterFileSize;
unsigned LoadQueueSize;
--- /dev/null
+//===---------------------- MicroOpQueueStage.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a stage that implements a queue of micro opcodes.
+/// It can be used to simulate a hardware micro-op queue that serves opcodes to
+/// the out of order backend.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
+#define LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/Stages/Stage.h"
+
+namespace llvm {
+namespace mca {
+
+/// A stage that simulates a queue of instruction opcodes.
+class MicroOpQueueStage : public Stage {
+ SmallVector<InstRef, 8> Buffer;
+ unsigned NextAvailableSlotIdx;
+ unsigned CurrentInstructionSlotIdx;
+
+ // Limits the number of instructions that can be written to this buffer every
+ // cycle. A value of zero means that there is no limit to the instruction
+ // throughput in input.
+ const unsigned MaxIPC;
+ unsigned CurrentIPC;
+
+ // Number of entries that are available during this cycle.
+ unsigned AvailableEntries;
+
+ // True if instructions dispatched to this stage don't need to wait for the
+ // next cycle before moving to the next stage.
+ // False if this buffer acts as a one cycle delay in the execution pipeline.
+ bool IsZeroLatencyStage;
+
+ MicroOpQueueStage(const MicroOpQueueStage &Other) = delete;
+ MicroOpQueueStage &operator=(const MicroOpQueueStage &Other) = delete;
+
+ // By default, an instruction consumes a number of buffer entries equal to its
+ // number of micro opcodes (see field `InstrDesc::NumMicroOpcodes`). The
+ // number of entries consumed by an instruction is normalized to the
+ // minimum value between NumMicroOpcodes and the buffer size. This is to avoid
+ // problems with (microcoded) instructions that generate a number of micro
+ // opcodes than doesn't fit in the buffer.
+ unsigned getNormalizedOpcodes(const InstRef &IR) const {
+ unsigned NormalizedOpcodes =
+ std::min(static_cast<unsigned>(Buffer.size()),
+ IR.getInstruction()->getDesc().NumMicroOps);
+ return NormalizedOpcodes ? NormalizedOpcodes : 1U;
+ }
+
+ Error moveInstructions();
+
+public:
+ MicroOpQueueStage(unsigned Size, unsigned IPC = 0,
+ bool ZeroLatencyStage = true);
+
+ bool isAvailable(const InstRef &IR) const override {
+ if (MaxIPC && CurrentIPC == MaxIPC)
+ return false;
+ unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+ if (NormalizedOpcodes > AvailableEntries)
+ return false;
+ return true;
+ }
+
+ bool hasWorkToComplete() const override {
+ return AvailableEntries != Buffer.size();
+ }
+
+ Error execute(InstRef &IR) override;
+ Error cycleStart() override;
+ Error cycleEnd() override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif // LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
Stages/EntryStage.cpp
Stages/ExecuteStage.cpp
Stages/InstructionTables.cpp
+ Stages/MicroOpQueueStage.cpp
Stages/RetireStage.cpp
Stages/Stage.cpp
Support.cpp
#include "llvm/MCA/Stages/DispatchStage.h"
#include "llvm/MCA/Stages/EntryStage.h"
#include "llvm/MCA/Stages/ExecuteStage.h"
+#include "llvm/MCA/Stages/MicroOpQueueStage.h"
#include "llvm/MCA/Stages/RetireStage.h"
namespace llvm {
// Build the pipeline.
auto StagePipeline = llvm::make_unique<Pipeline>();
StagePipeline->appendStage(std::move(Fetch));
+ if (Opts.MicroOpQueueSize)
+ StagePipeline->appendStage(llvm::make_unique<MicroOpQueueStage>(
+ Opts.MicroOpQueueSize, Opts.DecodersThroughput));
StagePipeline->appendStage(std::move(Dispatch));
StagePipeline->appendStage(std::move(Execute));
StagePipeline->appendStage(std::move(Retire));
--- /dev/null
+//===---------------------- MicroOpQueueStage.cpp ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the MicroOpQueueStage.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MCA/Stages/MicroOpQueueStage.h"
+
+namespace llvm {
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+Error MicroOpQueueStage::moveInstructions() {
+ InstRef IR = Buffer[CurrentInstructionSlotIdx];
+ while (IR && checkNextStage(IR)) {
+ if (llvm::Error Val = moveToTheNextStage(IR))
+ return Val;
+
+ Buffer[CurrentInstructionSlotIdx].invalidate();
+ unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+ CurrentInstructionSlotIdx += NormalizedOpcodes;
+ CurrentInstructionSlotIdx %= Buffer.size();
+ AvailableEntries += NormalizedOpcodes;
+ IR = Buffer[CurrentInstructionSlotIdx];
+ }
+
+ return llvm::ErrorSuccess();
+}
+
+MicroOpQueueStage::MicroOpQueueStage(unsigned Size, unsigned IPC,
+ bool ZeroLatencyStage)
+ : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), MaxIPC(IPC),
+ CurrentIPC(0), IsZeroLatencyStage(ZeroLatencyStage) {
+ Buffer.resize(Size ? Size : 1);
+ AvailableEntries = Buffer.size();
+}
+
+Error MicroOpQueueStage::execute(InstRef &IR) {
+ Buffer[NextAvailableSlotIdx] = IR;
+ unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+ NextAvailableSlotIdx += NormalizedOpcodes;
+ NextAvailableSlotIdx %= Buffer.size();
+ AvailableEntries -= NormalizedOpcodes;
+ ++CurrentIPC;
+ return llvm::ErrorSuccess();
+}
+
+Error MicroOpQueueStage::cycleStart() {
+ CurrentIPC = 0;
+ if (!IsZeroLatencyStage)
+ return moveInstructions();
+ return llvm::ErrorSuccess();
+}
+
+Error MicroOpQueueStage::cycleEnd() {
+ if (IsZeroLatencyStage)
+ return moveInstructions();
+ return llvm::ErrorSuccess();
+}
+
+} // namespace mca
+} // namespace llvm
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-1
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=3 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-3
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-4
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-DEC-2
+
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-1
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-DEC-1
+
+add %eax, %eax
+add %ebx, %ebx
+add %ecx, %ecx
+add %edx, %edx
+
+# BTVER2-DEC-2: Iterations: 1500
+# BTVER2-DEC-2-NEXT: Instructions: 6000
+# BTVER2-DEC-2-NEXT: Total Cycles: 3003
+# BTVER2-DEC-2-NEXT: Total uOps: 6000
+
+# BTVER2-DEC-2: Dispatch Width: 2
+# BTVER2-DEC-2-NEXT: uOps Per Cycle: 2.00
+# BTVER2-DEC-2-NEXT: IPC: 2.00
+# BTVER2-DEC-2-NEXT: Block RThroughput: 2.0
+
+# BTVER2-DEC-1: Iterations: 1500
+# BTVER2-DEC-1-NEXT: Instructions: 6000
+# BTVER2-DEC-1-NEXT: Total Cycles: 6003
+# BTVER2-DEC-1-NEXT: Total uOps: 6000
+
+# BTVER2-UOPQ-1: Iterations: 1500
+# BTVER2-UOPQ-1-NEXT: Instructions: 6000
+# BTVER2-UOPQ-1-NEXT: Total Cycles: 6003
+# BTVER2-UOPQ-1-NEXT: Total uOps: 6000
+
+# BTVER2-UOPQ-2: Iterations: 1500
+# BTVER2-UOPQ-2-NEXT: Instructions: 6000
+# BTVER2-UOPQ-2-NEXT: Total Cycles: 3003
+# BTVER2-UOPQ-2-NEXT: Total uOps: 6000
+
+# HASWELL-DEC-2: Iterations: 1500
+# HASWELL-DEC-2-NEXT: Instructions: 6000
+# HASWELL-DEC-2-NEXT: Total Cycles: 3003
+# HASWELL-DEC-2-NEXT: Total uOps: 6000
+
+# HASWELL-UOPQ-1: Iterations: 1500
+# HASWELL-UOPQ-1-NEXT: Instructions: 6000
+# HASWELL-UOPQ-1-NEXT: Total Cycles: 6003
+# HASWELL-UOPQ-1-NEXT: Total uOps: 6000
+
+# HASWELL-UOPQ-2: Iterations: 1500
+# HASWELL-UOPQ-2-NEXT: Instructions: 6000
+# HASWELL-UOPQ-2-NEXT: Total Cycles: 3003
+# HASWELL-UOPQ-2-NEXT: Total uOps: 6000
+
+# HASWELL-UOPQ-3: Iterations: 1500
+# HASWELL-UOPQ-3-NEXT: Instructions: 6000
+# HASWELL-UOPQ-3-NEXT: Total Cycles: 2003
+# HASWELL-UOPQ-3-NEXT: Total uOps: 6000
+
+# HASWELL-UOPQ-4: Iterations: 1500
+# HASWELL-UOPQ-4-NEXT: Instructions: 6000
+# HASWELL-UOPQ-4-NEXT: Total Cycles: 1503
+# HASWELL-UOPQ-4-NEXT: Total uOps: 6000
+
+# BTVER2-DEC-1: Dispatch Width: 2
+# BTVER2-DEC-1-NEXT: uOps Per Cycle: 1.00
+# BTVER2-DEC-1-NEXT: IPC: 1.00
+# BTVER2-DEC-1-NEXT: Block RThroughput: 2.0
+
+# BTVER2-UOPQ-1: Dispatch Width: 2
+# BTVER2-UOPQ-1-NEXT: uOps Per Cycle: 1.00
+# BTVER2-UOPQ-1-NEXT: IPC: 1.00
+# BTVER2-UOPQ-1-NEXT: Block RThroughput: 2.0
+
+# BTVER2-UOPQ-2: Dispatch Width: 2
+# BTVER2-UOPQ-2-NEXT: uOps Per Cycle: 2.00
+# BTVER2-UOPQ-2-NEXT: IPC: 2.00
+# BTVER2-UOPQ-2-NEXT: Block RThroughput: 2.0
+
+# HASWELL-DEC-2: Dispatch Width: 4
+# HASWELL-DEC-2-NEXT: uOps Per Cycle: 2.00
+# HASWELL-DEC-2-NEXT: IPC: 2.00
+# HASWELL-DEC-2-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-1: Dispatch Width: 4
+# HASWELL-UOPQ-1-NEXT: uOps Per Cycle: 1.00
+# HASWELL-UOPQ-1-NEXT: IPC: 1.00
+# HASWELL-UOPQ-1-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-2: Dispatch Width: 4
+# HASWELL-UOPQ-2-NEXT: uOps Per Cycle: 2.00
+# HASWELL-UOPQ-2-NEXT: IPC: 2.00
+# HASWELL-UOPQ-2-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-3: Dispatch Width: 4
+# HASWELL-UOPQ-3-NEXT: uOps Per Cycle: 3.00
+# HASWELL-UOPQ-3-NEXT: IPC: 3.00
+# HASWELL-UOPQ-3-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-4: Dispatch Width: 4
+# HASWELL-UOPQ-4-NEXT: uOps Per Cycle: 3.99
+# HASWELL-UOPQ-4-NEXT: IPC: 3.99
+# HASWELL-UOPQ-4-NEXT: Block RThroughput: 1.0
"be used for register mappings"),
cl::cat(ToolOptions), cl::init(0));
+static cl::opt<unsigned>
+ MicroOpQueue("micro-op-queue-size", cl::Hidden,
+ cl::desc("Number of entries in the micro-op queue"),
+ cl::cat(ToolOptions), cl::init(0));
+
+static cl::opt<unsigned>
+ DecoderThroughput("decoder-throughput", cl::Hidden,
+ cl::desc("Maximum throughput from the decoders "
+ "(instructions per cycle)"),
+ cl::cat(ToolOptions), cl::init(0));
+
static cl::opt<bool>
PrintRegisterFileStats("register-file-stats",
cl::desc("Print register file statistics"),
// Create a context to control ownership of the pipeline hardware.
mca::Context MCA(*MRI, *STI);
- mca::PipelineOptions PO(DispatchWidth, RegisterFileSize, LoadQueueSize,
- StoreQueueSize, AssumeNoAlias,
- EnableBottleneckAnalysis);
+ mca::PipelineOptions PO(MicroOpQueue, DecoderThroughput, DispatchWidth,
+ RegisterFileSize, LoadQueueSize, StoreQueueSize,
+ AssumeNoAlias, EnableBottleneckAnalysis);
// Number each region in the sequence.
unsigned RegionIdx = 0;