[MCA] Add an experimental MicroOpQueue stage.

author Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>

Fri, 29 Mar 2019 12:15:37 +0000 (12:15 +0000)

committer Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>

Fri, 29 Mar 2019 12:15:37 +0000 (12:15 +0000)
author Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Fri, 29 Mar 2019 12:15:37 +0000 (12:15 +0000)
committer Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Fri, 29 Mar 2019 12:15:37 +0000 (12:15 +0000)
diff --git a/llvm/include/llvm/MCA/Context.h b/llvm/include/llvm/MCA/Context.h

index e02b033..503d780 100644 (file)
--- a/llvm/include/llvm/MCA/Context.h
+++ b/llvm/include/llvm/MCA/Context.h
@@ -31,11 +31,15 @@ namespace mca {
  /// This is a convenience struct to hold the parameters necessary for creating
  /// the pre-built "default" out-of-order pipeline.
  struct PipelineOptions {
-  PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS,
-                  bool NoAlias, bool ShouldEnableBottleneckAnalysis = false)
-      : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
+  PipelineOptions(unsigned UOPQSize, unsigned DecThr, unsigned DW, unsigned RFS,
+                  unsigned LQS, unsigned SQS, bool NoAlias,
+                  bool ShouldEnableBottleneckAnalysis = false)
+      : MicroOpQueueSize(UOPQSize), DecodersThroughput(DecThr),
+        DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS),
          StoreQueueSize(SQS), AssumeNoAlias(NoAlias),
          EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {}
+  unsigned MicroOpQueueSize;
+  unsigned DecodersThroughput; // Instructions per cycle.
    unsigned DispatchWidth;
    unsigned RegisterFileSize;
    unsigned LoadQueueSize;
diff --git a/llvm/include/llvm/MCA/Stages/MicroOpQueueStage.h b/llvm/include/llvm/MCA/Stages/MicroOpQueueStage.h

new file mode 100644 (file)

index 0000000..50a5ef8
--- /dev/null
+++ b/llvm/include/llvm/MCA/Stages/MicroOpQueueStage.h
@@ -0,0 +1,88 @@
+//===---------------------- MicroOpQueueStage.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a stage that implements a queue of micro opcodes.
+/// It can be used to simulate a hardware micro-op queue that serves opcodes to
+/// the out of order backend.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
+#define LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/Stages/Stage.h"
+
+namespace llvm {
+namespace mca {
+
+/// A stage that simulates a queue of instruction opcodes.
+class MicroOpQueueStage : public Stage {
+  SmallVector<InstRef, 8> Buffer;
+  unsigned NextAvailableSlotIdx;
+  unsigned CurrentInstructionSlotIdx;
+
+  // Limits the number of instructions that can be written to this buffer every
+  // cycle. A value of zero means that there is no limit to the instruction
+  // throughput in input.
+  const unsigned MaxIPC;
+  unsigned CurrentIPC;
+
+  // Number of entries that are available during this cycle.
+  unsigned AvailableEntries;
+
+  // True if instructions dispatched to this stage don't need to wait for the
+  // next cycle before moving to the next stage.
+  // False if this buffer acts as a one cycle delay in the execution pipeline.
+  bool IsZeroLatencyStage;
+
+  MicroOpQueueStage(const MicroOpQueueStage &Other) = delete;
+  MicroOpQueueStage &operator=(const MicroOpQueueStage &Other) = delete;
+
+  // By default, an instruction consumes a number of buffer entries equal to its
+  // number of micro opcodes (see field `InstrDesc::NumMicroOpcodes`).  The
+  // number of entries consumed by an instruction is normalized to the
+  // minimum value between NumMicroOpcodes and the buffer size. This is to avoid
+  // problems with (microcoded) instructions that generate a number of micro
+  // opcodes than doesn't fit in the buffer.
+  unsigned getNormalizedOpcodes(const InstRef &IR) const {
+    unsigned NormalizedOpcodes =
+        std::min(static_cast<unsigned>(Buffer.size()),
+                 IR.getInstruction()->getDesc().NumMicroOps);
+    return NormalizedOpcodes ? NormalizedOpcodes : 1U;
+  }
+
+  Error moveInstructions();
+
+public:
+  MicroOpQueueStage(unsigned Size, unsigned IPC = 0,
+                    bool ZeroLatencyStage = true);
+
+  bool isAvailable(const InstRef &IR) const override {
+    if (MaxIPC && CurrentIPC == MaxIPC)
+      return false;
+    unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+    if (NormalizedOpcodes > AvailableEntries)
+      return false;
+    return true;
+  }
+
+  bool hasWorkToComplete() const override {
+    return AvailableEntries != Buffer.size();
+  }
+
+  Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error cycleEnd() override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif // LLVM_MCA_MICRO_OP_QUEUE_STAGE_H
diff --git a/llvm/lib/MCA/CMakeLists.txt b/llvm/lib/MCA/CMakeLists.txt

index bfd0782..4965b6b 100644 (file)
--- a/llvm/lib/MCA/CMakeLists.txt
+++ b/llvm/lib/MCA/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_library(LLVMMCA
    Stages/EntryStage.cpp
    Stages/ExecuteStage.cpp
    Stages/InstructionTables.cpp
+  Stages/MicroOpQueueStage.cpp
    Stages/RetireStage.cpp
    Stages/Stage.cpp
    Support.cpp
diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp

index 8de6753..f0e8dfa 100644 (file)
--- a/llvm/lib/MCA/Context.cpp
+++ b/llvm/lib/MCA/Context.cpp
@@ -21,6 +21,7 @@
  #include "llvm/MCA/Stages/DispatchStage.h"
  #include "llvm/MCA/Stages/EntryStage.h"
  #include "llvm/MCA/Stages/ExecuteStage.h"
+#include "llvm/MCA/Stages/MicroOpQueueStage.h"
  #include "llvm/MCA/Stages/RetireStage.h"
  
  namespace llvm {
@@ -55,6 +56,9 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
    // Build the pipeline.
    auto StagePipeline = llvm::make_unique<Pipeline>();
    StagePipeline->appendStage(std::move(Fetch));
+  if (Opts.MicroOpQueueSize)
+    StagePipeline->appendStage(llvm::make_unique<MicroOpQueueStage>(
+        Opts.MicroOpQueueSize, Opts.DecodersThroughput));
    StagePipeline->appendStage(std::move(Dispatch));
    StagePipeline->appendStage(std::move(Execute));
    StagePipeline->appendStage(std::move(Retire));
diff --git a/llvm/lib/MCA/Stages/MicroOpQueueStage.cpp b/llvm/lib/MCA/Stages/MicroOpQueueStage.cpp

new file mode 100644 (file)

index 0000000..cb3e4c6
--- /dev/null
+++ b/llvm/lib/MCA/Stages/MicroOpQueueStage.cpp
@@ -0,0 +1,70 @@
+//===---------------------- MicroOpQueueStage.cpp ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the MicroOpQueueStage.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MCA/Stages/MicroOpQueueStage.h"
+
+namespace llvm {
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+Error MicroOpQueueStage::moveInstructions() {
+  InstRef IR = Buffer[CurrentInstructionSlotIdx];
+  while (IR && checkNextStage(IR)) {
+    if (llvm::Error Val = moveToTheNextStage(IR))
+      return Val;
+
+    Buffer[CurrentInstructionSlotIdx].invalidate();
+    unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+    CurrentInstructionSlotIdx += NormalizedOpcodes;
+    CurrentInstructionSlotIdx %= Buffer.size();
+    AvailableEntries += NormalizedOpcodes;
+    IR = Buffer[CurrentInstructionSlotIdx];
+  }
+
+  return llvm::ErrorSuccess();
+}
+
+MicroOpQueueStage::MicroOpQueueStage(unsigned Size, unsigned IPC,
+                                     bool ZeroLatencyStage)
+    : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), MaxIPC(IPC),
+      CurrentIPC(0), IsZeroLatencyStage(ZeroLatencyStage) {
+  Buffer.resize(Size ? Size : 1);
+  AvailableEntries = Buffer.size();
+}
+
+Error MicroOpQueueStage::execute(InstRef &IR) {
+  Buffer[NextAvailableSlotIdx] = IR;
+  unsigned NormalizedOpcodes = getNormalizedOpcodes(IR);
+  NextAvailableSlotIdx += NormalizedOpcodes;
+  NextAvailableSlotIdx %= Buffer.size();
+  AvailableEntries -= NormalizedOpcodes;
+  ++CurrentIPC;
+  return llvm::ErrorSuccess();
+}
+
+Error MicroOpQueueStage::cycleStart() {
+  CurrentIPC = 0;
+  if (!IsZeroLatencyStage)
+    return moveInstructions();
+  return llvm::ErrorSuccess();
+}
+
+Error MicroOpQueueStage::cycleEnd() {
+  if (IsZeroLatencyStage)
+    return moveInstructions();
+  return llvm::ErrorSuccess();
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/test/tools/llvm-mca/X86/uop-queue.s b/llvm/test/tools/llvm-mca/X86/uop-queue.s

new file mode 100644 (file)

index 0000000..7eada2c
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/uop-queue.s
@@ -0,0 +1,105 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-1
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=3 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-3
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-4
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-DEC-2
+
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-1
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-2
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-DEC-1
+
+add %eax, %eax
+add %ebx, %ebx
+add %ecx, %ecx
+add %edx, %edx
+
+# BTVER2-DEC-2:        Iterations:        1500
+# BTVER2-DEC-2-NEXT:   Instructions:      6000
+# BTVER2-DEC-2-NEXT:   Total Cycles:      3003
+# BTVER2-DEC-2-NEXT:   Total uOps:        6000
+
+# BTVER2-DEC-2:        Dispatch Width:    2
+# BTVER2-DEC-2-NEXT:   uOps Per Cycle:    2.00
+# BTVER2-DEC-2-NEXT:   IPC:               2.00
+# BTVER2-DEC-2-NEXT:   Block RThroughput: 2.0
+
+# BTVER2-DEC-1:        Iterations:        1500
+# BTVER2-DEC-1-NEXT:   Instructions:      6000
+# BTVER2-DEC-1-NEXT:   Total Cycles:      6003
+# BTVER2-DEC-1-NEXT:   Total uOps:        6000
+
+# BTVER2-UOPQ-1:       Iterations:        1500
+# BTVER2-UOPQ-1-NEXT:  Instructions:      6000
+# BTVER2-UOPQ-1-NEXT:  Total Cycles:      6003
+# BTVER2-UOPQ-1-NEXT:  Total uOps:        6000
+
+# BTVER2-UOPQ-2:       Iterations:        1500
+# BTVER2-UOPQ-2-NEXT:  Instructions:      6000
+# BTVER2-UOPQ-2-NEXT:  Total Cycles:      3003
+# BTVER2-UOPQ-2-NEXT:  Total uOps:        6000
+
+# HASWELL-DEC-2:       Iterations:        1500
+# HASWELL-DEC-2-NEXT:  Instructions:      6000
+# HASWELL-DEC-2-NEXT:  Total Cycles:      3003
+# HASWELL-DEC-2-NEXT:  Total uOps:        6000
+
+# HASWELL-UOPQ-1:      Iterations:        1500
+# HASWELL-UOPQ-1-NEXT: Instructions:      6000
+# HASWELL-UOPQ-1-NEXT: Total Cycles:      6003
+# HASWELL-UOPQ-1-NEXT: Total uOps:        6000
+
+# HASWELL-UOPQ-2:      Iterations:        1500
+# HASWELL-UOPQ-2-NEXT: Instructions:      6000
+# HASWELL-UOPQ-2-NEXT: Total Cycles:      3003
+# HASWELL-UOPQ-2-NEXT: Total uOps:        6000
+
+# HASWELL-UOPQ-3:      Iterations:        1500
+# HASWELL-UOPQ-3-NEXT: Instructions:      6000
+# HASWELL-UOPQ-3-NEXT: Total Cycles:      2003
+# HASWELL-UOPQ-3-NEXT: Total uOps:        6000
+
+# HASWELL-UOPQ-4:      Iterations:        1500
+# HASWELL-UOPQ-4-NEXT: Instructions:      6000
+# HASWELL-UOPQ-4-NEXT: Total Cycles:      1503
+# HASWELL-UOPQ-4-NEXT: Total uOps:        6000
+
+# BTVER2-DEC-1:        Dispatch Width:    2
+# BTVER2-DEC-1-NEXT:   uOps Per Cycle:    1.00
+# BTVER2-DEC-1-NEXT:   IPC:               1.00
+# BTVER2-DEC-1-NEXT:   Block RThroughput: 2.0
+
+# BTVER2-UOPQ-1:       Dispatch Width:    2
+# BTVER2-UOPQ-1-NEXT:  uOps Per Cycle:    1.00
+# BTVER2-UOPQ-1-NEXT:  IPC:               1.00
+# BTVER2-UOPQ-1-NEXT:  Block RThroughput: 2.0
+
+# BTVER2-UOPQ-2:       Dispatch Width:    2
+# BTVER2-UOPQ-2-NEXT:  uOps Per Cycle:    2.00
+# BTVER2-UOPQ-2-NEXT:  IPC:               2.00
+# BTVER2-UOPQ-2-NEXT:  Block RThroughput: 2.0
+
+# HASWELL-DEC-2:       Dispatch Width:    4
+# HASWELL-DEC-2-NEXT:  uOps Per Cycle:    2.00
+# HASWELL-DEC-2-NEXT:  IPC:               2.00
+# HASWELL-DEC-2-NEXT:  Block RThroughput: 1.0
+
+# HASWELL-UOPQ-1:      Dispatch Width:    4
+# HASWELL-UOPQ-1-NEXT: uOps Per Cycle:    1.00
+# HASWELL-UOPQ-1-NEXT: IPC:               1.00
+# HASWELL-UOPQ-1-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-2:      Dispatch Width:    4
+# HASWELL-UOPQ-2-NEXT: uOps Per Cycle:    2.00
+# HASWELL-UOPQ-2-NEXT: IPC:               2.00
+# HASWELL-UOPQ-2-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-3:      Dispatch Width:    4
+# HASWELL-UOPQ-3-NEXT: uOps Per Cycle:    3.00
+# HASWELL-UOPQ-3-NEXT: IPC:               3.00
+# HASWELL-UOPQ-3-NEXT: Block RThroughput: 1.0
+
+# HASWELL-UOPQ-4:      Dispatch Width:    4
+# HASWELL-UOPQ-4-NEXT: uOps Per Cycle:    3.99
+# HASWELL-UOPQ-4-NEXT: IPC:               3.99
+# HASWELL-UOPQ-4-NEXT: Block RThroughput: 1.0
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp

index ce253af..c39b49b 100644 (file)
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -100,6 +100,17 @@ static cl::opt<unsigned>
                                "be used for register mappings"),
                       cl::cat(ToolOptions), cl::init(0));
  
+static cl::opt<unsigned>
+    MicroOpQueue("micro-op-queue-size", cl::Hidden,
+                 cl::desc("Number of entries in the micro-op queue"),
+                 cl::cat(ToolOptions), cl::init(0));
+
+static cl::opt<unsigned>
+    DecoderThroughput("decoder-throughput", cl::Hidden,
+                      cl::desc("Maximum throughput from the decoders "
+                               "(instructions per cycle)"),
+                      cl::cat(ToolOptions), cl::init(0));
+
  static cl::opt<bool>
      PrintRegisterFileStats("register-file-stats",
                             cl::desc("Print register file statistics"),
@@ -387,9 +398,9 @@ int main(int argc, char **argv) {
    // Create a context to control ownership of the pipeline hardware.
    mca::Context MCA(*MRI, *STI);
  
-  mca::PipelineOptions PO(DispatchWidth, RegisterFileSize, LoadQueueSize,
-                          StoreQueueSize, AssumeNoAlias,
-                          EnableBottleneckAnalysis);
+  mca::PipelineOptions PO(MicroOpQueue, DecoderThroughput, DispatchWidth,
+                          RegisterFileSize, LoadQueueSize, StoreQueueSize,
+                          AssumeNoAlias, EnableBottleneckAnalysis);
  
    // Number each region in the sequence.
    unsigned RegionIdx = 0;
author	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
	Fri, 29 Mar 2019 12:15:37 +0000 (12:15 +0000)
committer	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
	Fri, 29 Mar 2019 12:15:37 +0000 (12:15 +0000)
llvm/include/llvm/MCA/Context.h		patch \| blob \| history
llvm/include/llvm/MCA/Stages/MicroOpQueueStage.h	[new file with mode: 0644]	patch \| blob
llvm/lib/MCA/CMakeLists.txt		patch \| blob \| history
llvm/lib/MCA/Context.cpp		patch \| blob \| history
llvm/lib/MCA/Stages/MicroOpQueueStage.cpp	[new file with mode: 0644]	patch \| blob
llvm/test/tools/llvm-mca/X86/uop-queue.s	[new file with mode: 0644]	patch \| blob
llvm/tools/llvm-mca/llvm-mca.cpp		patch \| blob \| history