From 5578ec32f9c4fef46adce52a2e3d22bf409b3d2c Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Mon, 4 May 2020 18:23:04 +0100 Subject: [PATCH] [MCA] Fixed a bug where loads and stores were sometimes incorrectly marked as depedent. Fixes PR45793. This fixes a regression introduced by a very old commit 280ac1fd1dc35 (was llvm-svn 361950). Commit 280ac1fd1dc35 redesigned the logic in the LSUnit with the goal of speeding up isReady() queries, and stabilising the LSUnit API (while also making the load store unit more customisable). The concept of MemoryGroup (effectively an alias set) was added by that commit to better describe and track dependencies between memory operations. However, that concept was not just used for alias dependencies, but it was also used for describing memory "order" dependencies (enforced by the memory consistency model). Instructions of a same memory group were considered "equivalent" as in: independent operations that can potentially execute in parallel. The problem was that the cost of a dependency (in terms of number of cycles) should have been different for "order" dependency. Instructions in an order dependency simply have to have to wait until their predecessors are "issued" to an underlying pipeline (rather than having to wait until predecessors have beeng fully executed). For simple "order" dependencies, this was effectively introducing an artificial delay on the "issue" of independent loads and stores. This patch fixes the issue and adds a new test named 'independent-load-stores.s' to a bunch of x86 targets. That test contains the reproducible posted by Fabian Ritter on PR45793. I had to rerun the update-mca-tests script on several files. To avoid expected regressions on some Exynos tests, I have added a -noalias=false flag (to match the old strict behavior on latencies). Some tests for processor Barcelona are improved/fixed by this change and they now show better results. In a few tests we were incorrectly counting the time spent by instructions in a scheduler queue. In one case in particular we now correctly see a store executed out of order. That test was affected by the same underlying issue reported as PR45793. Reviewers: mattd Differential Revision: https://reviews.llvm.org/D79351 --- llvm/include/llvm/MCA/HardwareUnits/LSUnit.h | 49 +++-- llvm/lib/MCA/HardwareUnits/LSUnit.cpp | 84 ++++++-- .../test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s | 6 +- .../test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s | 6 +- .../test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s | 6 +- .../test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s | 6 +- .../tools/llvm-mca/AArch64/Exynos/float-store.s | 6 +- llvm/test/tools/llvm-mca/AArch64/Exynos/store.s | 6 +- .../llvm-mca/X86/Barcelona/load-store-throughput.s | 221 ++++++++++----------- .../llvm-mca/X86/Barcelona/store-throughput.s | 40 ++-- .../llvm-mca/X86/BdVer2/load-store-throughput.s | 215 ++++++++++---------- .../tools/llvm-mca/X86/BdVer2/memcpy-like-test.s | 6 +- .../tools/llvm-mca/X86/BdVer2/store-throughput.s | 48 ++--- .../llvm-mca/X86/BtVer2/independent-load-stores.s | 146 ++++++++++++++ llvm/test/tools/llvm-mca/X86/BtVer2/xadd.s | 42 ++-- .../llvm-mca/X86/Haswell/independent-load-stores.s | 142 +++++++++++++ .../X86/SkylakeClient/independent-load-stores.s | 142 +++++++++++++ .../X86/SkylakeServer/independent-load-stores.s | 142 +++++++++++++ 18 files changed, 974 insertions(+), 339 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/X86/BtVer2/independent-load-stores.s create mode 100644 llvm/test/tools/llvm-mca/X86/Haswell/independent-load-stores.s create mode 100644 llvm/test/tools/llvm-mca/X86/SkylakeClient/independent-load-stores.s create mode 100644 llvm/test/tools/llvm-mca/X86/SkylakeServer/independent-load-stores.s diff --git a/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h b/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h index 9143adf..2f9b4ba 100644 --- a/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h +++ b/llvm/include/llvm/MCA/HardwareUnits/LSUnit.h @@ -40,7 +40,10 @@ class MemoryGroup { unsigned NumInstructions; unsigned NumExecuting; unsigned NumExecuted; - SmallVector Succ; + // Successors that are in a order dependency with this group. + SmallVector OrderSucc; + // Successors that are in a data dependency with this group. + SmallVector DataSucc; CriticalDependency CriticalPredecessor; InstRef CriticalMemoryInstruction; @@ -55,8 +58,9 @@ public: NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {} MemoryGroup(MemoryGroup &&) = default; - ArrayRef getSuccessors() const { return Succ; } - unsigned getNumSuccessors() const { return Succ.size(); } + size_t getNumSuccessors() const { + return OrderSucc.size() + DataSucc.size(); + } unsigned getNumPredecessors() const { return NumPredecessors; } unsigned getNumExecutingPredecessors() const { return NumExecutingPredecessors; @@ -75,12 +79,22 @@ public: return CriticalPredecessor; } - void addSuccessor(MemoryGroup *Group) { + void addSuccessor(MemoryGroup *Group, bool IsDataDependent) { + // Do not need to add a dependency if there is no data + // dependency and all instructions from this group have been + // issued already. + if (!IsDataDependent && isExecuting()) + return; + Group->NumPredecessors++; assert(!isExecuted() && "Should have been removed!"); if (isExecuting()) - Group->onGroupIssued(CriticalMemoryInstruction); - Succ.emplace_back(Group); + Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent); + + if (IsDataDependent) + DataSucc.emplace_back(Group); + else + OrderSucc.emplace_back(Group); } bool isWaiting() const { @@ -98,10 +112,13 @@ public: } bool isExecuted() const { return NumInstructions == NumExecuted; } - void onGroupIssued(const InstRef &IR) { + void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) { assert(!isReady() && "Unexpected group-start event!"); NumExecutingPredecessors++; + if (!ShouldUpdateCriticalDep) + return; + unsigned Cycles = IR.getInstruction()->getCyclesLeft(); if (CriticalPredecessor.Cycles < Cycles) { CriticalPredecessor.IID = IR.getSourceIndex(); @@ -133,8 +150,14 @@ public: return; // Notify successors that this group started execution. - for (MemoryGroup *MG : Succ) - MG->onGroupIssued(CriticalMemoryInstruction); + for (MemoryGroup *MG : OrderSucc) { + MG->onGroupIssued(CriticalMemoryInstruction, false); + // Release the order dependency with this group. + MG->onGroupExecuted(); + } + + for (MemoryGroup *MG : DataSucc) + MG->onGroupIssued(CriticalMemoryInstruction, true); } void onInstructionExecuted() { @@ -145,8 +168,8 @@ public: if (!isExecuted()) return; - // Notify successors that this group has finished execution. - for (MemoryGroup *MG : Succ) + // Notify data dependent successors that this group has finished execution. + for (MemoryGroup *MG : DataSucc) MG->onGroupExecuted(); } @@ -412,6 +435,7 @@ class LSUnit : public LSUnitBase { unsigned CurrentLoadGroupID; unsigned CurrentLoadBarrierGroupID; unsigned CurrentStoreGroupID; + unsigned CurrentStoreBarrierGroupID; public: LSUnit(const MCSchedModel &SM) @@ -420,7 +444,8 @@ public: : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {} LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias) : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0), - CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {} + CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0), + CurrentStoreBarrierGroupID(0) {} /// Returns LSU_AVAILABLE if there are enough load/store queue entries to /// accomodate instruction IR. diff --git a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp index 0ee084c..e945e8c 100644 --- a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp +++ b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp @@ -77,9 +77,6 @@ unsigned LSUnit::dispatch(const InstRef &IR) { acquireSQSlot(); if (Desc.MayStore) { - // Always create a new group for store operations. - - // A store may not pass a previous store or store barrier. unsigned NewGID = createMemoryGroup(); MemoryGroup &NewGroup = getGroup(NewGID); NewGroup.addInstruction(); @@ -91,16 +88,32 @@ unsigned LSUnit::dispatch(const InstRef &IR) { MemoryGroup &IDom = getGroup(ImmediateLoadDominator); LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << ImmediateLoadDominator << ") --> (" << NewGID << ")\n"); - IDom.addSuccessor(&NewGroup); + IDom.addSuccessor(&NewGroup, !assumeNoAlias()); + } + + // A store may not pass a previous store barrier. + if (CurrentStoreBarrierGroupID) { + MemoryGroup &StoreGroup = getGroup(CurrentStoreBarrierGroupID); + LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" + << CurrentStoreBarrierGroupID + << ") --> (" << NewGID << ")\n"); + StoreGroup.addSuccessor(&NewGroup, true); } - if (CurrentStoreGroupID) { + + // A store may not pass a previous store. + if (CurrentStoreGroupID && + (CurrentStoreGroupID != CurrentStoreBarrierGroupID)) { MemoryGroup &StoreGroup = getGroup(CurrentStoreGroupID); LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentStoreGroupID << ") --> (" << NewGID << ")\n"); - StoreGroup.addSuccessor(&NewGroup); + StoreGroup.addSuccessor(&NewGroup, !assumeNoAlias()); } + CurrentStoreGroupID = NewGID; + if (IsMemBarrier) + CurrentStoreBarrierGroupID = NewGID; + if (Desc.MayLoad) { CurrentLoadGroupID = NewGID; if (IsMemBarrier) @@ -112,31 +125,59 @@ unsigned LSUnit::dispatch(const InstRef &IR) { assert(Desc.MayLoad && "Expected a load!"); - // Always create a new memory group if this is the first load of the sequence. + unsigned ImmediateLoadDominator = + std::max(CurrentLoadGroupID, CurrentLoadBarrierGroupID); + + // A new load group is created if we are in one of the following situations: + // 1) This is a load barrier (by construction, a load barrier is always + // assigned to a different memory group). + // 2) There is no load in flight (by construction we always keep loads and + // stores into separate memory groups). + // 3) There is a load barrier in flight. This load depends on it. + // 4) There is an intervening store between the last load dispatched to the + // LSU and this load. We always create a new group even if this load + // does not alias the last dispatched store. + // 5) There is no intervening store and there is an active load group. + // However that group has already started execution, so we cannot add + // this load to it. + bool ShouldCreateANewGroup = + IsMemBarrier || !ImmediateLoadDominator || + CurrentLoadBarrierGroupID == ImmediateLoadDominator || + ImmediateLoadDominator <= CurrentStoreGroupID || + getGroup(ImmediateLoadDominator).isExecuting(); - // A load may not pass a previous store unless flag 'NoAlias' is set. - // A load may pass a previous load. - // A younger load cannot pass a older load barrier. - // A load barrier cannot pass a older load. - bool ShouldCreateANewGroup = !CurrentLoadGroupID || IsMemBarrier || - CurrentLoadGroupID <= CurrentStoreGroupID || - CurrentLoadGroupID <= CurrentLoadBarrierGroupID; if (ShouldCreateANewGroup) { unsigned NewGID = createMemoryGroup(); MemoryGroup &NewGroup = getGroup(NewGID); NewGroup.addInstruction(); + // A load may not pass a previous store or store barrier + // unless flag 'NoAlias' is set. if (!assumeNoAlias() && CurrentStoreGroupID) { - MemoryGroup &StGroup = getGroup(CurrentStoreGroupID); + MemoryGroup &StoreGroup = getGroup(CurrentStoreGroupID); LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentStoreGroupID << ") --> (" << NewGID << ")\n"); - StGroup.addSuccessor(&NewGroup); + StoreGroup.addSuccessor(&NewGroup, true); } - if (CurrentLoadBarrierGroupID) { - MemoryGroup &LdGroup = getGroup(CurrentLoadBarrierGroupID); - LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentLoadBarrierGroupID - << ") --> (" << NewGID << ")\n"); - LdGroup.addSuccessor(&NewGroup); + + // A load barrier may not pass a previous load or load barrier. + if (IsMemBarrier) { + if (ImmediateLoadDominator) { + MemoryGroup &LoadGroup = getGroup(ImmediateLoadDominator); + LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" + << ImmediateLoadDominator + << ") --> (" << NewGID << ")\n"); + LoadGroup.addSuccessor(&NewGroup, true); + } + } else { + // A younger load cannot pass a older load barrier. + if (CurrentLoadBarrierGroupID) { + MemoryGroup &LoadGroup = getGroup(CurrentLoadBarrierGroupID); + LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" + << CurrentLoadBarrierGroupID + << ") --> (" << NewGID << ")\n"); + LoadGroup.addSuccessor(&NewGroup, true); + } } CurrentLoadGroupID = NewGID; @@ -145,6 +186,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) { return NewGID; } + // A load may pass a previous load. MemoryGroup &Group = getGroup(CurrentLoadGroupID); Group.addInstruction(); return CurrentLoadGroupID; diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s index 0cd5b6e..cf0e1bf 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5 st1 {v0.s}[0], [sp] st1 {v0.2s}, [sp] diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s index 94ac16d..b4d2b58 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5 st2 {v0.s, v1.s}[0], [sp] st2 {v0.2s, v1.2s}, [sp] diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s index 564e408..29f8079 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5 st3 {v0.s, v1.s, v2.s}[0], [sp] st3 {v0.2s, v1.2s, v2.2s}, [sp] diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s index 37283f9..7aa69b0 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 -# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5 st4 {v0.s, v1.s, v2.s, v3.s}[0], [sp] st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp] diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-store.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-store.s index 55d1d60..5b7004b 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-store.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-store.s @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 -# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 -# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5 stur d0, [sp, #2] stur q0, [sp, #16] diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/store.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/store.s index b86cdac..3c7d412 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/store.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/store.s @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3 -# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4 -# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4 +# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5 stur x0, [sp, #8] strb w0, [sp], #1 diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/load-store-throughput.s b/llvm/test/tools/llvm-mca/X86/Barcelona/load-store-throughput.s index adf6c10d..b600e3874 100644 --- a/llvm/test/tools/llvm-mca/X86/Barcelona/load-store-throughput.s +++ b/llvm/test/tools/llvm-mca/X86/Barcelona/load-store-throughput.s @@ -47,12 +47,12 @@ movaps %xmm3, (%rbx) # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 208 +# CHECK-NEXT: Total Cycles: 207 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.92 -# CHECK-NEXT: IPC: 1.92 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -72,22 +72,21 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (70.7%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (71.0%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (16.3%) -# CHECK-NEXT: 2, 148 (71.2%) -# CHECK-NEXT: 4, 26 (12.5%) +# CHECK-NEXT: 0, 33 (15.9%) +# CHECK-NEXT: 2, 148 (71.5%) +# CHECK-NEXT: 4, 26 (12.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 3 (1.4%) -# CHECK-NEXT: 1, 10 (4.8%) -# CHECK-NEXT: 2, 195 (93.8%) +# CHECK-NEXT: 0, 7 (3.4%) +# CHECK-NEXT: 2, 200 (96.6%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -116,16 +115,16 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: # CHECK-NEXT: - - - - 1.00 - - 1.00 movb %spl, (%rax) # CHECK-NEXT: - - - - - - 1.00 - movb (%rcx), %bpl -# CHECK-NEXT: - - - - - - 0.95 0.05 movb (%rdx), %sil -# CHECK-NEXT: - - - - 1.00 - 0.05 0.95 movb %dil, (%rbx) +# CHECK-NEXT: - - - - - - - 1.00 movb (%rdx), %sil +# CHECK-NEXT: - - - - 1.00 - 1.00 - movb %dil, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movb %spl, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movb (%rcx), %bpl -# CHECK-NEXT: [0,2] D=eeeeeER. movb (%rdx), %sil -# CHECK-NEXT: [0,3] D======eER movb %dil, (%rbx) +# CHECK: [0,0] DeER . . movb %spl, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movb (%rcx), %bpl +# CHECK-NEXT: [0,2] D=eeeeeER movb (%rdx), %sil +# CHECK-NEXT: [0,3] D=eE----R movb %dil, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -137,19 +136,19 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movb %spl, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movb (%rcx), %bpl # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movb (%rdx), %sil -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movb %dil, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 2.0 0.0 4.0 movb %dil, (%rbx) +# CHECK-NEXT: 1 1.5 1.0 1.0 # CHECK: [1] Code Region # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 208 +# CHECK-NEXT: Total Cycles: 207 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.92 -# CHECK-NEXT: IPC: 1.92 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -169,22 +168,21 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (70.7%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (71.0%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (16.3%) -# CHECK-NEXT: 2, 148 (71.2%) -# CHECK-NEXT: 4, 26 (12.5%) +# CHECK-NEXT: 0, 33 (15.9%) +# CHECK-NEXT: 2, 148 (71.5%) +# CHECK-NEXT: 4, 26 (12.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 3 (1.4%) -# CHECK-NEXT: 1, 10 (4.8%) -# CHECK-NEXT: 2, 195 (93.8%) +# CHECK-NEXT: 0, 7 (3.4%) +# CHECK-NEXT: 2, 200 (96.6%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -213,16 +211,16 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: # CHECK-NEXT: - - - - 1.00 - - 1.00 movw %sp, (%rax) # CHECK-NEXT: - - - - - - 1.00 - movw (%rcx), %bp -# CHECK-NEXT: - - - - - - 0.95 0.05 movw (%rdx), %si -# CHECK-NEXT: - - - - 1.00 - 0.05 0.95 movw %di, (%rbx) +# CHECK-NEXT: - - - - - - - 1.00 movw (%rdx), %si +# CHECK-NEXT: - - - - 1.00 - 1.00 - movw %di, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movw %sp, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movw (%rcx), %bp -# CHECK-NEXT: [0,2] D=eeeeeER. movw (%rdx), %si -# CHECK-NEXT: [0,3] D======eER movw %di, (%rbx) +# CHECK: [0,0] DeER . . movw %sp, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movw (%rcx), %bp +# CHECK-NEXT: [0,2] D=eeeeeER movw (%rdx), %si +# CHECK-NEXT: [0,3] D=eE----R movw %di, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -234,19 +232,19 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movw %sp, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movw (%rcx), %bp # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movw (%rdx), %si -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movw %di, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 2.0 0.0 4.0 movw %di, (%rbx) +# CHECK-NEXT: 1 1.5 1.0 1.0 # CHECK: [2] Code Region # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 208 +# CHECK-NEXT: Total Cycles: 207 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.92 -# CHECK-NEXT: IPC: 1.92 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -266,22 +264,21 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (70.7%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (71.0%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (16.3%) -# CHECK-NEXT: 2, 148 (71.2%) -# CHECK-NEXT: 4, 26 (12.5%) +# CHECK-NEXT: 0, 33 (15.9%) +# CHECK-NEXT: 2, 148 (71.5%) +# CHECK-NEXT: 4, 26 (12.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 3 (1.4%) -# CHECK-NEXT: 1, 10 (4.8%) -# CHECK-NEXT: 2, 195 (93.8%) +# CHECK-NEXT: 0, 7 (3.4%) +# CHECK-NEXT: 2, 200 (96.6%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -310,16 +307,16 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: # CHECK-NEXT: - - - - 1.00 - - 1.00 movl %esp, (%rax) # CHECK-NEXT: - - - - - - 1.00 - movl (%rcx), %ebp -# CHECK-NEXT: - - - - - - 0.95 0.05 movl (%rdx), %esi -# CHECK-NEXT: - - - - 1.00 - 0.05 0.95 movl %edi, (%rbx) +# CHECK-NEXT: - - - - - - - 1.00 movl (%rdx), %esi +# CHECK-NEXT: - - - - 1.00 - 1.00 - movl %edi, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movl %esp, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movl (%rcx), %ebp -# CHECK-NEXT: [0,2] D=eeeeeER. movl (%rdx), %esi -# CHECK-NEXT: [0,3] D======eER movl %edi, (%rbx) +# CHECK: [0,0] DeER . . movl %esp, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movl (%rcx), %ebp +# CHECK-NEXT: [0,2] D=eeeeeER movl (%rdx), %esi +# CHECK-NEXT: [0,3] D=eE----R movl %edi, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -331,19 +328,19 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movl %esp, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movl (%rcx), %ebp # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movl (%rdx), %esi -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movl %edi, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 2.0 0.0 4.0 movl %edi, (%rbx) +# CHECK-NEXT: 1 1.5 1.0 1.0 # CHECK: [3] Code Region # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 208 +# CHECK-NEXT: Total Cycles: 207 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.92 -# CHECK-NEXT: IPC: 1.92 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -363,22 +360,21 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (70.7%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (71.0%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (16.3%) -# CHECK-NEXT: 2, 148 (71.2%) -# CHECK-NEXT: 4, 26 (12.5%) +# CHECK-NEXT: 0, 33 (15.9%) +# CHECK-NEXT: 2, 148 (71.5%) +# CHECK-NEXT: 4, 26 (12.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 3 (1.4%) -# CHECK-NEXT: 1, 10 (4.8%) -# CHECK-NEXT: 2, 195 (93.8%) +# CHECK-NEXT: 0, 7 (3.4%) +# CHECK-NEXT: 2, 200 (96.6%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -407,16 +403,16 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: # CHECK-NEXT: - - - - 1.00 - - 1.00 movq %rsp, (%rax) # CHECK-NEXT: - - - - - - 1.00 - movq (%rcx), %rbp -# CHECK-NEXT: - - - - - - 0.95 0.05 movq (%rdx), %rsi -# CHECK-NEXT: - - - - 1.00 - 0.05 0.95 movq %rdi, (%rbx) +# CHECK-NEXT: - - - - - - - 1.00 movq (%rdx), %rsi +# CHECK-NEXT: - - - - 1.00 - 1.00 - movq %rdi, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movq %rsp, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movq (%rcx), %rbp -# CHECK-NEXT: [0,2] D=eeeeeER. movq (%rdx), %rsi -# CHECK-NEXT: [0,3] D======eER movq %rdi, (%rbx) +# CHECK: [0,0] DeER . . movq %rsp, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movq (%rcx), %rbp +# CHECK-NEXT: [0,2] D=eeeeeER movq (%rdx), %rsi +# CHECK-NEXT: [0,3] D=eE----R movq %rdi, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -428,19 +424,19 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movq %rsp, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movq (%rcx), %rbp # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movq (%rdx), %rsi -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movq %rdi, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 2.0 0.0 4.0 movq %rdi, (%rbx) +# CHECK-NEXT: 1 1.5 1.0 1.0 # CHECK: [4] Code Region # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 208 +# CHECK-NEXT: Total Cycles: 207 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.92 -# CHECK-NEXT: IPC: 1.92 +# CHECK-NEXT: uOps Per Cycle: 1.93 +# CHECK-NEXT: IPC: 1.93 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -460,22 +456,21 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (70.7%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (71.0%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (16.3%) -# CHECK-NEXT: 2, 148 (71.2%) -# CHECK-NEXT: 4, 26 (12.5%) +# CHECK-NEXT: 0, 33 (15.9%) +# CHECK-NEXT: 2, 148 (71.5%) +# CHECK-NEXT: 4, 26 (12.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 3 (1.4%) -# CHECK-NEXT: 1, 10 (4.8%) -# CHECK-NEXT: 2, 195 (93.8%) +# CHECK-NEXT: 0, 7 (3.4%) +# CHECK-NEXT: 2, 200 (96.6%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -504,16 +499,16 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: # CHECK-NEXT: - - - - 1.00 - - 1.00 movd %mm0, (%rax) # CHECK-NEXT: - - - - - - 1.00 - movd (%rcx), %mm1 -# CHECK-NEXT: - - - - - - 0.95 0.05 movd (%rdx), %mm2 -# CHECK-NEXT: - - - - 1.00 - 0.05 0.95 movd %mm3, (%rbx) +# CHECK-NEXT: - - - - - - - 1.00 movd (%rdx), %mm2 +# CHECK-NEXT: - - - - 1.00 - 1.00 - movd %mm3, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movd %mm0, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movd (%rcx), %mm1 -# CHECK-NEXT: [0,2] D=eeeeeER. movd (%rdx), %mm2 -# CHECK-NEXT: [0,3] D======eER movd %mm3, (%rbx) +# CHECK: [0,0] DeER . . movd %mm0, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movd (%rcx), %mm1 +# CHECK-NEXT: [0,2] D=eeeeeER movd (%rdx), %mm2 +# CHECK-NEXT: [0,3] D=eE----R movd %mm3, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -525,19 +520,19 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movd %mm0, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movd (%rcx), %mm1 # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movd (%rdx), %mm2 -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movd %mm3, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 2.0 0.0 4.0 movd %mm3, (%rbx) +# CHECK-NEXT: 1 1.5 1.0 1.0 # CHECK: [5] Code Region # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 209 +# CHECK-NEXT: Total Cycles: 208 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.91 -# CHECK-NEXT: IPC: 1.91 +# CHECK-NEXT: uOps Per Cycle: 1.92 +# CHECK-NEXT: IPC: 1.92 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -557,22 +552,21 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (70.3%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 147 (70.7%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 35 (16.7%) -# CHECK-NEXT: 2, 148 (70.8%) -# CHECK-NEXT: 4, 26 (12.4%) +# CHECK-NEXT: 0, 34 (16.3%) +# CHECK-NEXT: 2, 148 (71.2%) +# CHECK-NEXT: 4, 26 (12.5%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 3 (1.4%) -# CHECK-NEXT: 1, 12 (5.7%) -# CHECK-NEXT: 2, 194 (92.8%) +# CHECK-NEXT: 0, 8 (3.8%) +# CHECK-NEXT: 2, 200 (96.2%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -601,17 +595,16 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: # CHECK-NEXT: - - - - 1.00 - - 1.00 movaps %xmm0, (%rax) # CHECK-NEXT: - - - - - - 1.00 - movaps (%rcx), %xmm1 -# CHECK-NEXT: - - - - - - 0.94 0.06 movaps (%rdx), %xmm2 -# CHECK-NEXT: - - - - 1.00 - 0.06 0.94 movaps %xmm3, (%rbx) +# CHECK-NEXT: - - - - - - - 1.00 movaps (%rdx), %xmm2 +# CHECK-NEXT: - - - - 1.00 - 1.00 - movaps %xmm3, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeER . . movaps %xmm0, (%rax) -# CHECK-NEXT: [0,1] DeeeeeeER . movaps (%rcx), %xmm1 -# CHECK-NEXT: [0,2] D=eeeeeeER. movaps (%rdx), %xmm2 -# CHECK-NEXT: [0,3] D=======eER movaps %xmm3, (%rbx) +# CHECK: [0,0] DeER . . movaps %xmm0, (%rax) +# CHECK-NEXT: [0,1] DeeeeeeER. movaps (%rcx), %xmm1 +# CHECK-NEXT: [0,2] D=eeeeeeER movaps (%rdx), %xmm2 +# CHECK-NEXT: [0,3] D=eE-----R movaps %xmm3, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -623,5 +616,5 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movaps %xmm0, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movaps (%rcx), %xmm1 # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movaps (%rdx), %xmm2 -# CHECK-NEXT: 3. 1 8.0 0.0 0.0 movaps %xmm3, (%rbx) -# CHECK-NEXT: 1 3.0 1.0 0.0 +# CHECK-NEXT: 3. 1 2.0 0.0 5.0 movaps %xmm3, (%rbx) +# CHECK-NEXT: 1 1.5 1.0 1.3 diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s b/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s index 08a9c47..7d1fb6c 100644 --- a/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s +++ b/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s @@ -135,10 +135,10 @@ movaps %xmm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movb %spl, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movb %bpl, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movb %sil, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movb %dil, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movb %bpl, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movb %sil, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movb %dil, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [1] Code Region @@ -232,10 +232,10 @@ movaps %xmm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movw %sp, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movw %bp, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movw %si, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movw %di, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movw %bp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movw %si, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movw %di, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [2] Code Region @@ -329,10 +329,10 @@ movaps %xmm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movl %esp, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movl %ebp, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movl %esi, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movl %edi, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movl %ebp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movl %esi, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movl %edi, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [3] Code Region @@ -426,10 +426,10 @@ movaps %xmm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movq %rsp, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movq %rbp, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movq %rsi, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movq %rdi, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movq %rbp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movq %rsi, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movq %rdi, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [4] Code Region @@ -620,7 +620,7 @@ movaps %xmm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movaps %xmm0, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movaps %xmm1, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movaps %xmm2, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movaps %xmm3, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movaps %xmm1, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movaps %xmm2, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movaps %xmm3, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s index f326028..4b8f9e7 100644 --- a/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s @@ -72,23 +72,24 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 257 (84.0%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 256 (83.7%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (11.1%) -# CHECK-NEXT: 1, 172 (56.2%) -# CHECK-NEXT: 2, 86 (28.1%) +# CHECK-NEXT: 0, 35 (11.4%) +# CHECK-NEXT: 1, 171 (55.9%) +# CHECK-NEXT: 2, 85 (27.8%) +# CHECK-NEXT: 3, 1 (0.3%) # CHECK-NEXT: 4, 14 (4.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 5 (1.6%) -# CHECK-NEXT: 1, 202 (66.0%) -# CHECK-NEXT: 2, 99 (32.4%) +# CHECK-NEXT: 0, 6 (2.0%) +# CHECK-NEXT: 1, 200 (65.4%) +# CHECK-NEXT: 2, 100 (32.7%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -99,8 +100,8 @@ movaps %xmm3, (%rbx) # CHECK: [1] [2] [3] [4] # CHECK-NEXT: PdEX 36 40 40 # CHECK-NEXT: PdFPU 0 0 64 -# CHECK-NEXT: PdLoad 19 22 40 -# CHECK-NEXT: PdStore 20 23 24 +# CHECK-NEXT: PdLoad 21 24 40 +# CHECK-NEXT: PdStore 18 21 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -133,18 +134,18 @@ movaps %xmm3, (%rbx) # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions: -# CHECK-NEXT: 0.96 0.04 - - - - - - - - - - - - - - - - - - - - 1.00 movb %spl, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - - - 1.00 movb %spl, (%rax) # CHECK-NEXT: 2.00 - - - - - - - - - - - - - - - - - - - 2.00 - - movb (%rcx), %bpl # CHECK-NEXT: - 2.00 - - - - - - - - - - - - - - - - - 2.00 - - - movb (%rdx), %sil -# CHECK-NEXT: 0.04 0.96 - - - - - - - - - - - - - - - - - - - - 1.00 movb %dil, (%rbx) +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - - - 1.00 movb %dil, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movb %spl, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movb (%rcx), %bpl -# CHECK-NEXT: [0,2] D=eeeeeER. movb (%rdx), %sil -# CHECK-NEXT: [0,3] D======eER movb %dil, (%rbx) +# CHECK: [0,0] DeER . . movb %spl, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movb (%rcx), %bpl +# CHECK-NEXT: [0,2] D=eeeeeER movb (%rdx), %sil +# CHECK-NEXT: [0,3] D==eE---R movb %dil, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -156,8 +157,8 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movb %spl, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movb (%rcx), %bpl # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movb (%rdx), %sil -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movb %dil, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 3.0 1.0 3.0 movb %dil, (%rbx) +# CHECK-NEXT: 1 1.8 1.3 0.8 # CHECK: [1] Code Region @@ -188,23 +189,24 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 257 (84.0%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 256 (83.7%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (11.1%) -# CHECK-NEXT: 1, 172 (56.2%) -# CHECK-NEXT: 2, 86 (28.1%) +# CHECK-NEXT: 0, 35 (11.4%) +# CHECK-NEXT: 1, 171 (55.9%) +# CHECK-NEXT: 2, 85 (27.8%) +# CHECK-NEXT: 3, 1 (0.3%) # CHECK-NEXT: 4, 14 (4.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 5 (1.6%) -# CHECK-NEXT: 1, 202 (66.0%) -# CHECK-NEXT: 2, 99 (32.4%) +# CHECK-NEXT: 0, 6 (2.0%) +# CHECK-NEXT: 1, 200 (65.4%) +# CHECK-NEXT: 2, 100 (32.7%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -215,8 +217,8 @@ movaps %xmm3, (%rbx) # CHECK: [1] [2] [3] [4] # CHECK-NEXT: PdEX 36 40 40 # CHECK-NEXT: PdFPU 0 0 64 -# CHECK-NEXT: PdLoad 19 22 40 -# CHECK-NEXT: PdStore 20 23 24 +# CHECK-NEXT: PdLoad 21 24 40 +# CHECK-NEXT: PdStore 18 21 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -249,18 +251,18 @@ movaps %xmm3, (%rbx) # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions: -# CHECK-NEXT: 0.96 0.04 - - - - - - - - - - - - - - - - - - - - 1.00 movw %sp, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - - - 1.00 movw %sp, (%rax) # CHECK-NEXT: 2.00 - - - - - - - - - - - - - - - - - - - 2.00 - - movw (%rcx), %bp # CHECK-NEXT: - 2.00 - - - - - - - - - - - - - - - - - 2.00 - - - movw (%rdx), %si -# CHECK-NEXT: 0.04 0.96 - - - - - - - - - - - - - - - - - - - - 1.00 movw %di, (%rbx) +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - - - 1.00 movw %di, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movw %sp, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movw (%rcx), %bp -# CHECK-NEXT: [0,2] D=eeeeeER. movw (%rdx), %si -# CHECK-NEXT: [0,3] D======eER movw %di, (%rbx) +# CHECK: [0,0] DeER . . movw %sp, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movw (%rcx), %bp +# CHECK-NEXT: [0,2] D=eeeeeER movw (%rdx), %si +# CHECK-NEXT: [0,3] D==eE---R movw %di, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -272,8 +274,8 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movw %sp, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movw (%rcx), %bp # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movw (%rdx), %si -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movw %di, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 3.0 1.0 3.0 movw %di, (%rbx) +# CHECK-NEXT: 1 1.8 1.3 0.8 # CHECK: [2] Code Region @@ -304,23 +306,24 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 257 (84.0%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 256 (83.7%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (11.1%) -# CHECK-NEXT: 1, 172 (56.2%) -# CHECK-NEXT: 2, 86 (28.1%) +# CHECK-NEXT: 0, 35 (11.4%) +# CHECK-NEXT: 1, 171 (55.9%) +# CHECK-NEXT: 2, 85 (27.8%) +# CHECK-NEXT: 3, 1 (0.3%) # CHECK-NEXT: 4, 14 (4.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 5 (1.6%) -# CHECK-NEXT: 1, 202 (66.0%) -# CHECK-NEXT: 2, 99 (32.4%) +# CHECK-NEXT: 0, 6 (2.0%) +# CHECK-NEXT: 1, 200 (65.4%) +# CHECK-NEXT: 2, 100 (32.7%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -331,8 +334,8 @@ movaps %xmm3, (%rbx) # CHECK: [1] [2] [3] [4] # CHECK-NEXT: PdEX 36 40 40 # CHECK-NEXT: PdFPU 0 0 64 -# CHECK-NEXT: PdLoad 19 22 40 -# CHECK-NEXT: PdStore 20 23 24 +# CHECK-NEXT: PdLoad 21 24 40 +# CHECK-NEXT: PdStore 18 21 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -365,18 +368,18 @@ movaps %xmm3, (%rbx) # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions: -# CHECK-NEXT: 0.96 0.04 - - - - - - - - - - - - - - - - - - - - 1.00 movl %esp, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - - - 1.00 movl %esp, (%rax) # CHECK-NEXT: 2.00 - - - - - - - - - - - - - - - - - - - 2.00 - - movl (%rcx), %ebp # CHECK-NEXT: - 2.00 - - - - - - - - - - - - - - - - - 2.00 - - - movl (%rdx), %esi -# CHECK-NEXT: 0.04 0.96 - - - - - - - - - - - - - - - - - - - - 1.00 movl %edi, (%rbx) +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - - - 1.00 movl %edi, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movl %esp, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movl (%rcx), %ebp -# CHECK-NEXT: [0,2] D=eeeeeER. movl (%rdx), %esi -# CHECK-NEXT: [0,3] D======eER movl %edi, (%rbx) +# CHECK: [0,0] DeER . . movl %esp, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movl (%rcx), %ebp +# CHECK-NEXT: [0,2] D=eeeeeER movl (%rdx), %esi +# CHECK-NEXT: [0,3] D==eE---R movl %edi, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -388,8 +391,8 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movl %esp, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movl (%rcx), %ebp # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movl (%rdx), %esi -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movl %edi, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 3.0 1.0 3.0 movl %edi, (%rbx) +# CHECK-NEXT: 1 1.8 1.3 0.8 # CHECK: [3] Code Region @@ -420,23 +423,24 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 257 (84.0%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 256 (83.7%) # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 34 (11.1%) -# CHECK-NEXT: 1, 172 (56.2%) -# CHECK-NEXT: 2, 86 (28.1%) +# CHECK-NEXT: 0, 35 (11.4%) +# CHECK-NEXT: 1, 171 (55.9%) +# CHECK-NEXT: 2, 85 (27.8%) +# CHECK-NEXT: 3, 1 (0.3%) # CHECK-NEXT: 4, 14 (4.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 5 (1.6%) -# CHECK-NEXT: 1, 202 (66.0%) -# CHECK-NEXT: 2, 99 (32.4%) +# CHECK-NEXT: 0, 6 (2.0%) +# CHECK-NEXT: 1, 200 (65.4%) +# CHECK-NEXT: 2, 100 (32.7%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -447,8 +451,8 @@ movaps %xmm3, (%rbx) # CHECK: [1] [2] [3] [4] # CHECK-NEXT: PdEX 36 40 40 # CHECK-NEXT: PdFPU 0 0 64 -# CHECK-NEXT: PdLoad 19 22 40 -# CHECK-NEXT: PdStore 20 23 24 +# CHECK-NEXT: PdLoad 21 24 40 +# CHECK-NEXT: PdStore 18 21 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -481,18 +485,18 @@ movaps %xmm3, (%rbx) # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions: -# CHECK-NEXT: 0.96 0.04 - - - - - - - - - - - - - - - - - - - - 1.00 movq %rsp, (%rax) +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - - - - - - - - - 1.00 movq %rsp, (%rax) # CHECK-NEXT: 2.00 - - - - - - - - - - - - - - - - - - - 2.00 - - movq (%rcx), %rbp # CHECK-NEXT: - 2.00 - - - - - - - - - - - - - - - - - 2.00 - - - movq (%rdx), %rsi -# CHECK-NEXT: 0.04 0.96 - - - - - - - - - - - - - - - - - - - - 1.00 movq %rdi, (%rbx) +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - - - - - - - - - 1.00 movq %rdi, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movq %rsp, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movq (%rcx), %rbp -# CHECK-NEXT: [0,2] D=eeeeeER. movq (%rdx), %rsi -# CHECK-NEXT: [0,3] D======eER movq %rdi, (%rbx) +# CHECK: [0,0] DeER . . movq %rsp, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movq (%rcx), %rbp +# CHECK-NEXT: [0,2] D=eeeeeER movq (%rdx), %rsi +# CHECK-NEXT: [0,3] D==eE---R movq %rdi, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -504,14 +508,14 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movq %rsp, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movq (%rcx), %rbp # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movq (%rdx), %rsi -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movq %rdi, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 3.0 1.0 3.0 movq %rdi, (%rbx) +# CHECK-NEXT: 1 1.8 1.3 0.8 # CHECK: [4] Code Region # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 554 +# CHECK-NEXT: Total Cycles: 553 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 @@ -536,24 +540,24 @@ movaps %xmm3, (%rbx) # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 55 (9.9%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 57 (10.3%) # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 437 (78.9%) +# CHECK-NEXT: SQ - Store queue full: 432 (78.1%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 365 (65.9%) +# CHECK-NEXT: 0, 364 (65.8%) # CHECK-NEXT: 1, 88 (15.9%) -# CHECK-NEXT: 2, 3 (0.5%) -# CHECK-NEXT: 3, 86 (15.5%) -# CHECK-NEXT: 4, 12 (2.2%) +# CHECK-NEXT: 2, 4 (0.7%) +# CHECK-NEXT: 3, 84 (15.2%) +# CHECK-NEXT: 4, 13 (2.4%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 253 (45.7%) -# CHECK-NEXT: 1, 202 (36.5%) -# CHECK-NEXT: 2, 99 (17.9%) +# CHECK-NEXT: 0, 253 (45.8%) +# CHECK-NEXT: 1, 200 (36.2%) +# CHECK-NEXT: 2, 100 (18.1%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -599,18 +603,17 @@ movaps %xmm3, (%rbx) # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions: # CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - - - 3.00 - - - - 1.00 movd %mm0, (%rax) -# CHECK-NEXT: 1.53 1.47 - - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - movd (%rcx), %mm1 -# CHECK-NEXT: 1.47 1.53 - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - - movd (%rdx), %mm2 +# CHECK-NEXT: 1.50 1.50 - - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - movd (%rcx), %mm1 +# CHECK-NEXT: 1.50 1.50 - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - - movd (%rdx), %mm2 # CHECK-NEXT: 1.00 - - - - - - - - - - - - 1.00 - - 3.00 - - - - - 1.00 movd %mm3, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: 0 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeeER. . movd %mm0, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movd (%rcx), %mm1 -# CHECK-NEXT: [0,2] D=eeeeeER . movd (%rdx), %mm2 -# CHECK-NEXT: [0,3] D======eeER movd %mm3, (%rbx) +# CHECK: [0,0] DeeER. . movd %mm0, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movd (%rcx), %mm1 +# CHECK-NEXT: [0,2] D=eeeeeER movd (%rdx), %mm2 +# CHECK-NEXT: [0,3] D===eeE-R movd %mm3, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -622,8 +625,8 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movd %mm0, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movd (%rcx), %mm1 # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movd (%rdx), %mm2 -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movd %mm3, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 4.0 1.0 1.0 movd %mm3, (%rbx) +# CHECK-NEXT: 1 2.0 1.3 0.3 # CHECK: [5] Code Region @@ -668,9 +671,9 @@ movaps %xmm3, (%rbx) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 104 (25.7%) -# CHECK-NEXT: 1, 202 (49.9%) -# CHECK-NEXT: 2, 99 (24.4%) +# CHECK-NEXT: 0, 105 (25.9%) +# CHECK-NEXT: 1, 200 (49.4%) +# CHECK-NEXT: 2, 100 (24.7%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -679,10 +682,10 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 37 40 40 -# CHECK-NEXT: PdFPU 37 40 64 -# CHECK-NEXT: PdLoad 19 22 40 -# CHECK-NEXT: PdStore 20 22 24 +# CHECK-NEXT: PdEX 36 40 40 +# CHECK-NEXT: PdFPU 36 40 64 +# CHECK-NEXT: PdLoad 20 23 40 +# CHECK-NEXT: PdStore 19 21 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -721,12 +724,12 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 1.00 - - - - - - - - - - - - 1.00 - - 3.00 - - - - - 1.00 movaps %xmm3, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . movaps %xmm0, (%rax) -# CHECK-NEXT: [0,1] DeeeeeER . movaps (%rcx), %xmm1 -# CHECK-NEXT: [0,2] D=eeeeeER. movaps (%rdx), %xmm2 -# CHECK-NEXT: [0,3] D======eER movaps %xmm3, (%rbx) +# CHECK: [0,0] DeER . . movaps %xmm0, (%rax) +# CHECK-NEXT: [0,1] DeeeeeER. movaps (%rcx), %xmm1 +# CHECK-NEXT: [0,2] D=eeeeeER movaps (%rdx), %xmm2 +# CHECK-NEXT: [0,3] D===eE--R movaps %xmm3, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -738,5 +741,5 @@ movaps %xmm3, (%rbx) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movaps %xmm0, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movaps (%rcx), %xmm1 # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movaps (%rdx), %xmm2 -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movaps %xmm3, (%rbx) -# CHECK-NEXT: 1 2.8 1.0 0.0 +# CHECK-NEXT: 3. 1 4.0 2.0 2.0 movaps %xmm3, (%rbx) +# CHECK-NEXT: 1 2.0 1.5 0.5 diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s b/llvm/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s index fb96ce5..e175378 100644 --- a/llvm/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s @@ -101,9 +101,9 @@ vmovaps %xmm0, 48(%rdi) # CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0 # CHECK-NEXT: 1. 1 7.0 1.0 0.0 vmovaps %xmm0, (%rdi) # CHECK-NEXT: 2. 1 1.0 1.0 2.0 vmovaps 16(%rsi), %xmm0 -# CHECK-NEXT: 3. 1 8.0 0.0 0.0 vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 3. 1 8.0 1.0 0.0 vmovaps %xmm0, 16(%rdi) # CHECK-NEXT: 4. 1 3.0 3.0 0.0 vmovaps 32(%rsi), %xmm0 # CHECK-NEXT: 5. 1 9.0 1.0 0.0 vmovaps %xmm0, 32(%rdi) # CHECK-NEXT: 6. 1 3.0 3.0 2.0 vmovaps 48(%rsi), %xmm0 -# CHECK-NEXT: 7. 1 10.0 0.0 0.0 vmovaps %xmm0, 48(%rdi) -# CHECK-NEXT: 1 5.3 1.3 0.5 +# CHECK-NEXT: 7. 1 10.0 1.0 0.0 vmovaps %xmm0, 48(%rdi) +# CHECK-NEXT: 1 5.3 1.5 0.5 diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s index 067301b..c00b1c9 100644 --- a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s @@ -159,10 +159,10 @@ vmovaps %ymm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movb %spl, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movb %bpl, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movb %sil, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movb %dil, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movb %bpl, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movb %sil, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movb %dil, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [1] Code Region @@ -273,10 +273,10 @@ vmovaps %ymm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movw %sp, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movw %bp, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movw %si, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movw %di, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movw %bp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movw %si, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movw %di, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [2] Code Region @@ -387,10 +387,10 @@ vmovaps %ymm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movl %esp, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movl %ebp, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movl %esi, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movl %edi, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movl %ebp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movl %esi, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movl %edi, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [3] Code Region @@ -501,10 +501,10 @@ vmovaps %ymm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movq %rsp, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movq %rbp, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movq %rsi, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movq %rdi, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movq %rbp, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movq %rsi, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movq %rdi, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [4] Code Region @@ -732,10 +732,10 @@ vmovaps %ymm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movaps %xmm0, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movaps %xmm1, (%rcx) -# CHECK-NEXT: 2. 1 4.0 1.0 0.0 movaps %xmm2, (%rdx) -# CHECK-NEXT: 3. 1 5.0 0.0 0.0 movaps %xmm3, (%rbx) -# CHECK-NEXT: 1 3.0 0.5 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movaps %xmm1, (%rcx) +# CHECK-NEXT: 2. 1 4.0 2.0 0.0 movaps %xmm2, (%rdx) +# CHECK-NEXT: 3. 1 5.0 1.0 0.0 movaps %xmm3, (%rbx) +# CHECK-NEXT: 1 3.0 1.3 0.0 # CHECK: [6] Code Region @@ -846,7 +846,7 @@ vmovaps %ymm3, (%rbx) # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps %ymm0, (%rax) -# CHECK-NEXT: 1. 1 2.0 1.0 0.0 vmovaps %ymm1, (%rcx) -# CHECK-NEXT: 2. 1 35.0 33.0 0.0 vmovaps %ymm2, (%rdx) -# CHECK-NEXT: 3. 1 36.0 1.0 0.0 vmovaps %ymm3, (%rbx) -# CHECK-NEXT: 1 18.5 9.0 0.0 +# CHECK-NEXT: 1. 1 2.0 2.0 0.0 vmovaps %ymm1, (%rcx) +# CHECK-NEXT: 2. 1 35.0 34.0 0.0 vmovaps %ymm2, (%rdx) +# CHECK-NEXT: 3. 1 36.0 2.0 0.0 vmovaps %ymm3, (%rbx) +# CHECK-NEXT: 1 18.5 9.8 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/independent-load-stores.s b/llvm/test/tools/llvm-mca/X86/BtVer2/independent-load-stores.s new file mode 100644 index 0000000..bd202b6 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/independent-load-stores.s @@ -0,0 +1,146 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 < %s | FileCheck %s -check-prefixes=ALL,NOALIAS +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s -check-prefixes=ALL,YESALIAS + + addq $44, 64(%r14) + addq $44, 128(%r14) + addq $44, 192(%r14) + addq $44, 256(%r14) + addq $44, 320(%r14) + addq $44, 384(%r14) + addq $44, 448(%r14) + addq $44, 512(%r14) + addq $44, 576(%r14) + addq $44, 640(%r14) + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1000 + +# NOALIAS-NEXT: Total Cycles: 1008 +# YESALIAS-NEXT: Total Cycles: 6003 + +# ALL-NEXT: Total uOps: 1000 + +# ALL: Dispatch Width: 2 + +# NOALIAS-NEXT: uOps Per Cycle: 0.99 +# NOALIAS-NEXT: IPC: 0.99 + +# YESALIAS-NEXT: uOps Per Cycle: 0.17 +# YESALIAS-NEXT: IPC: 0.17 + +# ALL-NEXT: Block RThroughput: 10.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: +# ALL-NEXT: 1 6 1.00 * * addq $44, 64(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 128(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 192(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 256(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 320(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 384(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 448(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 512(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 576(%r14) +# ALL-NEXT: 1 6 1.00 * * addq $44, 640(%r14) + +# ALL: Resources: +# ALL-NEXT: [0] - JALU0 +# ALL-NEXT: [1] - JALU1 +# ALL-NEXT: [2] - JDiv +# ALL-NEXT: [3] - JFPA +# ALL-NEXT: [4] - JFPM +# ALL-NEXT: [5] - JFPU0 +# ALL-NEXT: [6] - JFPU1 +# ALL-NEXT: [7] - JLAGU +# ALL-NEXT: [8] - JMul +# ALL-NEXT: [9] - JSAGU +# ALL-NEXT: [10] - JSTC +# ALL-NEXT: [11] - JVALU0 +# ALL-NEXT: [12] - JVALU1 +# ALL-NEXT: [13] - JVIMUL + +# ALL: Resource pressure per iteration: +# ALL-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# ALL-NEXT: 5.00 5.00 - - - - - 10.00 - 10.00 - - - - + +# ALL: Resource pressure by instruction: +# ALL-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# ALL-NEXT: - 1.00 - - - - - 1.00 - 1.00 - - - - addq $44, 64(%r14) +# ALL-NEXT: 1.00 - - - - - - 1.00 - 1.00 - - - - addq $44, 128(%r14) +# ALL-NEXT: - 1.00 - - - - - 1.00 - 1.00 - - - - addq $44, 192(%r14) +# ALL-NEXT: 1.00 - - - - - - 1.00 - 1.00 - - - - addq $44, 256(%r14) +# ALL-NEXT: - 1.00 - - - - - 1.00 - 1.00 - - - - addq $44, 320(%r14) +# ALL-NEXT: 1.00 - - - - - - 1.00 - 1.00 - - - - addq $44, 384(%r14) +# ALL-NEXT: - 1.00 - - - - - 1.00 - 1.00 - - - - addq $44, 448(%r14) +# ALL-NEXT: 1.00 - - - - - - 1.00 - 1.00 - - - - addq $44, 512(%r14) +# ALL-NEXT: - 1.00 - - - - - 1.00 - 1.00 - - - - addq $44, 576(%r14) +# ALL-NEXT: 1.00 - - - - - - 1.00 - 1.00 - - - - addq $44, 640(%r14) + +# ALL: Timeline view: + +# NOALIAS-NEXT: 01234567 +# NOALIAS-NEXT: Index 0123456789 + +# YESALIAS-NEXT: 0123456789 0123456789 0123456789 +# YESALIAS-NEXT: Index 0123456789 0123456789 0123456789 012 + +# NOALIAS: [0,0] DeeeeeeER . . . addq $44, 64(%r14) +# NOALIAS-NEXT: [0,1] D=eeeeeeER. . . addq $44, 128(%r14) +# NOALIAS-NEXT: [0,2] .D=eeeeeeER . . addq $44, 192(%r14) +# NOALIAS-NEXT: [0,3] .D==eeeeeeER . . addq $44, 256(%r14) +# NOALIAS-NEXT: [0,4] . D==eeeeeeER . . addq $44, 320(%r14) +# NOALIAS-NEXT: [0,5] . D===eeeeeeER . . addq $44, 384(%r14) +# NOALIAS-NEXT: [0,6] . D===eeeeeeER. . addq $44, 448(%r14) +# NOALIAS-NEXT: [0,7] . D====eeeeeeER . addq $44, 512(%r14) +# NOALIAS-NEXT: [0,8] . D====eeeeeeER. addq $44, 576(%r14) +# NOALIAS-NEXT: [0,9] . D=====eeeeeeER addq $44, 640(%r14) + +# YESALIAS: [0,0] DeeeeeeER . . . . . . . . . . . . addq $44, 64(%r14) +# YESALIAS-NEXT: [0,1] D======eeeeeeER. . . . . . . . . . . addq $44, 128(%r14) +# YESALIAS-NEXT: [0,2] .D===========eeeeeeER . . . . . . . . . addq $44, 192(%r14) +# YESALIAS-NEXT: [0,3] .D=================eeeeeeER . . . . . . . . addq $44, 256(%r14) +# YESALIAS-NEXT: [0,4] . D======================eeeeeeER . . . . . . . addq $44, 320(%r14) +# YESALIAS-NEXT: [0,5] . D============================eeeeeeER . . . . . . addq $44, 384(%r14) +# YESALIAS-NEXT: [0,6] . D=================================eeeeeeER. . . . . addq $44, 448(%r14) +# YESALIAS-NEXT: [0,7] . D=======================================eeeeeeER . . . addq $44, 512(%r14) +# YESALIAS-NEXT: [0,8] . D============================================eeeeeeER . . addq $44, 576(%r14) +# YESALIAS-NEXT: [0,9] . D==================================================eeeeeeER addq $44, 640(%r14) + +# ALL: Average Wait times (based on the timeline view): +# ALL-NEXT: [0]: Executions +# ALL-NEXT: [1]: Average time spent waiting in a scheduler's queue +# ALL-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# ALL-NEXT: [3]: Average time elapsed from WB until retire stage + +# ALL: [0] [1] [2] [3] +# ALL-NEXT: 0. 1 1.0 1.0 0.0 addq $44, 64(%r14) + +# NOALIAS-NEXT: 1. 1 2.0 1.0 0.0 addq $44, 128(%r14) +# NOALIAS-NEXT: 2. 1 2.0 1.0 0.0 addq $44, 192(%r14) +# NOALIAS-NEXT: 3. 1 3.0 1.0 0.0 addq $44, 256(%r14) +# NOALIAS-NEXT: 4. 1 3.0 1.0 0.0 addq $44, 320(%r14) +# NOALIAS-NEXT: 5. 1 4.0 1.0 0.0 addq $44, 384(%r14) +# NOALIAS-NEXT: 6. 1 4.0 1.0 0.0 addq $44, 448(%r14) +# NOALIAS-NEXT: 7. 1 5.0 1.0 0.0 addq $44, 512(%r14) +# NOALIAS-NEXT: 8. 1 5.0 1.0 0.0 addq $44, 576(%r14) +# NOALIAS-NEXT: 9. 1 6.0 1.0 0.0 addq $44, 640(%r14) +# NOALIAS-NEXT: 1 3.5 1.0 0.0 + +# YESALIAS-NEXT: 1. 1 7.0 0.0 0.0 addq $44, 128(%r14) +# YESALIAS-NEXT: 2. 1 12.0 0.0 0.0 addq $44, 192(%r14) +# YESALIAS-NEXT: 3. 1 18.0 0.0 0.0 addq $44, 256(%r14) +# YESALIAS-NEXT: 4. 1 23.0 0.0 0.0 addq $44, 320(%r14) +# YESALIAS-NEXT: 5. 1 29.0 0.0 0.0 addq $44, 384(%r14) +# YESALIAS-NEXT: 6. 1 34.0 0.0 0.0 addq $44, 448(%r14) +# YESALIAS-NEXT: 7. 1 40.0 0.0 0.0 addq $44, 512(%r14) +# YESALIAS-NEXT: 8. 1 45.0 0.0 0.0 addq $44, 576(%r14) +# YESALIAS-NEXT: 9. 1 51.0 0.0 0.0 addq $44, 640(%r14) +# YESALIAS-NEXT: 1 26.0 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/xadd.s b/llvm/test/tools/llvm-mca/X86/BtVer2/xadd.s index 64b6490..691f530 100644 --- a/llvm/test/tools/llvm-mca/X86/BtVer2/xadd.s +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/xadd.s @@ -21,12 +21,12 @@ imul %ecx, %ecx # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 10 -# CHECK-NEXT: Total Cycles: 27 +# CHECK-NEXT: Total Cycles: 24 # CHECK-NEXT: Total uOps: 16 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.59 -# CHECK-NEXT: IPC: 0.37 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.42 # CHECK-NEXT: Block RThroughput: 4.0 # CHECK: Instruction Info: @@ -74,18 +74,18 @@ imul %ecx, %ecx # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 0123456 - -# CHECK: [0,0] DeeeeeeeeeeeER . . .. xaddl %ecx, (%rsp) -# CHECK-NEXT: [0,1] . D=eE-------R . . .. addl %ecx, %ecx -# CHECK-NEXT: [0,2] . D==eE-------R. . .. addl %ecx, %ecx -# CHECK-NEXT: [0,3] . D==eeeE----R. . .. imull %ecx, %ecx -# CHECK-NEXT: [0,4] . D=====eeeE--R . .. imull %ecx, %ecx -# CHECK-NEXT: [1,0] . D=======eeeeeeeeeeeER.. xaddl %ecx, (%rsp) -# CHECK-NEXT: [1,1] . .D========eE-------R.. addl %ecx, %ecx -# CHECK-NEXT: [1,2] . .D=========eE-------R. addl %ecx, %ecx -# CHECK-NEXT: [1,3] . . D=========eeeE----R. imull %ecx, %ecx -# CHECK-NEXT: [1,4] . . D============eeeE--R imull %ecx, %ecx +# CHECK-NEXT: Index 0123456789 0123 + +# CHECK: [0,0] DeeeeeeeeeeeER . . . xaddl %ecx, (%rsp) +# CHECK-NEXT: [0,1] . D=eE-------R . . . addl %ecx, %ecx +# CHECK-NEXT: [0,2] . D==eE-------R. . . addl %ecx, %ecx +# CHECK-NEXT: [0,3] . D==eeeE----R. . . imull %ecx, %ecx +# CHECK-NEXT: [0,4] . D=====eeeE--R . . imull %ecx, %ecx +# CHECK-NEXT: [1,0] . D====eeeeeeeeeeeER . xaddl %ecx, (%rsp) +# CHECK-NEXT: [1,1] . .D=====eE-------R . addl %ecx, %ecx +# CHECK-NEXT: [1,2] . .D======eE-------R. addl %ecx, %ecx +# CHECK-NEXT: [1,3] . . D======eeeE----R. imull %ecx, %ecx +# CHECK-NEXT: [1,4] . . D=========eeeE--R imull %ecx, %ecx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -94,12 +94,12 @@ imul %ecx, %ecx # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.5 0.5 0.0 xaddl %ecx, (%rsp) -# CHECK-NEXT: 1. 2 5.5 0.0 7.0 addl %ecx, %ecx -# CHECK-NEXT: 2. 2 6.5 0.0 7.0 addl %ecx, %ecx -# CHECK-NEXT: 3. 2 6.5 0.0 4.0 imull %ecx, %ecx -# CHECK-NEXT: 4. 2 9.5 0.0 2.0 imull %ecx, %ecx -# CHECK-NEXT: 2 6.5 0.1 4.0 +# CHECK-NEXT: 0. 2 3.0 0.5 0.0 xaddl %ecx, (%rsp) +# CHECK-NEXT: 1. 2 4.0 0.0 7.0 addl %ecx, %ecx +# CHECK-NEXT: 2. 2 5.0 0.0 7.0 addl %ecx, %ecx +# CHECK-NEXT: 3. 2 5.0 0.0 4.0 imull %ecx, %ecx +# CHECK-NEXT: 4. 2 8.0 0.0 2.0 imull %ecx, %ecx +# CHECK-NEXT: 2 5.0 0.1 4.0 # CHECK: [1] Code Region diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/independent-load-stores.s b/llvm/test/tools/llvm-mca/X86/Haswell/independent-load-stores.s new file mode 100644 index 0000000..3988ce8 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/Haswell/independent-load-stores.s @@ -0,0 +1,142 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mcpu=haswell -timeline -timeline-max-iterations=1 < %s | FileCheck %s -check-prefixes=ALL,NOALIAS +# RUN: llvm-mca -mcpu=haswell -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s -check-prefixes=ALL,YESALIAS + + addq $44, 64(%r14) + addq $44, 128(%r14) + addq $44, 192(%r14) + addq $44, 256(%r14) + addq $44, 320(%r14) + addq $44, 384(%r14) + addq $44, 448(%r14) + addq $44, 512(%r14) + addq $44, 576(%r14) + addq $44, 640(%r14) + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1000 + +# NOALIAS-NEXT: Total Cycles: 1009 +# YESALIAS-NEXT: Total Cycles: 7003 + +# ALL-NEXT: Total uOps: 3000 + +# ALL: Dispatch Width: 4 + +# NOALIAS-NEXT: uOps Per Cycle: 2.97 +# NOALIAS-NEXT: IPC: 0.99 + +# YESALIAS-NEXT: uOps Per Cycle: 0.43 +# YESALIAS-NEXT: IPC: 0.14 + +# ALL-NEXT: Block RThroughput: 10.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: +# ALL-NEXT: 3 7 1.00 * * addq $44, 64(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 128(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 192(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 256(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 320(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 384(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 448(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 512(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 576(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 640(%r14) + +# ALL: Resources: +# ALL-NEXT: [0] - HWDivider +# ALL-NEXT: [1] - HWFPDivider +# ALL-NEXT: [2] - HWPort0 +# ALL-NEXT: [3] - HWPort1 +# ALL-NEXT: [4] - HWPort2 +# ALL-NEXT: [5] - HWPort3 +# ALL-NEXT: [6] - HWPort4 +# ALL-NEXT: [7] - HWPort5 +# ALL-NEXT: [8] - HWPort6 +# ALL-NEXT: [9] - HWPort7 + +# ALL: Resource pressure per iteration: +# ALL-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# ALL-NEXT: - - 2.50 2.50 6.66 6.67 10.00 2.50 2.50 6.67 + +# ALL: Resource pressure by instruction: +# ALL-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# ALL-NEXT: - - - 0.50 0.66 0.67 1.00 - 0.50 0.67 addq $44, 64(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.66 1.00 0.50 - 0.67 addq $44, 128(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.67 1.00 - 0.50 0.66 addq $44, 192(%r14) +# ALL-NEXT: - - 0.50 - 0.66 0.67 1.00 0.50 - 0.67 addq $44, 256(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.66 1.00 - 0.50 0.67 addq $44, 320(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.67 1.00 0.50 - 0.66 addq $44, 384(%r14) +# ALL-NEXT: - - - 0.50 0.66 0.67 1.00 - 0.50 0.67 addq $44, 448(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.66 1.00 0.50 - 0.67 addq $44, 512(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.67 1.00 - 0.50 0.66 addq $44, 576(%r14) +# ALL-NEXT: - - 0.50 - 0.66 0.67 1.00 0.50 - 0.67 addq $44, 640(%r14) + +# ALL: Timeline view: + +# NOALIAS-NEXT: 012345678 +# NOALIAS-NEXT: Index 0123456789 + +# YESALIAS-NEXT: 0123456789 0123456789 0123456789 012 +# YESALIAS-NEXT: Index 0123456789 0123456789 0123456789 0123456789 + +# NOALIAS: [0,0] DeeeeeeeER. . . addq $44, 64(%r14) +# NOALIAS-NEXT: [0,1] .DeeeeeeeER . . addq $44, 128(%r14) +# NOALIAS-NEXT: [0,2] . DeeeeeeeER . . addq $44, 192(%r14) +# NOALIAS-NEXT: [0,3] . DeeeeeeeER . . addq $44, 256(%r14) +# NOALIAS-NEXT: [0,4] . DeeeeeeeER . . addq $44, 320(%r14) +# NOALIAS-NEXT: [0,5] . DeeeeeeeER. . addq $44, 384(%r14) +# NOALIAS-NEXT: [0,6] . .DeeeeeeeER . addq $44, 448(%r14) +# NOALIAS-NEXT: [0,7] . . DeeeeeeeER . addq $44, 512(%r14) +# NOALIAS-NEXT: [0,8] . . DeeeeeeeER. addq $44, 576(%r14) +# NOALIAS-NEXT: [0,9] . . DeeeeeeeER addq $44, 640(%r14) + +# YESALIAS: [0,0] DeeeeeeeER. . . . . . . . . . . . . . addq $44, 64(%r14) +# YESALIAS-NEXT: [0,1] .D======eeeeeeeER . . . . . . . . . . . . addq $44, 128(%r14) +# YESALIAS-NEXT: [0,2] . D============eeeeeeeER . . . . . . . . . . . addq $44, 192(%r14) +# YESALIAS-NEXT: [0,3] . D==================eeeeeeeER . . . . . . . . . addq $44, 256(%r14) +# YESALIAS-NEXT: [0,4] . D========================eeeeeeeER . . . . . . . . addq $44, 320(%r14) +# YESALIAS-NEXT: [0,5] . D==============================eeeeeeeER. . . . . . . addq $44, 384(%r14) +# YESALIAS-NEXT: [0,6] . .D====================================eeeeeeeER . . . . . addq $44, 448(%r14) +# YESALIAS-NEXT: [0,7] . . D==========================================eeeeeeeER . . . . addq $44, 512(%r14) +# YESALIAS-NEXT: [0,8] . . D================================================eeeeeeeER . . addq $44, 576(%r14) +# YESALIAS-NEXT: [0,9] . . D======================================================eeeeeeeER addq $44, 640(%r14) + +# ALL: Average Wait times (based on the timeline view): +# ALL-NEXT: [0]: Executions +# ALL-NEXT: [1]: Average time spent waiting in a scheduler's queue +# ALL-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# ALL-NEXT: [3]: Average time elapsed from WB until retire stage + +# ALL: [0] [1] [2] [3] +# ALL-NEXT: 0. 1 1.0 1.0 0.0 addq $44, 64(%r14) + +# NOALIAS-NEXT: 1. 1 1.0 1.0 0.0 addq $44, 128(%r14) +# NOALIAS-NEXT: 2. 1 1.0 1.0 0.0 addq $44, 192(%r14) +# NOALIAS-NEXT: 3. 1 1.0 1.0 0.0 addq $44, 256(%r14) +# NOALIAS-NEXT: 4. 1 1.0 1.0 0.0 addq $44, 320(%r14) +# NOALIAS-NEXT: 5. 1 1.0 1.0 0.0 addq $44, 384(%r14) +# NOALIAS-NEXT: 6. 1 1.0 1.0 0.0 addq $44, 448(%r14) +# NOALIAS-NEXT: 7. 1 1.0 1.0 0.0 addq $44, 512(%r14) +# NOALIAS-NEXT: 8. 1 1.0 1.0 0.0 addq $44, 576(%r14) +# NOALIAS-NEXT: 9. 1 1.0 1.0 0.0 addq $44, 640(%r14) +# NOALIAS-NEXT: 1 1.0 1.0 0.0 + +# YESALIAS-NEXT: 1. 1 7.0 0.0 0.0 addq $44, 128(%r14) +# YESALIAS-NEXT: 2. 1 13.0 0.0 0.0 addq $44, 192(%r14) +# YESALIAS-NEXT: 3. 1 19.0 0.0 0.0 addq $44, 256(%r14) +# YESALIAS-NEXT: 4. 1 25.0 0.0 0.0 addq $44, 320(%r14) +# YESALIAS-NEXT: 5. 1 31.0 0.0 0.0 addq $44, 384(%r14) +# YESALIAS-NEXT: 6. 1 37.0 0.0 0.0 addq $44, 448(%r14) +# YESALIAS-NEXT: 7. 1 43.0 0.0 0.0 addq $44, 512(%r14) +# YESALIAS-NEXT: 8. 1 49.0 0.0 0.0 addq $44, 576(%r14) +# YESALIAS-NEXT: 9. 1 55.0 0.0 0.0 addq $44, 640(%r14) +# YESALIAS-NEXT: 1 28.0 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/independent-load-stores.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/independent-load-stores.s new file mode 100644 index 0000000..03d7bcd --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/independent-load-stores.s @@ -0,0 +1,142 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -timeline-max-iterations=1 < %s | FileCheck %s -check-prefixes=ALL,NOALIAS +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s -check-prefixes=ALL,YESALIAS + + addq $44, 64(%r14) + addq $44, 128(%r14) + addq $44, 192(%r14) + addq $44, 256(%r14) + addq $44, 320(%r14) + addq $44, 384(%r14) + addq $44, 448(%r14) + addq $44, 512(%r14) + addq $44, 576(%r14) + addq $44, 640(%r14) + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1000 + +# NOALIAS-NEXT: Total Cycles: 1009 +# YESALIAS-NEXT: Total Cycles: 7003 + +# ALL-NEXT: Total uOps: 3000 + +# ALL: Dispatch Width: 6 + +# NOALIAS-NEXT: uOps Per Cycle: 2.97 +# NOALIAS-NEXT: IPC: 0.99 + +# YESALIAS-NEXT: uOps Per Cycle: 0.43 +# YESALIAS-NEXT: IPC: 0.14 + +# ALL-NEXT: Block RThroughput: 10.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: +# ALL-NEXT: 3 7 1.00 * * addq $44, 64(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 128(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 192(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 256(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 320(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 384(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 448(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 512(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 576(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 640(%r14) + +# ALL: Resources: +# ALL-NEXT: [0] - SKLDivider +# ALL-NEXT: [1] - SKLFPDivider +# ALL-NEXT: [2] - SKLPort0 +# ALL-NEXT: [3] - SKLPort1 +# ALL-NEXT: [4] - SKLPort2 +# ALL-NEXT: [5] - SKLPort3 +# ALL-NEXT: [6] - SKLPort4 +# ALL-NEXT: [7] - SKLPort5 +# ALL-NEXT: [8] - SKLPort6 +# ALL-NEXT: [9] - SKLPort7 + +# ALL: Resource pressure per iteration: +# ALL-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# ALL-NEXT: - - 2.50 2.50 6.66 6.67 10.00 2.50 2.50 6.67 + +# ALL: Resource pressure by instruction: +# ALL-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# ALL-NEXT: - - - 0.50 0.66 0.67 1.00 - 0.50 0.67 addq $44, 64(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.66 1.00 0.50 - 0.67 addq $44, 128(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.67 1.00 - 0.50 0.66 addq $44, 192(%r14) +# ALL-NEXT: - - 0.50 - 0.66 0.67 1.00 0.50 - 0.67 addq $44, 256(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.66 1.00 - 0.50 0.67 addq $44, 320(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.67 1.00 0.50 - 0.66 addq $44, 384(%r14) +# ALL-NEXT: - - - 0.50 0.66 0.67 1.00 - 0.50 0.67 addq $44, 448(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.66 1.00 0.50 - 0.67 addq $44, 512(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.67 1.00 - 0.50 0.66 addq $44, 576(%r14) +# ALL-NEXT: - - 0.50 - 0.66 0.67 1.00 0.50 - 0.67 addq $44, 640(%r14) + +# ALL: Timeline view: + +# NOALIAS-NEXT: 012345678 +# NOALIAS-NEXT: Index 0123456789 + +# YESALIAS-NEXT: 0123456789 0123456789 0123456789 012 +# YESALIAS-NEXT: Index 0123456789 0123456789 0123456789 0123456789 + +# NOALIAS: [0,0] DeeeeeeeER. . . addq $44, 64(%r14) +# NOALIAS-NEXT: [0,1] D=eeeeeeeER . . addq $44, 128(%r14) +# NOALIAS-NEXT: [0,2] .D=eeeeeeeER . . addq $44, 192(%r14) +# NOALIAS-NEXT: [0,3] .D==eeeeeeeER . . addq $44, 256(%r14) +# NOALIAS-NEXT: [0,4] . D==eeeeeeeER . . addq $44, 320(%r14) +# NOALIAS-NEXT: [0,5] . D===eeeeeeeER. . addq $44, 384(%r14) +# NOALIAS-NEXT: [0,6] . D===eeeeeeeER . addq $44, 448(%r14) +# NOALIAS-NEXT: [0,7] . D====eeeeeeeER . addq $44, 512(%r14) +# NOALIAS-NEXT: [0,8] . D====eeeeeeeER. addq $44, 576(%r14) +# NOALIAS-NEXT: [0,9] . D=====eeeeeeeER addq $44, 640(%r14) + +# YESALIAS: [0,0] DeeeeeeeER. . . . . . . . . . . . . . addq $44, 64(%r14) +# YESALIAS-NEXT: [0,1] D=======eeeeeeeER . . . . . . . . . . . . addq $44, 128(%r14) +# YESALIAS-NEXT: [0,2] .D=============eeeeeeeER . . . . . . . . . . . addq $44, 192(%r14) +# YESALIAS-NEXT: [0,3] .D====================eeeeeeeER . . . . . . . . . addq $44, 256(%r14) +# YESALIAS-NEXT: [0,4] . D==========================eeeeeeeER . . . . . . . . addq $44, 320(%r14) +# YESALIAS-NEXT: [0,5] . D=================================eeeeeeeER. . . . . . . addq $44, 384(%r14) +# YESALIAS-NEXT: [0,6] . D=======================================eeeeeeeER . . . . . addq $44, 448(%r14) +# YESALIAS-NEXT: [0,7] . D==============================================eeeeeeeER . . . . addq $44, 512(%r14) +# YESALIAS-NEXT: [0,8] . D====================================================eeeeeeeER . . addq $44, 576(%r14) +# YESALIAS-NEXT: [0,9] . D===========================================================eeeeeeeER addq $44, 640(%r14) + +# ALL: Average Wait times (based on the timeline view): +# ALL-NEXT: [0]: Executions +# ALL-NEXT: [1]: Average time spent waiting in a scheduler's queue +# ALL-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# ALL-NEXT: [3]: Average time elapsed from WB until retire stage + +# ALL: [0] [1] [2] [3] +# ALL-NEXT: 0. 1 1.0 1.0 0.0 addq $44, 64(%r14) + +# NOALIAS-NEXT: 1. 1 2.0 1.0 0.0 addq $44, 128(%r14) +# NOALIAS-NEXT: 2. 1 2.0 1.0 0.0 addq $44, 192(%r14) +# NOALIAS-NEXT: 3. 1 3.0 1.0 0.0 addq $44, 256(%r14) +# NOALIAS-NEXT: 4. 1 3.0 1.0 0.0 addq $44, 320(%r14) +# NOALIAS-NEXT: 5. 1 4.0 1.0 0.0 addq $44, 384(%r14) +# NOALIAS-NEXT: 6. 1 4.0 1.0 0.0 addq $44, 448(%r14) +# NOALIAS-NEXT: 7. 1 5.0 1.0 0.0 addq $44, 512(%r14) +# NOALIAS-NEXT: 8. 1 5.0 1.0 0.0 addq $44, 576(%r14) +# NOALIAS-NEXT: 9. 1 6.0 1.0 0.0 addq $44, 640(%r14) +# NOALIAS-NEXT: 1 3.5 1.0 0.0 + +# YESALIAS-NEXT: 1. 1 8.0 0.0 0.0 addq $44, 128(%r14) +# YESALIAS-NEXT: 2. 1 14.0 0.0 0.0 addq $44, 192(%r14) +# YESALIAS-NEXT: 3. 1 21.0 0.0 0.0 addq $44, 256(%r14) +# YESALIAS-NEXT: 4. 1 27.0 0.0 0.0 addq $44, 320(%r14) +# YESALIAS-NEXT: 5. 1 34.0 0.0 0.0 addq $44, 384(%r14) +# YESALIAS-NEXT: 6. 1 40.0 0.0 0.0 addq $44, 448(%r14) +# YESALIAS-NEXT: 7. 1 47.0 0.0 0.0 addq $44, 512(%r14) +# YESALIAS-NEXT: 8. 1 53.0 0.0 0.0 addq $44, 576(%r14) +# YESALIAS-NEXT: 9. 1 60.0 0.0 0.0 addq $44, 640(%r14) +# YESALIAS-NEXT: 1 30.5 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/independent-load-stores.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/independent-load-stores.s new file mode 100644 index 0000000..4ebdee9 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/independent-load-stores.s @@ -0,0 +1,142 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -timeline -timeline-max-iterations=1 < %s | FileCheck %s -check-prefixes=ALL,NOALIAS +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s -check-prefixes=ALL,YESALIAS + + addq $44, 64(%r14) + addq $44, 128(%r14) + addq $44, 192(%r14) + addq $44, 256(%r14) + addq $44, 320(%r14) + addq $44, 384(%r14) + addq $44, 448(%r14) + addq $44, 512(%r14) + addq $44, 576(%r14) + addq $44, 640(%r14) + +# ALL: Iterations: 100 +# ALL-NEXT: Instructions: 1000 + +# NOALIAS-NEXT: Total Cycles: 1009 +# YESALIAS-NEXT: Total Cycles: 7003 + +# ALL-NEXT: Total uOps: 3000 + +# ALL: Dispatch Width: 6 + +# NOALIAS-NEXT: uOps Per Cycle: 2.97 +# NOALIAS-NEXT: IPC: 0.99 + +# YESALIAS-NEXT: uOps Per Cycle: 0.43 +# YESALIAS-NEXT: IPC: 0.14 + +# ALL-NEXT: Block RThroughput: 10.0 + +# ALL: Instruction Info: +# ALL-NEXT: [1]: #uOps +# ALL-NEXT: [2]: Latency +# ALL-NEXT: [3]: RThroughput +# ALL-NEXT: [4]: MayLoad +# ALL-NEXT: [5]: MayStore +# ALL-NEXT: [6]: HasSideEffects (U) + +# ALL: [1] [2] [3] [4] [5] [6] Instructions: +# ALL-NEXT: 3 7 1.00 * * addq $44, 64(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 128(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 192(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 256(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 320(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 384(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 448(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 512(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 576(%r14) +# ALL-NEXT: 3 7 1.00 * * addq $44, 640(%r14) + +# ALL: Resources: +# ALL-NEXT: [0] - SKXDivider +# ALL-NEXT: [1] - SKXFPDivider +# ALL-NEXT: [2] - SKXPort0 +# ALL-NEXT: [3] - SKXPort1 +# ALL-NEXT: [4] - SKXPort2 +# ALL-NEXT: [5] - SKXPort3 +# ALL-NEXT: [6] - SKXPort4 +# ALL-NEXT: [7] - SKXPort5 +# ALL-NEXT: [8] - SKXPort6 +# ALL-NEXT: [9] - SKXPort7 + +# ALL: Resource pressure per iteration: +# ALL-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# ALL-NEXT: - - 2.50 2.50 6.66 6.67 10.00 2.50 2.50 6.67 + +# ALL: Resource pressure by instruction: +# ALL-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# ALL-NEXT: - - - 0.50 0.66 0.67 1.00 - 0.50 0.67 addq $44, 64(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.66 1.00 0.50 - 0.67 addq $44, 128(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.67 1.00 - 0.50 0.66 addq $44, 192(%r14) +# ALL-NEXT: - - 0.50 - 0.66 0.67 1.00 0.50 - 0.67 addq $44, 256(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.66 1.00 - 0.50 0.67 addq $44, 320(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.67 1.00 0.50 - 0.66 addq $44, 384(%r14) +# ALL-NEXT: - - - 0.50 0.66 0.67 1.00 - 0.50 0.67 addq $44, 448(%r14) +# ALL-NEXT: - - 0.50 - 0.67 0.66 1.00 0.50 - 0.67 addq $44, 512(%r14) +# ALL-NEXT: - - - 0.50 0.67 0.67 1.00 - 0.50 0.66 addq $44, 576(%r14) +# ALL-NEXT: - - 0.50 - 0.66 0.67 1.00 0.50 - 0.67 addq $44, 640(%r14) + +# ALL: Timeline view: + +# NOALIAS-NEXT: 012345678 +# NOALIAS-NEXT: Index 0123456789 + +# YESALIAS-NEXT: 0123456789 0123456789 0123456789 012 +# YESALIAS-NEXT: Index 0123456789 0123456789 0123456789 0123456789 + +# NOALIAS: [0,0] DeeeeeeeER. . . addq $44, 64(%r14) +# NOALIAS-NEXT: [0,1] D=eeeeeeeER . . addq $44, 128(%r14) +# NOALIAS-NEXT: [0,2] .D=eeeeeeeER . . addq $44, 192(%r14) +# NOALIAS-NEXT: [0,3] .D==eeeeeeeER . . addq $44, 256(%r14) +# NOALIAS-NEXT: [0,4] . D==eeeeeeeER . . addq $44, 320(%r14) +# NOALIAS-NEXT: [0,5] . D===eeeeeeeER. . addq $44, 384(%r14) +# NOALIAS-NEXT: [0,6] . D===eeeeeeeER . addq $44, 448(%r14) +# NOALIAS-NEXT: [0,7] . D====eeeeeeeER . addq $44, 512(%r14) +# NOALIAS-NEXT: [0,8] . D====eeeeeeeER. addq $44, 576(%r14) +# NOALIAS-NEXT: [0,9] . D=====eeeeeeeER addq $44, 640(%r14) + +# YESALIAS: [0,0] DeeeeeeeER. . . . . . . . . . . . . . addq $44, 64(%r14) +# YESALIAS-NEXT: [0,1] D=======eeeeeeeER . . . . . . . . . . . . addq $44, 128(%r14) +# YESALIAS-NEXT: [0,2] .D=============eeeeeeeER . . . . . . . . . . . addq $44, 192(%r14) +# YESALIAS-NEXT: [0,3] .D====================eeeeeeeER . . . . . . . . . addq $44, 256(%r14) +# YESALIAS-NEXT: [0,4] . D==========================eeeeeeeER . . . . . . . . addq $44, 320(%r14) +# YESALIAS-NEXT: [0,5] . D=================================eeeeeeeER. . . . . . . addq $44, 384(%r14) +# YESALIAS-NEXT: [0,6] . D=======================================eeeeeeeER . . . . . addq $44, 448(%r14) +# YESALIAS-NEXT: [0,7] . D==============================================eeeeeeeER . . . . addq $44, 512(%r14) +# YESALIAS-NEXT: [0,8] . D====================================================eeeeeeeER . . addq $44, 576(%r14) +# YESALIAS-NEXT: [0,9] . D===========================================================eeeeeeeER addq $44, 640(%r14) + +# ALL: Average Wait times (based on the timeline view): +# ALL-NEXT: [0]: Executions +# ALL-NEXT: [1]: Average time spent waiting in a scheduler's queue +# ALL-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# ALL-NEXT: [3]: Average time elapsed from WB until retire stage + +# ALL: [0] [1] [2] [3] +# ALL-NEXT: 0. 1 1.0 1.0 0.0 addq $44, 64(%r14) + +# NOALIAS-NEXT: 1. 1 2.0 1.0 0.0 addq $44, 128(%r14) +# NOALIAS-NEXT: 2. 1 2.0 1.0 0.0 addq $44, 192(%r14) +# NOALIAS-NEXT: 3. 1 3.0 1.0 0.0 addq $44, 256(%r14) +# NOALIAS-NEXT: 4. 1 3.0 1.0 0.0 addq $44, 320(%r14) +# NOALIAS-NEXT: 5. 1 4.0 1.0 0.0 addq $44, 384(%r14) +# NOALIAS-NEXT: 6. 1 4.0 1.0 0.0 addq $44, 448(%r14) +# NOALIAS-NEXT: 7. 1 5.0 1.0 0.0 addq $44, 512(%r14) +# NOALIAS-NEXT: 8. 1 5.0 1.0 0.0 addq $44, 576(%r14) +# NOALIAS-NEXT: 9. 1 6.0 1.0 0.0 addq $44, 640(%r14) +# NOALIAS-NEXT: 1 3.5 1.0 0.0 + +# YESALIAS-NEXT: 1. 1 8.0 0.0 0.0 addq $44, 128(%r14) +# YESALIAS-NEXT: 2. 1 14.0 0.0 0.0 addq $44, 192(%r14) +# YESALIAS-NEXT: 3. 1 21.0 0.0 0.0 addq $44, 256(%r14) +# YESALIAS-NEXT: 4. 1 27.0 0.0 0.0 addq $44, 320(%r14) +# YESALIAS-NEXT: 5. 1 34.0 0.0 0.0 addq $44, 384(%r14) +# YESALIAS-NEXT: 6. 1 40.0 0.0 0.0 addq $44, 448(%r14) +# YESALIAS-NEXT: 7. 1 47.0 0.0 0.0 addq $44, 512(%r14) +# YESALIAS-NEXT: 8. 1 53.0 0.0 0.0 addq $44, 576(%r14) +# YESALIAS-NEXT: 9. 1 60.0 0.0 0.0 addq $44, 640(%r14) +# YESALIAS-NEXT: 1 30.5 0.1 0.0 -- 2.7.4