From eebe841a47cbbd55bdcc32da943c92d18f88a5b8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 27 Sep 2018 09:36:28 +1000 Subject: [PATCH] RegAlloc: Allow targets to split register allocation AMDGPU normally spills SGPRs to VGPRs. Previously, since all register classes are handled at the same time, this was problematic. We don't know ahead of time how many registers will be needed to be reserved to handle the spilling. If no VGPRs were left for spilling, we would have to try to spill to memory. If the spilled SGPRs were required for exec mask manipulation, it is highly problematic because the lanes active at the point of spill are not necessarily the same as at the restore point. Avoid this problem by fully allocating SGPRs in a separate regalloc run from VGPRs. This way we know the exact number of VGPRs needed, and can reserve them for a second run. This fixes the most serious issues, but it is still possible using inline asm to make all VGPRs unavailable. Start erroring in the case where we ever would require memory for an SGPR spill. This is implemented by giving each regalloc pass a callback which reports if a register class should be handled or not. A few passes need some small changes to deal with leftover virtual registers. In the AMDGPU implementation, a new pass is introduced to take the place of PrologEpilogInserter for SGPR spills emitted during the first run. One disadvantage of this is currently StackSlotColoring is no longer used for SGPR spills. It would need to be run again, which will require more work. Error if the standard -regalloc option is used. Introduce new separate -sgpr-regalloc and -vgpr-regalloc flags, so the two runs can be controlled individually. PBQB is not currently supported, so this also prevents using the unhandled allocator. --- llvm/include/llvm/CodeGen/Passes.h | 6 + llvm/include/llvm/CodeGen/RegAllocCommon.h | 32 ++ llvm/include/llvm/CodeGen/RegAllocRegistry.h | 1 + llvm/lib/CodeGen/LiveIntervals.cpp | 7 +- llvm/lib/CodeGen/RegAllocBase.cpp | 18 + llvm/lib/CodeGen/RegAllocBase.h | 11 +- llvm/lib/CodeGen/RegAllocBasic.cpp | 15 +- llvm/lib/CodeGen/RegAllocFast.cpp | 32 +- llvm/lib/CodeGen/RegAllocGreedy.cpp | 32 +- llvm/lib/CodeGen/TargetPassConfig.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 195 ++++++- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 70 +++ llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 52 +- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 22 + llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 54 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 5 +- .../GlobalISel/extractelement-stack-lower.ll | 617 +++++++++------------ llvm/test/CodeGen/AMDGPU/agpr-csr.ll | 19 +- .../CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir | 2 +- .../CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir | 2 +- .../attr-amdgpu-flat-work-group-size-vgpr-limit.ll | 12 +- llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll | 34 +- .../CodeGen/AMDGPU/gfx-callable-argument-types.ll | 90 +-- .../AMDGPU/gfx-callable-preserved-registers.ll | 104 ++-- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 42 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 31 +- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll | 50 +- llvm/test/CodeGen/AMDGPU/pei-build-spill.mir | 16 +- llvm/test/CodeGen/AMDGPU/remat-vop.mir | 2 +- llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 108 ++++ llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 234 ++++++++ .../CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir | 7 +- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 20 +- .../CodeGen/AMDGPU/spill-empty-live-interval.mir | 2 +- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll | 2 +- .../AMDGPU/spill_more_than_wavesize_csr_sgprs.ll | 9 +- .../AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir | 5 +- .../AMDGPU/unstructured-cfg-def-use-issue.ll | 31 +- llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll | 126 ++--- .../AMDGPU/virtregrewrite-undef-identity-copy.mir | 4 +- 40 files changed, 1399 insertions(+), 726 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/RegAllocCommon.h create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 76667ea..324474c 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -16,6 +16,8 @@ #include "llvm/Support/CodeGen.h" #include "llvm/Support/Discriminator.h" +#include "llvm/CodeGen/RegAllocCommon.h" + #include #include @@ -173,16 +175,20 @@ namespace llvm { /// possible. It is best suited for debug code where live ranges are short. /// FunctionPass *createFastRegisterAllocator(); + FunctionPass *createFastRegisterAllocator(RegClassFilterFunc F, + bool ClearVirtRegs); /// BasicRegisterAllocation Pass - This pass implements a degenerate global /// register allocator using the basic regalloc framework. /// FunctionPass *createBasicRegisterAllocator(); + FunctionPass *createBasicRegisterAllocator(RegClassFilterFunc F); /// Greedy register allocation pass - This pass implements a global register /// allocator for optimized builds. /// FunctionPass *createGreedyRegisterAllocator(); + FunctionPass *createGreedyRegisterAllocator(RegClassFilterFunc F); /// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean /// Quadratic Prograaming (PBQP) based register allocator. diff --git a/llvm/include/llvm/CodeGen/RegAllocCommon.h b/llvm/include/llvm/CodeGen/RegAllocCommon.h new file mode 100644 index 0000000..39b77d9 --- /dev/null +++ b/llvm/include/llvm/CodeGen/RegAllocCommon.h @@ -0,0 +1,32 @@ +//===- RegAllocCommon.h - Utilities shared between allocators ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGALLOCCOMMON_H +#define LLVM_CODEGEN_REGALLOCCOMMON_H + +#include + +namespace llvm { + +class TargetRegisterClass; +class TargetRegisterInfo; + +typedef std::function RegClassFilterFunc; + +/// Default register class filter function for register allocation. All virtual +/// registers should be allocated. +static inline bool allocateAllRegClasses(const TargetRegisterInfo &, + const TargetRegisterClass &) { + return true; +} + +} + +#endif // LLVM_CODEGEN_REGALLOCCOMMON_H diff --git a/llvm/include/llvm/CodeGen/RegAllocRegistry.h b/llvm/include/llvm/CodeGen/RegAllocRegistry.h index 9a63674..e33d16c 100644 --- a/llvm/include/llvm/CodeGen/RegAllocRegistry.h +++ b/llvm/include/llvm/CodeGen/RegAllocRegistry.h @@ -14,6 +14,7 @@ #ifndef LLVM_CODEGEN_REGALLOCREGISTRY_H #define LLVM_CODEGEN_REGALLOCREGISTRY_H +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/MachinePassRegistry.h" namespace llvm { diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index 6075f64..23036c2 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -713,10 +713,15 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) { if (LI.empty()) continue; + // Target may have not allocated this yet. + Register PhysReg = VRM->getPhys(Reg); + if (!PhysReg) + continue; + // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); - for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); + for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) { const LiveRange &RURange = getRegUnit(*Unit); if (RURange.empty()) diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp index a2e9f21..d891d4c 100644 --- a/llvm/lib/CodeGen/RegAllocBase.cpp +++ b/llvm/lib/CodeGen/RegAllocBase.cpp @@ -175,3 +175,21 @@ void RegAllocBase::postOptimization() { } DeadRemats.clear(); } + +void RegAllocBase::enqueue(LiveInterval *LI) { + const Register Reg = LI->reg(); + + assert(Reg.isVirtual() && "Can only enqueue virtual registers"); + + if (VRM->hasPhys(Reg)) + return; + + const TargetRegisterClass &RC = *MRI->getRegClass(Reg); + if (ShouldAllocateClass(*TRI, RC)) { + LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n'); + enqueueImpl(LI); + } else { + LLVM_DEBUG(dbgs() << "Not enqueueing " << printReg(Reg, TRI) + << " in skipped register class\n"); + } +} diff --git a/llvm/lib/CodeGen/RegAllocBase.h b/llvm/lib/CodeGen/RegAllocBase.h index 3144605..1fb56db 100644 --- a/llvm/lib/CodeGen/RegAllocBase.h +++ b/llvm/lib/CodeGen/RegAllocBase.h @@ -37,6 +37,7 @@ #define LLVM_LIB_CODEGEN_REGALLOCBASE_H #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/RegisterClassInfo.h" namespace llvm { @@ -67,6 +68,7 @@ protected: LiveIntervals *LIS = nullptr; LiveRegMatrix *Matrix = nullptr; RegisterClassInfo RegClassInfo; + const RegClassFilterFunc ShouldAllocateClass; /// Inst which is a def of an original reg and whose defs are already all /// dead after remat is saved in DeadRemats. The deletion of such inst is @@ -74,7 +76,9 @@ protected: /// always available for the remat of all the siblings of the original reg. SmallPtrSet DeadRemats; - RegAllocBase() = default; + RegAllocBase(const RegClassFilterFunc F = allocateAllRegClasses) : + ShouldAllocateClass(F) {} + virtual ~RegAllocBase() = default; // A RegAlloc pass should call this before allocatePhysRegs. @@ -92,7 +96,10 @@ protected: virtual Spiller &spiller() = 0; /// enqueue - Add VirtReg to the priority queue of unassigned registers. - virtual void enqueue(LiveInterval *LI) = 0; + virtual void enqueueImpl(LiveInterval *LI) = 0; + + /// enqueue - Add VirtReg to the priority queue of unassigned registers. + void enqueue(LiveInterval *LI); /// dequeue - Return the next unassigned register, or NULL. virtual LiveInterval *dequeue() = 0; diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index f8de3e3..b65d580 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -76,7 +76,7 @@ class RABasic : public MachineFunctionPass, void LRE_WillShrinkVirtReg(Register) override; public: - RABasic(); + RABasic(const RegClassFilterFunc F = allocateAllRegClasses); /// Return the pass name. StringRef getPassName() const override { return "Basic Register Allocator"; } @@ -88,7 +88,7 @@ public: Spiller &spiller() override { return *SpillerInstance; } - void enqueue(LiveInterval *LI) override { + void enqueueImpl(LiveInterval *LI) override { Queue.push(LI); } @@ -171,7 +171,9 @@ void RABasic::LRE_WillShrinkVirtReg(Register VirtReg) { enqueue(&LI); } -RABasic::RABasic(): MachineFunctionPass(ID) { +RABasic::RABasic(RegClassFilterFunc F): + MachineFunctionPass(ID), + RegAllocBase(F) { } void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { @@ -332,7 +334,10 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) { return true; } -FunctionPass* llvm::createBasicRegisterAllocator() -{ +FunctionPass* llvm::createBasicRegisterAllocator() { return new RABasic(); } + +FunctionPass* llvm::createBasicRegisterAllocator(RegClassFilterFunc F) { + return new RABasic(F); +} diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 8a964ef..707161d 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -69,7 +70,13 @@ namespace { public: static char ID; - RegAllocFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {} + RegAllocFast(const RegClassFilterFunc F = allocateAllRegClasses, + bool ClearVirtRegs_ = true) : + MachineFunctionPass(ID), + ShouldAllocateClass(F), + StackSlotForVirtReg(-1), + ClearVirtRegs(ClearVirtRegs_) { + } private: MachineFrameInfo *MFI; @@ -77,6 +84,7 @@ namespace { const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; RegisterClassInfo RegClassInfo; + const RegClassFilterFunc ShouldAllocateClass; /// Basic block currently being allocated. MachineBasicBlock *MBB; @@ -84,6 +92,8 @@ namespace { /// Maps virtual regs to the frame index where these values are spilled. IndexedMap StackSlotForVirtReg; + bool ClearVirtRegs; + /// Everything we know about a live virtual register. struct LiveReg { MachineInstr *LastUse = nullptr; ///< Last instr to use reg. @@ -213,8 +223,12 @@ namespace { } MachineFunctionProperties getSetProperties() const override { - return MachineFunctionProperties().set( + if (ClearVirtRegs) { + return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); + } + + return MachineFunctionProperties(); } MachineFunctionProperties getClearedProperties() const override { @@ -1539,9 +1553,11 @@ bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) allocateBasicBlock(MBB); - // All machine operands and other references to virtual registers have been - // replaced. Remove the virtual registers. - MRI->clearVirtRegs(); + if (ClearVirtRegs) { + // All machine operands and other references to virtual registers have been + // replaced. Remove the virtual registers. + MRI->clearVirtRegs(); + } StackSlotForVirtReg.clear(); LiveDbgValueMap.clear(); @@ -1551,3 +1567,9 @@ bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) { FunctionPass *llvm::createFastRegisterAllocator() { return new RegAllocFast(); } + +FunctionPass *llvm::createFastRegisterAllocator( + std::function Ftor, bool ClearVirtRegs) { + return new RegAllocFast(Ftor, ClearVirtRegs); +} diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index f71df3f..b58b700 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -412,7 +412,7 @@ class RAGreedy : public MachineFunctionPass, ArrayRef RegCosts; public: - RAGreedy(); + RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); /// Return the pass name. StringRef getPassName() const override { return "Greedy Register Allocator"; } @@ -421,7 +421,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override; void releaseMemory() override; Spiller &spiller() override { return *SpillerInstance; } - void enqueue(LiveInterval *LI) override; + void enqueueImpl(LiveInterval *LI) override; LiveInterval *dequeue() override; MCRegister selectOrSplit(LiveInterval &, SmallVectorImpl &) override; @@ -636,7 +636,22 @@ FunctionPass* llvm::createGreedyRegisterAllocator() { return new RAGreedy(); } -RAGreedy::RAGreedy(): MachineFunctionPass(ID) { +namespace llvm { +FunctionPass* createGreedyRegisterAllocator( + std::function Ftor); + +} + +FunctionPass* llvm::createGreedyRegisterAllocator( + std::function Ftor) { + return new RAGreedy(Ftor); +} + +RAGreedy::RAGreedy(RegClassFilterFunc F): + MachineFunctionPass(ID), + RegAllocBase(F) { } void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { @@ -693,7 +708,7 @@ void RAGreedy::LRE_WillShrinkVirtReg(Register VirtReg) { // Register is assigned, put it back on the queue for reassignment. LiveInterval &LI = LIS->getInterval(VirtReg); Matrix->unassign(LI); - enqueue(&LI); + RegAllocBase::enqueue(&LI); } void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) { @@ -716,7 +731,7 @@ void RAGreedy::releaseMemory() { GlobalCand.clear(); } -void RAGreedy::enqueue(LiveInterval *LI) { enqueue(Queue, LI); } +void RAGreedy::enqueueImpl(LiveInterval *LI) { enqueue(Queue, LI); } void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { // Prioritize live ranges by size, assigning larger ranges first. @@ -2936,7 +2951,12 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) { if (Register::isPhysicalRegister(Reg)) continue; - assert(VRM->hasPhys(Reg) && "We have unallocated variable!!"); + // This may be a skipped class + if (!VRM->hasPhys(Reg)) { + assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) && + "We have an unallocated variable which should have been handled"); + continue; + } // Get the live interval mapped with this virtual register to be able // to check for the interference with the new color. diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 2a4f6bf..86d2463 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1333,8 +1333,8 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) { } bool TargetPassConfig::addRegAssignAndRewriteFast() { - if (RegAlloc != &useDefaultRegisterAllocator && - RegAlloc != &createFastRegisterAllocator) + if (RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator && + RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&createFastRegisterAllocator) report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc."); addPass(createRegAllocPass(false)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 81d848a..f2a27e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -32,6 +32,8 @@ #include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" @@ -52,6 +54,115 @@ using namespace llvm; +namespace { +class SGPRRegisterRegAlloc : public RegisterRegAllocBase { +public: + SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +class VGPRRegisterRegAlloc : public RegisterRegAllocBase { +public: + VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isSGPRClass(&RC); +} + +static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return !static_cast(TRI).isSGPRClass(&RC); +} + + +/// -{sgpr|vgpr}-regalloc=... command line option. +static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } + +/// A dummy default pass factory indicates whether the register allocator is +/// overridden on the command line. +static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; +static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; + +static SGPRRegisterRegAlloc +defaultSGPRRegAlloc("default", + "pick SGPR register allocator based on -O option", + useDefaultRegisterAllocator); + +static cl::opt> +SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for SGPRs")); + +static cl::opt> +VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for VGPRs")); + + +static void initializeDefaultSGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = SGPRRegAlloc; + SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); + } +} + +static void initializeDefaultVGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = VGPRRegAlloc; + VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); + } +} + +static FunctionPass *createBasicSGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createGreedySGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createFastSGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +static FunctionPass *createBasicVGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createGreedyVGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createFastVGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateVGPRs, true); +} + +static SGPRRegisterRegAlloc basicRegAllocSGPR( + "basic", "basic register allocator", createBasicSGPRRegisterAllocator); +static SGPRRegisterRegAlloc greedyRegAllocSGPR( + "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); + +static SGPRRegisterRegAlloc fastRegAllocSGPR( + "fast", "fast register allocator", createFastSGPRRegisterAllocator); + + +static VGPRRegisterRegAlloc basicRegAllocVGPR( + "basic", "basic register allocator", createBasicVGPRRegisterAllocator); +static VGPRRegisterRegAlloc greedyRegAllocVGPR( + "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); + +static VGPRRegisterRegAlloc fastRegAllocVGPR( + "fast", "fast register allocator", createFastVGPRRegisterAllocator); +} + + static cl::opt EnableR600StructurizeCFG( "r600-ir-structurize", cl::desc("Use StructurizeCFG IR pass"), @@ -852,6 +963,14 @@ public: bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; + + FunctionPass *createSGPRAllocPass(bool Optimized); + FunctionPass *createVGPRAllocPass(bool Optimized); + FunctionPass *createRegAllocPass(bool Optimized) override; + + bool addRegAssignAndRewriteFast() override; + bool addRegAssignAndRewriteOptimized() override; + void addPreRegAlloc() override; bool addPreRewrite() override; void addPostRegAlloc() override; @@ -1237,14 +1356,84 @@ bool GCNPassConfig::addPreRewrite() { return true; } +FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, + initializeDefaultSGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyRegisterAllocator(onlyAllocateSGPRs); + + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, + initializeDefaultVGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyVGPRRegisterAllocator(); + + return createFastVGPRRegisterAllocator(); +} + +FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { + llvm_unreachable("should not be used"); +} + +static const char RegAllocOptNotSupportedMessage[] = + "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; + +bool GCNPassConfig::addRegAssignAndRewriteFast() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(false)); + return true; +} + +bool GCNPassConfig::addRegAssignAndRewriteOptimized() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(true)); + + // Commit allocated register changes. This is mostly necessary because too + // many things rely on the use lists of the physical registers, such as the + // verifier. This is only necessary with allocators which use LiveIntervals, + // since FastRegAlloc does the replacments itself. + addPass(createVirtRegRewriter(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(true)); + + addPreRewrite(); + addPass(&VirtRegRewriterID); + + return true; +} + void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); if (getOptLevel() > CodeGenOpt::None) addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); - - // Equivalent of PEI for SGPRs. - addPass(&SILowerSGPRSpillsID); } void GCNPassConfig::addPreSched2() { diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 6e281e1..51b8ef1 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -20,6 +20,12 @@ using namespace llvm; #define DEBUG_TYPE "frame-info" +static cl::opt EnableSpillVGPRToAGPR( + "amdgpu-spill-vgpr-to-agpr", + cl::desc("Enable spilling VGPRs to AGPRs"), + cl::ReallyHidden, + cl::init(true)); + // Find a scratch register that we can use in the prologue. We avoid using // callee-save registers since they may appear to be free when this is called // from canUseAsPrologue (during shrink wrapping), but then no longer be free @@ -1127,9 +1133,73 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFrameInfo &MFI = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() + && EnableSpillVGPRToAGPR; + + if (SpillVGPRToAGPR) { + // To track the spill frame indices handled in this pass. + BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + + bool SeenDbgInstr = false; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (MI.isDebugInstr()) + SeenDbgInstr = true; + + if (TII->isVGPRSpill(MI)) { + // Try to eliminate stack used by VGPR spills before frame + // finalization. + unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr); + int FI = MI.getOperand(FIOp).getIndex(); + Register VReg = + TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); + if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, + TRI->isAGPR(MRI, VReg))) { + // FIXME: change to enterBasicBlockEnd() + RS->enterBasicBlock(MBB); + TRI->eliminateFrameIndex(MI, 0, FIOp, RS); + SpillFIs.set(FI); + continue; + } + } + } + } + + for (MachineBasicBlock &MBB : MF) { + for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) + MBB.addLiveIn(Reg); + + for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + + if (!SpillFIs.empty() && SeenDbgInstr) { + // FIXME: The dead frame indices are replaced with a null register from + // the debug value instructions. We should instead, update it with the + // correct register value. But not sure the register value alone is + for (MachineInstr &MI : MBB) { + if (MI.isDebugValue() && MI.getOperand(0).isFI() && + SpillFIs[MI.getOperand(0).getIndex()]) { + MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); + MI.getOperand(0).setIsDebug(); + } + } + } + } + } + FuncInfo->removeDeadFrameIndices(MFI); assert(allSGPRSpillsAreDead(MF) && "SGPR spill should have been removed in SILowerSGPRSpills"); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 7eaebbe..38b9d85 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -31,12 +31,6 @@ using MBBVector = SmallVector; namespace { -static cl::opt EnableSpillVGPRToAGPR( - "amdgpu-spill-vgpr-to-agpr", - cl::desc("Enable spilling VGPRs to AGPRs"), - cl::ReallyHidden, - cl::init(true)); - class SILowerSGPRSpills : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; @@ -71,6 +65,7 @@ char SILowerSGPRSpills::ID = 0; INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -295,6 +290,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); VRM = getAnalysisIfAvailable(); + LIS = getAnalysisIfAvailable(); assert(SaveBlocks.empty() && RestoreBlocks.empty()); @@ -318,21 +314,14 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { return false; } - const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() - && EnableSpillVGPRToAGPR; - bool MadeChange = false; - - const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts(); - std::unique_ptr RS; - bool NewReservedRegs = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs()); - if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) { + if (HasSGPRSpillToVGPR) { // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs // are spilled to VGPRs, in which case we can eliminate the stack usage. // @@ -350,36 +339,15 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; Next = std::next(I); - if (SpillToAGPR && TII->isVGPRSpill(MI)) { - // Try to eliminate stack used by VGPR spills before frame - // finalization. - unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::vaddr); - int FI = MI.getOperand(FIOp).getIndex(); - Register VReg = - TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); - if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, - TRI->isAGPR(MRI, VReg))) { - NewReservedRegs = true; - if (!RS) - RS.reset(new RegScavenger()); - - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); - TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get()); - SpillFIs.set(FI); - continue; - } - } - - if (!TII->isSGPRSpill(MI) || !TRI->spillSGPRToVGPR()) + if (!TII->isSGPRSpill(MI)) continue; int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { NewReservedRegs = true; - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr); + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, + nullptr, LIS); (void)Spilled; assert(Spilled && "failed to spill SGPR to VGPR when allocated"); SpillFIs.set(FI); @@ -387,16 +355,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { } } + // FIXME: Adding to live-ins redundant with reserving registers. for (MachineBasicBlock &MBB : MF) { for (auto SSpill : FuncInfo->getSGPRSpillVGPRs()) MBB.addLiveIn(SSpill.VGPR); - - for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) - MBB.addLiveIn(Reg); - - for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); // FIXME: The dead frame indices are replaced with a null register from diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index c9a7944..33d4fb4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -8,7 +8,22 @@ #include "SIMachineFunctionInfo.h" #include "AMDGPUTargetMachine.h" +#include "AMDGPUSubtarget.h" +#include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MIRParser/MIParser.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include +#include #define MAX_LANES 64 @@ -315,6 +330,13 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, // partially spill the SGPR to VGPRs. SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; + +#if 0 + DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), + "VGPRs for SGPR spilling", + 0, DS_Error); + MF.getFunction().getContext().diagnose(DiagOutOfRegs); +#endif return false; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 7fd0765..5c46343 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -572,6 +572,17 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg.first); } + // Reserve VGPRs used for SGPR spilling. + // Note we treat freezeReservedRegs unusually because we run register + // allocation in two phases. It's OK to re-freeze with new registers for the + // second run. +#if 0 + for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { + for (auto &SpilledVGPR : SpilledFI.second) + reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); + } +#endif + // FIXME: Stop using reserved registers for this. for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) reserveRegisterTuples(Reserved, Reg); @@ -1311,6 +1322,7 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, + LiveIntervals *LIS, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); @@ -1340,6 +1352,12 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, .addReg(SubReg, getKillRegState(UseKill)) .addImm(Spill.Lane) .addReg(Spill.VGPR); + if (LIS) { + if (i == 0) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } if (i == 0 && SB.NumSubRegs > 1) { // We may be spilling a super-register which is only partially defined, @@ -1383,6 +1401,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, .addReg(SB.TmpVGPR, TmpVGPRFlags); TmpVGPRFlags = 0; + if (LIS) { + if (i == 0) + LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane); + else + LIS->InsertMachineInstrInMaps(*WriteLane); + } + // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (SB.NumSubRegs > 1) { @@ -1403,12 +1428,17 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, MI->eraseFromParent(); SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); + + if (LIS) + LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); + return true; } bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, + LiveIntervals *LIS, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); @@ -1432,6 +1462,13 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, .addImm(Spill.Lane); if (SB.NumSubRegs > 1 && i == 0) MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } + } } else { SB.prepare(); @@ -1459,6 +1496,12 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, .addImm(i); if (SB.NumSubRegs > 1 && i == 0) MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } } } @@ -1466,6 +1509,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, } MI->eraseFromParent(); + + if (LIS) + LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); + return true; } @@ -1475,7 +1522,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, - RegScavenger *RS) const { + RegScavenger *RS, + LiveIntervals *LIS) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: @@ -1487,7 +1535,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, true); + return spillSGPR(MI, FI, RS, LIS, true); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: @@ -1498,7 +1546,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, true); + return restoreSGPR(MI, FI, RS, LIS, true); default: llvm_unreachable("not an SGPR spill instruction"); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 25e7046..2a92051 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -114,10 +114,12 @@ public: /// If \p OnlyToVGPR is true, this will only succeed if this bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, @@ -125,7 +127,8 @@ public: RegScavenger *RS) const override; bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr) const; StringRef getRegAsmName(MCRegister Reg) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index c0fcbd3..1b146dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -12,20 +12,22 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 @@ -44,8 +46,8 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 ; GCN-NEXT: v_mov_b32_e32 v5, s4 @@ -59,44 +61,25 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -115,6 +98,7 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 @@ -155,26 +139,26 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v12, v20 ; GCN-NEXT: v_mov_b32_e32 v13, v21 @@ -192,31 +176,6 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -234,29 +193,35 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v8, v11 +; GCN-NEXT: v_mov_b32_e32 v9, v12 +; GCN-NEXT: v_mov_b32_e32 v10, v13 +; GCN-NEXT: v_mov_b32_e32 v11, v14 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -273,20 +238,22 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 @@ -305,8 +272,8 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 ; GCN-NEXT: v_mov_b32_e32 v5, s4 @@ -319,44 +286,25 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -375,6 +323,7 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 @@ -415,26 +364,26 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v12, v20 ; GCN-NEXT: v_mov_b32_e32 v13, v21 @@ -452,31 +401,6 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -494,34 +418,40 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 -; GCN-NEXT: v_lshrrev_b32_e64 v15, 6, s33 -; GCN-NEXT: v_add_u32_e32 v15, 0x100, v15 -; GCN-NEXT: v_add_u32_e32 v0, v15, v0 +; GCN-NEXT: v_mov_b32_e32 v8, v11 +; GCN-NEXT: v_mov_b32_e32 v9, v12 +; GCN-NEXT: v_mov_b32_e32 v10, v13 +; GCN-NEXT: v_mov_b32_e32 v11, v14 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_lshrrev_b32_e64 v11, 6, s33 +; GCN-NEXT: v_add_u32_e32 v11, 0x100, v11 +; GCN-NEXT: v_add_u32_e32 v0, v11, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: s_waitcnt vmcnt(16) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -538,20 +468,22 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 @@ -570,8 +502,8 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 ; GCN-NEXT: v_mov_b32_e32 v5, s4 @@ -585,44 +517,25 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[52:55], v[60:61], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -641,6 +554,7 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 @@ -681,26 +595,26 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v12, v20 ; GCN-NEXT: v_mov_b32_e32 v13, v21 @@ -718,31 +632,6 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -760,30 +649,36 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v8, v11 +; GCN-NEXT: v_mov_b32_e32 v9, v12 +; GCN-NEXT: v_mov_b32_e32 v10, v13 +; GCN-NEXT: v_mov_b32_e32 v11, v14 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll index 57ab426..8bb9d95 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll @@ -35,11 +35,11 @@ define void @func_areg_32() #0 { ; GCN-LABEL: {{^}}func_areg_33: ; GCN-NOT: a32 -; GFX90A: buffer_store_dword a32, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A: v_accvgpr_read_b32 v0, a32 ; Reload Reuse ; GCN-NOT: a32 ; GCN: use agpr32 ; GCN-NOT: a32 -; GFX90A: buffer_load_dword a32, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A: v_accvgpr_write_b32 a32, v0 ; Reload Reuse ; GCN-NOT: a32 ; GCN: s_setpc_b64 define void @func_areg_33() #0 { @@ -50,9 +50,9 @@ define void @func_areg_33() #0 { ; GCN-LABEL: {{^}}func_areg_64: ; GFX908-NOT: buffer_ ; GCN-NOT: v_accvgpr -; GFX90A: buffer_store_dword a63, +; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse ; GCN: use agpr63 -; GFX90A: buffer_load_dword a63, +; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse ; GCN-NOT: v_accvgpr ; GCN: s_setpc_b64 define void @func_areg_64() #0 { @@ -62,12 +62,13 @@ define void @func_areg_64() #0 { ; GCN-LABEL: {{^}}func_areg_31_63: ; GFX908-NOT: buffer_ -; GCN-NOT: v_accvgpr -; GFX90A: buffer_store_dword a63, +; GFX908-NOT: v_accvgpr +; GFX908-NOT: buffer +; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse ; GCN: use agpr31, agpr63 -; GFX90A: buffer_load_dword a63, -; GCN-NOT: buffer_ -; GCN-NOT: v_accvgpr +; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse +; GFX908-NOT: v_accvgpr +; GFX908-NOT: buffer ; GCN: s_setpc_b64 define void @func_areg_31_63() #0 { call void asm sideeffect "; use agpr31, agpr63", "~{a31},~{a63}" () diff --git a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir index 0664657..c6dfafc 100644 --- a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir +++ b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx908.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s --- # GCN-LABEL: name: alloc_vgpr_64 diff --git a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir index 24a3001..5aea865 100644 --- a/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir +++ b/llvm/test/CodeGen/AMDGPU/alloc-aligned-tuples-gfx90a.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s # Using the unaligned vector tuples are OK as long as they aren't used # in a real instruction. diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll index da32323..97ad823 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll @@ -1,11 +1,11 @@ ; -enable-misched=false makes the register usage more predictable ; -regalloc=fast just makes the test run faster -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9 -; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A -; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64 +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A +; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false --sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode,+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE64 define internal void @use256vgprs() { %v0 = call i32 asm sideeffect "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 97c6a42..52aa154 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -273,22 +273,22 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v1, s33, 63 -; GCN-COUNT-60: v_writelane_b32 v1 +; GCN-NEXT: v_writelane_b32 v0, s33, 63 +; GCN-COUNT-60: v_writelane_b32 v0 ; GCN: s_mov_b32 s33, s32 -; GCN-COUNT-2: v_writelane_b32 v1 +; GCN-COUNT-2: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1 +; GCN: v_writelane_b32 v0 ; MUBUF: s_addk_i32 s32, 0x400 ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, 16 ; FLATSCR: s_add_i32 s32, s32, -16 -; GCN-NEXT: v_readlane_b32 s33, v1, 63 +; GCN-NEXT: v_readlane_b32 s33, v0, 63 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload @@ -318,21 +318,21 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-62: v_writelane_b32 v1, +; GCN-COUNT-62: v_writelane_b32 v0, ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: v_writelane_b32 v1, +; GCN: v_writelane_b32 v0, ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1, +; GCN: v_writelane_b32 v0, ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 -; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 +; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0 ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] @@ -388,24 +388,24 @@ define void @realign_stack_no_fp_elim() #1 { ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v1, s33, 2 -; GCN-NEXT: v_writelane_b32 v1, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s33, 2 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN: v_writelane_b32 v1, s31, 1 +; GCN: v_writelane_b32 v0, s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART ; MUBUF: s_addk_i32 s32, 0x300 -; MUBUF-NEXT: v_readlane_b32 s4, v1, 0 -; MUBUF-NEXT: v_readlane_b32 s5, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s4, v0, 0 +; MUBUF-NEXT: v_readlane_b32 s5, v0, 1 ; FLATSCR: s_add_i32 s32, s32, 12 -; FLATSCR-NEXT: v_readlane_b32 s0, v1, 0 -; FLATSCR-NEXT: v_readlane_b32 s1, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s0, v0, 0 +; FLATSCR-NEXT: v_readlane_b32 s1, v0, 1 ; MUBUF-NEXT: s_addk_i32 s32, 0xfd00 ; FLATSCR-NEXT: s_add_i32 s32, s32, -12 -; GCN-NEXT: v_readlane_b32 s33, v1, 2 +; GCN-NEXT: v_readlane_b32 s33, v0, 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index d679979..8486676 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4225,32 +4225,32 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v42, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v42, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: global_store_dword v[40:41], v0, off +; GFX9-NEXT: global_store_dword v[41:42], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v42, 0 -; GFX9-NEXT: v_readlane_b32 s5, v42, 1 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v42, 2 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -4260,34 +4260,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v42, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_mov_b32_e32 v40, v0 -; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v41, v1 -; GFX10-NEXT: v_writelane_b32 v42, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: global_store_dword v[40:41], v0, off +; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s4, v42, 0 -; GFX10-NEXT: v_readlane_b32 s5, v42, 1 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v42, 2 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4298,34 +4298,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, v0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off +; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v42, 1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v42, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s32 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll index 9d6d3b4..35d6618 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -184,33 +184,33 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v41, s33, 2 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v41, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v40, v31 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, v31 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: v_mov_b32_e32 v31, v41 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 -; GFX9-NEXT: v_readlane_b32 s5, v41, 1 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 2 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -220,34 +220,34 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 2 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v40, v31 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v31 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v31, v40 +; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v41, 0 -; GFX10-NEXT: v_readlane_b32 s5, v41, 1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 2 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -762,14 +762,14 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v41, s33, 3 -; GFX9-NEXT: v_writelane_b32 v41, s40, 0 -; GFX9-NEXT: v_writelane_b32 v41, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND @@ -779,23 +779,23 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v41, s31, 2 -; GFX9-NEXT: v_mov_b32_e32 v40, v32 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_mov_b32_e32 v41, v32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s40 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v40 +; GFX9-NEXT: ; use v41 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v41, 1 -; GFX9-NEXT: v_readlane_b32 s5, v41, 2 -; GFX9-NEXT: v_readlane_b32 s40, v41, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: v_readlane_b32 s5, v40, 2 +; GFX9-NEXT: v_readlane_b32 s40, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v41, 3 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -805,41 +805,41 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v41, s33, 3 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v41, s40, 0 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s40, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v40, v32 -; GFX10-NEXT: v_writelane_b32 v41, s30, 1 -; GFX10-NEXT: v_writelane_b32 v41, s31, 2 +; GFX10-NEXT: v_mov_b32_e32 v41, v32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v40 +; GFX10-NEXT: ; use v41 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v41, 1 -; GFX10-NEXT: v_readlane_b32 s5, v41, 2 -; GFX10-NEXT: v_readlane_b32 s40, v41, 0 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_readlane_b32 s5, v40, 2 +; GFX10-NEXT: v_readlane_b32 s40, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v41, 3 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 7a735b4..62714db 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -598,44 +598,44 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr) ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v41, s33, 6 +; GCN-NEXT: v_writelane_b32 v40, s33, 6 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v41, s34, 0 -; GCN-NEXT: v_writelane_b32 v41, s35, 1 -; GCN-NEXT: v_writelane_b32 v41, s36, 2 -; GCN-NEXT: v_writelane_b32 v41, s37, 3 -; GCN-NEXT: v_writelane_b32 v41, s30, 4 -; GCN-NEXT: v_writelane_b32 v41, s31, 5 -; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s30, 4 +; GCN-NEXT: v_writelane_b32 v40, s31, 5 +; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[34:35], exec ; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_readfirstlane_b32 s5, v2 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2] ; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc -; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: v_mov_b32_e32 v0, v41 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] ; GCN-NEXT: s_cbranch_execnz BB7_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_mov_b32_e32 v0, v40 -; GCN-NEXT: v_readlane_b32 s4, v41, 4 -; GCN-NEXT: v_readlane_b32 s5, v41, 5 -; GCN-NEXT: v_readlane_b32 s37, v41, 3 -; GCN-NEXT: v_readlane_b32 s36, v41, 2 -; GCN-NEXT: v_readlane_b32 s35, v41, 1 -; GCN-NEXT: v_readlane_b32 s34, v41, 0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v0, v41 +; GCN-NEXT: v_readlane_b32 s4, v40, 4 +; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v41, 6 +; GCN-NEXT: v_readlane_b32 s33, v40, 6 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 2a1c3dc..c24cf56 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -123,8 +123,9 @@ ; GCN-O0-NEXT: Live Register Matrix ; GCN-O0-NEXT: SI Pre-allocate WWM Registers ; GCN-O0-NEXT: Fast Register Allocator -; GCN-O0-NEXT: SI Fix VGPR copies ; GCN-O0-NEXT: SI lower SGPR spill instructions +; GCN-O0-NEXT: Fast Register Allocator +; GCN-O0-NEXT: SI Fix VGPR copies ; GCN-O0-NEXT: Fixup Statepoint Caller Saved ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O0-NEXT: Machine Optimization Remark Emitter @@ -344,6 +345,12 @@ ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Greedy Register Allocator +; GCN-O1-NEXT: Virtual Register Rewriter +; GCN-O1-NEXT: SI lower SGPR spill instructions +; GCN-O1-NEXT: Virtual Register Map +; GCN-O1-NEXT: Live Register Matrix +; GCN-O1-NEXT: Machine Optimization Remark Emitter +; GCN-O1-NEXT: Greedy Register Allocator ; GCN-O1-NEXT: GCN NSA Reassign ; GCN-O1-NEXT: Virtual Register Rewriter ; GCN-O1-NEXT: Stack Slot Coloring @@ -351,7 +358,6 @@ ; GCN-O1-NEXT: Machine Loop Invariant Code Motion ; GCN-O1-NEXT: SI Fix VGPR copies ; GCN-O1-NEXT: SI optimize exec mask operations -; GCN-O1-NEXT: SI lower SGPR spill instructions ; GCN-O1-NEXT: Fixup Statepoint Caller Saved ; GCN-O1-NEXT: PostRA Machine Sink ; GCN-O1-NEXT: MachineDominator Tree Construction @@ -622,6 +628,12 @@ ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Greedy Register Allocator +; GCN-O1-OPTS-NEXT: Virtual Register Rewriter +; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions +; GCN-O1-OPTS-NEXT: Virtual Register Map +; GCN-O1-OPTS-NEXT: Live Register Matrix +; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter +; GCN-O1-OPTS-NEXT: Greedy Register Allocator ; GCN-O1-OPTS-NEXT: GCN NSA Reassign ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter ; GCN-O1-OPTS-NEXT: Stack Slot Coloring @@ -629,7 +641,6 @@ ; GCN-O1-OPTS-NEXT: Machine Loop Invariant Code Motion ; GCN-O1-OPTS-NEXT: SI Fix VGPR copies ; GCN-O1-OPTS-NEXT: SI optimize exec mask operations -; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions ; GCN-O1-OPTS-NEXT: Fixup Statepoint Caller Saved ; GCN-O1-OPTS-NEXT: PostRA Machine Sink ; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction @@ -901,6 +912,12 @@ ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Greedy Register Allocator +; GCN-O2-NEXT: Virtual Register Rewriter +; GCN-O2-NEXT: SI lower SGPR spill instructions +; GCN-O2-NEXT: Virtual Register Map +; GCN-O2-NEXT: Live Register Matrix +; GCN-O2-NEXT: Machine Optimization Remark Emitter +; GCN-O2-NEXT: Greedy Register Allocator ; GCN-O2-NEXT: GCN NSA Reassign ; GCN-O2-NEXT: Virtual Register Rewriter ; GCN-O2-NEXT: Stack Slot Coloring @@ -908,7 +925,6 @@ ; GCN-O2-NEXT: Machine Loop Invariant Code Motion ; GCN-O2-NEXT: SI Fix VGPR copies ; GCN-O2-NEXT: SI optimize exec mask operations -; GCN-O2-NEXT: SI lower SGPR spill instructions ; GCN-O2-NEXT: Fixup Statepoint Caller Saved ; GCN-O2-NEXT: PostRA Machine Sink ; GCN-O2-NEXT: MachineDominator Tree Construction @@ -1193,6 +1209,12 @@ ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Greedy Register Allocator +; GCN-O3-NEXT: Virtual Register Rewriter +; GCN-O3-NEXT: SI lower SGPR spill instructions +; GCN-O3-NEXT: Virtual Register Map +; GCN-O3-NEXT: Live Register Matrix +; GCN-O3-NEXT: Machine Optimization Remark Emitter +; GCN-O3-NEXT: Greedy Register Allocator ; GCN-O3-NEXT: GCN NSA Reassign ; GCN-O3-NEXT: Virtual Register Rewriter ; GCN-O3-NEXT: Stack Slot Coloring @@ -1200,7 +1222,6 @@ ; GCN-O3-NEXT: Machine Loop Invariant Code Motion ; GCN-O3-NEXT: SI Fix VGPR copies ; GCN-O3-NEXT: SI optimize exec mask operations -; GCN-O3-NEXT: SI lower SGPR spill instructions ; GCN-O3-NEXT: Fixup Statepoint Caller Saved ; GCN-O3-NEXT: PostRA Machine Sink ; GCN-O3-NEXT: MachineDominator Tree Construction diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 2ee55df..e8da961 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -189,44 +189,44 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v43, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v43, s35, 1 +; GFX9-NEXT: v_writelane_b32 v40, s35, 1 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 -; GFX9-NEXT: v_writelane_b32 v43, s30, 2 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 -; GFX9-NEXT: v_writelane_b32 v43, s31, 3 -; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: v_mov_b32_e32 v42, v0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 +; GFX9-NEXT: v_mov_b32_e32 v0, v41 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 +; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v43, 2 -; GFX9-NEXT: v_readlane_b32 s5, v43, 3 -; GFX9-NEXT: v_readlane_b32 s35, v43, 1 -; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 3 +; GFX9-NEXT: v_readlane_b32 s35, v40, 1 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v43, 4 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir index 9b75f92..8b00aee 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir @@ -1,12 +1,12 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A-V2A %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=0 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-spill-vgpr-to-agpr=1 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A-V2A %s --- name: test_spill_v1 diff --git a/llvm/test/CodeGen/AMDGPU/remat-vop.mir b/llvm/test/CodeGen/AMDGPU/remat-vop.mir index 2b82f11..1de8efe 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-vop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-vop.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs --stress-regalloc=2 -start-before=greedy -stop-after=virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs --stress-regalloc=2 -start-before=greedy,0 -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s --- name: test_remat_v_mov_b32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll new file mode 100644 index 0000000..79f288a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -0,0 +1,108 @@ +; REQUIRES: asserts + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s +; RUN: llc -sgpr-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s + +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=O0 %s + +; RUN: llc -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s +; RUN: llc -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s +; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s + +; RUN: not --crash llc -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s +; RUN: not --crash llc -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s + + +; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc + +; DEFAULT: Greedy Register Allocator +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: SI lower SGPR spill instructions +; DEFAULT-NEXT: Virtual Register Map +; DEFAULT-NEXT: Live Register Matrix +; DEFAULT-NEXT: Machine Optimization Remark Emitter +; DEFAULT-NEXT: Greedy Register Allocator +; DEFAULT-NEXT: GCN NSA Reassign +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: Stack Slot Coloring + +; O0: Fast Register Allocator +; O0-NEXT: SI lower SGPR spill instructions +; O0-NEXT: Fast Register Allocator +; O0-NEXT: SI Fix VGPR copies + + + + +; BASIC-DEFAULT: Debug Variable Analysis +; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis +; BASIC-DEFAULT-NEXT: Machine Natural Loop Construction +; BASIC-DEFAULT-NEXT: Machine Block Frequency Analysis +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Basic Register Allocator +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges +; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis +; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis +; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter +; BASIC-DEFAULT-NEXT: Greedy Register Allocator +; BASIC-DEFAULT-NEXT: GCN NSA Reassign +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: Stack Slot Coloring + + + +; DEFAULT-BASIC: Greedy Register Allocator +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions +; DEFAULT-BASIC-NEXT: Virtual Register Map +; DEFAULT-BASIC-NEXT: Live Register Matrix +; DEFAULT-BASIC-NEXT: Basic Register Allocator +; DEFAULT-BASIC-NEXT: GCN NSA Reassign +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: Stack Slot Coloring + + + +; BASIC-BASIC: Debug Variable Analysis +; BASIC-BASIC-NEXT: Live Stack Slot Analysis +; BASIC-BASIC-NEXT: Machine Natural Loop Construction +; BASIC-BASIC-NEXT: Machine Block Frequency Analysis +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: SI lower SGPR spill instructions +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: GCN NSA Reassign +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: Stack Slot Coloring + + +declare void @bar() + +; Something with some CSR SGPR spills +define void @foo() { + call void asm sideeffect "; clobber", "~{s33}"() + call void @bar() + ret void +} + +; Block live out spills with fast regalloc +define amdgpu_kernel void @control_flow(i1 %cond) { + %s33 = call i32 asm sideeffect "; clobber", "={s33}"() + br i1 %cond, label %bb0, label %bb1 + +bb0: + call void asm sideeffect "; use %0", "s"(i32 %s33) + br label %bb1 + +bb1: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll new file mode 100644 index 0000000..14b6dde --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; The first 64 SGPR spills can go to a VGPR, but there isn't a second +; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. + +define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { +; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 0 +; GCN-NEXT: v_writelane_b32 v23, s9, 1 +; GCN-NEXT: v_writelane_b32 v23, s10, 2 +; GCN-NEXT: v_writelane_b32 v23, s11, 3 +; GCN-NEXT: v_writelane_b32 v23, s12, 4 +; GCN-NEXT: v_writelane_b32 v23, s13, 5 +; GCN-NEXT: v_writelane_b32 v23, s14, 6 +; GCN-NEXT: v_writelane_b32 v23, s15, 7 +; GCN-NEXT: v_writelane_b32 v23, s16, 8 +; GCN-NEXT: v_writelane_b32 v23, s17, 9 +; GCN-NEXT: v_writelane_b32 v23, s18, 10 +; GCN-NEXT: v_writelane_b32 v23, s19, 11 +; GCN-NEXT: v_writelane_b32 v23, s20, 12 +; GCN-NEXT: v_writelane_b32 v23, s21, 13 +; GCN-NEXT: v_writelane_b32 v23, s22, 14 +; GCN-NEXT: v_writelane_b32 v23, s23, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 16 +; GCN-NEXT: v_writelane_b32 v23, s9, 17 +; GCN-NEXT: v_writelane_b32 v23, s10, 18 +; GCN-NEXT: v_writelane_b32 v23, s11, 19 +; GCN-NEXT: v_writelane_b32 v23, s12, 20 +; GCN-NEXT: v_writelane_b32 v23, s13, 21 +; GCN-NEXT: v_writelane_b32 v23, s14, 22 +; GCN-NEXT: v_writelane_b32 v23, s15, 23 +; GCN-NEXT: v_writelane_b32 v23, s16, 24 +; GCN-NEXT: v_writelane_b32 v23, s17, 25 +; GCN-NEXT: v_writelane_b32 v23, s18, 26 +; GCN-NEXT: v_writelane_b32 v23, s19, 27 +; GCN-NEXT: v_writelane_b32 v23, s20, 28 +; GCN-NEXT: v_writelane_b32 v23, s21, 29 +; GCN-NEXT: v_writelane_b32 v23, s22, 30 +; GCN-NEXT: v_writelane_b32 v23, s23, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 32 +; GCN-NEXT: v_writelane_b32 v23, s9, 33 +; GCN-NEXT: v_writelane_b32 v23, s10, 34 +; GCN-NEXT: v_writelane_b32 v23, s11, 35 +; GCN-NEXT: v_writelane_b32 v23, s12, 36 +; GCN-NEXT: v_writelane_b32 v23, s13, 37 +; GCN-NEXT: v_writelane_b32 v23, s14, 38 +; GCN-NEXT: v_writelane_b32 v23, s15, 39 +; GCN-NEXT: v_writelane_b32 v23, s16, 40 +; GCN-NEXT: v_writelane_b32 v23, s17, 41 +; GCN-NEXT: v_writelane_b32 v23, s18, 42 +; GCN-NEXT: v_writelane_b32 v23, s19, 43 +; GCN-NEXT: v_writelane_b32 v23, s20, 44 +; GCN-NEXT: v_writelane_b32 v23, s21, 45 +; GCN-NEXT: v_writelane_b32 v23, s22, 46 +; GCN-NEXT: v_writelane_b32 v23, s23, 47 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v23, s8, 48 +; GCN-NEXT: v_writelane_b32 v23, s9, 49 +; GCN-NEXT: v_writelane_b32 v23, s10, 50 +; GCN-NEXT: v_writelane_b32 v23, s11, 51 +; GCN-NEXT: v_writelane_b32 v23, s12, 52 +; GCN-NEXT: v_writelane_b32 v23, s13, 53 +; GCN-NEXT: v_writelane_b32 v23, s14, 54 +; GCN-NEXT: v_writelane_b32 v23, s15, 55 +; GCN-NEXT: v_writelane_b32 v23, s16, 56 +; GCN-NEXT: v_writelane_b32 v23, s17, 57 +; GCN-NEXT: v_writelane_b32 v23, s18, 58 +; GCN-NEXT: v_writelane_b32 v23, s19, 59 +; GCN-NEXT: v_writelane_b32 v23, s20, 60 +; GCN-NEXT: v_writelane_b32 v23, s21, 61 +; GCN-NEXT: v_writelane_b32 v23, s22, 62 +; GCN-NEXT: v_writelane_b32 v23, s23, 63 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[6:7] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_writelane_b32 v0, s6, 0 +; GCN-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s4, s5 +; GCN-NEXT: s_cbranch_scc1 BB0_2 +; GCN-NEXT: ; %bb.1: ; %bb0 +; GCN-NEXT: v_readlane_b32 s4, v23, 0 +; GCN-NEXT: v_readlane_b32 s5, v23, 1 +; GCN-NEXT: v_readlane_b32 s6, v23, 2 +; GCN-NEXT: v_readlane_b32 s7, v23, 3 +; GCN-NEXT: v_readlane_b32 s8, v23, 4 +; GCN-NEXT: v_readlane_b32 s9, v23, 5 +; GCN-NEXT: v_readlane_b32 s10, v23, 6 +; GCN-NEXT: v_readlane_b32 s11, v23, 7 +; GCN-NEXT: v_readlane_b32 s12, v23, 8 +; GCN-NEXT: v_readlane_b32 s13, v23, 9 +; GCN-NEXT: v_readlane_b32 s14, v23, 10 +; GCN-NEXT: v_readlane_b32 s15, v23, 11 +; GCN-NEXT: v_readlane_b32 s16, v23, 12 +; GCN-NEXT: v_readlane_b32 s17, v23, 13 +; GCN-NEXT: v_readlane_b32 s18, v23, 14 +; GCN-NEXT: v_readlane_b32 s19, v23, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s4, v23, 16 +; GCN-NEXT: v_readlane_b32 s5, v23, 17 +; GCN-NEXT: v_readlane_b32 s6, v23, 18 +; GCN-NEXT: v_readlane_b32 s7, v23, 19 +; GCN-NEXT: v_readlane_b32 s8, v23, 20 +; GCN-NEXT: v_readlane_b32 s9, v23, 21 +; GCN-NEXT: v_readlane_b32 s10, v23, 22 +; GCN-NEXT: v_readlane_b32 s11, v23, 23 +; GCN-NEXT: v_readlane_b32 s12, v23, 24 +; GCN-NEXT: v_readlane_b32 s13, v23, 25 +; GCN-NEXT: v_readlane_b32 s14, v23, 26 +; GCN-NEXT: v_readlane_b32 s15, v23, 27 +; GCN-NEXT: v_readlane_b32 s16, v23, 28 +; GCN-NEXT: v_readlane_b32 s17, v23, 29 +; GCN-NEXT: v_readlane_b32 s18, v23, 30 +; GCN-NEXT: v_readlane_b32 s19, v23, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s4, v23, 32 +; GCN-NEXT: v_readlane_b32 s5, v23, 33 +; GCN-NEXT: v_readlane_b32 s6, v23, 34 +; GCN-NEXT: v_readlane_b32 s7, v23, 35 +; GCN-NEXT: v_readlane_b32 s8, v23, 36 +; GCN-NEXT: v_readlane_b32 s9, v23, 37 +; GCN-NEXT: v_readlane_b32 s10, v23, 38 +; GCN-NEXT: v_readlane_b32 s11, v23, 39 +; GCN-NEXT: v_readlane_b32 s12, v23, 40 +; GCN-NEXT: v_readlane_b32 s13, v23, 41 +; GCN-NEXT: v_readlane_b32 s14, v23, 42 +; GCN-NEXT: v_readlane_b32 s15, v23, 43 +; GCN-NEXT: v_readlane_b32 s16, v23, 44 +; GCN-NEXT: v_readlane_b32 s17, v23, 45 +; GCN-NEXT: v_readlane_b32 s18, v23, 46 +; GCN-NEXT: v_readlane_b32 s19, v23, 47 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s8, v23, 48 +; GCN-NEXT: v_readlane_b32 s9, v23, 49 +; GCN-NEXT: v_readlane_b32 s10, v23, 50 +; GCN-NEXT: v_readlane_b32 s11, v23, 51 +; GCN-NEXT: v_readlane_b32 s12, v23, 52 +; GCN-NEXT: v_readlane_b32 s13, v23, 53 +; GCN-NEXT: v_readlane_b32 s14, v23, 54 +; GCN-NEXT: v_readlane_b32 s15, v23, 55 +; GCN-NEXT: v_readlane_b32 s16, v23, 56 +; GCN-NEXT: v_readlane_b32 s17, v23, 57 +; GCN-NEXT: v_readlane_b32 s18, v23, 58 +; GCN-NEXT: v_readlane_b32 s19, v23, 59 +; GCN-NEXT: v_readlane_b32 s20, v23, 60 +; GCN-NEXT: v_readlane_b32 s21, v23, 61 +; GCN-NEXT: v_readlane_b32 s22, v23, 62 +; GCN-NEXT: v_readlane_b32 s23, v23, 63 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[8:23] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:5] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: BB0_2: ; %ret +; GCN-NEXT: s_endpgm + call void asm sideeffect "", "~{v[0:7]}" () #0 + call void asm sideeffect "", "~{v[8:15]}" () #0 + call void asm sideeffect "", "~{v[16:19]}"() #0 + call void asm sideeffect "", "~{v[20:21]}"() #0 + call void asm sideeffect "", "~{v22}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index 7ad302e..c822d4f 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -1,5 +1,8 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s + +# -run-pass is used to artifically avoid using split register allocation, which would avoid stressing StackSlotColoring. + # Make sure that stack slot coloring doesn't try to merge frame # indexes used for SGPR spilling with those that aren't. diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 74cd5070..e5baafb 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -197,15 +197,15 @@ entry: ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec ; GCN: s_mov_b32 s33, s32 ; GCN-DAG: s_addk_i32 s32, 0x400 -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v42, s34, 0 -; GCN-DAG: v_writelane_b32 v42, s35, 1 +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 [[CSRV]], s34, 0 +; GCN-DAG: v_writelane_b32 [[CSRV]], s35, 1 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -214,20 +214,20 @@ entry: ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GCN-DAG: v_readlane_b32 s34, v42, 0 -; GCN-DAG: v_readlane_b32 s35, v42, 1 +; GCN-DAG: v_readlane_b32 s34, [[CSRV]], 0 +; GCN-DAG: v_readlane_b32 s35, [[CSRV]], 1 ; GCN: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: v_readlane_b32 s33, ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir index 2989795..29eac20 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-dce-in-ra=0 -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-dce-in-ra=0 -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy,1 -o - %s | FileCheck %s # https://bugs.llvm.org/show_bug.cgi?id=33620 --- diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 26a9672..e0247b1 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s -; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck --check-prefix=CHECK %s ; RUN: llc -march=amdgcn -mattr=-xnack -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s ; diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll index 6d46665..d85774e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -22,12 +22,11 @@ define void @spill_more_than_wavesize_csr_sgprs() { } ; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object: -; CHECK-DAG: v_writelane_b32 v1, s98, 63 -; CHECK-DAG: v_writelane_b32 v2, s99, 0 +; CHECK-DAG: v_writelane_b32 v0, s98, 63 +; CHECK-DAG: v_writelane_b32 v1, s99, 0 ; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v2, 0 -; CHECK-DAG: v_readlane_b32 s98, v1, 63 - +; CHECK-DAG: v_readlane_b32 s99, v1, 0 +; CHECK-DAG: v_readlane_b32 s98, v0, 63 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir index 8636314..1c55dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -1,4 +1,7 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -stress-regalloc=1 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck %s +# Note we are NOT using the normal register allocator pipeline. We are +# forcing allocating VGPRs and SGPRs at the same time. +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -stress-regalloc=1 -run-pass=greedy,virtregrewriter,stack-slot-coloring -o - %s | FileCheck %s + --- # CHECK-LABEL: name: no_merge_sgpr_vgpr_spill_slot{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index 1eb31cc..2582a87 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -89,7 +89,6 @@ define hidden void @widget() { ; SI-OPT: bb12: ; SI-OPT-NEXT: store float 0.000000e+00, float addrspace(1)* null, align 8 ; SI-OPT-NEXT: ret void -; bb: %tmp = load i32, i32 addrspace(1)* null, align 16 %tmp1 = icmp slt i32 %tmp, 21 @@ -187,30 +186,30 @@ define hidden void @blam() { ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v43, s33, 4 +; GCN-NEXT: v_writelane_b32 v40, s33, 4 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s37, 3 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: flat_load_dword v40, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: flat_load_dword v41, v[1:2] +; GCN-NEXT: v_mov_b32_e32 v43, 0 ; GCN-NEXT: s_getpc_b64 s[36:37] ; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v40 +; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v41 ; GCN-NEXT: s_branch BB1_3 ; GCN-NEXT: BB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 @@ -229,7 +228,7 @@ define hidden void @blam() { ; GCN-NEXT: BB1_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[41:42] +; GCN-NEXT: flat_load_dword v0, v[42:43] ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) @@ -273,7 +272,7 @@ define hidden void @blam() { ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: BB1_10: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0 ; GCN-NEXT: s_branch BB1_2 bb: %tmp = load float, float* null, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll index 3b065a0..447dd9a 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -8,11 +8,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; preserved across the call and should get 8 scratch registers. ; GFX9-LABEL: non_preserved_vgpr_tuple8: -; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 @@ -21,28 +21,28 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: v_mov_b32_e32 v32, v12 ; GFX9: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 +; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9: s_setpc_b64 s[4:5] ; ; GFX10-LABEL: non_preserved_vgpr_tuple8: -; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10: v_mov_b32_e32 v36, v16 ; GFX10-NEXT: v_mov_b32_e32 v35, v15 @@ -53,7 +53,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 @@ -62,12 +62,12 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10: buffer_load_dword v43, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 +; GFX10: buffer_load_dword v44, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX10: s_setpc_b64 s[4:5] main_body: call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 @@ -86,20 +86,20 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; The upper 3 sub-registers are unused. ; GFX9-LABEL: call_preserved_vgpr_tuple8: -; GFX9: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill - -; GFX9: v_mov_b32_e32 v44, v16 -; GFX9-NEXT: v_mov_b32_e32 v43, v15 -; GFX9-NEXT: v_mov_b32_e32 v42, v14 -; GFX9-NEXT: v_mov_b32_e32 v41, v13 -; GFX9-NEXT: v_mov_b32_e32 v40, v12 - -; GFX9: image_gather4_c_b_cl v[0:3], v[40:44], s[36:43], s[4:7] dmask:0x1 +; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill + +; GFX9: v_mov_b32_e32 v45, v16 +; GFX9-NEXT: v_mov_b32_e32 v44, v15 +; GFX9-NEXT: v_mov_b32_e32 v43, v14 +; GFX9-NEXT: v_mov_b32_e32 v42, v13 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 + +; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -108,24 +108,24 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 -; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9: s_setpc_b64 s[4:5] ; ; GFX10-LABEL: call_preserved_vgpr_tuple8: -; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D @@ -133,24 +133,24 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v40, v16 +; GFX10-NEXT: v_mov_b32_e32 v41, v16 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v41, v15 -; GFX10-NEXT: v_mov_b32_e32 v42, v14 -; GFX10-NEXT: v_mov_b32_e32 v43, v13 -; GFX10-NEXT: v_mov_b32_e32 v44, v12 +; GFX10-NEXT: v_mov_b32_e32 v42, v15 +; GFX10-NEXT: v_mov_b32_e32 v43, v14 +; GFX10-NEXT: v_mov_b32_e32 v44, v13 +; GFX10-NEXT: v_mov_b32_e32 v45, v12 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D - -; GFX10: buffer_load_dword v44, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 -; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D + +; GFX10: buffer_load_dword v45, off, s[0:3], s33{{$}} +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 +; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; GFX10: s_setpc_b64 s[4:5] main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir index 67676e6..06aec73 100644 --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - %s | FileCheck %s # The undef copy of %4 is allocated to $vgpr3, and the identity copy # was deleted, and $vgpr3 was considered undef. The code to replace @@ -31,7 +31,7 @@ body: | ; CHECK-LABEL: name: undef_identity_copy ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128), addrspace 1) ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc ; CHECK: $sgpr4 = COPY $sgpr95 ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @foo, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4 ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 -- 2.7.4