From caf1294d95785503a1d114c8c167e181fff7068b Mon Sep 17 00:00:00 2001 From: Baptiste Saleil Date: Mon, 26 Apr 2021 15:48:12 -0400 Subject: [PATCH] [AMDGPU] Experiments show that the GCNRegBankReassign pass significantly impacts the compilation time and there is no case for which we see any improvement in performance. This patch removes this pass and its associated test cases from the tree. Differential Revision: https://reviews.llvm.org/D101313 Change-Id: I0599169a7609c19a887f8d847a71e664030cc141 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 13 - llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 - llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp | 900 --------------------- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 8 +- .../AMDGPU/GlobalISel/extractelement.i128.ll | 96 +-- .../AMDGPU/GlobalISel/extractelement.i16.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll | 28 +- .../CodeGen/AMDGPU/GlobalISel/extractelement.ll | 208 ++--- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll | 156 ++-- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll | 422 +++++----- llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll | 8 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 326 ++++---- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 318 ++++---- .../CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll | 48 +- .../CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll | 428 +++++----- .../CodeGen/AMDGPU/GlobalISel/insertelement.ll | 184 ++--- .../AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll | 8 +- .../llvm.amdgcn.image.gather4.a16.dim.ll | 40 +- .../GlobalISel/llvm.amdgcn.image.gather4.dim.ll | 4 +- .../AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll | 44 +- .../GlobalISel/llvm.amdgcn.image.sample.g16.ll | 86 +- .../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll | 4 +- .../AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll | 20 +- .../CodeGen/AMDGPU/GlobalISel/load-local.128.ll | 8 +- .../CodeGen/AMDGPU/GlobalISel/load-local.96.ll | 23 +- .../CodeGen/AMDGPU/GlobalISel/load-unaligned.ll | 31 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 8 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 130 +-- llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll | 6 +- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 178 ++-- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 8 +- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 178 ++-- .../CodeGen/AMDGPU/GlobalISel/store-local.128.ll | 16 +- .../CodeGen/AMDGPU/GlobalISel/store-local.96.ll | 12 +- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 78 +- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 60 +- .../AMDGPU/atomic_optimizations_local_pointer.ll | 72 +- .../AMDGPU/atomic_optimizations_pixelshader.ll | 8 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 4 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 8 +- llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 8 +- .../fneg-fold-legalize-dag-increase-insts.ll | 4 +- llvm/test/CodeGen/AMDGPU/frem.ll | 176 ++-- llvm/test/CodeGen/AMDGPU/fshr.ll | 68 +- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 32 +- llvm/test/CodeGen/AMDGPU/idot2.ll | 4 +- llvm/test/CodeGen/AMDGPU/idot4s.ll | 20 +- llvm/test/CodeGen/AMDGPU/idot4u.ll | 46 +- llvm/test/CodeGen/AMDGPU/idot8s.ll | 144 ++-- llvm/test/CodeGen/AMDGPU/idot8u.ll | 120 +-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll | 110 +-- .../AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll | 68 +- .../AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll | 4 +- .../CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll | 8 +- .../AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll | 40 +- .../CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll | 40 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll | 20 +- llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 16 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 8 +- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 12 +- .../test/CodeGen/AMDGPU/regbank-reassign-split.mir | 38 - .../CodeGen/AMDGPU/regbank-reassign-wave64.mir | 69 -- llvm/test/CodeGen/AMDGPU/regbank-reassign.mir | 611 -------------- llvm/test/CodeGen/AMDGPU/saddo.ll | 20 +- llvm/test/CodeGen/AMDGPU/saddsat.ll | 14 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 8 +- llvm/test/CodeGen/AMDGPU/ssubsat.ll | 14 +- llvm/test/CodeGen/AMDGPU/store-local.128.ll | 20 +- llvm/test/CodeGen/AMDGPU/store-local.96.ll | 8 +- llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll | 24 +- llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll | 27 +- llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll | 32 +- llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll | 24 +- llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll | 24 +- llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll | 50 +- 78 files changed, 2151 insertions(+), 3983 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp delete mode 100644 llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir delete mode 100644 llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir delete mode 100644 llvm/test/CodeGen/AMDGPU/regbank-reassign.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 7220616..a38d0a77 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -74,16 +74,6 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); -namespace AMDGPU { -enum RegBankReassignMode { - RM_VGPR = 1, - RM_SGPR = 2, - RM_BOTH = RM_VGPR | RM_SGPR -}; -} -MachineFunctionPass * -createGCNRegBankReassignPass(AMDGPU::RegBankReassignMode Mode); - struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); @@ -342,9 +332,6 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; -void initializeGCNRegBankReassignPass(PassRegistry &); -extern char &GCNRegBankReassignID; - void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b7fcffb..b50e0eb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -262,7 +262,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); - initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); } @@ -1177,10 +1176,8 @@ void GCNPassConfig::addOptimizedRegAlloc() { } bool GCNPassConfig::addPreRewrite() { - if (EnableRegReassign) { + if (EnableRegReassign) addPass(&GCNNSAReassignID); - addPass(createGCNRegBankReassignPass(AMDGPU::RM_BOTH)); - } return true; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 4a4fee5..41d58d5 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -139,7 +139,6 @@ add_llvm_target(AMDGPUCodeGen SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp - GCNRegBankReassign.cpp GCNNSAReassign.cpp GCNDPPCombine.cpp SIModeRegister.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp deleted file mode 100644 index b877ef9..0000000 --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ /dev/null @@ -1,900 +0,0 @@ -//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Try to reassign registers on GFX10+ to reduce register bank -/// conflicts. -/// -/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in -/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to -/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, -/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. -/// -/// The shader can read one dword from each of these banks once per cycle. -/// If an instruction has to read more register operands from the same bank -/// an additional cycle is needed. HW attempts to pre-load registers through -/// input operand gathering, but a stall cycle may occur if that fails. For -/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, -/// potentially incuring 2 stall cycles. -/// -/// The pass tries to reassign registers to reduce bank conflicts. -/// -/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so -/// that 4 has to be subtracted from an SGPR bank number to get the real value. -/// This also corresponds to bit numbers in bank masks used in the pass. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveRegMatrix.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; -using namespace AMDGPU; - -static cl::opt VerifyStallCycles("amdgpu-verify-regbanks-reassign", - cl::desc("Verify stall cycles in the regbanks reassign pass"), - cl::value_desc("0|1|2"), - cl::init(0), cl::Hidden); - -// Threshold to keep compile time reasonable. -static cl::opt VRegThresh("amdgpu-regbanks-reassign-threshold", - cl::desc("Max number of vregs to run the regbanks reassign pass"), - cl::init(15000), cl::Hidden); - -#define DEBUG_TYPE "amdgpu-regbanks-reassign" - -#define NUM_VGPR_BANKS 4 -#define NUM_SGPR_BANKS 8 -#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) -#define SGPR_BANK_OFFSET NUM_VGPR_BANKS -#define VGPR_BANK_MASK 0xf -#define SGPR_BANK_MASK 0xff0 -#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) - -STATISTIC(NumStallsDetected, - "Number of operand read stalls detected"); -STATISTIC(NumStallsRecovered, - "Number of operand read stalls recovered"); - -namespace { - -class GCNRegBankReassign : public MachineFunctionPass { - - class OperandMask { - public: - OperandMask(unsigned r, unsigned s, unsigned m) - : Reg(r), SubReg(s), Mask(m) {} - Register Reg; - unsigned SubReg; - unsigned Mask; - }; - - class Candidate { - public: - Candidate(MachineInstr *mi, Register reg, unsigned subreg, - unsigned freebanks) - : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(const GCNRegBankReassign *P) const { - MI->dump(); - dbgs() << P->printReg(Reg) << " to banks "; - dumpFreeBanks(FreeBanks); - dbgs() << '\n'; - } -#endif - - MachineInstr *MI; - Register Reg; - unsigned SubReg; - unsigned FreeBanks; - }; - - class CandidateList : public std::map> { - public: - void push(unsigned Weight, const Candidate&& C) { - operator[](Weight).push_front(C); - } - - Candidate &back() { - return rbegin()->second.back(); - } - - void pop_back() { - rbegin()->second.pop_back(); - if (rbegin()->second.empty()) - erase(rbegin()->first); - } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(const GCNRegBankReassign *P) const { - dbgs() << "\nCandidates:\n\n"; - for (auto &B : *this) { - dbgs() << " Weight " << B.first << ":\n"; - for (auto &C : B.second) - C.dump(P); - } - dbgs() << "\n\n"; - } -#endif - }; - -public: - static char ID; - -public: - GCNRegBankReassign(RegBankReassignMode Mode = RM_BOTH) - : MachineFunctionPass(ID), Mode(Mode) { - initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "GCN RegBank Reassign"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - const GCNSubtarget *ST; - - const MachineRegisterInfo *MRI; - - const SIRegisterInfo *TRI; - - MachineLoopInfo *MLI; - - VirtRegMap *VRM; - - LiveRegMatrix *LRM; - - LiveIntervals *LIS; - - RegBankReassignMode Mode; - - unsigned MaxNumVGPRs; - - unsigned MaxNumSGPRs; - - BitVector RegsUsed; - - SmallVector OperandMasks; - - CandidateList Candidates; - - const MCPhysReg *CSRegs; - - // Returns bank for a phys reg. - unsigned getPhysRegBank(Register Reg, unsigned SubReg) const; - - // Return a bit set for each register bank used. 4 banks for VGPRs and - // 8 banks for SGPRs. - // Registers already processed and recorded in RegsUsed are excluded. - // If Bank is not -1 assume Reg:SubReg to belong to that Bank. - uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank); - - // Analyze one instruction returning the number of stalls and a mask of the - // banks used by all operands. - // If Reg and Bank are provided, assume all uses of Reg will be replaced with - // a register chosen from Bank. - std::pair analyzeInst(const MachineInstr &MI, - Register Reg = Register(), - unsigned SubReg = 0, int Bank = -1); - - // Return true if register is regular VGPR or SGPR or their tuples. - // Returns false for special registers like m0, vcc etc. - bool isReassignable(Register Reg) const; - - // Check if registers' defs are old and may be pre-loaded. - // Returns 0 if both registers are old enough, 1 or 2 if one or both - // registers will not likely be pre-loaded. - unsigned getOperandGatherWeight(const MachineInstr& MI, - Register Reg1, - Register Reg2, - unsigned StallCycles) const; - - - // Find all bank bits in UsedBanks where Mask can be relocated to. - unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; - - // Find all bank bits in UsedBanks where Mask can be relocated to. - // Bank is relative to the register and not its subregister component. - // Returns 0 is a register is not reassignable. - unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask, - unsigned UsedBanks) const; - - // Add cadidate instruction to the work list. - void collectCandidates(MachineInstr& MI, unsigned UsedBanks, - unsigned StallCycles); - - // Collect cadidate instructions across function. Returns a number stall - // cycles detected. Only counts stalls if Collect is false. - unsigned collectCandidates(MachineFunction &MF, bool Collect = true); - - // Remove all candidates that read specified register. - void removeCandidates(Register Reg); - - // Compute stalls within the uses of SrcReg replaced by a register from - // Bank. If Bank is -1 does not perform substitution. If Collect is set - // candidates are collected and added to work list. - unsigned computeStallCycles(Register SrcReg, - Register Reg = Register(), - unsigned SubReg = 0, int Bank = -1, - bool Collect = false); - - // Search for a register in Bank unused within LI. - // Returns phys reg or NoRegister. - MCRegister scavengeReg(LiveInterval &LI, unsigned Bank, - unsigned SubReg) const; - - // Try to reassign candidate. Returns number or stall cycles saved. - unsigned tryReassign(Candidate &C); - - bool verifyCycles(MachineFunction &MF, - unsigned OriginalCycles, unsigned CyclesSaved); - - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -public: - Printable printReg(Register Reg, unsigned SubReg = 0) const { - return Printable([Reg, SubReg, this](raw_ostream &OS) { - if (Reg.isPhysical()) { - OS << llvm::printReg(Reg, TRI); - return; - } - if (!VRM->isAssignedReg(Reg)) - OS << " " << llvm::printReg(Reg, TRI); - else - OS << llvm::printReg(Reg, TRI) << '(' - << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; - if (SubReg) - OS << ':' << TRI->getSubRegIndexName(SubReg); - }); - } - - static Printable printBank(unsigned Bank) { - return Printable([Bank](raw_ostream &OS) { - OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); - }); - } - - static void dumpFreeBanks(unsigned FreeBanks) { - for (unsigned L = 0; L < NUM_BANKS; ++L) - if (FreeBanks & (1 << L)) - dbgs() << printBank(L) << ' '; - } -#endif -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", - false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(VirtRegMap) -INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) -INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", - false, false) - - -char GCNRegBankReassign::ID = 0; - -char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; - -unsigned GCNRegBankReassign::getPhysRegBank(Register Reg, - unsigned SubReg) const { - assert(Reg.isPhysical()); - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - unsigned Size = TRI->getRegSizeInBits(*RC); - if (Size == 16) - Reg = TRI->get32BitRegister(Reg); - else if (Size > 32) { - if (SubReg) { - const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg); - Reg = TRI->getSubReg(Reg, SubReg); - if (TRI->getRegSizeInBits(*SubRC) > 32) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } else { - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } - } - - if (TRI->hasVGPRs(RC)) { - unsigned RegNo = Reg - AMDGPU::VGPR0; - return RegNo % NUM_VGPR_BANKS; - } - - unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2; - return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; -} - -uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg, - int Bank) { - if (Reg.isVirtual()) { - if (!VRM->isAssignedReg(Reg)) - return 0; - - Reg = VRM->getPhys(Reg); - if (!Reg) - return 0; - if (SubReg) - Reg = TRI->getSubReg(Reg, SubReg); - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - unsigned Size = TRI->getRegSizeInBits(*RC); - - if (Size == 16) { - Reg = TRI->get32BitRegister(Reg); - Size = 1; - } else { - Size /= 32; - if (Size > 1) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } - - if (TRI->hasVGPRs(RC)) { - // VGPRs have 4 banks assigned in a round-robin fashion. - unsigned RegNo = Reg - AMDGPU::VGPR0; - uint32_t Mask = maskTrailingOnes(Size); - unsigned Used = 0; - // Bitmask lacks an extract method - for (unsigned I = 0; I < Size; ++I) - if (RegsUsed.test(RegNo + I)) - Used |= 1 << I; - RegsUsed.set(RegNo, RegNo + Size); - Mask &= ~Used; - Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank); - return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; - } - - // SGPRs have 8 banks holding 2 consequitive registers each. - unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2; - unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); - if (RegNo + StartBit >= RegsUsed.size()) - return 0; - - if (Size > 1) - Size /= 2; - unsigned Mask = (1 << Size) - 1; - unsigned Used = 0; - for (unsigned I = 0; I < Size; ++I) - if (RegsUsed.test(StartBit + RegNo + I)) - Used |= 1 << I; - RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size); - Mask &= ~Used; - Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS - : unsigned(Bank - SGPR_BANK_OFFSET); - Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; - // Reserve 4 bank ids for VGPRs. - return Mask << SGPR_BANK_OFFSET; -} - -std::pair -GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg, - unsigned SubReg, int Bank) { - unsigned StallCycles = 0; - unsigned UsedBanks = 0; - - if (MI.isMetaInstruction()) - return std::make_pair(StallCycles, UsedBanks); - - if (!(Mode & RM_SGPR) && - MI.getDesc().TSFlags & (SIInstrFlags::SMRD | SIInstrFlags::SALU)) - return std::make_pair(StallCycles, UsedBanks); - - RegsUsed.reset(); - OperandMasks.clear(); - for (const auto& Op : MI.explicit_uses()) { - // Undef can be assigned to any register, so two vregs can be assigned - // the same phys reg within the same instruction. - if (!Op.isReg() || Op.isUndef()) - continue; - - const Register R = Op.getReg(); - const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R); - - // Do not compute stalls for AGPRs - if (TRI->hasAGPRs(RC)) - continue; - if ((Mode != RM_BOTH) && !(Mode & (TRI->hasVGPRs(RC) ? RM_VGPR : RM_SGPR))) - continue; - - // Do not compute stalls if sub-register covers all banks - if (Op.getSubReg()) { - LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); - if (TRI->hasVGPRs(RC)) { - if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) - continue; - } else { - if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) - continue; - } - } - - unsigned ShiftedBank = Bank; - - if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) { - unsigned RegOffset = - TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0); - unsigned Offset = TRI->getChannelFromSubReg( - Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0); - if (Bank < NUM_VGPR_BANKS) { - unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset); - ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS; - } else if (Bank >= SGPR_BANK_OFFSET) { - unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1); - ShiftedBank = SGPR_BANK_OFFSET + - (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS; - } - } - - uint32_t Mask = getRegBankMask(R, Op.getSubReg(), - (Reg == R) ? ShiftedBank : -1); - StallCycles += countPopulation(UsedBanks & Mask); - UsedBanks |= Mask; - OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); - } - - return std::make_pair(StallCycles, UsedBanks); -} - -unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, - Register Reg1, - Register Reg2, - unsigned StallCycles) const -{ - unsigned Defs = 0; - MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); - MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); - for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { - if (MI.isDebugInstr()) - continue; - --Def; - if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) - continue; - if (Def->modifiesRegister(Reg1, TRI)) - Defs |= 1; - if (Def->modifiesRegister(Reg2, TRI)) - Defs |= 2; - } - return countPopulation(Defs); -} - -bool GCNRegBankReassign::isReassignable(Register Reg) const { - if (Reg.isPhysical() || !VRM->isAssignedReg(Reg)) - return false; - - // InlineSpiller does not call LRM::assign() after an LI split leaving it - // in an inconsistent state, so we cannot call LRM::unassign(). - // See llvm bug #48911. - // Skip reassign if a register has originated from such split. - // FIXME: Remove the workaround when bug #48911 is fixed. - if (VRM->getPreSplitReg(Reg)) - return false; - - const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); - - Register PhysReg = VRM->getPhys(Reg); - - if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) - return false; - - for (auto U : MRI->use_nodbg_operands(Reg)) { - if (U.isImplicit()) - return false; - const MachineInstr *UseInst = U.getParent(); - if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) - return false; - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); - unsigned Size = TRI->getRegSizeInBits(*RC); - - // TODO: Support 16 bit registers. Those needs to be moved with their - // parent VGPR_32 and potentially a sibling 16 bit sub-register. - if (Size < 32) - return false; - - if (TRI->hasVGPRs(RC)) - return true; - - if (Size == 16) - return AMDGPU::SGPR_LO16RegClass.contains(PhysReg); - - if (Size > 32) - PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); - - return AMDGPU::SGPR_32RegClass.contains(PhysReg); -} - -unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, - unsigned UsedBanks) const { - unsigned Size = countPopulation(Mask); - unsigned FreeBanks = 0; - unsigned Bank = findFirstSet(Mask); - - UsedBanks &= ~Mask; - - // Find free VGPR banks - if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { - for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { - if (Bank == I) - continue; - unsigned NewMask = ((1 << Size) - 1) << I; - NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; - if (!(UsedBanks & NewMask)) - FreeBanks |= 1 << I; - } - return FreeBanks; - } - - // Find free SGPR banks - // SGPR tuples must be aligned, so step is size in banks it - // crosses. - Bank -= SGPR_BANK_OFFSET; - for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { - if (Bank == I) - continue; - unsigned NewMask = ((1 << Size) - 1) << I; - NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; - if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) - FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; - } - - return FreeBanks; -} - -unsigned GCNRegBankReassign::getFreeBanks(Register Reg, - unsigned SubReg, - unsigned Mask, - unsigned UsedBanks) const { - if (!isReassignable(Reg)) - return 0; - - unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); - - unsigned Offset = TRI->getChannelFromSubReg(SubReg); - if (Offset && (Mask & VGPR_BANK_MASK)) { - unsigned Shift = Offset; - if (Shift >= NUM_VGPR_BANKS) - return 0; - unsigned VB = FreeBanks & VGPR_BANK_MASK; - FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & - VGPR_BANK_MASK; - } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) { - unsigned Shift = Offset >> 1; - if (Shift >= NUM_SGPR_BANKS) - return 0; - unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; - FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & - SGPR_BANK_SHIFTED_MASK; - FreeBanks <<= SGPR_BANK_OFFSET; - } - - LLVM_DEBUG(if (FreeBanks) { - dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) - << " to banks: "; dumpFreeBanks(FreeBanks); - dbgs() << '\n'; }); - - return FreeBanks; -} - -void GCNRegBankReassign::collectCandidates(MachineInstr& MI, - unsigned UsedBanks, - unsigned StallCycles) { - LLVM_DEBUG(MI.dump()); - - if (!StallCycles) - return; - - LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); - - for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { - for (unsigned J = I + 1; J != E; ++J) { - if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) - continue; - - Register Reg1 = OperandMasks[I].Reg; - Register Reg2 = OperandMasks[J].Reg; - unsigned SubReg1 = OperandMasks[I].SubReg; - unsigned SubReg2 = OperandMasks[J].SubReg; - unsigned Mask1 = OperandMasks[I].Mask; - unsigned Mask2 = OperandMasks[J].Mask; - unsigned Size1 = countPopulation(Mask1); - unsigned Size2 = countPopulation(Mask2); - - LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << - " and " << printReg(Reg2, SubReg2) << '\n'); - - unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); - Weight += MLI->getLoopDepth(MI.getParent()) * 10; - - LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); - - unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); - unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); - if (FreeBanks1) - Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0), - Candidate(&MI, Reg1, SubReg1, FreeBanks1)); - if (FreeBanks2) - Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0), - Candidate(&MI, Reg2, SubReg2, FreeBanks2)); - } - } -} - -unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg, - unsigned SubReg, int Bank, - bool Collect) { - unsigned TotalStallCycles = 0; - SmallSet Visited; - - for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { - if (MI.isBundle()) - continue; - if (!Visited.insert(&MI).second) - continue; - unsigned StallCycles; - unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank); - TotalStallCycles += StallCycles; - if (Collect) - collectCandidates(MI, UsedBanks, StallCycles); - } - - return TotalStallCycles; -} - -MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank, - unsigned SubReg) const { - const TargetRegisterClass *RC = MRI->getRegClass(LI.reg()); - unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs - : MaxNumSGPRs; - unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 - : AMDGPU::SGPR0); - - for (MCRegister Reg : RC->getRegisters()) { - // Check occupancy limit. - if (TRI->isSubRegisterEq(Reg, MaxReg)) - break; - - if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank) - continue; - - for (unsigned I = 0; CSRegs[I]; ++I) - if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && - !LRM->isPhysRegUsed(CSRegs[I])) - return MCRegister::from(AMDGPU::NoRegister); - - LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); - - if (!LRM->checkInterference(LI, Reg)) - return Reg; - } - - return MCRegister::from(AMDGPU::NoRegister); -} - -unsigned GCNRegBankReassign::tryReassign(Candidate &C) { - if (!LIS->hasInterval(C.Reg)) - return 0; - - LiveInterval &LI = LIS->getInterval(C.Reg); - LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); - LI.dump()); - - // For each candidate bank walk all instructions in the range of live - // interval and check if replacing the register with one belonging to - // the candidate bank reduces conflicts. - - unsigned OrigStalls = computeStallCycles(C.Reg); - LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); - if (!OrigStalls) - return 0; - - struct BankStall { - BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; - bool operator<(const BankStall &RHS) const { - if (Stalls == RHS.Stalls) - return Bank < RHS.Bank; - return Stalls > RHS.Stalls; - } - unsigned Bank; - unsigned Stalls; - }; - SmallVector BankStalls; - - for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { - if (C.FreeBanks & (1 << Bank)) { - LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); - unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank); - if (Stalls < OrigStalls) { - LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " - << Stalls << '\n'); - BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); - } - } - } - llvm::sort(BankStalls); - - MCRegister OrigReg = VRM->getPhys(C.Reg); - LRM->unassign(LI); - while (!BankStalls.empty()) { - BankStall BS = BankStalls.pop_back_val(); - MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg); - if (Reg == AMDGPU::NoRegister) { - LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) - << '\n'); - continue; - } - LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) - << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") - << " in bank " << printBank(BS.Bank) << '\n'); - - LRM->assign(LI, Reg); - - LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); - - return OrigStalls - BS.Stalls; - } - LRM->assign(LI, OrigReg); - - return 0; -} - -unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, - bool Collect) { - unsigned TotalStallCycles = 0; - - for (MachineBasicBlock &MBB : MF) { - - LLVM_DEBUG(if (Collect) { - if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); - else dbgs() << MBB.getName(); dbgs() << ":\n"; - }); - - for (MachineInstr &MI : MBB.instrs()) { - if (MI.isBundle()) - continue; // we analyze the instructions inside the bundle individually - - unsigned StallCycles; - unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI); - - if (Collect) - collectCandidates(MI, UsedBanks, StallCycles); - - TotalStallCycles += StallCycles; - } - - LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); - } - - return TotalStallCycles; -} - -void GCNRegBankReassign::removeCandidates(Register Reg) { - typename CandidateList::iterator Next; - for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) { - Next = std::next(I); - I->second.remove_if([Reg, this](const Candidate& C) { - return C.MI->readsRegister(Reg, TRI); - }); - if (I->second.empty()) - Candidates.erase(I); - } -} - -bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, - unsigned OriginalCycles, - unsigned CyclesSaved) { - unsigned StallCycles = collectCandidates(MF, false); - LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles - << " stall cycles left\n"); - return StallCycles + CyclesSaved == OriginalCycles; -} - -bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { - ST = &MF.getSubtarget(); - if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) - return false; - - MRI = &MF.getRegInfo(); - - LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " - << MF.getName() << '\n' - << ((Mode & RM_VGPR) ? "VGPR " : "") - << ((Mode & RM_SGPR) ? "SGPR " : "") << "mode\n" - << "NumVirtRegs = " << MRI->getNumVirtRegs() << "\n\n"); - - if (MRI->getNumVirtRegs() > VRegThresh) { - LLVM_DEBUG(dbgs() << "NumVirtRegs > " << VRegThresh - << " threshold, skipping function.\n\n"); - return false; - } - - TRI = ST->getRegisterInfo(); - MLI = &getAnalysis(); - VRM = &getAnalysis(); - LRM = &getAnalysis(); - LIS = &getAnalysis(); - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned Occupancy = MFI->getOccupancy(); - MaxNumVGPRs = ST->getMaxNumVGPRs(MF); - MaxNumSGPRs = ST->getMaxNumSGPRs(MF); - MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); - MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); - - CSRegs = MRI->getCalleeSavedRegs(); - unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() + - // Not a tight bound - AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1; - RegsUsed.resize(NumRegBanks); - - unsigned StallCycles = collectCandidates(MF); - NumStallsDetected += StallCycles; - - LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " - "function " << MF.getName() << '\n'); - - LLVM_DEBUG(Candidates.dump(this)); - - unsigned CyclesSaved = 0; - while (!Candidates.empty()) { - Candidate C = Candidates.back(); - unsigned LocalCyclesSaved = tryReassign(C); - CyclesSaved += LocalCyclesSaved; - - if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) - report_fatal_error("RegBank reassign stall cycles verification failed."); - - Candidates.pop_back(); - if (LocalCyclesSaved) { - removeCandidates(C.Reg); - computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true); - - LLVM_DEBUG(Candidates.dump(this)); - } - } - NumStallsRecovered += CyclesSaved; - - LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved - << " cycles saved in function " << MF.getName() << '\n'); - - Candidates.clear(); - - if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) - report_fatal_error("RegBank reassign stall cycles verification failed."); - - RegsUsed.clear(); - - return CyclesSaved > 0; -} - -MachineFunctionPass * -llvm::createGCNRegBankReassignPass(RegBankReassignMode Mode) { - return new GCNRegBankReassign(Mode); -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index c3e3167..a178f05 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1643,12 +1643,8 @@ define <2 x i64> @v_ashr_v2i64(<2 x i64> %value, <2 x i64> %amount) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], v4, v[10:11] -; GFX10-NEXT: v_ashrrev_i64 v[2:3], v6, v[7:8] +; GFX10-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] +; GFX10-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i64> %value, %amount ret <2 x i64> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 80b599f..5741091 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -314,45 +314,45 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr, ; GFX10-NEXT: v_add_nc_u32_e32 v19, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v19 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v19 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v27, v18, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v8, vcc_lo +; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v22, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v23, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v10, vcc_lo -; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v27, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v9, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v19 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v11, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v22, v11, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v12, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v3, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v14, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v19 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v16, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc_lo @@ -577,54 +577,54 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace( ; ; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_cndmask_b32_e32 v4, s8, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, s9, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, s8, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s21, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s20, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s21, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s22, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s22, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s23, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll index 525b2c2..3a88af6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -581,9 +581,9 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(<8 x i16> addrspace(1)* %ptr, i32 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll index 50def72..c820562 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -223,9 +223,9 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i ; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_and_b32_sdwa v4, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v6, v0, s4, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX10-NEXT: v_or3_b32 v0, v6, v4, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v4, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1036,12 +1036,12 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p ; GFX10-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v0, s1, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s1, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v3 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: s_and_b32 s0, s2, 3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -2613,25 +2613,25 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX10-NEXT: v_and_b32_sdwa v13, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v23, v1, s1, v8 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v8 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_sdwa v17, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_and_or_b32 v2, v2, s1, v19 +; GFX10-NEXT: v_and_or_b32 v2, v2, s1, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v13, v7 -; GFX10-NEXT: v_or3_b32 v1, v23, v14, v9 +; GFX10-NEXT: v_or3_b32 v1, v1, v14, v9 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_and_or_b32 v5, v3, v4, v5 ; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v2, v2, v17, v11 +; GFX10-NEXT: v_or3_b32 v2, v2, v15, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 647d22b..a944adb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -186,9 +186,9 @@ define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 @@ -227,9 +227,9 @@ define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel) ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 @@ -346,20 +346,20 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) { ; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: s_mov_b64 s[4:5], 1 ; GFX10-NEXT: s_mov_b64 s[8:9], 3 -; GFX10-NEXT: s_mov_b64 s[14:15], 4 +; GFX10-NEXT: s_mov_b64 s[10:11], 4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, s5, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 ; GFX10-NEXT: s_mov_b64 s[12:13], 5 +; GFX10-NEXT: s_mov_b64 s[14:15], 6 ; GFX10-NEXT: s_mov_b64 s[16:17], 7 ; GFX10-NEXT: s_mov_b64 s[18:19], 8 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: s_mov_b64 s[14:15], 6 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 @@ -561,11 +561,11 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) { ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s46, s12 +; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 @@ -576,8 +576,8 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s47, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo @@ -624,23 +624,23 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo @@ -860,9 +860,9 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 @@ -1360,23 +1360,23 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v16, 3, v16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo @@ -1416,9 +1416,9 @@ define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 @@ -1530,23 +1530,23 @@ define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 % ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo @@ -2001,9 +2001,9 @@ define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2034,9 +2034,9 @@ define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel) ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: ; return to shader part epilog @@ -2162,9 +2162,9 @@ define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v7 @@ -2199,9 +2199,9 @@ define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel) ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 @@ -2311,19 +2311,19 @@ define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, s12 +; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s47, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s11, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -2358,17 +2358,17 @@ define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v15, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo @@ -2520,11 +2520,11 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s46, s12 +; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo @@ -2533,8 +2533,8 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s47, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo @@ -2575,23 +2575,23 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v15, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x double> %vec, i32 %sel @@ -3168,8 +3168,8 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s46, s12 -; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 @@ -3187,9 +3187,9 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s47, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 @@ -3245,25 +3245,25 @@ define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 @@ -3476,25 +3476,25 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index b399aad..de48249 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -637,9 +637,9 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 @@ -849,9 +849,9 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 @@ -1515,9 +1515,9 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 6431eab..aa6a244 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -712,27 +712,27 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 ; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_fdiv_v2f32: @@ -752,18 +752,18 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH-NEXT: s_denorm_mode 0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v3, v3, v1 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v5, v6, v5, v7 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v11, vcc_lo, v1, v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v5, v2, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 ; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v6, 1.0 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v5, v6 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v11, v6 -; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v11 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v6 +; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v2 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v6 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v11, -v4, v5 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v2, -v4, v5 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v11, v6, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b @@ -874,27 +874,27 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 ; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_ulp25: @@ -905,16 +905,16 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 ; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 ; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v7, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v6, 1.0, s5, s4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v7, v0 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v6, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv @@ -1044,25 +1044,25 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v8, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 ; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v8, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v13, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v13 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8 ; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v13, -v2, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v13, v4, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v8, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -1226,25 +1226,25 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v8, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 ; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v8, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v13, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v13 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8 ; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v13, -v2, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v13, v4, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v8, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -1465,27 +1465,27 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 ; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: @@ -1496,16 +1496,16 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 ; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 ; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v7, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v6, 1.0, s5, s4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v7, v0 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v6, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll index 946f54d..ca83689 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -105,10 +105,10 @@ define double @v_fdiv_f64_afn(double %a, double %b) { ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b ret double %fdiv @@ -355,9 +355,9 @@ define double @v_rcp_f64_arcp_afn(double %x) { ; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 ; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GFX10-NEXT: v_mul_f64 v[6:7], 1.0, v[2:3] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], 1.0 -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; GFX10-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn double 1.0, %x ret double %fdiv @@ -458,10 +458,10 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) { ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b, !fpmath !0 ret double %fdiv @@ -634,33 +634,29 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v30, v4 -; GFX10-NEXT: v_mov_b32_e32 v31, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b ret <2 x double> %fdiv @@ -692,30 +688,22 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v18, v4 -; GFX10-NEXT: v_mov_b32_e32 v19, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] -; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b ret <2 x double> %fdiv @@ -816,33 +804,29 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v30, v4 -; GFX10-NEXT: v_mov_b32_e32 v31, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv @@ -943,29 +927,29 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x ret <2 x double> %fdiv @@ -1066,29 +1050,29 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> , %x ret <2 x double> %fdiv @@ -1120,26 +1104,22 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[14:15] -; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] ; GFX10-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5] ; GFX10-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7] -; GFX10-NEXT: v_fma_f64 v[14:15], -v[14:15], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[0:1], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] -; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x double> , %x ret <2 x double> %fdiv @@ -1240,29 +1220,29 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x, !fpmath !0 ret <2 x double> %fdiv @@ -1294,30 +1274,22 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v18, v4 -; GFX10-NEXT: v_mov_b32_e32 v19, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] -; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv @@ -1418,33 +1390,29 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v30, v4 -; GFX10-NEXT: v_mov_b32_e32 v31, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv @@ -1476,30 +1444,22 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v18, v4 -; GFX10-NEXT: v_mov_b32_e32 v19, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] -; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index aad28b9..077f913 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -479,12 +479,8 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v12, v2 -; GFX10-NEXT: v_mov_b32_e32 v13, v3 -; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] -; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) ret <2 x double> %fma diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fdd450f..83ec29d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -1159,7 +1159,6 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: v_and_b32_e32 v11, 7, v2 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -1167,13 +1166,14 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_mov_b32_e32 v15, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v12, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 @@ -2190,13 +2190,13 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: s_sub_i32 s4, 0, 24 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xffffff +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v12 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v12 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2224,19 +2224,19 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v15 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v11, v6, v12 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v10, v5, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v11, v2 -; GFX10-NEXT: v_and_b32_e32 v6, v7, v12 -; GFX10-NEXT: v_and_b32_e32 v7, v15, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v6, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, v7, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, v10, v3 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v7, v7, v10 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result @@ -2617,13 +2617,13 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX10-NEXT: v_alignbit_b32 v0, v7, v2, v4 -; GFX10-NEXT: v_alignbit_b32 v1, v6, v3, v5 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) ret <2 x i32> %result @@ -2770,22 +2770,22 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v22, v1, v5, 1 -; GFX10-NEXT: v_alignbit_b32 v18, v0, v4, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v15, 1, v0 +; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1 +; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1 +; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1 +; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v19, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9 -; GFX10-NEXT: v_alignbit_b32 v5, v2, v6, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v23, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v10 -; GFX10-NEXT: v_alignbit_b32 v13, v3, v7, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 -; GFX10-NEXT: v_alignbit_b32 v0, v15, v18, v8 -; GFX10-NEXT: v_alignbit_b32 v1, v19, v22, v9 -; GFX10-NEXT: v_alignbit_b32 v2, v23, v5, v10 -; GFX10-NEXT: v_alignbit_b32 v3, v14, v13, v11 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) ret <4 x i32> %result @@ -4176,15 +4176,15 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX10-NEXT: s_mov_b32 s4, 0xf000f ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v11, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v15, s4, v6 -; GFX10-NEXT: v_and_b32_e32 v19, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v11, v0 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v15, v2 -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v19, v1 -; GFX10-NEXT: v_pk_lshrrev_b16 v3, v6, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4290,9 +4290,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v7, 63, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -4703,18 +4703,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v15, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v19, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v13, 63, v11 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v10 -; GFX10-NEXT: v_lshlrev_b64 v[11:12], v15, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v19, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v9, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 +; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) ret <2 x i64> %result @@ -5178,16 +5178,14 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 ; GFX10-NEXT: s_movk_i32 s4, 0x7f -; GFX10-NEXT: v_mov_b32_e32 v27, v2 ; GFX10-NEXT: v_and_b32_e32 v18, s4, v8 -; GFX10-NEXT: v_mov_b32_e32 v28, v3 ; GFX10-NEXT: v_and_b32_e32 v19, s4, v9 ; GFX10-NEXT: s_sub_i32 s4, 64, 1 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], s4, v[6:7] ; GFX10-NEXT: s_sub_i32 s4, 1, 64 ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s4, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[12:13], s4, v[6:7] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 ; GFX10-NEXT: v_or_b32_e32 v8, v8, v10 @@ -5197,48 +5195,48 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: s_and_b32 s4, 1, s4 ; GFX10-NEXT: v_sub_nc_u32_e32 v14, 64, v18 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v9, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v14, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[27:28] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, v7, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v23, 64, v18 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v7, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 ; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[21:22] +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[21:22] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 ; GFX10-NEXT: v_or_b32_e32 v15, v15, v17 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[21:22] -; GFX10-NEXT: v_cndmask_b32_e64 v23, v8, v14, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v14, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v31, 0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v23, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v10, v27, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v28, s6 -; GFX10-NEXT: v_or_b32_e32 v0, v31, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 +; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -5473,7 +5471,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v0, s[8:9] -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v10, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[8:9] @@ -5481,25 +5479,25 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[8:9] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v19, v8, s2, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s6, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, s3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -5756,7 +5754,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] ; GFX10-NEXT: s_sub_i32 s0, 1, 64 ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 @@ -5765,12 +5763,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX10-NEXT: s_sub_i32 s0, 64, s4 @@ -5778,7 +5776,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] ; GFX10-NEXT: s_sub_i32 s0, s4, 64 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 @@ -5787,12 +5785,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 @@ -6025,7 +6023,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[11:12], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s12, 1, s6 ; GFX10-NEXT: s_sub_i32 s13, 1, 64 @@ -6045,10 +6043,10 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s14, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 @@ -6419,7 +6417,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_movk_i32 s4, 0x41 -; GFX10-NEXT: v_lshrrev_b32_e32 v19, 31, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v16, 31, v5 ; GFX10-NEXT: s_sub_i32 s5, 64, s4 ; GFX10-NEXT: v_lshlrev_b64 v[10:11], s4, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[8:9], s5, v[0:1] @@ -6431,39 +6429,39 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_sub_i32 s5, 64, 63 -; GFX10-NEXT: v_or_b32_e32 v15, v9, v11 ; GFX10-NEXT: v_or_b32_e32 v14, v8, v10 +; GFX10-NEXT: v_or_b32_e32 v15, v9, v11 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[6:7] ; GFX10-NEXT: s_and_b32 s6, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s7, 1, s4 ; GFX10-NEXT: s_sub_i32 s4, 63, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v27, 0, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 -; GFX10-NEXT: v_lshrrev_b64 v[23:24], s4, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], s4, v[6:7] ; GFX10-NEXT: s_cmp_lt_u32 63, 64 -; GFX10-NEXT: v_or_b32_e32 v6, v19, v8 +; GFX10-NEXT: v_or_b32_e32 v6, v16, v8 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 63, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s7 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v23, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s4 ; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v24, v9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v9, s4 ; GFX10-NEXT: s_and_b32 s4, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v0, v2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, v2, s6 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v15, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v3, s6 -; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v27, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v19, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s6 +; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v13, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result @@ -6810,21 +6808,19 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_movk_i32 s18, 0x7f ; GFX10-NEXT: s_mov_b32 s19, 0 -; GFX10-NEXT: s_mov_b32 s30, s0 ; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX10-NEXT: s_sub_i32 s17, s22, 64 ; GFX10-NEXT: s_sub_i32 s23, 64, s22 ; GFX10-NEXT: s_cmp_lt_u32 s22, 64 -; GFX10-NEXT: s_mov_b32 s31, s1 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s22, 0 ; GFX10-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[30:31], s23 +; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s22 -; GFX10-NEXT: s_lshl_b64 s[22:23], s[30:31], s22 +; GFX10-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 ; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[30:31], s17 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 ; GFX10-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] @@ -6844,7 +6840,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] ; GFX10-NEXT: s_cmp_lg_u32 s30, 0 -; GFX10-NEXT: s_cselect_b64 s[46:47], s[8:9], s[0:1] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[26:27], 0 ; GFX10-NEXT: s_sub_i32 s26, s16, 64 @@ -6853,7 +6849,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b32 s27, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s16, 0 ; GFX10-NEXT: s_cselect_b32 s30, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[46:47], s16 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 ; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 ; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] @@ -6861,7 +6857,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s30, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[46:47], s[8:9] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] @@ -7329,8 +7325,6 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v29, v2 -; GFX10-NEXT: v_mov_b32_e32 v30, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v23, 64, v27 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v16, vcc_lo @@ -7338,20 +7332,20 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v23, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[29:30] +; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] ; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v34, v21, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v35, v22, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v21, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v10, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, v11, s4 ; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v31, 64, v27 +; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 -; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[34:35] +; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] ; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v31, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 @@ -7363,77 +7357,77 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo -; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v34, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v35, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v8, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v9, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v3, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, v1, s4 ; GFX10-NEXT: v_xor_b32_e32 v16, -1, v20 -; GFX10-NEXT: v_or_b32_e32 v0, v21, v8 ; GFX10-NEXT: v_or_b32_e32 v1, v11, v9 +; GFX10-NEXT: v_or_b32_e32 v0, v21, v8 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], s8, v[14:15] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 -; GFX10-NEXT: v_and_b32_e32 v27, s7, v16 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_and_b32_e32 v25, s7, v16 +; GFX10-NEXT: v_and_b32_e32 v24, s7, v20 ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], s9, v[14:15] -; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_and_b32_e32 v24, s7, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v30, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v16, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v31, v17, v11, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v24 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v29, s6 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v24 +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v18, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[14:15], v24, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v9, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v31, 64, v27 -; GFX10-NEXT: v_lshrrev_b64 v[35:36], v18, v[4:5] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v27 ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v27, v[12:13] -; GFX10-NEXT: v_lshlrev_b64 v[20:21], v31, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v25 +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v25, v[12:13] +; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v5, v36, v15 -; GFX10-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX10-NEXT: v_or_b32_e32 v5, v11, v15 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9] +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v16, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v31, v3, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v3, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[3:4], v27, v[8:9] +; GFX10-NEXT: v_lshrrev_b64 v[3:4], v25, v[8:9] ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v18, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v27 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v14, v5, v7, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v5, v7, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, v4, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v31, v6, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, v6, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v10, v12, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v3, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v13, s5 ; GFX10-NEXT: v_or_b32_e32 v3, v22, v23 -; GFX10-NEXT: v_or_b32_e32 v7, v14, v11 +; GFX10-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX10-NEXT: v_or_b32_e32 v4, v15, v5 -; GFX10-NEXT: v_or_b32_e32 v6, v19, v10 +; GFX10-NEXT: v_or_b32_e32 v6, v6, v10 ; GFX10-NEXT: v_or_b32_e32 v5, v9, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index d9abd35..35d17d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -260,9 +260,9 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f ; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 ; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v7, v4, v3 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, v4, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -1158,38 +1158,38 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 +; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_and_b32_e32 v15, 7, v8 +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v14, 7, v11 -; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 +; GFX10-NEXT: v_mov_b32_e32 v13, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, 0xff -; GFX10-NEXT: v_lshlrev_b16 v3, v14, v3 -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 ; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v15, 7, v14 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 ; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 ; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v15, v5 +; GFX10-NEXT: v_lshlrev_b16 v5, v13, v5 ; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 @@ -2190,14 +2190,14 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: s_sub_i32 s4, 0, 24 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xffffff +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v12 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v12 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2224,18 +2224,18 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_and_b32_e32 v4, v11, v12 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 -; GFX10-NEXT: v_and_b32_e32 v11, v6, v12 -; GFX10-NEXT: v_and_b32_e32 v4, v7, v12 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v4, v7, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, v11, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) @@ -4424,9 +4424,9 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 63, v5 +; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4833,18 +4833,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v19, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v15, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v11 -; GFX10-NEXT: v_and_b32_e32 v13, 63, v10 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v19, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[11:12], v15, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v9, v[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) ret <2 x i64> %result @@ -5317,46 +5317,44 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s6 ; GFX10-NEXT: v_and_b32_e32 v19, s5, v15 -; GFX10-NEXT: v_and_b32_e32 v20, s5, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v20, s5, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v20 -; GFX10-NEXT: v_mov_b32_e32 v25, v4 -; GFX10-NEXT: v_mov_b32_e32 v26, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v19 ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v11, v[9:10] -; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v20 ; GFX10-NEXT: v_lshlrev_b64 v[13:14], v19, v[9:10] -; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v20 +; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[9:10] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[9:10] -; GFX10-NEXT: v_lshrrev_b64 v[15:16], v20, v[25:26] +; GFX10-NEXT: v_lshrrev_b64 v[15:16], v20, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[17:18], v17, v[6:7] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19 -; GFX10-NEXT: v_or_b32_e32 v10, v3, v12 -; GFX10-NEXT: v_or_b32_e32 v11, v2, v11 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v21, v[6:7] ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v13, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v11, v2, v11 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v20 +; GFX10-NEXT: v_or_b32_e32 v10, v3, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v13, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v13, v15, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v10, v16, v18 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v20, v[6:7] ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v10, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v15, v1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v0, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v25, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v26, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v7, s4 -; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v8, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v9, v3 @@ -5591,31 +5589,31 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v12 ; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v10, s[10:11] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[10:11] ; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v13, s[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v12, s[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v19, v8, s8, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s8, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, s9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -5870,7 +5868,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] ; GFX10-NEXT: s_sub_i32 s0, s8, 64 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 @@ -5879,12 +5877,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX10-NEXT: v_or_b32_e32 v0, s4, v0 @@ -6128,10 +6126,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_sub_i32 s5, 1, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_lshlrev_b64 v[13:14], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v6, v4, v6 @@ -6139,8 +6137,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: s_and_b32 s5, 1, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v13, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v14, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc_lo ; GFX10-NEXT: s_sub_i32 s5, s6, 64 @@ -6148,7 +6146,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 ; GFX10-NEXT: s_sub_i32 s4, 64, s6 ; GFX10-NEXT: s_cmp_lt_u32 s6, 64 -; GFX10-NEXT: v_lshrrev_b64 v[11:12], s4, v[4:5] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s6, v[0:1] ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 @@ -6156,8 +6154,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[4:5], s5, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v2, v11, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v12, v7 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 ; GFX10-NEXT: s_sub_i32 s10, s8, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo @@ -6538,22 +6536,22 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: s_cmp_eq_u32 63, 0 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], s4, v[0:1] ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[14:15], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 31, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v11, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX10-NEXT: s_movk_i32 s6, 0x41 ; GFX10-NEXT: s_and_b32 s4, 1, s4 ; GFX10-NEXT: s_sub_i32 s5, 64, s6 ; GFX10-NEXT: v_or_b32_e32 v12, v9, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v14, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v0, v8, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[10:11], s5, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[8:9], s6, v[4:5] ; GFX10-NEXT: s_sub_i32 s5, s6, 64 ; GFX10-NEXT: s_cmp_lt_u32 s6, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s5, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v12, v1, v12, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[6:7] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: v_or_b32_e32 v8, v8, v10 @@ -6563,17 +6561,17 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: s_and_b32 s5, 1, s5 ; GFX10-NEXT: s_and_b32 s6, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v7, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX10-NEXT: v_or_b32_e32 v1, v13, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6921,10 +6919,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX10-NEXT: s_sub_i32 s31, 64, 1 ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: s_mov_b32 s62, s10 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 -; GFX10-NEXT: s_mov_b32 s63, s11 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s31 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], 1 @@ -6935,23 +6931,23 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b64 s[26:27], s[28:29], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s23, 0 -; GFX10-NEXT: s_cselect_b64 s[46:47], s[2:3], s[0:1] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] ; GFX10-NEXT: s_sub_i32 s23, s16, 64 ; GFX10-NEXT: s_sub_i32 s2, 64, s16 ; GFX10-NEXT: s_cmp_lt_u32 s16, 64 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s16, 0 ; GFX10-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[46:47], s16 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[0:1], s16 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[26:27], s2 ; GFX10-NEXT: s_lshl_b64 s[16:17], s[26:27], s16 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] ; GFX10-NEXT: s_lshl_b64 s[24:25], s[26:27], s23 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_cselect_b64 s[78:79], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[24:25] ; GFX10-NEXT: s_cmp_lg_u32 s29, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[46:47], s[2:3] +; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] ; GFX10-NEXT: s_sub_i32 s26, s22, 64 ; GFX10-NEXT: s_sub_i32 s23, 64, s22 ; GFX10-NEXT: s_cmp_lt_u32 s22, 64 @@ -6959,17 +6955,17 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cmp_eq_u32 s22, 0 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s22 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[62:63], s23 -; GFX10-NEXT: s_lshr_b64 s[22:23], s[62:63], s22 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s23 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[10:11], s22 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[10:11], s[62:63], s26 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[78:79], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] ; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] @@ -7413,7 +7409,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: s_sub_i32 s5, 64, 1 ; GFX10-NEXT: s_sub_i32 s6, 1, 64 ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_lshrrev_b64 v[27:28], s5, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[17:18], s5, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[21:22], 1, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 @@ -7421,117 +7417,115 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v16 -; GFX10-NEXT: v_or_b32_e32 v21, v27, v21 -; GFX10-NEXT: v_or_b32_e32 v18, v28, v22 +; GFX10-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX10-NEXT: v_xor_b32_e32 v19, -1, v16 ; GFX10-NEXT: s_movk_i32 s7, 0x7f ; GFX10-NEXT: s_and_b32 s8, 1, s8 -; GFX10-NEXT: v_and_b32_e32 v31, s7, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v18, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v21, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v18, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 +; GFX10-NEXT: v_and_b32_e32 v25, s7, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v23, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v18, 0, v24, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v19, 64, v31 ; GFX10-NEXT: v_and_b32_e32 v26, s7, v16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v31 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v19, v[17:18] -; GFX10-NEXT: v_mov_b32_e32 v35, v10 -; GFX10-NEXT: v_mov_b32_e32 v36, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v26 -; GFX10-NEXT: v_lshlrev_b64 v[21:22], v31, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[23:24], v31, v[17:18] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v31 -; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v26 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v25 +; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v25 +; GFX10-NEXT: v_sub_nc_u32_e32 v19, 64, v26 +; GFX10-NEXT: v_lshlrev_b64 v[23:24], v25, v[17:18] +; GFX10-NEXT: v_lshlrev_b64 v[21:22], v25, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[17:18] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 +; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26 -; GFX10-NEXT: v_lshrrev_b64 v[27:28], s5, v[4:5] +; GFX10-NEXT: s_cmp_lt_u32 1, 64 ; GFX10-NEXT: v_or_b32_e32 v21, v2, v21 ; GFX10-NEXT: v_or_b32_e32 v22, v3, v22 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v16, v[17:18] -; GFX10-NEXT: v_lshlrev_b64 v[18:19], v25, v[35:36] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] +; GFX10-NEXT: v_lshlrev_b64 v[18:19], v19, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v39, 0, v24, vcc_lo -; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v21, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v22, v3, v22, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v29, v[35:36] -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v27, v[10:11] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 ; GFX10-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX10-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7] -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: v_xor_b32_e32 v25, -1, v20 ; GFX10-NEXT: v_cndmask_b32_e32 v18, v21, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v31, v22, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 -; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v19, v3, v17, s4 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], 1, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v8, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s5, v[4:5] +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], s6, v[4:5] +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: s_and_b32 s6, 1, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v8, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v25, -1, v20 -; GFX10-NEXT: v_or_b32_e32 v2, v27, v10 -; GFX10-NEXT: v_or_b32_e32 v3, v28, v11 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX10-NEXT: s_and_b32 s8, 1, s8 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v26, v[35:36] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v16, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v19, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v11, v4, v2, s6 -; GFX10-NEXT: v_and_b32_e32 v30, s7, v25 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v2, s6 +; GFX10-NEXT: v_and_b32_e32 v25, s7, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v3, s6 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v17, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, v0, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v30 ; GFX10-NEXT: v_or_b32_e32 v0, v23, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 -; GFX10-NEXT: v_lshrrev_b64 v[5:6], v2, v[8:9] -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 64, v30 -; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v1, s4 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v30, v[3:4] +; GFX10-NEXT: v_cndmask_b32_e32 v3, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v25 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 64, v25 ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v23 -; GFX10-NEXT: v_or_b32_e32 v1, v39, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v1, s4 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[3:4] +; GFX10-NEXT: v_lshrrev_b64 v[5:6], v2, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v1, v24, v16 ; GFX10-NEXT: v_or_b32_e32 v2, v18, v19 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v30, v[8:9] +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v25, v[8:9] ; GFX10-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] +; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX10-NEXT: v_or_b32_e32 v10, v5, v10 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 64, v23 -; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v30 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 ; GFX10-NEXT: v_lshlrev_b64 v[7:8], v7, v[8:9] ; GFX10-NEXT: v_or_b32_e32 v9, v6, v11 -; GFX10-NEXT: v_lshrrev_b64 v[34:35], v5, v[14:15] ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v23 +; GFX10-NEXT: v_lshrrev_b64 v[5:6], v5, v[14:15] ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v7, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[7:8], v23, v[14:15] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v30 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v34, v16, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v25 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v23 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v35, v18, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v18, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v10, v3, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v3, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v4, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v12, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v13, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v7, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s4 -; GFX10-NEXT: v_or_b32_e32 v3, v31, v26 +; GFX10-NEXT: v_or_b32_e32 v3, v22, v26 ; GFX10-NEXT: v_or_b32_e32 v4, v11, v4 ; GFX10-NEXT: v_or_b32_e32 v5, v14, v5 -; GFX10-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v6, v10, v6 ; GFX10-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 9502d23..b4b0037 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2235,8 +2235,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v11, v6, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 ; GFX10-NEXT: v_and_or_b32 v7, v2, v7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 @@ -2482,8 +2482,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v3, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo @@ -2902,21 +2902,21 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: s_and_b32 s9, s2, s8 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s7, 7 ; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_lshl_b32 s3, s3, 4 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_lshl_b32 s8, s8, s3 ; GFX10-NEXT: s_lshl_b32 s3, s9, s3 ; GFX10-NEXT: s_not_b32 s8, s8 +; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v11, v2, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v7, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v8, s6 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 ; GFX10-NEXT: v_and_or_b32 v12, v0, s8, s3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s7, 0 @@ -3822,19 +3822,19 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v5, s0 ; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 ; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -4020,16 +4020,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: s_lshl_b32 s7, s8, s7 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 ; GFX10-NEXT: s_not_b32 s7, s7 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_mov_b32_e32 v14, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v7, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5 ; GFX10-NEXT: v_and_or_b32 v13, v0, s7, v1 @@ -4201,6 +4201,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 @@ -4220,9 +4221,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v8, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v9, s3 -; GFX10-NEXT: v_mov_b32_e32 v15, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 ; GFX10-NEXT: v_and_or_b32 v14, v1, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index 7ac27af..adf7a49 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -1638,11 +1638,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_and_b32_sdwa v4, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v6, v0, s0, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_lshl_b32 s1, s0, s1 ; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: v_or3_b32 v0, v6, v4, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v4, v3 ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 @@ -1794,9 +1794,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v0, s1, v3 -; GFX10-NEXT: v_or3_b32 v0, v3, v6, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 @@ -1804,10 +1804,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v0, v3, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_or3_b32 v2, v0, v2, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v3, v2, v4 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -2324,13 +2324,13 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v0, s1, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s1, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v3 ; GFX10-NEXT: s_lshr_b32 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s3, s3, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 ; GFX10-NEXT: s_lshl_b32 s4, s1, s3 @@ -2629,12 +2629,12 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -2905,20 +2905,20 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -3186,20 +3186,20 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -3397,7 +3397,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; ; GFX10-LABEL: insertelement_v_v8i8_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx2 v[11:12], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 ; GFX10-NEXT: s_movk_i32 s1, 0xff @@ -3405,22 +3405,22 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v12 -; GFX10-NEXT: v_and_b32_sdwa v8, v11, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v8, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_sdwa v9, v12, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v9, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v11, v11, s1, v4 -; GFX10-NEXT: v_and_or_b32 v10, v12, s1, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v4 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v5 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s1 ; GFX10-NEXT: s_and_b32 s0, s2, s1 -; GFX10-NEXT: v_or3_b32 v0, v11, v8, v6 -; GFX10-NEXT: v_or3_b32 v1, v10, v9, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v8, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v9, v7 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 @@ -3906,34 +3906,34 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v15, v0, s1, v6 -; GFX10-NEXT: v_and_or_b32 v14, v1, s1, v7 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, v4, v5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_or3_b32 v0, v15, v10, v8 -; GFX10-NEXT: v_or3_b32 v1, v14, v11, v9 +; GFX10-NEXT: v_or3_b32 v0, v0, v10, v8 +; GFX10-NEXT: v_or3_b32 v1, v1, v11, v9 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v7, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v6, v4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v4, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_b32_sdwa v4, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v3, v0, v5, v3 -; GFX10-NEXT: v_and_or_b32 v1, v11, v5, v2 -; GFX10-NEXT: v_or3_b32 v0, v3, v8, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, v5, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or3_b32 v1, v1, v4, v7 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v8, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v4, v7 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -4820,60 +4820,60 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v13, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 ; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_or3_b32 v7, v1, v14, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v1, v1, v14, v8 ; GFX10-NEXT: v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v12 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v5 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc_lo +; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 ; GFX10-NEXT: v_or3_b32 v3, v3, v16, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s1 ; GFX10-NEXT: v_and_or_b32 v5, v5, s3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 ; GFX10-NEXT: v_and_b32_sdwa v13, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v5, v2, s4, v9 ; GFX10-NEXT: v_and_b32_sdwa v14, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 +; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_and_or_b32 v18, v3, s4, v4 -; GFX10-NEXT: v_or3_b32 v2, v5, v15, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v0, v0, v13, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v14, v8 -; GFX10-NEXT: v_or3_b32 v3, v18, v16, v11 +; GFX10-NEXT: v_or3_b32 v3, v3, v16, v11 +; GFX10-NEXT: v_or3_b32 v2, v2, v15, v10 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -5323,12 +5323,11 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 -; GFX10-NEXT: v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo @@ -5337,7 +5336,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX10-NEXT: v_and_or_b32 v6, v1, s5, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5347,18 +5346,19 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v19, v15, s5, v4 +; GFX10-NEXT: v_and_or_b32 v4, v0, s5, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9 -; GFX10-NEXT: v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 -; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 -; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -5814,16 +5814,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 @@ -5831,23 +5831,23 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX10-NEXT: v_and_or_b32 v19, v15, s5, v4 -; GFX10-NEXT: v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v4, v0, s5, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_and_or_b32 v6, v1, s5, v6 +; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8 +; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9 -; GFX10-NEXT: v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 -; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 -; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -6300,16 +6300,16 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 @@ -6317,23 +6317,23 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX10-NEXT: v_and_or_b32 v19, v15, s8, v4 -; GFX10-NEXT: v_and_b32_sdwa v0, v15, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v4, v0, s8, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_and_or_b32 v6, v1, s8, v6 +; GFX10-NEXT: v_and_or_b32 v8, v2, s8, v8 +; GFX10-NEXT: v_and_or_b32 v9, v3, s8, v9 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v8, v2, s8, v8 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v9, v3, s8, v9 -; GFX10-NEXT: v_and_b32_sdwa v14, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 -; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 -; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -6659,7 +6659,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-LABEL: insertelement_v_v16i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v22, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 @@ -6669,76 +6669,76 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v26, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v19, v4, s3, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v15, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v16, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v26, v3, s3, v26 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v17, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX10-NEXT: v_or3_b32 v3, v3, v15, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v12 -; GFX10-NEXT: v_and_or_b32 v30, v5, s3, v11 -; GFX10-NEXT: v_or3_b32 v3, v26, v15, v8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_or3_b32 v26, v19, v16, v10 +; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v11 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10 ; GFX10-NEXT: v_and_b32_sdwa v18, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v5, v30, v17, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v14 -; GFX10-NEXT: v_and_or_b32 v11, v6, s3, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v26, vcc_lo +; GFX10-NEXT: v_or3_b32 v5, v5, v17, v7 +; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 ; GFX10-NEXT: v_lshlrev_b32_e64 v9, v0, s3 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_or3_b32 v6, v11, v18, v8 +; GFX10-NEXT: v_or3_b32 v6, v6, v18, v8 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v5, s0 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v9 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v6, s1 -; GFX10-NEXT: v_and_or_b32 v0, v7, v10, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v18, v26, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 +; GFX10-NEXT: v_and_or_b32 v0, v7, v8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v0, s1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v18 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v22, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v19, v2, s3, v5 -; GFX10-NEXT: v_and_b32_sdwa v14, v18, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v5 ; GFX10-NEXT: v_and_b32_sdwa v16, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_and_or_b32 v3, v18, s3, v7 -; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v1 ; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 -; GFX10-NEXT: v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_or3_b32 v0, v2, v13, v6 ; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8 ; GFX10-NEXT: v_or3_b32 v3, v5, v16, v11 ; GFX10-NEXT: v_or3_b32 v2, v4, v15, v10 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_or3_b32 v0, v19, v13, v6 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -7063,7 +7063,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-LABEL: insertelement_v_v16i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v18, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_lshr_b32 s4, s2, 2 @@ -7079,69 +7079,69 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v6 ; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v22, v4, s3, v19 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v3, v4, s3, v8 +; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_sdwa v23, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v16, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v14, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v11 ; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_or3_b32 v3, v22, v15, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v3, v3, v15, v9 ; GFX10-NEXT: v_and_b32_sdwa v17, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v13 -; GFX10-NEXT: v_or3_b32 v4, v5, v23, v4 +; GFX10-NEXT: v_or3_b32 v4, v5, v16, v4 ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 -; GFX10-NEXT: v_or3_b32 v7, v6, v17, v7 +; GFX10-NEXT: v_or3_b32 v6, v6, v17, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1 ; GFX10-NEXT: v_and_or_b32 v2, v5, s2, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v1, v2, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v22, v3, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v19 -; GFX10-NEXT: v_and_b32_sdwa v13, v19, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v13, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v19, v19, s3, v5 -; GFX10-NEXT: v_and_b32_sdwa v14, v22, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5 ; GFX10-NEXT: v_and_b32_sdwa v16, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v5, v2, s3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v3, v22, s3, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 -; GFX10-NEXT: v_and_or_b32 v5, v2, s3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7 +; GFX10-NEXT: v_or3_b32 v0, v1, v13, v6 ; GFX10-NEXT: v_or3_b32 v2, v4, v15, v10 +; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8 ; GFX10-NEXT: v_or3_b32 v3, v5, v16, v11 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_or3_b32 v0, v19, v13, v6 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -7489,66 +7489,66 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_and_b32_sdwa v18, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6 ; GFX10-NEXT: v_and_or_b32 v4, v4, s1, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, 8, v7 ; GFX10-NEXT: v_and_or_b32 v5, v5, s1, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v15, 8, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v19, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v14 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 24, v7 +; GFX10-NEXT: v_or3_b32 v4, v4, v17, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v14 ; GFX10-NEXT: v_and_or_b32 v6, v6, s1, v13 -; GFX10-NEXT: v_or3_b32 v15, v4, v17, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_or3_b32 v5, v5, v18, v12 ; GFX10-NEXT: v_and_b32_sdwa v20, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v16 -; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v14 ; GFX10-NEXT: v_or3_b32 v6, v6, v19, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v15, v5, vcc_lo +; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, v0, v1 -; GFX10-NEXT: v_or3_b32 v7, v7, v20, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3 +; GFX10-NEXT: v_or3_b32 v7, v7, v20, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v6, s0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v7, s1 ; GFX10-NEXT: v_and_or_b32 v0, v9, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v0, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v7, v0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v15, v0, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v18, v5, v0, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v19, 8, v27 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v27 -; GFX10-NEXT: v_lshlrev_b32_sdwa v23, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_sdwa v21, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v15, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v19, v2, v1, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v14, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v15, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v16, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v10, v4, v1, v10 -; GFX10-NEXT: v_and_b32_sdwa v17, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v5 +; GFX10-NEXT: v_and_b32_sdwa v17, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v5, v0, v1, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v13 -; GFX10-NEXT: v_and_or_b32 v3, v27, v1, v8 -; GFX10-NEXT: v_and_or_b32 v2, v18, v1, v7 +; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v7 +; GFX10-NEXT: v_and_or_b32 v4, v4, v1, v10 +; GFX10-NEXT: v_or3_b32 v0, v2, v14, v6 +; GFX10-NEXT: v_or3_b32 v1, v3, v15, v9 +; GFX10-NEXT: v_or3_b32 v2, v4, v16, v11 +; GFX10-NEXT: v_or3_b32 v3, v5, v17, v12 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_or3_b32 v0, v19, v21, v6 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v3, v3, v17, v12 -; GFX10-NEXT: v_or3_b32 v1, v2, v15, v9 -; GFX10-NEXT: v_or3_b32 v2, v10, v16, v11 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index df72995..b5ccf47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -989,8 +989,8 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: v_mov_b32_e32 v16, s15 ; MOVREL-NEXT: v_mov_b32_e32 v2, s1 -; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: v_mov_b32_e32 v15, s14 ; MOVREL-NEXT: v_mov_b32_e32 v14, s13 ; MOVREL-NEXT: v_mov_b32_e32 v13, s12 @@ -1005,30 +1005,28 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; MOVREL-NEXT: v_mov_b32_e32 v4, s3 ; MOVREL-NEXT: v_mov_b32_e32 v3, s2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; MOVREL-NEXT: s_mov_b32 s30, s18 -; MOVREL-NEXT: s_mov_b32 s31, s19 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s30, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s31, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s30, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s31, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 6, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 7, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s30, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s31, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s30, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s31, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s30, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s31, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s30, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s31, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s30, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s31, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s30, s4 -; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s31, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s18, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s19, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s18, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s19, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s18, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s19, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s18, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s19, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s18, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s19, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s18, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s19, s4 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off @@ -1525,19 +1523,17 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 -; MOVREL-NEXT: v_mov_b32_e32 v19, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 -; MOVREL-NEXT: v_mov_b32_e32 v23, v1 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 6, v18 -; MOVREL-NEXT: v_cndmask_b32_e32 v0, v19, v16, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v16, s2 @@ -2161,8 +2157,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18 -; MOVREL-NEXT: v_mov_b32_e32 v19, v0 -; MOVREL-NEXT: v_mov_b32_e32 v23, v1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 @@ -2171,9 +2165,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 6, v18 -; MOVREL-NEXT: v_cndmask_b32_e32 v0, v19, v16, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v16, s2 @@ -3550,28 +3544,28 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v16, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s0 +; MOVREL-NEXT: v_mov_b32_e32 v14, s7 +; MOVREL-NEXT: v_mov_b32_e32 v7, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0 -; MOVREL-NEXT: v_mov_b32_e32 v10, s1 -; MOVREL-NEXT: v_mov_b32_e32 v11, s2 -; MOVREL-NEXT: v_mov_b32_e32 v12, s3 -; MOVREL-NEXT: v_mov_b32_e32 v13, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v8, s1 +; MOVREL-NEXT: v_mov_b32_e32 v9, s2 +; MOVREL-NEXT: v_mov_b32_e32 v10, s3 +; MOVREL-NEXT: v_mov_b32_e32 v11, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1 -; MOVREL-NEXT: v_mov_b32_e32 v14, s5 -; MOVREL-NEXT: v_mov_b32_e32 v15, s6 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v10, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v12, s5 +; MOVREL-NEXT: v_mov_b32_e32 v13, s6 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v11, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 3 -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 4 -; MOVREL-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v11, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 5 -; MOVREL-NEXT: v_cndmask_b32_e32 v5, v14, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 6 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v15, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc_lo ; MOVREL-NEXT: v_mov_b32_e32 v0, v7 ; MOVREL-NEXT: ; return to shader part epilog entry: @@ -3624,29 +3618,29 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v16, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s0 +; MOVREL-NEXT: v_mov_b32_e32 v15, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; MOVREL-NEXT: v_mov_b32_e32 v10, s1 -; MOVREL-NEXT: v_mov_b32_e32 v11, s2 -; MOVREL-NEXT: v_mov_b32_e32 v12, s3 -; MOVREL-NEXT: v_mov_b32_e32 v13, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v8, v9, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v9, s1 +; MOVREL-NEXT: v_mov_b32_e32 v10, s2 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; MOVREL-NEXT: v_mov_b32_e32 v14, s5 -; MOVREL-NEXT: v_mov_b32_e32 v15, s6 -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v11, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v5, v14, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 ; MOVREL-NEXT: v_mov_b32_e32 v1, v7 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v15, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo ; MOVREL-NEXT: v_mov_b32_e32 v0, v8 ; MOVREL-NEXT: ; return to shader part epilog entry: @@ -4128,23 +4122,21 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v7f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v19, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 -; MOVREL-NEXT: v_mov_b32_e32 v18, v3 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v14, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 ; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v19, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0 -; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v15, s5 @@ -4271,38 +4263,38 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v20, s15 -; MOVREL-NEXT: v_mov_b32_e32 v19, s14 -; MOVREL-NEXT: v_mov_b32_e32 v18, s13 -; MOVREL-NEXT: v_mov_b32_e32 v17, s12 -; MOVREL-NEXT: v_mov_b32_e32 v16, s11 -; MOVREL-NEXT: v_mov_b32_e32 v15, s10 -; MOVREL-NEXT: v_mov_b32_e32 v14, s9 -; MOVREL-NEXT: v_mov_b32_e32 v13, s8 -; MOVREL-NEXT: v_mov_b32_e32 v12, s7 -; MOVREL-NEXT: v_mov_b32_e32 v11, s6 -; MOVREL-NEXT: v_mov_b32_e32 v10, s5 -; MOVREL-NEXT: v_mov_b32_e32 v9, s4 -; MOVREL-NEXT: v_mov_b32_e32 v8, s3 -; MOVREL-NEXT: v_mov_b32_e32 v7, s2 -; MOVREL-NEXT: v_mov_b32_e32 v6, s1 -; MOVREL-NEXT: v_mov_b32_e32 v5, s0 +; MOVREL-NEXT: v_mov_b32_e32 v17, s15 +; MOVREL-NEXT: v_mov_b32_e32 v16, s14 +; MOVREL-NEXT: v_mov_b32_e32 v15, s13 +; MOVREL-NEXT: v_mov_b32_e32 v14, s12 +; MOVREL-NEXT: v_mov_b32_e32 v13, s11 +; MOVREL-NEXT: v_mov_b32_e32 v12, s10 +; MOVREL-NEXT: v_mov_b32_e32 v11, s9 +; MOVREL-NEXT: v_mov_b32_e32 v10, s8 +; MOVREL-NEXT: v_mov_b32_e32 v9, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, s6 +; MOVREL-NEXT: v_mov_b32_e32 v7, s5 +; MOVREL-NEXT: v_mov_b32_e32 v6, s4 +; MOVREL-NEXT: v_mov_b32_e32 v5, s3 +; MOVREL-NEXT: v_mov_b32_e32 v4, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v5, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v7, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v8, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc_lo ; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v11, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v12, v1, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v0, v13, v0, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v14, v1, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v10, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v11, v1, s1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v3 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 @@ -4466,15 +4458,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; MOVREL-NEXT: v_mov_b32_e32 v15, v2 -; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo @@ -4531,15 +4521,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; MOVREL-NEXT: v_mov_b32_e32 v15, v2 -; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll index f3bc046..062c0ad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -1828,10 +1828,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v8, s3 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v8, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll index b438719..9c01dda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -29,7 +29,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX10NSA-LABEL: gather4_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -45,7 +45,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -83,7 +83,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10NSA-LABEL: gather4_cube: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -140,7 +140,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; ; GFX10NSA-LABEL: gather4_2darray: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -159,7 +159,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -195,7 +195,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10NSA-LABEL: gather4_c_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -211,7 +211,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -249,7 +249,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10NSA-LABEL: gather4_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -268,7 +268,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -306,7 +306,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; ; GFX10NSA-LABEL: gather4_c_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff @@ -325,7 +325,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -361,7 +361,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10NSA-LABEL: gather4_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -377,7 +377,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -413,7 +413,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10NSA-LABEL: gather4_c_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -429,7 +429,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -467,7 +467,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; ; GFX10NSA-LABEL: gather4_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff @@ -486,7 +486,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -524,7 +524,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff @@ -543,7 +543,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll index 5e82ab8..f597fa9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -80,7 +80,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10NSA-LABEL: gather4_2d_tfe: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v5, v0 @@ -101,7 +101,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: v_mov_b32_e32 v2, v0 ; GFX10NSA-NEXT: v_mov_b32_e32 v3, v0 ; GFX10NSA-NEXT: v_mov_b32_e32 v4, v0 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll index 5226382..d19db8b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -65,16 +65,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: v_mov_b32_e32 v12, v11 -; GFX10-NEXT: v_mov_b32_e32 v13, v11 -; GFX10-NEXT: v_mov_b32_e32 v14, v11 -; GFX10-NEXT: v_mov_b32_e32 v15, v11 -; GFX10-NEXT: v_mov_b32_e32 v0, v11 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -82,13 +82,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mov_b32_e32 v3, v14 -; GFX10-NEXT: v_mov_b32_e32 v4, v15 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v11, v4, s[10:11] +; GFX10-NEXT: global_store_dword v8, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -129,16 +129,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: v_mov_b32_e32 v12, v11 -; GFX10-NEXT: v_mov_b32_e32 v13, v11 -; GFX10-NEXT: v_mov_b32_e32 v14, v11 -; GFX10-NEXT: v_mov_b32_e32 v15, v11 -; GFX10-NEXT: v_mov_b32_e32 v0, v11 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -146,13 +146,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mov_b32_e32 v3, v14 -; GFX10-NEXT: v_mov_b32_e32 v4, v15 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v11, v4, s[10:11] +; GFX10-NEXT: global_store_dword v8, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll index d4d526b..1f1b34b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -22,9 +22,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 -; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -35,14 +35,14 @@ main_body: define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v11, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v11, s12 -; GFX10-NEXT: v_and_or_b32 v2, v3, v11, v4 -; GFX10-NEXT: v_and_or_b32 v3, v5, v11, s12 +; GFX10-NEXT: v_and_or_b32 v0, v0, v9, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v9, s12 +; GFX10-NEXT: v_and_or_b32 v2, v3, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, v5, v9, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -72,9 +72,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 -; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -85,10 +85,10 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -102,10 +102,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9 -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -116,10 +116,10 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 +; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -133,9 +133,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10 +; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v4 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -165,9 +165,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -196,9 +196,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -209,10 +209,10 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -226,10 +226,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -240,10 +240,10 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 +; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -257,9 +257,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10 +; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v4 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -273,9 +273,9 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11 +; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v5 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -289,9 +289,9 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11 +; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v5 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll index 72a9dbb..866bae4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -79,9 +79,9 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX10-NEXT: v_or3_b32 v7, v0, v1, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 -; GFX10-NEXT: v_dot4_i32_i8 v0, v7, v1, v8 +; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %a.cast = bitcast <4 x i8> %a to i32 %b.cast = bitcast <4 x i8> %b to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll index 70e4021..ffcc4ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -79,9 +79,9 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX10-NEXT: v_or3_b32 v7, v0, v1, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 -; GFX10-NEXT: v_dot4_u32_u8 v0, v7, v1, v8 +; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %a.cast = bitcast <4 x i8> %a to i32 %b.cast = bitcast <4 x i8> %b to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index 1b8689d..23cc4fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -351,8 +351,8 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote -; GFX10-32-NEXT: s_wqm_b32 s28, s12 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: BB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -374,7 +374,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz BB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec @@ -383,7 +383,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: BB3_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 @@ -487,8 +487,8 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote -; GFX10-32-NEXT: s_wqm_b32 s28, s12 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: BB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 @@ -510,7 +510,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz BB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec @@ -519,7 +519,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: BB4_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -632,8 +632,8 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32 ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] ; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 ; GFX10-64-NEXT: ; %bb.1: ; %.entry -; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll index 38634ea..939b491 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -192,7 +192,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 ; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v25, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:12 ; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 ; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 @@ -213,7 +213,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(10) ; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) ; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) @@ -221,7 +221,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 -; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v21 +; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) @@ -230,7 +230,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v10, v25, v11, v10 +; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll index 6dda1f4..eeef6bc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -158,11 +158,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:11 ; GFX10-NEXT: ds_read_u8 v10, v0 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:4 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v12, 0xff ; GFX10-NEXT: v_mov_b32_e32 v13, 8 ; GFX10-NEXT: s_movk_i32 s4, 0xff @@ -182,19 +182,18 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(5) ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v8, v15, v12 +; GFX10-NEXT: v_and_b32_e32 v8, v8, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(3) ; GFX10-NEXT: v_and_b32_e32 v9, v9, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 ; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v7, v14, v12, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 @@ -266,9 +265,9 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 ; GFX10-NEXT: ds_read_u16 v2, v0 offset:6 ; GFX10-NEXT: ds_read_u16 v3, v0 offset:10 -; GFX10-NEXT: ds_read_u16 v7, v0 -; GFX10-NEXT: ds_read_u16 v11, v0 offset:4 -; GFX10-NEXT: ds_read_u16 v15, v0 offset:8 +; GFX10-NEXT: ds_read_u16 v4, v0 +; GFX10-NEXT: ds_read_u16 v5, v0 offset:4 +; GFX10-NEXT: ds_read_u16 v6, v0 offset:8 ; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(5) ; GFX10-NEXT: v_and_b32_e32 v0, s4, v1 @@ -280,11 +279,11 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_and_or_b32 v0, v7, s4, v0 +; GFX10-NEXT: v_and_or_b32 v0, v4, s4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_and_or_b32 v1, v11, s4, v1 +; GFX10-NEXT: v_and_or_b32 v1, v5, s4, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v15, s4, v2 +; GFX10-NEXT: v_and_or_b32 v2, v6, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index 73e1da0..0b8efd5e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -108,7 +108,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 ; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v25, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:12 ; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 ; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 @@ -129,7 +129,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(10) ; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) ; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) @@ -137,7 +137,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 -; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v21 +; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) @@ -146,7 +146,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v10, v25, v11, v10 +; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -242,11 +242,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:11 ; GFX10-NEXT: ds_read_u8 v10, v0 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:4 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v12, 0xff ; GFX10-NEXT: v_mov_b32_e32 v13, 8 ; GFX10-NEXT: s_movk_i32 s4, 0xff @@ -266,19 +266,18 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(5) ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v8, v15, v12 +; GFX10-NEXT: v_and_b32_e32 v8, v8, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(3) ; GFX10-NEXT: v_and_b32_e32 v9, v9, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 ; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v7, v14, v12, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 @@ -410,27 +409,27 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: ds_write_b8 v0, v1 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:1 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:3 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:6 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v3 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:7 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 494593e..b390c73 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1608,12 +1608,8 @@ define <2 x i64> @v_lshr_v2i64(<2 x i64> %value, <2 x i64> %amount) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v4, v[10:11] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v6, v[7:8] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i64> %value, %amount ret <2 x i64> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index fbf6d90..dddad69 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -585,12 +585,12 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3 ; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v9 ; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v2, v2, v5, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v11, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v6 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul i96 %num, %den @@ -997,24 +997,24 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_add_co_u32 v8, s5, v9, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v18, s4, v13, v11 +; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9 ; GFX10-NEXT: v_mul_lo_u32 v10, v2, v5 -; GFX10-NEXT: v_add_co_u32 v11, s4, v18, v15 +; GFX10-NEXT: v_add_co_u32 v11, s4, v11, v15 ; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4 ; GFX10-NEXT: v_add3_u32 v12, v14, v12, v13 ; GFX10-NEXT: v_mul_lo_u32 v13, v1, v6 ; GFX10-NEXT: v_mul_hi_u32 v1, v1, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v3, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_mul_hi_u32 v6, v0, v6 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX10-NEXT: v_add3_u32 v10, v10, v13, v7 +; GFX10-NEXT: v_add3_u32 v3, v3, v13, v7 ; GFX10-NEXT: v_add3_u32 v4, v12, v14, v5 -; GFX10-NEXT: v_add3_u32 v1, v10, v15, v1 +; GFX10-NEXT: v_add3_u32 v1, v3, v15, v1 ; GFX10-NEXT: v_add3_u32 v3, v1, v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2758,13 +2758,15 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v17 ; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 -; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15 +; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9 +; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9 ; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v18 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v19, s4, v19, v20 ; GFX10-NEXT: v_mul_lo_u32 v20, v2, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX10-NEXT: v_add_nc_u32_e32 v17, v17, v18 ; GFX10-NEXT: v_mul_lo_u32 v18, v0, v10 ; GFX10-NEXT: v_add_co_u32 v18, s4, v19, v18 @@ -2781,7 +2783,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mul_hi_u32 v21, v2, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v22 -; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5 @@ -2791,11 +2793,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8 ; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9 ; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v23 -; GFX10-NEXT: v_add3_u32 v18, v19, v29, v18 +; GFX10-NEXT: v_add3_u32 v18, v19, v22, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9 ; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27 -; GFX10-NEXT: v_add3_u32 v30, v21, v24, v23 +; GFX10-NEXT: v_add3_u32 v19, v21, v24, v23 ; GFX10-NEXT: v_mul_lo_u32 v21, v2, v10 ; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26 ; GFX10-NEXT: v_mul_lo_u32 v24, v1, v11 @@ -2813,7 +2814,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mul_hi_u32 v22, v2, v9 ; GFX10-NEXT: v_add3_u32 v24, v25, v27, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v19, v30, v23, v20 +; GFX10-NEXT: v_add3_u32 v19, v19, v23, v20 ; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v26 ; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 @@ -2822,120 +2823,119 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8 ; GFX10-NEXT: v_add3_u32 v23, v24, v25, v26 ; GFX10-NEXT: v_mul_lo_u32 v24, v4, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20 ; GFX10-NEXT: v_mul_lo_u32 v26, v3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27 -; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v24 -; GFX10-NEXT: v_add3_u32 v35, v23, v30, v21 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v24 +; GFX10-NEXT: v_add3_u32 v21, v23, v25, v21 ; GFX10-NEXT: v_mul_lo_u32 v23, v2, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v34, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26 ; GFX10-NEXT: v_mul_lo_u32 v26, v1, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19 -; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23 ; GFX10-NEXT: v_mul_lo_u32 v23, v0, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26 ; GFX10-NEXT: v_mul_hi_u32 v26, v4, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v20, v35, v25, v20 -; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23 -; GFX10-NEXT: v_add3_u32 v23, v34, v27, v28 +; GFX10-NEXT: v_add3_u32 v20, v21, v25, v20 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23 +; GFX10-NEXT: v_add3_u32 v23, v24, v27, v28 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10 +; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8 ; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9 -; GFX10-NEXT: v_add_co_u32 v27, s4, v31, v26 -; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11 +; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v26 +; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10 ; GFX10-NEXT: v_add3_u32 v23, v23, v30, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v21, s4, v27, v29 -; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 +; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11 +; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v29 ; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22 ; GFX10-NEXT: v_add3_u32 v23, v23, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v4, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v33, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28 -; GFX10-NEXT: v_add_co_u32 v31, s5, v21, v26 +; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v26 ; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v24, s4, v25, v24 -; GFX10-NEXT: v_add_co_u32 v21, s5, v31, v29 -; GFX10-NEXT: v_add3_u32 v39, v23, v33, v26 +; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v29 +; GFX10-NEXT: v_add3_u32 v22, v23, v22, v26 ; GFX10-NEXT: v_mul_lo_u32 v23, v2, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v35, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v24, s4, v24, v27 ; GFX10-NEXT: v_mul_lo_u32 v27, v1, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9 -; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v34, s4, v24, v23 +; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20 +; GFX10-NEXT: v_add_co_u32 v23, s4, v24, v23 ; GFX10-NEXT: v_mul_lo_u32 v24, v0, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v35, v28, v35, v29 -; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20 -; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v27 +; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 ; GFX10-NEXT: v_mul_hi_u32 v27, v5, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5 -; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX10-NEXT: v_add_co_u32 v34, s4, v23, v24 -; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v22, v35, v30, v32 -; GFX10-NEXT: v_add3_u32 v21, v39, v26, v21 -; GFX10-NEXT: v_add_co_u32 v34, s4, v34, v27 +; GFX10-NEXT: v_add3_u32 v21, v22, v26, v21 ; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v24 +; GFX10-NEXT: v_add3_u32 v24, v28, v25, v29 +; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 +; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10 +; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 +; GFX10-NEXT: v_add3_u32 v24, v24, v30, v32 ; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v31 +; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v31 +; GFX10-NEXT: v_add3_u32 v22, v24, v28, v27 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v22, v22, v28, v27 ; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v25 ; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8 -; GFX10-NEXT: v_add_co_u32 v30, s4, v23, v26 -; GFX10-NEXT: v_add3_u32 v33, v22, v24, v25 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v26 +; GFX10-NEXT: v_add3_u32 v22, v22, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 ; GFX10-NEXT: v_mul_lo_u32 v25, v4, v11 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v28 ; GFX10-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v23, s4, v30, v27 ; GFX10-NEXT: v_mul_hi_u32 v5, v5, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 ; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v7, v7, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v1, v14 ; GFX10-NEXT: v_mul_hi_u32 v25, v0, v13 -; GFX10-NEXT: v_add3_u32 v33, v33, v26, v27 ; GFX10-NEXT: v_mul_hi_u32 v2, v2, v12 -; GFX10-NEXT: v_add3_u32 v26, v7, v29, v28 ; GFX10-NEXT: v_mul_hi_u32 v1, v1, v13 -; GFX10-NEXT: v_add3_u32 v7, v26, v24, v15 -; GFX10-NEXT: v_add_co_u32 v11, s4, v23, v25 +; GFX10-NEXT: v_add3_u32 v7, v7, v29, v28 +; GFX10-NEXT: v_add3_u32 v22, v22, v26, v27 +; GFX10-NEXT: v_add3_u32 v7, v7, v24, v15 +; GFX10-NEXT: v_add_co_u32 v9, s4, v23, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5 -; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v21 +; GFX10-NEXT: v_add_co_u32 v6, s4, v9, v21 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v3, v5, v4, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v14 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v8 -; GFX10-NEXT: v_add3_u32 v5, v33, v10, v7 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v16 +; GFX10-NEXT: v_add3_u32 v5, v22, v10, v7 +; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v17 -; GFX10-NEXT: v_add3_u32 v7, v3, v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v3, v18 +; GFX10-NEXT: v_add3_u32 v7, v1, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v16 ; GFX10-NEXT: v_mov_b32_e32 v4, v19 ; GFX10-NEXT: v_mov_b32_e32 v5, v20 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll index 1e0d7e88..16c4871 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -413,12 +413,12 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f16_e32 v3, v1 ; GFX10-NEXT: v_rndne_f16_e32 v2, v0 -; GFX10-NEXT: v_rndne_f16_sdwa v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_rndne_f16_e32 v3, v1 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v7 +; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 ; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 50fa5c7..12b3b54 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4199,16 +4199,16 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -4543,30 +4543,26 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v17, v2 -; GFX10-NEXT: v_mov_b32_e32 v18, v3 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v19, vcc_lo, v17, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 ; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -5327,7 +5323,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] ; GFX10-NEXT: s_movk_i32 s0, 0x7f ; GFX10-NEXT: s_sub_i32 s1, 64, s0 -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s0, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo @@ -5335,33 +5330,34 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s1, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] ; GFX10-NEXT: s_sub_i32 s1, s0, 64 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX10-NEXT: v_or_b32_e32 v9, v16, v9 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; GFX10-NEXT: v_or_b32_e32 v8, v0, v8 +; GFX10-NEXT: v_or_b32_e32 v9, v1, v9 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i64 v[2:3], s1, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 @@ -5569,64 +5565,60 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX10-LABEL: saddsat_i128_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: v_add_co_u32 v15, vcc_lo, v5, s0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6] +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20 +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: s_movk_i32 s0, 0x7f ; GFX10-NEXT: s_sub_i32 s2, 64, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v0, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[15:16] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[19:20] ; GFX10-NEXT: s_sub_i32 s1, s0, 64 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[19:20] +; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[6:7] ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[19:20] +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v20, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5959,28 +5951,20 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_mov_b32_e32 v20, v2 -; GFX10-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v0, v8 ; GFX10-NEXT: s_movk_i32 s5, 0x7f -; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v22, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: s_sub_i32 s6, 64, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo ; GFX10-NEXT: s_sub_i32 s7, s5, 64 -; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 -; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23] -; GFX10-NEXT: v_mov_b32_e32 v26, v4 -; GFX10-NEXT: v_mov_b32_e32 v27, v5 -; GFX10-NEXT: v_mov_b32_e32 v24, v6 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v25, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e32 v20, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0, v[8:9] ; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[16:17] @@ -5991,7 +5975,6 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i64 v[0:1], s5, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: v_ashrrev_i64 v[8:9], s7, v[18:19] @@ -5999,33 +5982,34 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s8, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v19 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 ; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20 ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s4 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_add_co_u32 v8, s4, v26, v12 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v27, v13, s4 -; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s4, v24, v14, s4 +; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v13, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s4, v6, v14, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v25, v15, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[26:27] +; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v7, v15, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v20, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[3:4], s5, v[8:9] ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[24:25] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 ; GFX10-NEXT: v_cmp_gt_u64_e64 s4, 0, v[12:13] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], s6, v[10:11] @@ -6035,7 +6019,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_or_b32_e32 v13, v4, v13 ; GFX10-NEXT: v_ashrrev_i64 v[3:4], s5, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[24:25] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v5, s4 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, 0, v[14:15] ; GFX10-NEXT: v_ashrrev_i64 v[5:6], s7, v[10:11] @@ -6049,13 +6033,13 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: s_and_b32 s6, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_xor_b32_e32 v7, v14, v7 -; GFX10-NEXT: v_ashrrev_i32_e32 v18, 31, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s5 ; GFX10-NEXT: v_add_co_u32 v5, s4, v5, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7 @@ -6592,23 +6576,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_addc_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_addc_u32 s30, s2, s10 +; GFX10-NEXT: s_addc_u32 s18, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_addc_u32 s31, s3, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] -; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] +; GFX10-NEXT: s_addc_u32 s19, s3, s11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 @@ -6628,13 +6610,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_ashr_i32 s10, s31, 31 +; GFX10-NEXT: s_ashr_i32 s10, s19, 31 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21 +; GFX10-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[18:19], s21 ; GFX10-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10-NEXT: s_mov_b32 s11, s10 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9] @@ -6655,7 +6637,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 +; GFX10-NEXT: v_mov_b32_e32 v3, s19 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 @@ -6669,7 +6651,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo ; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10-NEXT: s_addc_u32 s3, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 174df2d..4dcbd7c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1571,12 +1571,8 @@ define <2 x i64> @v_shl_v2i64(<2 x i64> %value, <2 x i64> %amount) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[10:11] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[7:8] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i64> %value, %amount ret <2 x i64> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index e4858b8..4e99dac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4185,16 +4185,16 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, v2 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -4529,30 +4529,26 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v17, v2 -; GFX10-NEXT: v_mov_b32_e32 v18, v3 +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v14, v4 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v19, vcc_lo, v17, v6 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 ; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -5313,7 +5309,6 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] ; GFX10-NEXT: s_movk_i32 s0, 0x7f ; GFX10-NEXT: s_sub_i32 s1, 64, s0 -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s0, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo @@ -5321,33 +5316,34 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s1, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] ; GFX10-NEXT: s_sub_i32 s1, s0, 64 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX10-NEXT: v_or_b32_e32 v9, v16, v9 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; GFX10-NEXT: v_or_b32_e32 v8, v0, v8 +; GFX10-NEXT: v_or_b32_e32 v9, v1, v9 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i64 v[2:3], s1, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 @@ -5555,64 +5551,60 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX10-LABEL: ssubsat_i128_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v5, s0 +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 ; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6] +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20 +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: s_movk_i32 s0, 0x7f ; GFX10-NEXT: s_sub_i32 s2, 64, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v0, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[15:16] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[19:20] ; GFX10-NEXT: s_sub_i32 s1, s0, 64 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[19:20] +; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[6:7] ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[19:20] +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v20, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5945,28 +5937,20 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_mov_b32_e32 v20, v2 -; GFX10-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 ; GFX10-NEXT: s_movk_i32 s5, 0x7f -; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v22, v8 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: s_sub_i32 s6, 64, s5 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo ; GFX10-NEXT: s_sub_i32 s7, s5, 64 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23] -; GFX10-NEXT: v_mov_b32_e32 v26, v4 -; GFX10-NEXT: v_mov_b32_e32 v27, v5 -; GFX10-NEXT: v_mov_b32_e32 v24, v6 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v25, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e32 v20, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] ; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[16:17] @@ -5977,7 +5961,6 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i64 v[0:1], s5, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: v_ashrrev_i64 v[8:9], s7, v[18:19] @@ -5985,33 +5968,34 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s8, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v19 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 ; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20 ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s4 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s4, v26, v12 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v27, v13, s4 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v24, v14, s4 +; GFX10-NEXT: v_sub_co_u32 v8, s4, v4, v12 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v5, v13, s4 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v6, v14, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s4, v25, v15, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[26:27] +; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s4, v7, v15, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v20, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[3:4], s5, v[8:9] ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[24:25] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], s6, v[10:11] @@ -6021,7 +6005,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_or_b32_e32 v13, v4, v13 ; GFX10-NEXT: v_ashrrev_i64 v[3:4], s5, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[24:25] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v5, s4 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, 0, v[14:15] ; GFX10-NEXT: v_ashrrev_i64 v[5:6], s7, v[10:11] @@ -6035,13 +6019,13 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: s_and_b32 s6, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_xor_b32_e32 v7, v14, v7 -; GFX10-NEXT: v_ashrrev_i32_e32 v18, 31, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s5 ; GFX10-NEXT: v_add_co_u32 v5, s4, v5, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7 @@ -6578,23 +6562,21 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_subb_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_subb_u32 s30, s2, s10 +; GFX10-NEXT: s_subb_u32 s18, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_subb_u32 s31, s3, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] -; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] +; GFX10-NEXT: s_subb_u32 s19, s3, s11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 @@ -6614,13 +6596,13 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_ashr_i32 s10, s31, 31 +; GFX10-NEXT: s_ashr_i32 s10, s19, 31 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21 +; GFX10-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[18:19], s21 ; GFX10-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10-NEXT: s_mov_b32 s11, s10 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9] @@ -6641,7 +6623,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 +; GFX10-NEXT: v_mov_b32_e32 v3, s19 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 @@ -6655,7 +6637,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo ; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10-NEXT: s_subb_u32 s3, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index f6fc451..8c1bc5f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -176,22 +176,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s9, s6, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v15, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_lshr_b32 s0, s6, 24 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v19, s8 +; GFX10-NEXT: v_mov_b32_e32 v9, s8 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v15 offset:2 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:2 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:5 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:6 -; GFX10-NEXT: ds_write_b8 v1, v19 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8 ; GFX10-NEXT: s_lshr_b32 s1, s7, 16 @@ -202,12 +202,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s2, s7, 24 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:12 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:13 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:14 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:15 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:15 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -286,7 +286,7 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 ; GFX10-NEXT: s_lshr_b32 s3, s7, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 -; GFX10-NEXT: v_mov_b32_e32 v11, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 @@ -294,7 +294,7 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b16 v1, v4 offset:12 -; GFX10-NEXT: ds_write_b16 v1, v11 offset:2 +; GFX10-NEXT: ds_write_b16 v1, v5 offset:2 ; GFX10-NEXT: ds_write_b16 v1, v6 offset:6 ; GFX10-NEXT: ds_write_b16 v1, v7 offset:10 ; GFX10-NEXT: ds_write_b16 v1, v8 offset:14 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 88277f4..c96a98f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -147,12 +147,12 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s3, s12, 24 ; GFX10-NEXT: s_lshr_b32 s6, s14, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v15, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, s5 ; GFX10-NEXT: s_lshr_b32 s2, s13, 8 ; GFX10-NEXT: s_lshr_b32 s4, s13, 16 ; GFX10-NEXT: s_lshr_b32 s7, s14, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s14 -; GFX10-NEXT: v_mov_b32_e32 v11, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: s_lshr_b32 s8, s14, 24 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v10, s6 @@ -161,13 +161,13 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v11 offset:2 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:2 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:5 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:6 ; GFX10-NEXT: v_mov_b32_e32 v0, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: ds_write_b8 v1, v15 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 @@ -239,13 +239,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v3, s14 ; GFX10-NEXT: s_lshr_b32 s2, s14, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: ds_write_b16 v1, v0 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b16 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b16 v1, v7 offset:6 +; GFX10-NEXT: ds_write_b16 v1, v5 offset:6 ; GFX10-NEXT: ds_write_b16 v1, v6 offset:10 ; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 0e23a16..681b8f0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2819,20 +2819,16 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v4 -; GFX10-NEXT: v_mov_b32_e32 v11, v5 -; GFX10-NEXT: v_mov_b32_e32 v15, v6 -; GFX10-NEXT: v_mov_b32_e32 v16, v7 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v10 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo -; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v2, v15 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11] -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[5:6], v[15:16] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, -1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -3203,22 +3199,22 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; ; GFX10-LABEL: uaddsat_i128_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, s0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[0:1] +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[2:3] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[2:3] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v10, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -3435,33 +3431,25 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v18, v8 -; GFX10-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-NEXT: v_mov_b32_e32 v16, v10 -; GFX10-NEXT: v_mov_b32_e32 v17, v11 -; GFX10-NEXT: v_mov_b32_e32 v10, v12 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v18 -; GFX10-NEXT: v_mov_b32_e32 v11, v13 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v19, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v20, v14 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v16, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v21, v15 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v17, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[18:19] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v10 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v11, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v20, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v21, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[16:17] +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[10:11] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[20:21] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[16:17] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[20:21] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15] ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index f5c9bb5..b71703a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2689,16 +2689,12 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v10, v4 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v11, v5, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5] -; GFX10-NEXT: v_sub_co_u32 v4, s4, v0, v6 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v1, v7, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[0:1], v[6:7] +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_sub_co_u32 v4, s4, v2, v6 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v3, v7, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4 @@ -2974,7 +2970,7 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s10, s10, 1 ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: s_subb_u32 s14, s2, s6 +; GFX10-NEXT: s_subb_u32 s10, s2, s6 ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10-NEXT: s_and_b32 s11, s11, 1 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 @@ -2989,7 +2985,7 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s14, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -3305,41 +3301,33 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_mov_b32_e32 v20, v2 -; GFX10-NEXT: v_mov_b32_e32 v21, v3 -; GFX10-NEXT: v_mov_b32_e32 v26, v4 -; GFX10-NEXT: v_mov_b32_e32 v27, v5 -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[22:23], v[8:9] -; GFX10-NEXT: v_mov_b32_e32 v24, v6 -; GFX10-NEXT: v_mov_b32_e32 v25, v7 +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[6:7], v[14:15] ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[20:21], v[10:11] -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[24:25], v[14:15] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[10:11] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[26:27], v[12:13] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX10-NEXT: v_and_b32_e32 v16, 1, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[24:25], v[14:15] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v22, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v23, v9, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v17, s5 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v20, v10, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v21, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v26, v12 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v27, v13, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v24, v14, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v25, v15, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s5 @@ -3630,7 +3618,7 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_and_b32 s1, s1, 1 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[4:5], s[12:13] -; GFX10-NEXT: s_subb_u32 s30, s6, s14 +; GFX10-NEXT: s_subb_u32 s10, s6, s14 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s0, s0, 1 @@ -3656,7 +3644,7 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_readfirstlane_b32 s2, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s30, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s3, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 921e0b3..5f28f31 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -500,12 +500,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB2_2: @@ -551,11 +551,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB2_2: @@ -1680,12 +1680,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB9_2: @@ -1731,11 +1731,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB9_2: @@ -2534,12 +2534,12 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 +; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB14_2: @@ -2585,11 +2585,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 +; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB14_2: @@ -2768,12 +2768,12 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB15_2: @@ -2819,11 +2819,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB15_2: @@ -3002,12 +3002,12 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB16_2: @@ -3053,11 +3053,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB16_2: @@ -3238,12 +3238,12 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 +; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB17_2: @@ -3291,11 +3291,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 +; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB17_2: @@ -3655,12 +3655,12 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 +; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB19_2: @@ -3708,11 +3708,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 +; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB19_2: @@ -4070,12 +4070,12 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB21_2: @@ -4121,11 +4121,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB21_2: @@ -4480,12 +4480,12 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB23_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 +; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB23_2: @@ -4531,11 +4531,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB23_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 +; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB23_2: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 2781993..765a681 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -92,7 +92,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX1064-NEXT: s_cbranch_execz BB0_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] @@ -101,7 +101,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i ; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX1064-NEXT: BB0_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 @@ -328,14 +328,14 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX1064-NEXT: s_mov_b64 exec, s[10:11] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: v_mov_b32_e32 v0, s12 ; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1064-NEXT: BB1_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index ab20b16..8213a37 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -233,7 +233,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 27c5fa4..e0b30ad 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1121,7 +1121,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2 @@ -1144,9 +1144,9 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 -; GFX10-NEXT: global_store_dword v11, v6, s[0:1] offset:24 -; GFX10-NEXT: global_store_dwordx2 v11, v[4:5], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dword v8, v6, s[0:1] offset:24 +; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index a1c8e48..9dcffcd 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -90,8 +90,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ds_write_b32 v3, v2 offset:12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_fmas_f32 v7, s0, s0, s0 -; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v4, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm entry: @@ -340,8 +340,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_fmas_f32 v7, s0, s0, s0 -; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v5, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll index 8c12686..1ec14d1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll @@ -20,9 +20,7 @@ define { double, double } @testfn(double %arg, double %arg1, double %arg2) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1] +; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[0:1] ; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] ; GFX10-NEXT: v_add_f64 v[2:3], -v[2:3], -v[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 39af8c1..e465320 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -970,11 +970,11 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace( ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v15, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v15, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v15, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -989,7 +989,7 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace( ; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v15, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm double addrspace(1)* %in2) #0 { %r0 = load double, double addrspace(1)* %in1, align 8 @@ -1141,10 +1141,10 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] @@ -1299,10 +1299,10 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] @@ -1893,49 +1893,49 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v11, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[15:16], v11, s[2:3] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GFX10-NEXT: v_rcp_f32_e32 v7, v7 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v6, v16, v1 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v16 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v10, v5 +; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 ; GFX10-NEXT: v_rcp_f32_e32 v7, v7 -; GFX10-NEXT: v_div_fixup_f16 v5, v10, v3, v4 -; GFX10-NEXT: v_trunc_f16_e32 v10, v5 -; GFX10-NEXT: v_fmac_f16_e64 v4, -v10, v3 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX10-NEXT: v_trunc_f16_e32 v5, v5 +; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_and_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v15 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v5, v15, v0 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v15 +; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX10-NEXT: v_rcp_f32_e32 v7, v7 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 @@ -1943,7 +1943,7 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; GFX10-NEXT: v_fmac_f16_e64 v0, -v5, v2 ; GFX10-NEXT: v_and_b32_e32 v2, v3, v6 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: global_store_dwordx2 v11, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4 @@ -2161,11 +2161,11 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v11, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v11, s[2:3] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 @@ -2178,26 +2178,26 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; GFX10-NEXT: v_fma_f32 v8, v9, v7, v8 ; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v6, v5, v7, v8 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v2, v0 -; GFX10-NEXT: v_div_fixup_f32 v5, v6, v3, v1 -; GFX10-NEXT: v_trunc_f32_e32 v6, v5 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_fma_f32 v1, v3, -v5, v1 ; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 -; GFX10-NEXT: v_fma_f32 v1, v3, -v6, v1 +; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 ; GFX10-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; GFX10-NEXT: v_fma_f32 v6, v7, v6, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, v4, v6 -; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v4 +; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX10-NEXT: v_fma_f32 v7, v8, v6, v7 -; GFX10-NEXT: v_fma_f32 v5, -v5, v7, v4 +; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v3, v5, v6, v7 +; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2 -; GFX10-NEXT: global_store_dwordx2 v11, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm <2 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 @@ -2538,11 +2538,11 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[15:18], v8, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v18 -; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v18, v7, v18 +; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 +; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 ; GFX10-NEXT: v_rcp_f32_e32 v11, v10 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 @@ -2553,55 +2553,55 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 -; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v17, v6, v17 -; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v18 +; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9 -; GFX10-NEXT: v_fma_f32 v18, v7, -v9, v18 -; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v17 +; GFX10-NEXT: v_fma_f32 v3, v7, -v9, v3 +; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 +; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 ; GFX10-NEXT: v_rcp_f32_e32 v10, v9 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; GFX10-NEXT: v_fma_f32 v10, v11, v10, v10 -; GFX10-NEXT: v_mul_f32_e32 v11, v0, v10 -; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v0 +; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 ; GFX10-NEXT: v_fma_f32 v11, v12, v10, v11 -; GFX10-NEXT: v_fma_f32 v1, -v9, v11, v0 +; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v7, v1, v10, v11 -; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v17 +; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7 -; GFX10-NEXT: v_fma_f32 v17, v6, -v7, v17 -; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v16 -; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v16, v5, v16 +; GFX10-NEXT: v_fma_f32 v2, v6, -v7, v2 +; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 +; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 ; GFX10-NEXT: v_rcp_f32_e32 v9, v7 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 ; GFX10-NEXT: v_fma_f32 v9, v10, v9, v9 -; GFX10-NEXT: v_mul_f32_e32 v0, v6, v9 -; GFX10-NEXT: v_fma_f32 v11, -v7, v0, v6 -; GFX10-NEXT: v_fma_f32 v0, v11, v9, v0 -; GFX10-NEXT: v_fma_f32 v6, -v7, v0, v6 +; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 +; GFX10-NEXT: v_fma_f32 v10, v11, v9, v10 +; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v0 -; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v16 +; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6 -; GFX10-NEXT: v_fma_f32 v16, v5, -v6, v16 -; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v15 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v15, v4, v15 +; GFX10-NEXT: v_fma_f32 v1, v5, -v6, v1 +; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 ; GFX10-NEXT: v_fma_f32 v7, v9, v7, v7 -; GFX10-NEXT: v_mul_f32_e32 v0, v5, v7 -; GFX10-NEXT: v_fma_f32 v10, -v6, v0, v5 -; GFX10-NEXT: v_fma_f32 v0, v10, v7, v0 -; GFX10-NEXT: v_fma_f32 v5, -v6, v0, v5 +; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 +; GFX10-NEXT: v_fma_f32 v9, v10, v7, v9 +; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v0 -; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v15 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_fmac_f32_e64 v15, -v5, v4 -; GFX10-NEXT: global_store_dwordx4 v8, v[15:18], s[4:5] +; GFX10-NEXT: v_fmac_f32_e64 v0, -v5, v4 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm <4 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 @@ -2842,34 +2842,34 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[18:21], v16, s[2:3] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[20:21], v[20:21], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[18:19], v[18:19], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[20:21], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] ; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] ; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] -; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[20:21], v[2:3] +; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[20:21], v[2:3] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[18:19], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] ; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] ; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[18:19], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[18:19], v[0:1] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm <2 x double> addrspace(1)* %in2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 550fa502..2648fde 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -843,31 +843,31 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX10-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX10-NEXT: v_and_b32_e32 v15, 15, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX10-NEXT: v_and_b32_e32 v19, 15, v6 -; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0 +; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 ; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7 -; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v19, v10 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 ; GFX10-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX10-NEXT: v_or_b32_e32 v11, v6, v4 +; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10 +; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v11, 16, v0 +; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) @@ -1005,28 +1005,28 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11 ; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 ; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 ; GFX10-NEXT: v_and_b32_e32 v10, 15, v10 -; GFX10-NEXT: v_and_b32_e32 v15, 15, v8 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12 ; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0 ; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: v_or_b32_e32 v3, v7, v6 -; GFX10-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) @@ -1085,9 +1085,9 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) { ; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 63, v5 +; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1172,18 +1172,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2 ; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v15, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v19, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v10 -; GFX10-NEXT: v_and_b32_e32 v13, 63, v11 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v15, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v9, v[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v13, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 +; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %ret @@ -1331,10 +1331,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v7, 8, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 8, v5 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v7 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 2d55883..32e4f58 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -466,8 +466,8 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_mad_f32 v7, -v2, v0, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v2, s0 -; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, v2, s0 +; GFX10-NEXT: global_store_short v[5:6], v2, off ; GFX10-NEXT: s_cbranch_vccz BB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -546,16 +546,16 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_mul_f32_e32 v8, v7, v1 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_trunc_f32_e32 v10, v8 -; GFX10-NEXT: v_mad_f32 v7, -v10, v0, v7 -; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v10 +; GFX10-NEXT: v_trunc_f32_e32 v8, v8 +; GFX10-NEXT: v_mad_f32 v7, -v8, v0, v7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v7, v2, v7 -; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v7 +; GFX10-NEXT: global_store_short v[5:6], v2, off ; GFX10-NEXT: s_cbranch_vccz BB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -646,8 +646,8 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v0| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v2, v7 -; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX10-NEXT: global_store_short v[5:6], v2, off ; GFX10-NEXT: s_cbranch_vccz BB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -725,14 +725,14 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v7 +; GFX10-NEXT: v_cvt_f32_i32_e32 v5, v7 ; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7 -; GFX10-NEXT: v_mul_f32_e32 v8, v11, v1 +; GFX10-NEXT: v_mul_f32_e32 v8, v5, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 30, v6 -; GFX10-NEXT: v_trunc_f32_e32 v10, v8 +; GFX10-NEXT: v_trunc_f32_e32 v8, v8 ; GFX10-NEXT: v_or_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_mad_f32 v5, -v10, v0, v11 -; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v10 +; GFX10-NEXT: v_mad_f32 v5, -v8, v0, v5 +; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v8 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v0| ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] @@ -742,8 +742,8 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, s1 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, v7, v2 -; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v7, v2 +; GFX10-NEXT: global_store_short v[5:6], v2, off ; GFX10-NEXT: s_cbranch_vccz BB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 36951b7..7a3fea9 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2732,11 +2732,11 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v7, v5, v4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index e3b5f81..6f44f2a 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -341,21 +341,21 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v10, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX10-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v6, v6, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_bfe_i32 v4, v8, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v10, v9, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v7, v9, 0, 8 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -534,7 +534,7 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -543,7 +543,7 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -718,14 +718,14 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_i32_i24_e32 v5, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v7, v0, v5 +; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -908,13 +908,13 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v7, s2, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 5747d4f..ad5a0a5 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -327,17 +327,17 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1 -; GFX10-DL-NEXT: v_and_b32_e32 v10, s0, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -517,7 +517,7 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -526,7 +526,7 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -841,7 +841,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -850,7 +850,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v11, v4 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v7, v6, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -1025,17 +1025,17 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* % ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v7, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v7, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -1215,14 +1215,14 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v7, v0, v5 +; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -1412,11 +1412,11 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v6, v0, v3, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v6 -; GFX10-DL-NEXT: v_add3_u32 v0, v6, v4, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v3 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] @@ -1622,7 +1622,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v9, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v3, v6, v9, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] @@ -1809,13 +1809,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v7, s2, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -2230,7 +2230,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 ; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v1, v2, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4 ; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2239,7 +2239,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 ; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 84a21ad..d0cde94 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -644,26 +644,26 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v18, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v18, v17, v3 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 @@ -672,13 +672,13 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v9 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v15, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 @@ -686,13 +686,13 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v7, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; @@ -722,55 +722,55 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v18, v17, v3 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v15, v9, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v5, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc16: @@ -1218,26 +1218,26 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v18, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v18, v17, v3 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 @@ -1246,13 +1246,13 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v9 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v15, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 @@ -1260,13 +1260,13 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v7, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 ; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; @@ -1296,55 +1296,55 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v18, v17, v3 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v15, v9, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v5, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8: @@ -1713,25 +1713,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v15, v1, 16, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v14, v0, v7, v5 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v15, v10 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v15, v14, v3, v4 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v6, v15, v8, v6 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v7, v1, v2 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v6, v3, v4 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v7, v5 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5 ; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; @@ -1765,25 +1765,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v15, v1, 16, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4 ; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v15, v10 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v15, v2, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v15, v8, v6 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v3, v2, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v3, v0, v5 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_multiuses_mul1: @@ -2550,7 +2550,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v19, v2, 24, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v12, v2, 24, 4 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v2 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 16, 4 @@ -2577,7 +2577,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v5 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v19 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 @@ -2592,9 +2592,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v7, v1, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v7, v5 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 @@ -2638,7 +2638,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v19, v0, 24, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v12, v0, 24, 4 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 16, 4 @@ -2665,7 +2665,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v5 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v19 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 @@ -2676,11 +2676,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v7, v0, v6 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v6 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v7, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v4 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 @@ -3196,7 +3196,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v19, 0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -3207,7 +3207,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v19, s[0:1] +; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) @@ -3250,8 +3250,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 @@ -3262,13 +3262,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v23, 12, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v23 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -3284,12 +3284,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v23, v0 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-XNACK-NEXT: global_store_byte v19, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: @@ -3297,7 +3297,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v19, 0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 @@ -3308,7 +3308,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] -; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v19, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) @@ -3347,7 +3347,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v23, v9, v0 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 @@ -3360,7 +3360,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12 @@ -3390,7 +3390,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NOXNACK-NEXT: global_store_byte v19, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index aa8fc55..d3bb2a4 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -494,31 +494,31 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -812,31 +812,31 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -1134,31 +1134,31 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -1441,31 +1441,31 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -2373,49 +2373,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 8, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_bfe_u32 v19, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 +; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 ; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v19 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 +; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6 ; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 ; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v23, 28, v1 +; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4 ; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GFX10-DL-NEXT: v_add_nc_u16 v14, v3, v9 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v14, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v23, 16, v4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 @@ -2762,7 +2762,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v19, 0 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -2773,7 +2773,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v3, v19, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2794,7 +2794,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v23, v2, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 16, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15 ; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9 @@ -2804,7 +2804,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2 ; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v23 +; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v12 ; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7 ; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2820,12 +2820,12 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v9, v8 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v23, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v12, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: global_store_byte v19, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -3115,7 +3115,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 24, 4 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 8, 4 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v7 @@ -3133,12 +3132,13 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v8 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v11, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v8 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index dedda14..b4d0399 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -448,22 +448,22 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa ; ; GFX10-LABEL: load_3d_tfe_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -579,22 +579,22 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace ; ; GFX10-LABEL: load_cube_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -837,22 +837,22 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp ; ; GFX10-LABEL: load_2darray_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -968,22 +968,22 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp ; ; GFX10-LABEL: load_2dmsaa_both: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1361,22 +1361,22 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa ; ; GFX10-LABEL: load_mip_2d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll index 0e0ea50..e393248 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -566,10 +566,10 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -650,14 +650,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 -; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -707,9 +707,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -762,8 +762,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 -; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -804,10 +804,10 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -854,14 +854,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -911,9 +911,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -966,8 +966,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1162,8 +1162,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1196,8 +1196,8 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll index 47765ca..9005052 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -96,13 +96,13 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10-LABEL: image_sample_2d_f16_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, v5 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index f75fe13..6e9daac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10-LABEL: sample_1d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe] +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] @@ -92,7 +92,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] @@ -499,7 +499,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10-LABEL: sample_1d_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe] +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] @@ -512,7 +512,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll index b4f89e5..6a3248e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -15,12 +15,12 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -58,9 +58,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36] +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -89,8 +89,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -139,12 +139,12 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -166,9 +166,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36] +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -197,8 +197,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll index 002d4e6..7c20bc6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -15,12 +15,12 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -58,9 +58,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -89,8 +89,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -139,12 +139,12 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -166,9 +166,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -197,8 +197,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 9edd1a3..e88b70f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -356,8 +356,8 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote -; GFX10-32-NEXT: s_wqm_b32 s28, s12 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: BB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D @@ -379,7 +379,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz BB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec @@ -388,7 +388,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: BB3_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 @@ -492,8 +492,8 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote -; GFX10-32-NEXT: s_wqm_b32 s28, s12 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: BB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 @@ -515,7 +515,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz BB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec @@ -524,7 +524,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: BB4_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -637,8 +637,8 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32 ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] ; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 ; GFX10-64-NEXT: ; %bb.1: ; %.entry -; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index bdeda3e..9b2f8aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -157,25 +157,25 @@ define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_lo_u32 v15, v0, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, v3 ; GFX10-NEXT: v_mul_hi_u32 v5, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v6, v0, v3 ; GFX10-NEXT: v_mul_lo_u32 v8, v1, v2 ; GFX10-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX10-NEXT: v_mul_hi_i32 v9, v1, v3 ; GFX10-NEXT: v_mul_lo_u32 v11, v1, v3 -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v5, v15 +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v5, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v10, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v7, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v11 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, v11, v2 +; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, v6, v2 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_add3_u32 v1, v5, v15, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v11, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v5, v4, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v6, v0 @@ -461,8 +461,8 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] ; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30 -; GFX10-NEXT: v_ashrrev_i64 v[6:7], 2, v[4:5] -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] +; GFX10-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 4f48c06..d4fa0b3 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -539,15 +539,15 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index d9962d2..d686af2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -330,12 +330,12 @@ define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrs ; GCN-SCRATCH: ; %bb.0: ; %entry ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, v2 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, v2 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: global_load_short_d16_hi v6, v[0:1], off +; GCN-SCRATCH-NEXT: global_load_short_d16_hi v5, v[0:1], off ; GCN-SCRATCH-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:64 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v6, off +; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v5, off ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v2, off offset:128 ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 @@ -373,12 +373,12 @@ define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrs ; GCN-SCRATCH: ; %bb.0: ; %entry ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, v2 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, v2 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: global_load_short_d16 v6, v[0:1], off +; GCN-SCRATCH-NEXT: global_load_short_d16 v5, v[0:1], off ; GCN-SCRATCH-NEXT: global_load_short_d16 v2, v[0:1], off offset:64 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v6, off +; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v5, off ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v2, off offset:128 ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir deleted file mode 100644 index 8862644..0000000 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir +++ /dev/null @@ -1,38 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s - ---- | - define amdgpu_kernel void @do_not_reassign_spill() #0 { ret void } - - attributes #0 = { "amdgpu-num-vgpr"="8" } -... - -# GCN-LABEL: do_not_reassign_spill{{$}} -# GCN: V_AND_B32_e32 killed $vgpr1, killed $vgpr5, ---- -name: do_not_reassign_spill -tracksRegLiveness: true -machineFunctionInfo: - stackPtrOffsetReg: $sgpr32 -stack: - - { id: 0, type: default, offset: 0, size: 4, alignment: 4 } -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' } - - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' } - - { id: 4, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 5, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 6, class: vgpr_32 } -body: | - bb.0: - %0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %3 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %4 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %5 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - S_NOP 0, implicit-def dead $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5 - %6 = V_AND_B32_e32 %1, %5, implicit $exec - S_ENDPGM 0, implicit %6 -... diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir deleted file mode 100644 index 918e009..0000000 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir +++ /dev/null @@ -1,69 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s - - -# Test that subreg reassignments are correctly handled when whole register also -# conflicts. If this is mishandled stall counts will be incorrect and cause an -# infinite loop. -# GCN-LABEL: vgpr64_mixed_use{{$}} -# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF -# GCN: $vgpr4_vgpr5 = IMPLICIT_DEF -# GCN: $vcc = IMPLICIT_DEF -# GCN: $vgpr2_vgpr3 = IMPLICIT_DEF -# GCN: $vgpr6_vgpr7 = IMPLICIT_DEF -# GCN: $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF -# GCN: $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF -# GCN: $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF -# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF -# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF -# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF -# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF -# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF -# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF -# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF -# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec -# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec -# GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 killed $vgpr4_vgpr5, killed $vgpr0_vgpr1, implicit $exec ---- -name: vgpr64_mixed_use -tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } - - { id: 1, class: vreg_64, preferred-register: '$vgpr4_vgpr5' } - - { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' } - - { id: 3, class: vgpr_32 } - - { id: 4, class: vgpr_32 } - - { id: 5, class: sreg_64_xexec } - - { id: 6, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } - - { id: 7, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } - - { id: 8, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } - - { id: 9, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } - - { id: 10, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } - - { id: 11, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } - - { id: 12, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } - - { id: 13, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } - - { id: 14, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } - - { id: 15, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } - - { id: 16, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } - - { id: 17, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = IMPLICIT_DEF - %6 = IMPLICIT_DEF - %7 = IMPLICIT_DEF - %8 = IMPLICIT_DEF - %9 = IMPLICIT_DEF - %10 = IMPLICIT_DEF - %11 = IMPLICIT_DEF - %12 = IMPLICIT_DEF - %13 = IMPLICIT_DEF - %14 = IMPLICIT_DEF - %15 = IMPLICIT_DEF - %16 = IMPLICIT_DEF - %17 = IMPLICIT_DEF - %3 = V_CNDMASK_B32_e64 0, %0.sub1, 0, %1.sub1, %2, implicit $exec - %4 = V_CNDMASK_B32_e64 0, %0.sub0, 0, %1.sub0, %2, implicit $exec - %5 = V_CMP_LT_U64_e64 %1, %0, implicit $exec - S_ENDPGM 0 -... diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir deleted file mode 100644 index df057da..0000000 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ /dev/null @@ -1,611 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s - -# GCN-LABEL: v1_vs_v5{{$}} -# GCN: V_AND_B32_e32 killed $vgpr3, killed $vgpr1, ---- -name: v1_vs_v5 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: v0_1_vs_v4{{$}} -# GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr3, ---- -name: v0_1_vs_v4 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 1, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %1, %0, 0, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: v1_2_vs_v4_5{{$}} -# GCN: GLOBAL_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5, ---- -name: v1_2_vs_v4_5 -tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64, preferred-register: '$vgpr4_vgpr5' } - - { id: 1, class: vreg_64, preferred-register: '$vgpr1_vgpr2' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: s11_vs_vcc{{$}} -# GCN: $vgpr0, $vcc_lo = V_ADDC_U32_e64 killed $sgpr14, killed $vgpr0, killed $vcc_lo, 0 ---- -name: s11_vs_vcc -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr11' } - - { id: 1, class: vgpr_32 } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - $vcc_lo = IMPLICIT_DEF - %2, $vcc_lo = V_ADDC_U32_e64 killed %0, killed %1, killed $vcc_lo, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: s0_vs_s16{{$}} -# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr0, ---- -name: s0_vs_s16 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr0 = IMPLICIT_DEF - %1 = S_AND_B32 %0, $sgpr0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: s1_vs_s16{{$}} -# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr1, ---- -name: s1_vs_s16 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr1 = IMPLICIT_DEF - %1 = S_AND_B32 %0, $sgpr1, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: s12_vs_null{{$}} -# GCN: S_AND_B32 $sgpr_null, killed renamable $sgpr14, ---- -name: s12_vs_null -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr12' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = S_AND_B32 $sgpr_null, %0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: s13_vs_m0{{$}} -# GCN: S_AND_B32 $m0, killed renamable $sgpr14, ---- -name: s13_vs_m0 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr13' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = S_AND_B32 $m0, %0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: s12_13_vs_s28_s29{{$}} -# GCN: S_AND_B64 $sgpr28_sgpr29, killed renamable $sgpr14_sgpr15, ---- -name: s12_13_vs_s28_s29 -tracksRegLiveness: true -registers: - - { id: 0, class: sreg_64, preferred-register: '$sgpr12_sgpr13' } - - { id: 1, class: sreg_64 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr28_sgpr29 = IMPLICIT_DEF - %1 = S_AND_B64 $sgpr28_sgpr29, %0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: livein{{$}} -# GCN: V_AND_B32_e32 killed $vgpr4, killed $vgpr0, ---- -name: livein -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 2, class: vgpr_32 } -liveins: - - { reg: '$vgpr0', virtual-reg: '' } - - { reg: '$vgpr4', virtual-reg: '' } -body: | - bb.0: - liveins: $vgpr0, $vgpr4 - - %0 = COPY $vgpr0 - %1 = COPY $vgpr4 - %2 = V_AND_B32_e32 %1, %0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: liveout{{$}} -# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, ---- -name: liveout -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - $vgpr0 = COPY %0 - $vgpr4 = COPY %1 - S_ENDPGM 0 -... - -# GCN-LABEL: implicit{{$}} -# GCN: V_MOV_B32_indirect undef $vgpr4, undef $vgpr0, implicit $exec, implicit-def dead renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7, implicit $m0 ---- -name: implicit -tracksRegLiveness: true -registers: - - { id: 0, class: vreg_128 } - - { id: 1, class: vreg_128, preferred-register: '$vgpr4_vgpr5_vgpr6_vgpr7' } -body: | - bb.0: - %1 = IMPLICIT_DEF - V_MOV_B32_indirect undef %1.sub0:vreg_128, undef $vgpr0, implicit $exec, implicit-def %0:vreg_128, implicit %1:vreg_128, implicit $m0 - S_ENDPGM 0 -... - -# GCN-LABEL: occupancy_limit{{$}} -# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, ---- -name: occupancy_limit -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } - - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } - - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } - - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } - - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } - - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } - - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } - - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } - - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } - - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } - - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } - - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %3 = IMPLICIT_DEF - %4 = IMPLICIT_DEF - %5 = IMPLICIT_DEF - %6 = IMPLICIT_DEF - %7 = IMPLICIT_DEF - %8 = IMPLICIT_DEF - %9 = IMPLICIT_DEF - %10 = IMPLICIT_DEF - %11 = IMPLICIT_DEF - %12 = IMPLICIT_DEF - %13 = IMPLICIT_DEF - %14 = IMPLICIT_DEF - %15 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - GLOBAL_STORE_DWORD %3, %0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %1, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %2, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %4, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: csr{{$}} -# GCN: V_AND_B32_e32 $vgpr37, $vgpr0, ---- -name: csr -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } - - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } - - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } - - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } - - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } - - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } - - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } - - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } - - { id: 12, class: vgpr_32, preferred-register: '$vgpr33' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %3 = IMPLICIT_DEF - %4 = IMPLICIT_DEF - %5 = IMPLICIT_DEF - %6 = IMPLICIT_DEF - %7 = IMPLICIT_DEF - %8 = IMPLICIT_DEF - %9 = IMPLICIT_DEF - %10 = IMPLICIT_DEF - %11 = IMPLICIT_DEF - %12 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - GLOBAL_STORE_DWORD %3, %0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %1, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %2, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %4, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %12, 0, 0, implicit $exec - S_ENDPGM 0 -... - -# Do not touch undefs -# GCN-LABEL: s0_vs_s16_undef{{$}} -# GCN: S_AND_B32 killed renamable $sgpr16, undef $sgpr0, ---- -name: s0_vs_s16_undef -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = S_AND_B32 %0, undef $sgpr0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: smem_bundle{{$}} -# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0 -# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0 ---- -name: smem_bundle -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_128, preferred-register: '$sgpr0_sgpr1_sgpr2_sgpr3' } - - { id: 1, class: sreg_32_xm0_xexec, preferred-register: '$sgpr16' } - - { id: 2, class: sreg_32_xm0_xexec, preferred-register: '$sgpr17' } - - { id: 3, class: sreg_32_xm0_xexec, preferred-register: '$sgpr4' } - - { id: 4, class: sreg_32_xm0_xexec, preferred-register: '$sgpr5' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = IMPLICIT_DEF - early-clobber %3, early-clobber %4 = BUNDLE %0, %1, %2 { - %3 = S_BUFFER_LOAD_DWORD_SGPR %0, %1, 0 - %4 = S_BUFFER_LOAD_DWORD_SGPR %0, %2, 0 - } - S_ENDPGM 0 -... - -# GCN-LABEL: vreg_512_subs{{$}} -# don't care about the assignment: this used to trigger an infinite loop ---- -name: vreg_512_subs -tracksRegLiveness: true -registers: - - { id: 1, class: vreg_512, preferred-register: '$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15' } - - { id: 2, class: vgpr_32, preferred-register: '$vgpr28' } -body: | - bb.0: - %1 = IMPLICIT_DEF - %2 = IMPLICIT_DEF - DS_WRITE2_B32_gfx9 %2, %1.sub0, %1.sub1, 0, 1, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub2, %1.sub3, 2, 3, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub4, %1.sub5, 4, 5, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub6, %1.sub7, 6, 7, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub8, %1.sub9, 8, 9, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub10, %1.sub11, 10, 11, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub12, %1.sub13, 12, 13, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub14, %1.sub15, 14, 15, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: vgpr_lo16_sub{{$}} -# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec -# GCN: renamable $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16 ---- -name: vgpr_lo16_sub -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } - - { id: 3, class: vgpr_lo16 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - %3 = COPY %2.lo16 - $vgpr1_lo16 = COPY %3 - SI_RETURN_TO_EPILOG $vgpr1_lo16 -... - -# GCN-LABEL: vgpr_lo16{{$}} -# GCN: $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16 ---- -name: vgpr_lo16 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_lo16, preferred-register: '$vgpr4_lo16' } -body: | - bb.0: - liveins: $vgpr0_lo16 - - %0 = COPY $vgpr0_lo16 - $vgpr1_lo16 = COPY %0 - SI_RETURN_TO_EPILOG $vgpr1_lo16 -... - -# GCN-LABEL: vgpr_hi16_sub{{$}} -# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec -# GCN: renamable $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16 ---- -name: vgpr_hi16_sub -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } - - { id: 3, class: vgpr_hi16 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - %3 = COPY %2.hi16 - $vgpr1_hi16 = COPY %3 - SI_RETURN_TO_EPILOG $vgpr1_hi16 -... - -# GCN-LABEL: vgpr_hi16{{$}} -# GCN: $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16 ---- -name: vgpr_hi16 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_hi16, preferred-register: '$vgpr4_hi16' } -body: | - bb.0: - liveins: $vgpr0_hi16 - - %0 = COPY $vgpr0_hi16 - $vgpr1_hi16 = COPY %0 - SI_RETURN_TO_EPILOG $vgpr1_hi16 -... - -# GCN-LABEL: sgpr_lo16_sub{{$}} -# GCN: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr14, $sgpr0, implicit-def $scc -# GCN: renamable $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16 ---- -name: sgpr_lo16_sub -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } - - { id: 1, class: sgpr_32 } - - { id: 2, class: sgpr_lo16 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr0 = IMPLICIT_DEF - %1 = S_AND_B32 %0, $sgpr0, implicit-def $scc - %2 = COPY %1.lo16 - $sgpr1_lo16 = COPY %2 - SI_RETURN_TO_EPILOG $sgpr1_lo16 -... - -# GCN-LABEL: sgpr_lo16{{$}} -# GCN: $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16 ---- -name: sgpr_lo16 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_lo16, preferred-register: '$sgpr4_lo16' } -body: | - bb.0: - liveins: $sgpr0_lo16 - - %0 = COPY $sgpr0_lo16 - $sgpr1_lo16 = COPY %0 - SI_RETURN_TO_EPILOG $sgpr1_lo16 -... - -# Check that we do not use VGPR3 which we would use otherwise. -# We cannot use it because of interference with VGPR3_LO16. -# GCN-LABEL: v1_vs_v5_src_interence{{$}} -# GCN: V_AND_B32_e32 killed $vgpr7, killed $vgpr1, ---- -name: v1_vs_v5_src_interence -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - $vgpr3_lo16 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - S_ENDPGM 0 -... - -# Test that bank of subreg is considered during scavenging. -# If handled incorrectly an infinite loop occurs. -# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}} -# GCN: S_AND_B32 killed renamable $sgpr13, $sgpr0, ---- -name: s0_vs_s15_16_17_sub1 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr0 = IMPLICIT_DEF - %1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc - S_ENDPGM 0 -... - -# Test that the size of subreg is correctly handled in bank calculation. -# If handled incorrectly an infinite loop occurs. -# GCN-LABEL: vgpr_sub_dependence{{$}} -# GCN: $vgpr9_vgpr10_vgpr11_vgpr12 = IMPLICIT_DEF -# GCN: $vgpr16_vgpr17 = IMPLICIT_DEF -# GCN: $vgpr14_vgpr15 = IMPLICIT_DEF -# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF -# GCN: $vgpr7_vgpr8 = IMPLICIT_DEF -# GCN: $vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF -# GCN: $vgpr18_vgpr19 = IMPLICIT_DEF -# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF -# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF -# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF -# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF -# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF -# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF -# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF -# GCN: $vgpr0_vgpr1 = V_ADD_F64_e64 0, $vgpr11_vgpr12, 0, killed $vgpr16_vgpr17, 0, 0, implicit $mode, implicit $exec -# GCN: $vgpr0_vgpr1 = V_ADD_F64_e64 0, killed $vgpr9_vgpr10, 0, killed $vgpr14_vgpr15, 0, 0, implicit $mode, implicit $exec ---- -name: vgpr_sub_dependence -tracksRegLiveness: true -registers: - - { id: 0, class: vreg_128, preferred-register: '$vgpr10_vgpr11_vgpr12_vgpr13' } - - { id: 1, class: vreg_64, preferred-register: '$vgpr16_vgpr17' } - - { id: 2, class: vreg_64, preferred-register: '$vgpr14_vgpr15' } - - { id: 3, class: vreg_64 } - - { id: 4, class: vreg_64 } - - { id: 5, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } - - { id: 6, class: vreg_64, preferred-register: '$vgpr7_vgpr8' } - - { id: 7, class: vreg_128, preferred-register: '$vgpr3_vgpr4_vgpr5_vgpr6' } - - { id: 8, class: vreg_64, preferred-register: '$vgpr18_vgpr19' } - - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } - - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } - - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } - - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } - - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } - - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } - - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = IMPLICIT_DEF - %5 = IMPLICIT_DEF - %6 = IMPLICIT_DEF - %7 = IMPLICIT_DEF - %8 = IMPLICIT_DEF - %9 = IMPLICIT_DEF - %10 = IMPLICIT_DEF - %11 = IMPLICIT_DEF - %12 = IMPLICIT_DEF - %13 = IMPLICIT_DEF - %14 = IMPLICIT_DEF - %15 = IMPLICIT_DEF - %3 = V_ADD_F64_e64 0, %0.sub2_sub3:vreg_128, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec - %4 = V_ADD_F64_e64 0, %0.sub0_sub1:vreg_128, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: dbg_value_v1_v5{{$}} -# GCN: renamable $vgpr1 = IMPLICIT_DEF -# GCN: renamable $vgpr5 = IMPLICIT_DEF ---- -name: dbg_value_v1_v5 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - DBG_VALUE debug-use %1, debug-use %0 - S_ENDPGM 0, implicit %0, implicit %1 -... - -# GCN-LABEL: kill_v1_v5{{$}} -# GCN: renamable $vgpr1 = IMPLICIT_DEF -# GCN: renamable $vgpr5 = IMPLICIT_DEF -# GCN: KILL killed renamable $vgpr5, killed renamable $vgpr1 ---- -name: kill_v1_v5 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - KILL %1, %0 - S_ENDPGM 0 -... diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index e789388..33526c9 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -458,16 +458,16 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[9:10], v6, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v9, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v10, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[7:8], v[9:10] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] ; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: global_store_dwordx2 v6, v[7:8], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] ; GFX10-NEXT: global_store_byte v6, v0, s[6:7] ; GFX10-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 @@ -575,14 +575,14 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_i32 v5, v1, v3 clamp -; GFX10-NEXT: v_add_nc_u32_e32 v10, v1, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_add_nc_i32 v6, v0, v2 clamp -; GFX10-NEXT: v_add_nc_u32_e32 v9, v0, v2 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v6 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v4, v[9:10], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX10-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 8120499..3c2b66c 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -486,17 +486,17 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[10:11] -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 480b026..1c7c1db 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -544,15 +544,15 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 4764fad..fde23b0 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -1100,17 +1100,17 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[10:11] -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index 831bf87..cf2f557 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -230,12 +230,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_lshr_b32 s1, s7, 24 ; GFX10-NEXT: s_lshr_b32 s5, s5, 24 -; GFX10-NEXT: v_mov_b32_e32 v15, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v11, s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 @@ -243,8 +243,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v11 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v11 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 @@ -252,7 +252,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: ds_write_b8 v0, v15 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 @@ -351,15 +351,15 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v7, s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 ; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 -; GFX10-NEXT: ds_write_b16 v0, v7 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v7 offset:2 +; GFX10-NEXT: ds_write_b16 v0, v4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void @@ -420,9 +420,9 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v6, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s7 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX10-NEXT: ds_write2_b32 v0, v3, v6 offset0:2 offset1:3 +; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 6babc93..d54d418 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -196,11 +196,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s5, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_lshr_b32 s4, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v11, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v15, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: v_mov_b32_e32 v9, s4 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 @@ -208,11 +208,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX10-NEXT: ds_write_b8 v0, v3 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v11 offset:9 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 -; GFX10-NEXT: ds_write_b8 v0, v15 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:1 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 ; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll index af94bd4..97412b1 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll @@ -65,12 +65,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_strict(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val @@ -88,12 +84,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_ignore(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val @@ -111,12 +103,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_maytrap(<2 x double> %x, < ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll index 8949334..110e651 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -75,10 +75,9 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-NEXT: v_fmac_f16_e32 v6, v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val @@ -128,23 +127,21 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v5 -; GFX10-NEXT: v_mov_b32_e32 v15, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v15 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GFX10-NEXT: v_fmac_f16_e32 v15, v0, v2 +; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_fmac_f16_e32 v14, v1, v3 -; GFX10-NEXT: v_fmac_f16_e32 v5, v8, v7 -; GFX10-NEXT: v_fmac_f16_e32 v4, v11, v10 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v15 -; GFX10-NEXT: v_and_b32_e32 v2, v0, v14 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 +; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 +; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v0, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll index 067640c..3807793 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll @@ -31,12 +31,8 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v13, v3 -; GFX10-NEXT: v_mov_b32_e32 v12, v2 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] -; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val @@ -77,18 +73,10 @@ define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v29, v7 -; GFX10-NEXT: v_mov_b32_e32 v28, v6 -; GFX10-NEXT: v_mov_b32_e32 v31, v5 -; GFX10-NEXT: v_mov_b32_e32 v30, v4 -; GFX10-NEXT: v_mov_b32_e32 v25, v3 -; GFX10-NEXT: v_mov_b32_e32 v24, v2 -; GFX10-NEXT: v_mov_b32_e32 v27, v1 -; GFX10-NEXT: v_mov_b32_e32 v26, v0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[30:31], v[12:13], v[20:21] -; GFX10-NEXT: v_fma_f64 v[6:7], v[28:29], v[14:15], v[22:23] -; GFX10-NEXT: v_fma_f64 v[2:3], v[24:25], v[10:11], v[18:19] -; GFX10-NEXT: v_fma_f64 v[0:1], v[26:27], v[8:9], v[16:17] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x double> %val @@ -162,12 +150,8 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg(<2 x doubl ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v13, v3 -; GFX10-NEXT: v_mov_b32_e32 v12, v2 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_fma_f64 v[2:3], -v[12:13], -v[6:7], v[10:11] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[14:15], -v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg <2 x double> %x %neg.y = fneg <2 x double> %y diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll index daa7dcc..9fc32fa 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll @@ -65,12 +65,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_strict(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val @@ -88,12 +84,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_ignore(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val @@ -111,12 +103,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_maytrap(<2 x double> %x, < ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll index 8e4e406..115d52e 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll @@ -65,12 +65,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_strict(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val @@ -88,12 +84,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_ignore(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val @@ -111,12 +103,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_maytrap(<2 x double> %x, < ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 30beac7..3500090 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -746,11 +746,11 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -778,15 +778,15 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dwordx2 v[9:10], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v10, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v9, 16, v2 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -816,12 +816,12 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1319,14 +1319,14 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dwordx2 v[7:8], v6, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] ; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_fma_f16 v4, v7, v2, v4 op_sel_hi:[0,1,1] -; GFX10-NEXT: v_pk_fma_f16 v2, v8, v2, v5 op_sel_hi:[0,1,1] -; GFX10-NEXT: v_pk_fma_f16 v0, v7, v3, v4 op_sel:[1,0,0] -; GFX10-NEXT: v_pk_fma_f16 v1, v8, v3, v2 op_sel:[1,0,0] +; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] +; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] +; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] ; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm entry: @@ -1380,14 +1380,16 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v1, v3, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v7, 16, v2 +; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 -- 2.7.4