From: Tom Stellard Date: Mon, 29 Aug 2016 19:42:52 +0000 (+0000) Subject: AMDGPU/SI: Implement a custom MachineSchedStrategy X-Git-Tag: llvmorg-4.0.0-rc1~11235 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0d23ebe8883af4b280897af751614c7b433e00f7;p=platform%2Fupstream%2Fllvm.git AMDGPU/SI: Implement a custom MachineSchedStrategy Summary: GCNSchedStrategy re-uses most of GenericScheduler, it's just uses a different method to compute the excess and critical register pressure limits. It's not enabled by default, to enable it you need to pass -misched=gcn to llc. Shader DB stats: 32464 shaders in 17874 tests Totals: SGPRS: 1542846 -> 1643125 (6.50 %) VGPRS: 1005595 -> 904653 (-10.04 %) Spilled SGPRs: 29929 -> 27745 (-7.30 %) Spilled VGPRs: 334 -> 352 (5.39 %) Scratch VGPRs: 1612 -> 1624 (0.74 %) dwords per thread Code Size: 36688188 -> 37034900 (0.95 %) bytes LDS: 1913 -> 1913 (0.00 %) blocks Max Waves: 254101 -> 265125 (4.34 %) Wait states: 0 -> 0 (0.00 %) Totals from affected shaders: SGPRS: 1338220 -> 1438499 (7.49 %) VGPRS: 886221 -> 785279 (-11.39 %) Spilled SGPRs: 29869 -> 27685 (-7.31 %) Spilled VGPRs: 334 -> 352 (5.39 %) Scratch VGPRs: 1612 -> 1624 (0.74 %) dwords per thread Code Size: 34315716 -> 34662428 (1.01 %) bytes LDS: 1551 -> 1551 (0.00 %) blocks Max Waves: 188127 -> 199151 (5.86 %) Wait states: 0 -> 0 (0.00 %) Reviewers: arsenm, mareko, nhaehnle, MatzeB, atrick Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: https://reviews.llvm.org/D23688 llvm-svn: 279995 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 44f59c3..03ac8cc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -214,3 +214,48 @@ void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } + +unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (SGPRs <= 80) + return 10; + if (SGPRs <= 88) + return 9; + if (SGPRs <= 100) + return 8; + return 7; + } + if (SGPRs <= 48) + return 10; + if (SGPRs <= 56) + return 9; + if (SGPRs <= 64) + return 8; + if (SGPRs <= 72) + return 7; + if (SGPRs <= 80) + return 6; + return 5; +} + +unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { + if (VGPRs <= 24) + return 10; + if (VGPRs <= 28) + return 9; + if (VGPRs <= 32) + return 8; + if (VGPRs <= 36) + return 7; + if (VGPRs <= 40) + return 6; + if (VGPRs <= 48) + return 5; + if (VGPRs <= 64) + return 4; + if (VGPRs <= 84) + return 3; + if (VGPRs <= 128) + return 2; + return 1; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index b15d359..985b1ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -429,6 +429,12 @@ public: bool hasSGPRInitBug() const { return SGPRInitBug; } + + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs + unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; + + /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs + unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a86603a..f144ce2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -18,6 +18,7 @@ #include "AMDGPUCallLowering.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "GCNSchedStrategy.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" @@ -96,6 +97,14 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { return new SIScheduleDAGMI(C); } +static ScheduleDAGInstrs * +createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + ScheduleDAGMILive *DAG = + new ScheduleDAGMILive(C, make_unique(C)); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -104,6 +113,11 @@ static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler); +static MachineSchedRegistry +GCNMaxOccupancySchedRegistry("gcn-max-occupancy", + "Run GCN scheduler to maximize occupancy", + createGCNMaxOccupancyMachineScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. @@ -467,7 +481,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( const SISubtarget &ST = C->MF->getSubtarget(); if (ST.enableSIScheduler()) return createSIMachineScheduler(C); - return nullptr; + return createGCNMaxOccupancyMachineScheduler(C); } bool GCNPassConfig::addPreISel() { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c1ecf09..e58e5b2 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -49,6 +49,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp GCNHazardRecognizer.cpp + GCNSchedStrategy.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp R600EmitClauseMarkers.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp new file mode 100644 index 0000000..62e9f52 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -0,0 +1,312 @@ +//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This contains a MachineSchedStrategy implementation for maximizing wave +/// occupancy on GCN hardware. +//===----------------------------------------------------------------------===// + +#include "GCNSchedStrategy.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" + +#define DEBUG_TYPE "misched" + +using namespace llvm; + +GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( + const MachineSchedContext *C) : + GenericScheduler(C) { } + +static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, + const MachineFunction &MF) { + + const SISubtarget &ST = MF.getSubtarget(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs), + ST.getOccupancyWithNumVGPRs(VGPRs)); + return std::min(MinRegOccupancy, + ST.getOccupancyWithLocalMemSize(MFI->getLDSSize())); +} + +void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, + bool AtTop, const RegPressureTracker &RPTracker, + const SIRegisterInfo *SRI, + int SGPRPressure, + int VGPRPressure, + int SGPRExcessLimit, + int VGPRExcessLimit, + int SGPRCriticalLimit, + int VGPRCriticalLimit) { + + Cand.SU = SU; + Cand.AtTop = AtTop; + + // getDownwardPressure() and getUpwardPressure() make temporary changes to + // the the tracker, so we need to pass those function a non-const copy. + RegPressureTracker &TempTracker = const_cast(RPTracker); + + std::vector Pressure; + std::vector MaxPressure; + + if (AtTop) + TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); + else { + // FIXME: I think for bottom up scheduling, the register pressure is cached + // and can be retrieved by DAG->getPressureDif(SU). + TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + } + + int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()]; + int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + + // If two instructions increase the pressure of different register sets + // by the same amount, the generic scheduler will prefer to schedule the + // instruction that increases the set with the least amount of registers, + // which in our case would be SGPRs. This is rarely what we want, so + // when we report excess/critical register pressure, we do it either + // only for VGPRs or only for SGPRs. + + // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. + const int MaxVGPRPressureInc = 16; + bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; + bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; + + + // FIXME: We have to enter REG-EXCESS before we reach the actual threshold + // to increase the likelihood we don't go over the limits. We should improve + // the analysis to look through dependencies to find the path with the least + // register pressure. + // FIXME: This is also necessary, because some passes that run after + // scheduling and before regalloc increase register pressure. + const int ErrorMargin = 3; + VGPRExcessLimit -= ErrorMargin; + SGPRExcessLimit -= ErrorMargin; + + // We only need to update the RPDelata for instructions that increase + // register pressure. Instructions that decrease or keep reg pressure + // the same will be marked as RegExcess in tryCandidate() when they + // are compared with instructions that increase the register pressure. + if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { + Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet()); + Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); + } + + if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { + Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet()); + Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit); + } + + // Register pressure is considered 'CRITICAL' if it is approaching a value + // that would reduce the wave occupancy for the execution unit. When + // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both + // has the same cost, so we don't need to prefer one over the other. + + VGPRCriticalLimit -= ErrorMargin; + SGPRCriticalLimit -= ErrorMargin; + + int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; + int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; + + if (SGPRDelta >= 0 || VGPRDelta >= 0) { + if (SGPRDelta > VGPRDelta) { + Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet()); + Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); + } else { + Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet()); + Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); + } + } +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNodeFromQueue() +void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, + const CandPolicy &ZonePolicy, + const RegPressureTracker &RPTracker, + SchedCandidate &Cand) { + const SISubtarget &ST = DAG->MF.getSubtarget(); + const SIRegisterInfo *SRI = static_cast(TRI); + ArrayRef Pressure = RPTracker.getRegSetPressureAtPos(); + unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()]; + unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + unsigned SGPRExcessLimit = + Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); + unsigned VGPRExcessLimit = + Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); + unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF); + unsigned SGPRCriticalLimit = SRI->getNumSGPRsAllowed(ST, MaxWaves); + unsigned VGPRCriticalLimit = SRI->getNumVGPRsAllowed(MaxWaves); + + ReadyQueue &Q = Zone.Available; + for (SUnit *SU : Q) { + + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, + SGPRPressure, VGPRPressure, + SGPRExcessLimit, VGPRExcessLimit, + SGPRCriticalLimit, VGPRCriticalLimit); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + Cand.setBest(TryCand); + } + } +} + +static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) { + switch (Reason) { + default: + return Reason; + case GenericSchedulerBase::RegCritical: + case GenericSchedulerBase::RegExcess: + return -Reason; + } +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNodeBidirectional() +SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { + // Schedule as far as possible in the direction of no choice. This is most + // efficient, but also provides the best heuristics for CriticalPSets. + if (SUnit *SU = Bot.pickOnlyChoice()) { + IsTopNode = false; + return SU; + } + if (SUnit *SU = Top.pickOnlyChoice()) { + IsTopNode = true; + return SU; + } + // Set the bottom-up policy based on the state of the current bottom zone and + // the instructions outside the zone, including the top zone. + CandPolicy BotPolicy; + setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); + // Set the top-down policy based on the state of the current top zone and + // the instructions outside the zone, including the bottom zone. + CandPolicy TopPolicy; + setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + + // See if BotCand is still valid (because we previously scheduled from Top). + DEBUG(dbgs() << "Picking from Bot:\n"); + if (!BotCand.isValid() || BotCand.SU->isScheduled || + BotCand.Policy != BotPolicy) { + BotCand.reset(CandPolicy()); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + } else { + DEBUG(traceCandidate(BotCand)); + } + + // Check if the top Q has a better candidate. + DEBUG(dbgs() << "Picking from Top:\n"); + if (!TopCand.isValid() || TopCand.SU->isScheduled || + TopCand.Policy != TopPolicy) { + TopCand.reset(CandPolicy()); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + } else { + DEBUG(traceCandidate(TopCand)); + } + + // Pick best from BotCand and TopCand. + DEBUG( + dbgs() << "Top Cand: "; + traceCandidate(BotCand); + dbgs() << "Bot Cand: "; + traceCandidate(TopCand); + ); + SchedCandidate Cand; + if (TopCand.Reason == BotCand.Reason) { + Cand = BotCand; + GenericSchedulerBase::CandReason TopReason = TopCand.Reason; + TopCand.Reason = NoCand; + GenericScheduler::tryCandidate(Cand, TopCand, nullptr); + if (TopCand.Reason != NoCand) { + Cand.setBest(TopCand); + } else { + TopCand.Reason = TopReason; + } + } else { + if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) { + Cand = TopCand; + } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) { + Cand = BotCand; + } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) { + Cand = TopCand; + } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { + Cand = BotCand; + } else { + int TopRank = getBidirectionalReasonRank(TopCand.Reason); + int BotRank = getBidirectionalReasonRank(BotCand.Reason); + if (TopRank > BotRank) { + Cand = TopCand; + } else { + Cand = BotCand; + } + } + } + DEBUG( + dbgs() << "Picking: "; + traceCandidate(Cand); + ); + + IsTopNode = Cand.AtTop; + return Cand.SU; +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNode() +SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { + if (DAG->top() == DAG->bottom()) { + assert(Top.Available.empty() && Top.Pending.empty() && + Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); + return nullptr; + } + SUnit *SU; + do { + if (RegionPolicy.OnlyTopDown) { + SU = Top.pickOnlyChoice(); + if (!SU) { + CandPolicy NoPolicy; + TopCand.reset(NoPolicy); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find a candidate"); + SU = TopCand.SU; + } + IsTopNode = true; + } else if (RegionPolicy.OnlyBottomUp) { + SU = Bot.pickOnlyChoice(); + if (!SU) { + CandPolicy NoPolicy; + BotCand.reset(NoPolicy); + pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find a candidate"); + SU = BotCand.SU; + } + IsTopNode = false; + } else { + SU = pickNodeBidirectional(IsTopNode); + } + } while (SU->isScheduled); + + if (SU->isTopReady()) + Top.removeReady(SU); + if (SU->isBottomReady()) + Bot.removeReady(SU); + + DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); + return SU; +} diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h new file mode 100644 index 0000000..4cfc0ce --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -0,0 +1,54 @@ +//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H +#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class SIRegisterInfo; + +/// This is a minimal scheduler strategy. The main difference between this +/// and the GenericScheduler is that GCNSchedStrategy uses different +/// heuristics to determine excess/critical pressure sets. Its goal is to +/// maximize kernel occupancy (i.e. maximum number of waves per simd). +class GCNMaxOccupancySchedStrategy : public GenericScheduler { + + SUnit *pickNodeBidirectional(bool &IsTopNode); + + void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, + const RegPressureTracker &RPTracker, + SchedCandidate &Cand); + + void initCandidate(SchedCandidate &Cand, SUnit *SU, + bool AtTop, const RegPressureTracker &RPTracker, + const SIRegisterInfo *SRI, + int SGPRPressure, int VGPRPressure, + int SGPRExcessLimit, int VGPRExcessLimit, + int SGPRCriticalLimit, int VGPRCriticalLimit); + + void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone, const SIRegisterInfo *SRI, + unsigned SGPRPressure, unsigned VGPRPressure); + +public: + GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); + + SUnit *pickNode(bool &IsTopNode) override; +}; + +} // End namespace llvm + +#endif // GCNSCHEDSTRATEGY_H diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 80fdd85..7d84f7b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -249,6 +249,12 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, return VGPRLimit; } +unsigned +SIRegisterInfo::getDefaultRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); +} + bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { return Fn.getFrameInfo().hasStackObjects(); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index b0e852e..6e66c52 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -51,6 +51,8 @@ public: unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override; + unsigned getDefaultRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 0db92fc..be27966 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -47,6 +47,10 @@ def Write64Bit : SchedWrite; class SISchedMachineModel : SchedMachineModel { let CompleteModel = 1; + // MicroOpBufferSize = 1 means that instructions will always be added + // the ready queue when they become available. This exposes them + // to the register pressure analysis. + let MicroOpBufferSize = 1; let IssueWidth = 1; let PostRAScheduler = 1; } diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index 0046bc9..eb0bf65 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -258,10 +258,10 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) } ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64: -; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}} -; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} -; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} ; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}} +; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}} ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]] ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]] ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]] @@ -284,10 +284,9 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace( ; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} ; SI-NOT: and ; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]] -; SI-NOT: and -; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]] ; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]] ; SI-NOT: and +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]] define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load volatile i64, i64 addrspace(1)* %aptr @@ -486,8 +485,8 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace( ; low 32-bits, which is not a valid 64-bit inline immmediate. ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64: -; SI: s_load_dword s ; SI: s_load_dwordx2 +; SI: s_load_dword s ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index d0976b7..ee18032 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -155,8 +155,8 @@ define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind { } ; FUNC-LABEL: {{^}}s_ctpop_i65: -; GCN: s_and_b32 ; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]], +; GCN: s_and_b32 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]], ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]] ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll index dab3c10..c028d10 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -8,7 +8,7 @@ ; SI-LABEL: {{^}}offset_order: -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 +; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll index f6cdbb4..7d351d5 100644 --- a/llvm/test/CodeGen/AMDGPU/fceil64.ll +++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll @@ -13,8 +13,10 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone ; CI: v_ceil_f64_e32 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01 -; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]] +; FIXME: We should be using s_addk_i32 here, but the reg allocation hints +; are not always followed. +; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]] ; SI-DAG: s_not_b64 ; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index 19deefe..9126a47 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -548,7 +548,7 @@ define void @test_f32_interp(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f64_interp: ; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] -; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] +; SI: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] define void @test_f64_interp(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll index 9bbfe1e..4dae566 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll @@ -6,8 +6,8 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone ; SI-LABEL: {{^}}test_fmax3_f64: ; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} ; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 -; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 ; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]] +; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]] ; SI: buffer_store_dwordx2 [[RESULT]], ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index bf230b2..6d729f4 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -22,6 +22,7 @@ entry: ; XXX: Could do v_or_b32 directly ; CHECK-LABEL: {{^}}extract_w_offset_salu_use_vector: +; CHECK: s_mov_b32 m0 ; CHECK-DAG: s_or_b32 ; CHECK-DAG: s_or_b32 ; CHECK-DAG: s_or_b32 @@ -30,8 +31,7 @@ entry: ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movrels_b32_e32 +; CHECK: v_movrels_b32_e32 define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) { entry: %idx = add i32 %in, 1 @@ -242,13 +242,13 @@ entry: ; FIXME: Why is vector copied in between? ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]] -; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7 ; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9 +; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7 ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]] ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]] ; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec -; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK: s_waitcnt vmcnt(0) ; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]: ; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] @@ -303,8 +303,10 @@ bb2: ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]] ; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 -; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] ; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]] +; CHECK: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}} +; CHECK: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}} +; CHECK: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] ; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]: ; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] @@ -324,7 +326,7 @@ bb2: ; CHECK: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] ; CHECK: s_mov_b32 m0, [[READLANE]] ; CHECK: s_and_saveexec_b64 vcc, vcc -; CHECK-NEXT: v_movreld_b32_e32 [[VEC_ELT1]], 63 +; CHECK-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63 ; CHECK-NEXT: s_xor_b64 exec, exec, vcc ; CHECK: s_cbranch_execnz [[LOOP1]] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index b1405c7..e38ee47 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -344,7 +344,7 @@ endif: ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 -; GCN: s_mov_b32 m0, [[SCALEDIDX]] +; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]] ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, 0 ; Increment to next element folded into base register, but FileCheck diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 397e172..31ff7d9 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -343,8 +343,8 @@ define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, < ; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32: ; GCN-DAG: s_load_dwordx16 ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] +; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 5e1171a..86f5d4e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -360,7 +360,7 @@ define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA-DAG: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll index 3d6d7fa..8e1b003 100644 --- a/llvm/test/CodeGen/AMDGPU/missing-store.ll +++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll @@ -6,12 +6,12 @@ ; resulting in losing the store to gptr ; FUNC-LABEL: {{^}}missing_store_reduced: +; SI: s_load_dwordx2 ; SI: ds_read_b64 ; SI-DAG: buffer_store_dword ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} -; SI: s_load_dword -; SI: s_nop 2 +; SI: s_nop 3 ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} ; SI: buffer_store_dword ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll index 36f1257..85dfbe6 100644 --- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -10,10 +10,10 @@ ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} ; GCN-NOT: v_mov_b32 -; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] -; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] ; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-NOT: v_mov_b32 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll index 9eb76eb..58d4117 100644 --- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -103,9 +103,8 @@ define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { ; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_multi_use_f32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]| -; GCN: buffer_store_dword [[RCP]] - ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]| +; GCN: buffer_store_dword [[RCP]] ; GCN: buffer_store_dword [[MUL]] define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 { %src.fabs = call float @llvm.fabs.f32(float %src) diff --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll index 915c438..0408413 100644 --- a/llvm/test/CodeGen/AMDGPU/ret.ll +++ b/llvm/test/CodeGen/AMDGPU/ret.ll @@ -18,12 +18,13 @@ define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 i } ; GCN-LABEL: {{^}}vgpr_literal: -; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0 -; GCN: s_waitcnt expcnt(0) +; GCN: v_mov_b32_e32 v4, v0 +; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4 ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 ; GCN-DAG: v_mov_b32_e32 v3, -1.0 +; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) @@ -229,13 +230,14 @@ define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2 ; GCN-LABEL: {{^}}structure_literal: -; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0 -; GCN: s_waitcnt expcnt(0) +; GCN: v_mov_b32_e32 v3, v0 +; GCN: exp 15, 0, 1, 1, 1, v3, v3, v3, v3 ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: s_mov_b32 s0, 2 ; GCN-DAG: s_mov_b32 s1, 3 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 +; GCN: s_waitcnt expcnt(0) define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> }} diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 87362dc..5344834 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -134,8 +134,8 @@ define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2: ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll index 52f3cce..fd73ff8 100644 --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -170,14 +170,12 @@ entry: ; CI. ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8: +; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} ; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} ; GCN-NOHSA-NOT: v_add -; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} -; GCN-NOHSA-NOT: v_add -; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} -; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} +; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll index faf8d8a..d5c1dd9 100644 --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -93,13 +93,13 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> ; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] ; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}} ; SI: v_cndmask_b32_e32 +; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] ; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx2 define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index c5dbfd9..b85714e 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -341,11 +341,12 @@ define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i6 ; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]] ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]] ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO]]{{\]}} -; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}} define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index f6ed41d..8523f1c 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -70,9 +70,9 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace } ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load -; CI-DAG: buffer_store_dword ; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; CI: buffer_store_dword ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3 ; CI: buffer_store_dword @@ -136,9 +136,9 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 } ; FUNC-LABEL: @reorder_global_load_local_store_global_load +; CI: ds_write_b32 ; CI: buffer_load_dword ; CI: buffer_load_dword -; CI: ds_write_b32 ; CI: buffer_store_dword define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 @@ -181,11 +181,11 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa ; FUNC-LABEL: @reorder_global_offsets ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 -; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408 +; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: s_endpgm define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index dbd07fe..db4c1c8 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -35,11 +35,11 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { ; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2 ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], +; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] ; SI: s_addc_u32 +; SI: buffer_store_dword v[[LO_VREG]], ; SI: v_mov_b32_e32 -; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] ; SI: v_mov_b32_e32 -; SI: buffer_store_dword v[[LO_VREG]], define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { %aa = add i64 %a, 234 ; Prevent shrinking store. %b = shl i64 %aa, 2 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index 268f3c7..078df53 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -38,16 +38,16 @@ ; SI: v_cndmask_b32_e64 ; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]] ; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]] ; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]] ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]] ; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]], +; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]], ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll index 7a72a4c..87fcfba 100644 --- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -42,15 +42,18 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa } ; GCN-LABEL: {{^}}test_use_s_v_s: -; GCN: buffer_load_dword [[VA0:v[0-9]+]] -; GCN: buffer_load_dword [[VA1:v[0-9]+]] ; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; SI: buffer_load_dword [[VA0:v[0-9]+]] +; SI: buffer_load_dword [[VA1:v[0-9]+]] ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; GCN-NOT: v_mov_b32 +; VI: buffer_load_dword [[VA0:v[0-9]+]] +; VI: buffer_load_dword [[VA1:v[0-9]+]] + ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]] ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]] ; GCN: buffer_store_dword [[RESULT0]] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 7d97777..52fa0be 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -12,16 +12,16 @@ ; GCN-LABEL: {{^}}main: -; GCN-DAG: s_mov_b32 s13, s12 -; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s18, -1 -; SI-DAG: s_mov_b32 s19, 0xe8f000 -; VI-DAG: s_mov_b32 s19, 0xe80000 +; GCN-DAG: s_mov_b32 s11, s12 +; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s14, -1 +; SI-DAG: s_mov_b32 s15, 0xe8f000 +; VI-DAG: s_mov_b32 s15, 0xe80000 -; s13 is offset system SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload +; s11 is offset system SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024