using namespace llvm;
+cl::opt<bool>
+ DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule",
+ cl::Hidden,
+ cl::desc("Disable unclustred high register pressure "
+ "reduction scheduling stage."),
+ cl::init(false));
+
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C)
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
- HasClusteredNodes(false), HasExcessPressure(false) {}
+ HasHighPressure(false) {}
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- // FIXME: This is also necessary, because some passes that run after
- // scheduling and before regalloc increase register pressure.
- const unsigned ErrorMargin = 3;
-
SGPRExcessLimit =
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
VGPRExcessLimit =
// marked as RegExcess in tryCandidate() when they are compared with
// instructions that increase the register pressure.
if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
- HasExcessPressure = true;
+ HasHighPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
}
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
- HasExcessPressure = true;
+ HasHighPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
}
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
if (SGPRDelta >= 0 || VGPRDelta >= 0) {
- HasExcessPressure = true;
+ HasHighPressure = true;
if (SGPRDelta > VGPRDelta) {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
if (SU->isBottomReady())
Bot.removeReady(SU);
- if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) {
- for (SDep &Dep : SU->Preds) {
- if (Dep.isCluster()) {
- HasClusteredNodes = true;
- break;
- }
- }
- }
-
LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
<< *SU->getInstr());
return SU;
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
RescheduleRegions.resize(Regions.size());
- RegionsWithClusters.resize(Regions.size());
RegionsWithHighRP.resize(Regions.size());
+ RegionsWithExcessRP.resize(Regions.size());
RegionsWithMinOcc.resize(Regions.size());
RescheduleRegions.set();
- RegionsWithClusters.reset();
RegionsWithHighRP.reset();
+ RegionsWithExcessRP.reset();
RegionsWithMinOcc.reset();
runSchedStages();
void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
- UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this);
+ UnclusteredHighRPStage S1(GCNSchedStageID::UnclusteredHighRPReschedule,
+ *this);
ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
*this);
PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
case GCNSchedStageID::InitialSchedule:
OS << "Initial Schedule";
break;
- case GCNSchedStageID::UnclusteredReschedule:
- OS << "Unclustered Reschedule";
+ case GCNSchedStageID::UnclusteredHighRPReschedule:
+ OS << "Unclustered High Register Pressure Reschedule";
break;
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
OS << "Clustered Low Occupancy Reschedule";
return true;
}
-bool UnclusteredRescheduleStage::initGCNSchedStage() {
+bool UnclusteredHighRPStage::initGCNSchedStage() {
+ if (DisableUnclusterHighRP)
+ return false;
+
if (!GCNSchedStage::initGCNSchedStage())
return false;
- if (DAG.RescheduleRegions.none())
+ if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
return false;
SavedMutations.swap(DAG.Mutations);
+ InitialOccupancy = DAG.MinOccupancy;
+ // Aggressivly try to reduce register pressure in the unclustered high RP
+ // stage. Temporarily increase occupancy target in the region.
+ S.ErrorMargin = S.HighRPErrorMargin;
+ if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
+ MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
+
+ LLVM_DEBUG(
+ dbgs()
+ << "Retrying function scheduling without clustering. "
+ "Aggressivly try to reduce register pressure to achieve occupancy "
+ << DAG.MinOccupancy << ".\n");
- LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n");
return true;
}
LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
}
-void UnclusteredRescheduleStage::finalizeGCNSchedStage() {
+void UnclusteredHighRPStage::finalizeGCNSchedStage() {
SavedMutations.swap(DAG.Mutations);
+ S.ErrorMargin = S.DefaultErrorMargin;
+ if (DAG.MinOccupancy > InitialOccupancy) {
+ for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
+ DAG.RegionsWithMinOcc[IDX] =
+ DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
+
+ LLVM_DEBUG(dbgs() << StageID
+ << " stage successfully increased occupancy to "
+ << DAG.MinOccupancy << '\n');
+ }
GCNSchedStage::finalizeGCNSchedStage();
}
llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs());
dbgs() << "Region register pressure: "; PressureBefore.print(dbgs()));
- // Set HasClusteredNodes to true for late stages where we have already
- // collected it. That way pickNode() will not scan SDep's when not needed.
- S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule;
- S.HasExcessPressure = false;
+ S.HasHighPressure = false;
return true;
}
-bool UnclusteredRescheduleStage::initGCNRegion() {
- if (!DAG.RescheduleRegions[RegionIdx])
+bool UnclusteredHighRPStage::initGCNRegion() {
+ // Only reschedule regions with the minimum occupancy or regions that may have
+ // spilling (excess register pressure).
+ if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
+ DAG.MinOccupancy <= InitialOccupancy) &&
+ !DAG.RegionsWithExcessRP[RegionIdx])
return false;
return GCNSchedStage::initGCNRegion();
}
bool ClusteredLowOccStage::initGCNRegion() {
- // We may need to reschedule this region if it doesn't have clusters so it
- // wasn't rescheduled in the last stage, or if we found it was testing
- // critical register pressure limits in the unclustered reschedule stage. The
- // later is because we may not have been able to raise the min occupancy in
- // the previous stage so the region may be overly constrained even if it was
- // already rescheduled.
- if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx])
+ // We may need to reschedule this region if it wasn't rescheduled in the last
+ // stage, or if we found it was testing critical register pressure limits in
+ // the unclustered reschedule stage. The later is because we may not have been
+ // able to raise the min occupancy in the previous stage so the region may be
+ // overly constrained even if it was already rescheduled.
+ if (!DAG.RegionsWithHighRP[RegionIdx])
return false;
return GCNSchedStage::initGCNRegion();
void GCNSchedStage::finalizeGCNRegion() {
DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd);
DAG.RescheduleRegions[RegionIdx] = false;
- if (S.HasExcessPressure)
+ if (S.HasHighPressure)
DAG.RegionsWithHighRP[RegionIdx] = true;
// Revert scheduling if we have dropped occupancy or there is some other
RegionIdx++;
}
-void InitialScheduleStage::finalizeGCNRegion() {
- // Record which regions have clustered nodes for the next unclustered
- // reschedule stage.
- assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
- if (S.HasClusteredNodes)
- DAG.RegionsWithClusters[RegionIdx] = true;
-
- GCNSchedStage::finalizeGCNRegion();
-}
-
void GCNSchedStage::checkScheduling() {
// Check the results of scheduling.
PressureAfter = DAG.getRealRegPressure(RegionIdx);
PressureAfter.getSGPRNum() > MaxSGPRs) {
DAG.RescheduleRegions[RegionIdx] = true;
DAG.RegionsWithHighRP[RegionIdx] = true;
+ DAG.RegionsWithExcessRP[RegionIdx] = true;
}
// Revert if this region's schedule would cause a drop in occupancy or
if (mayCauseSpilling(WavesAfter))
return true;
- assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule);
- // Don't reschedule the region in the next stage if it doesn't have clusters.
- if (!DAG.RegionsWithClusters[RegionIdx])
- DAG.RescheduleRegions[RegionIdx] = false;
-
return false;
}
-bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
- if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
- return true;
-
- // If RP is not reduced in the unclustred reschedule stage, revert to the old
- // schedule.
- if (!PressureAfter.less(ST, PressureBefore)) {
+bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
+ // If RP is not reduced in the unclustred reschedule stage, revert to the
+ // old schedule.
+ if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
+ mayCauseSpilling(WavesAfter)) ||
+ GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
return true;
}
bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
if (WavesAfter <= MFI.getMinWavesPerEU() &&
!PressureAfter.less(ST, PressureBefore) &&
- DAG.RescheduleRegions[RegionIdx]) {
+ DAG.RegionsWithExcessRP[RegionIdx]) {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
return true;
}
PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RescheduleRegions[RegionIdx] =
- DAG.RegionsWithClusters[RegionIdx] ||
- (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule;
+ (nextStage(StageID)) != GCNSchedStageID::UnclusteredHighRPReschedule;
DAG.RegionEnd = DAG.RegionBegin;
int SkippedDebugInstr = 0;
for (MachineInstr *MI : Unsched) {
MachineFunction *MF;
public:
- // schedule() have seen a clustered memory operation. Set it to false
- // before a region scheduling to know if the region had such clusters.
- bool HasClusteredNodes;
+ // schedule() have seen register pressure over the critical limits and had to
+ // track register pressure for actual scheduling heuristics.
+ bool HasHighPressure;
- // schedule() have seen an excess register pressure and had to track
- // register pressure for actual scheduling heuristics.
- bool HasExcessPressure;
+ // An error margin is necessary because of poor performance of the generic RP
+ // tracker and can be adjusted up for tuning heuristics to try and more
+ // aggressively reduce register pressure.
+ const unsigned DefaultErrorMargin = 3;
+
+ const unsigned HighRPErrorMargin = 10;
+
+ unsigned ErrorMargin = DefaultErrorMargin;
unsigned SGPRCriticalLimit;
enum class GCNSchedStageID : unsigned {
InitialSchedule = 0,
- UnclusteredReschedule = 1,
+ UnclusteredHighRPReschedule = 1,
ClusteredLowOccupancyReschedule = 2,
PreRARematerialize = 3,
LastStage = PreRARematerialize
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
friend class InitialScheduleStage;
- friend class UnclusteredRescheduleStage;
+ friend class UnclusteredHighRPStage;
friend class ClusteredLowOccStage;
friend class PreRARematStage;
// or we generally desire to reschedule it.
BitVector RescheduleRegions;
- // Record regions which use clustered loads/stores.
- BitVector RegionsWithClusters;
-
// Record regions with high register pressure.
BitVector RegionsWithHighRP;
+ // Record regions with excess register pressure over the physical register
+ // limit. Register pressure in these regions usually will result in spilling.
+ BitVector RegionsWithExcessRP;
+
// Regions that has the same occupancy as the latest MinOccupancy
BitVector RegionsWithMinOcc;
void setupNewBlock();
// Finalize state after scheudling a region.
- virtual void finalizeGCNRegion();
+ void finalizeGCNRegion();
// Check result of scheduling.
void checkScheduling();
class InitialScheduleStage : public GCNSchedStage {
public:
- void finalizeGCNRegion() override;
-
bool shouldRevertScheduling(unsigned WavesAfter) override;
InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};
-class UnclusteredRescheduleStage : public GCNSchedStage {
+class UnclusteredHighRPStage : public GCNSchedStage {
private:
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+ // Save the initial occupancy before starting this stage.
+ unsigned InitialOccupancy;
+
public:
bool initGCNSchedStage() override;
bool shouldRevertScheduling(unsigned WavesAfter) override;
- UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ UnclusteredHighRPStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};
class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
public AMDGPUSubtarget {
-
+public:
using AMDGPUSubtarget::getMaxWavesPerEU;
-public:
// Following 2 enums are documented at:
// - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
enum class TrapHandlerAbi {
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
-; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
-; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
-; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28
-; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5)
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5)
-; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
-; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3)
-; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
+; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16
; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
+; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
+; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32
; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
-; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
-; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
-; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28
-; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5)
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5)
-; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
-; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3)
-; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
+; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16
; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
+; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
+; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32
; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
+; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
-; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
-; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
-; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28
-; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5)
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5)
-; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
-; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3)
-; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
+; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16
; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
+; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
+; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32
; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
-; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
-; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
-; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28
-; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5)
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
-; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5)
-; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
-; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3)
-; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
+; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16
; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
+; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20
+; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
+; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32
; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
+; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x9
+; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1]
; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[40:43], v64, s[0:1] offset:32
; GFX11-NEXT: global_load_b128 v[56:59], v64, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[60:63], v64, s[0:1] offset:112
; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144
-; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128
-; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7
-; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: s_clause 0x6
+; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128
; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160
; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176
; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:192
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: s_clause 0x8
+; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192
; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3]
; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16
; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32
; GFX11-NEXT: global_store_b128 v64, v[52:55], s[2:3] offset:80
; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96
; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112
-; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:208
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX7-LABEL: v_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v16, v0
-; GFX7-NEXT: v_mov_b32_e32 v17, v1
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT: v_mul_lo_u32 v26, v5, v10
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc
+; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
-; GFX7-NEXT: v_mov_b32_e32 v1, v18
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc
+; GFX7-NEXT: v_mov_b32_e32 v20, v18
; GFX7-NEXT: v_mov_b32_e32 v18, v19
-; GFX7-NEXT: v_mov_b32_e32 v19, v20
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
-; GFX7-NEXT: v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
-; GFX7-NEXT: v_mov_b32_e32 v0, v23
-; GFX7-NEXT: v_mul_lo_u32 v23, v4, v11
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
+; GFX7-NEXT: v_mov_b32_e32 v19, v16
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX7-NEXT: v_mov_b32_e32 v19, v22
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13
+; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
+; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, v22
-; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v20, v11
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
-; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13]
-; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
-; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13]
-; GFX7-NEXT: v_mul_lo_u32 v11, v16, v15
-; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v16, v0
-; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT: v_mul_lo_u32 v26, v5, v10
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v20, vcc
+; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
-; GFX8-NEXT: v_mov_b32_e32 v1, v18
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc
+; GFX8-NEXT: v_mov_b32_e32 v20, v18
; GFX8-NEXT: v_mov_b32_e32 v18, v19
-; GFX8-NEXT: v_mov_b32_e32 v19, v20
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
-; GFX8-NEXT: v_addc_u32_e64 v25, s[4:5], 0, v0, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
-; GFX8-NEXT: v_mov_b32_e32 v0, v23
-; GFX8-NEXT: v_mul_lo_u32 v23, v4, v11
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
+; GFX8-NEXT: v_mov_b32_e32 v19, v16
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX8-NEXT: v_mov_b32_e32 v19, v22
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13
+; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, v22
-; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v4, s[12:13]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX8-NEXT: v_mov_b32_e32 v20, v11
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
-; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], 0, v10, s[12:13]
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
-; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v11, v3, s[12:13]
-; GFX8-NEXT: v_mul_lo_u32 v11, v16, v15
-; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v25, v4, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v10, v5, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v24, v6, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v10, s[12:13], v21, v11, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[12:13], v10, v9, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[10:11], v9, v13, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v27, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v23, s[6:7]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v26, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v12, 0
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT: v_mul_lo_u32 v26, v5, v10
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v11, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11
+; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v20, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v16, v10, 0
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v6, v8, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v17, v9, v[22:23]
-; GFX9-NEXT: v_mov_b32_e32 v1, v18
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc
+; GFX9-NEXT: v_mov_b32_e32 v20, v18
; GFX9-NEXT: v_mov_b32_e32 v18, v19
-; GFX9-NEXT: v_mov_b32_e32 v19, v20
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v16, v13, v[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[22:23]
-; GFX9-NEXT: v_addc_co_u32_e64 v25, s[4:5], 0, v0, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v12, v[18:19]
-; GFX9-NEXT: v_mov_b32_e32 v0, v23
-; GFX9-NEXT: v_mul_lo_u32 v23, v4, v11
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v11, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[10:11], v4, v9, v[11:12]
+; GFX9-NEXT: v_mov_b32_e32 v19, v16
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
+; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
+; GFX9-NEXT: v_mov_b32_e32 v19, v22
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
+; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
+; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13
+; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v17, v10, v[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[14:15], v16, v8, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, v22
-; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], 0, v4, s[12:13]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[16:17], v16, v9, v[1:2]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
+; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX9-NEXT: v_mov_b32_e32 v20, v11
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
-; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], 0, v10, s[12:13]
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v17, v8, v[1:2]
-; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v11, v3, s[12:13]
-; GFX9-NEXT: v_mul_lo_u32 v11, v16, v15
-; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v25, v4, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v10, v5, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v24, v6, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v10, s[12:13], v21, v11, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[12:13], v10, v9, s[14:15]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[10:11], v9, v13, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v27, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v23, s[6:7]
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v26, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13]
+; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13]
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
-; GFX908-NEXT: s_sub_i32 s4, 0, s1
+; GFX908-NEXT: s_sub_i32 s7, 0, s1
; GFX908-NEXT: s_lshr_b32 s5, s6, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s6
; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX908-NEXT: v_mov_b32_e32 v7, s3
-; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], 5
-; GFX908-NEXT: v_mov_b32_e32 v6, s2
-; GFX908-NEXT: v_mul_lo_u32 v2, s4, v0
; GFX908-NEXT: s_mov_b32 s4, 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s2
+; GFX908-NEXT: v_mul_lo_u32 v2, s7, v0
+; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], 5
; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX908-NEXT: v_add_u32_e32 v0, v0, v2
; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0
# CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0
# CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32
# CHECK-NEXT: RegionInstrs: 46
-# CHECK: Unclustered reschedule did not help.
-# CHECK: Attempting to revert scheduling.
-# CHECK: Retrying function scheduling with lowest recorded occupancy 3.
-# CHECK: ********** MI Scheduling **********
-# CHECK: test_same_num_instrs:%bb.2
-# CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0
-# CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32
-# CHECK-NEXT: RegionInstrs: 46
# CHECK: Attempting to revert scheduling.
---
--- /dev/null
+# REQUIRES: asserts
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+
+--- |
+ define amdgpu_kernel void @high-RP-reschedule() { ret void }
+...
+
+# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4
+
+---
+name: high-RP-reschedule
+tracksRegLiveness: true
+machineFunctionInfo:
+ occupancy: 4
+body: |
+ bb.0:
+ %0:vreg_128 = IMPLICIT_DEF
+ %1:vreg_128 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vreg_128 = IMPLICIT_DEF
+ %4:vreg_128 = IMPLICIT_DEF
+ %5:vreg_128 = IMPLICIT_DEF
+ %6:vreg_128 = IMPLICIT_DEF
+ %7:vreg_128 = IMPLICIT_DEF
+ %8:vreg_128 = IMPLICIT_DEF
+ %9:vreg_128 = IMPLICIT_DEF
+ %10:vreg_128 = IMPLICIT_DEF
+ %11:sreg_64_xexec = IMPLICIT_DEF
+ %12:vreg_64 = IMPLICIT_DEF
+
+ bb.1:
+ %13:vgpr_32 = V_LSHRREV_B16_e32 1, %12.sub0, implicit $exec
+ %14:vgpr_32 = V_AND_B32_e32 127, %13, implicit $exec
+ %15:vgpr_32 = V_MUL_LO_U16_e32 49, %14, implicit $exec
+ %16:vgpr_32 = V_LSHRREV_B16_e32 10, %15, implicit $exec
+ %17:vgpr_32 = V_MUL_LO_U16_e32 42, %16, implicit $exec
+ %18:vgpr_32 = V_SUB_U16_e32 %12.sub0, %17, implicit $exec
+ %19:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+ %20:vgpr_32 = V_MUL_U32_U24_sdwa 0, %18, 0, %19, 0, 6, 0, 0, 6, implicit $exec
+ %21:vgpr_32 = V_LSHLREV_B32_e32 4, %20, implicit $exec
+ %22:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 608, 0, implicit $exec :: (load (s128))
+ %23:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 576, 0, implicit $exec :: (load (s128))
+ %24:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 592, 0, implicit $exec :: (load (s128))
+ %25:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 624, 0, implicit $exec :: (load (s128))
+ %26:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 672, 0, implicit $exec :: (load (s128))
+ %27:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 640, 0, implicit $exec :: (load (s128))
+ %28:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 656, 0, implicit $exec :: (load (s128))
+ %29:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ %30:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %31:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub0_sub1, 1, %29, 0, 0, implicit $mode, implicit $exec
+ %32:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub2_sub3, 0, %30, 0, 0, implicit $mode, implicit $exec
+ %33:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ %34:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %35:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub0_sub1, 1, %33, 0, 0, implicit $mode, implicit $exec
+ %36:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub2_sub3, 0, %34, 0, 0, implicit $mode, implicit $exec
+ %37:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ %38:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %39:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub0_sub1, 1, %37, 0, 0, implicit $mode, implicit $exec
+ %40:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub2_sub3, 0, %38, 0, 0, implicit $mode, implicit $exec
+ %41:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ %42:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %43:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub0_sub1, 1, %41, 0, 0, implicit $mode, implicit $exec
+ %44:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub2_sub3, 0, %42, 0, 0, implicit $mode, implicit $exec
+ %45:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ %46:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %47:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ %48:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub0_sub1, 1, %45, 0, 0, implicit $mode, implicit $exec
+ %49:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub2_sub3, 0, %46, 0, 0, implicit $mode, implicit $exec
+ %50:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ %51:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %52:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %53:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub0_sub1, 1, %47, 0, 0, implicit $mode, implicit $exec
+ %54:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub0_sub1, 1, %50, 0, 0, implicit $mode, implicit $exec
+ %55:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub2_sub3, 0, %51, 0, 0, implicit $mode, implicit $exec
+ %56:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub2_sub3, 0, %52, 0, 0, implicit $mode, implicit $exec
+ %57:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub2_sub3, 1, %32, 0, 0, implicit $mode, implicit $exec
+ %58:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %35, 1, %39, 0, 0, implicit $mode, implicit $exec
+ %59:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %44, 1, %49, 0, 0, implicit $mode, implicit $exec
+ %60:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %53, 1, %54, 0, 0, implicit $mode, implicit $exec
+ %61:sreg_64 = S_MOV_B64_IMM_PSEUDO 4604544271217802189
+ %62:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub0_sub1, 1, %31, 0, 0, implicit $mode, implicit $exec
+ undef %63.sub1:sreg_64 = S_MOV_B32 -1075404642
+ %64:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %36, 1, %40, 0, 0, implicit $mode, implicit $exec
+ %65:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %43, 1, %48, 0, 0, implicit $mode, implicit $exec
+ %66:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %55, 1, %56, 0, 0, implicit $mode, implicit $exec
+ %67:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %57, 0, %58, 0, 0, implicit $mode, implicit $exec
+ %68:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %59, 0, %60, 0, 0, implicit $mode, implicit $exec
+ %69:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub2_sub3, 0, 4611686018427387904, 1, %57, 0, 0, implicit $mode, implicit $exec
+ %70:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub0_sub1, 0, 4611686018427387904, 1, %62, 0, 0, implicit $mode, implicit $exec
+ %71:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %35, 0, 4611686018427387904, 1, %58, 0, 0, implicit $mode, implicit $exec
+ %72:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %36, 0, 4611686018427387904, 1, %64, 0, 0, implicit $mode, implicit $exec
+ %73:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %43, 0, 4611686018427387904, 1, %65, 0, 0, implicit $mode, implicit $exec
+ %74:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %44, 0, 4611686018427387904, 1, %59, 0, 0, implicit $mode, implicit $exec
+ %75:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %55, 0, 4611686018427387904, 1, %66, 0, 0, implicit $mode, implicit $exec
+ %76:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %53, 0, 4611686018427387904, 1, %60, 0, 0, implicit $mode, implicit $exec
+ %77:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %62, 1, %64, 0, 0, implicit $mode, implicit $exec
+ %78:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %65, 1, %66, 0, 0, implicit $mode, implicit $exec
+ %79:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %70, 1, %71, 0, 0, implicit $mode, implicit $exec
+ %80:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %57, 0, 4611686018427387904, 1, %67, 0, 0, implicit $mode, implicit $exec
+ %81:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %59, 0, 4611686018427387904, 1, %68, 0, 0, implicit $mode, implicit $exec
+ %82:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %69, 1, %72, 0, 0, implicit $mode, implicit $exec
+ %83:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %74, 1, %75, 0, 0, implicit $mode, implicit $exec
+ %84:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %73, 1, %76, 0, 0, implicit $mode, implicit $exec
+ %85:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %62, 0, 4611686018427387904, 1, %77, 0, 0, implicit $mode, implicit $exec
+ %86:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %65, 0, 4611686018427387904, 1, %78, 0, 0, implicit $mode, implicit $exec
+ %63.sub0:sreg_64 = COPY %61.sub0
+ %87:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %61, 0, %67, 0, 0, implicit $mode, implicit $exec
+ %88:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %80, 0, 0, implicit $mode, implicit $exec
+ %89:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %70, 0, 4611686018427387904, 1, %79, 0, 0, implicit $mode, implicit $exec
+ %90:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %69, 0, 4611686018427387904, 1, %82, 0, 0, implicit $mode, implicit $exec
+ %91:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %74, 0, 4611686018427387904, 1, %83, 0, 0, implicit $mode, implicit $exec
+ %92:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %73, 0, 4611686018427387904, 1, %84, 0, 0, implicit $mode, implicit $exec
+ %93:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %63, 0, %85, 0, 0, implicit $mode, implicit $exec
+ %94:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %77, 0, 0, implicit $mode, implicit $exec
+ undef %95.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %87, 0, 0, implicit $mode, implicit $exec
+ undef %96.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %82, 0, %84, 0, 0, implicit $mode, implicit $exec
+ undef %97.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %61, 0, %88, 0, 0, implicit $mode, implicit $exec
+ undef %98.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %90, 1, %91, 0, 0, implicit $mode, implicit $exec
+ %98.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %89, 1, %92, 0, 0, implicit $mode, implicit $exec
+ %97.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %93, 0, 0, implicit $mode, implicit $exec
+ %96.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %79, 1, %83, 0, 0, implicit $mode, implicit $exec
+ %95.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %63, 0, %94, 0, 0, implicit $mode, implicit $exec
+ undef %99.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %67, 0, 4611686018427387904, 1, %95.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ undef %100.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %82, 0, 4611686018427387904, 1, %96.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ undef %101.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %80, 0, 4611686018427387904, 1, %97.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ undef %102.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %90, 0, 4611686018427387904, 1, %98.sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ %102.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %89, 0, 4611686018427387904, 1, %98.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %101.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %85, 0, 4611686018427387904, 1, %97.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %100.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %79, 0, 4611686018427387904, 1, %96.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %99.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %77, 0, 4611686018427387904, 1, %95.sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ %103:vgpr_32 = V_ADD_U32_sdwa 0, %2, 0, %18, 0, 6, 0, 6, 0, implicit $exec
+ %104:vgpr_32 = V_LSHL_ADD_U32_e64 %103, 4, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %104, %102, 0, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %104, %101, 672, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %104, %100, 1344, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %104, %99, 2016, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %104, %98, 2688, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %104, %97, 3360, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %104, %96, 4032, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %104, %95, 4704, 0, implicit $exec
+
+ bb.2:
+ S_ENDPGM 0, implicit %0, implicit %1
+...
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s43
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s41
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s41
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5)
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39
; GCN-HSA-NEXT: s_and_b32 s50, s50, 0xffff
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xf0
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xc0
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xa0
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x90
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x80
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v34, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43
+; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40
-; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s41
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s38
+; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s39
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14
-; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12
+; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s34
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s33
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5)
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19
; GCN-HSA-NEXT: s_ashr_i32 s68, s50, 16
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3
+; GCN-HSA-NEXT: s_sext_i32_i16 s47, s47
+; GCN-HSA-NEXT: s_sext_i32_i16 s46, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3
-; GCN-HSA-NEXT: s_sext_i32_i16 s49, s49
-; GCN-HSA-NEXT: s_sext_i32_i16 s48, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
+; GCN-HSA-NEXT: s_sext_i32_i16 s45, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3
+; GCN-HSA-NEXT: s_sext_i32_i16 s44, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
+; GCN-HSA-NEXT: s_sext_i32_i16 s51, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3
+; GCN-HSA-NEXT: s_sext_i32_i16 s50, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s43, s43
; GCN-HSA-NEXT: s_sext_i32_i16 s42, s42
-; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43
+; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT: s_sext_i32_i16 s51, s51
+; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19]
+; GCN-HSA-NEXT: s_sext_i32_i16 s49, s49
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: s_sext_i32_i16 s50, s50
+; GCN-HSA-NEXT: s_sext_i32_i16 s48, s48
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s36, s36
; GCN-HSA-NEXT: s_sext_i32_i16 s39, s39
; GCN-HSA-NEXT: s_sext_i32_i16 s38, s38
; GCN-HSA-NEXT: s_sext_i32_i16 s41, s41
; GCN-HSA-NEXT: s_sext_i32_i16 s40, s40
-; GCN-HSA-NEXT: s_sext_i32_i16 s45, s45
-; GCN-HSA-NEXT: s_sext_i32_i16 s44, s44
-; GCN-HSA-NEXT: s_sext_i32_i16 s47, s47
-; GCN-HSA-NEXT: s_sext_i32_i16 s46, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCN-HSA-NEXT: s_sext_i32_i16 s29, s29
; GCN-HSA-NEXT: s_sext_i32_i16 s31, s31
; GCN-HSA-NEXT: s_sext_i32_i16 s30, s30
; GCN-HSA-NEXT: s_sext_i32_i16 s37, s37
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40
-; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s41
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s38
+; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s39
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s54
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30
-; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28
+; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s31
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s34
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s29
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s33
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s27, s27
; GCN-HSA-NEXT: s_sext_i32_i16 s26, s26
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15
; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s11
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s9
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s5
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, s3
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s1
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s11
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s9
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s3
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, s1
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s14, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[20:21], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[18:19], 0x100000
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s6, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s0, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[44:45], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s37
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s17
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s17
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s13
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s41
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[54:55], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[30:31], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s39
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s39
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s34
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s35
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s25
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_mov_b32 s42, s15
-; GCN-HSA-NEXT: s_mov_b32 s44, s13
-; GCN-HSA-NEXT: s_mov_b32 s46, s11
-; GCN-HSA-NEXT: s_mov_b32 s48, s9
-; GCN-HSA-NEXT: s_mov_b32 s50, s7
-; GCN-HSA-NEXT: s_mov_b32 s52, s5
-; GCN-HSA-NEXT: s_mov_b32 s54, s3
-; GCN-HSA-NEXT: s_mov_b32 s56, s1
-; GCN-HSA-NEXT: s_lshr_b32 s58, s14, 16
-; GCN-HSA-NEXT: s_lshr_b32 s60, s12, 16
-; GCN-HSA-NEXT: s_lshr_b32 s62, s10, 16
-; GCN-HSA-NEXT: s_lshr_b32 s64, s8, 16
-; GCN-HSA-NEXT: s_lshr_b32 s66, s6, 16
-; GCN-HSA-NEXT: s_lshr_b32 s68, s4, 16
-; GCN-HSA-NEXT: s_lshr_b32 s70, s2, 16
-; GCN-HSA-NEXT: s_lshr_b32 s72, s0, 16
+; GCN-HSA-NEXT: s_mov_b32 s48, s13
+; GCN-HSA-NEXT: s_mov_b32 s50, s11
+; GCN-HSA-NEXT: s_mov_b32 s52, s9
+; GCN-HSA-NEXT: s_mov_b32 s54, s7
+; GCN-HSA-NEXT: s_mov_b32 s56, s5
+; GCN-HSA-NEXT: s_mov_b32 s44, s3
+; GCN-HSA-NEXT: s_mov_b32 s58, s1
+; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16
+; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16
+; GCN-HSA-NEXT: s_lshr_b32 s64, s10, 16
+; GCN-HSA-NEXT: s_lshr_b32 s66, s8, 16
+; GCN-HSA-NEXT: s_lshr_b32 s68, s6, 16
+; GCN-HSA-NEXT: s_lshr_b32 s70, s4, 16
+; GCN-HSA-NEXT: s_lshr_b32 s72, s2, 16
+; GCN-HSA-NEXT: s_lshr_b32 s74, s0, 16
; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000
; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48
; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000
; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48
-; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48
+; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48
; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48
; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48
; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
-; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[74:75], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[72:73], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[68:69], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[66:67], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xf0
-; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s45
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT: s_add_u32 s58, s16, 0xf0
+; GCN-HSA-NEXT: s_addc_u32 s59, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48
+; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xd0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s49
+; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49
+; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0
+; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40
-; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50
+; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41
; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, s41
+; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50
+; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38
; GCN-HSA-NEXT: s_add_u32 s38, s16, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39
; GCN-HSA-NEXT: s_add_u32 s38, s16, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14
; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15
; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44
; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12
; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13
; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s41
-; GCN-HSA-NEXT: v_mov_b32_e32 v34, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38
; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43
-; GCN-HSA-NEXT: v_mov_b32_e32 v35, s39
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23]
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v18
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v15
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v16
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v19
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v18
-; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v19
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v17
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v16
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v26
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v24
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v27
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v26
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v25
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v24
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v21
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v20
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v19
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v18
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v31
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v30
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v29
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v28
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v25
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v24
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v25
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v24
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[38:41], off, s[8:11], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v23
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v22
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v41
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v42, 16, v40
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v39
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v35
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v41
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xffff, v40
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v39
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v38
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v37
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v36
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v35
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v40
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v39
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v40
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v39
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v23
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v22
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:96
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v21
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v20
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[57:60], off, s[8:11], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v58
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v57
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v56
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v25
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v24
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v58
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v57
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v56
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v55
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v25
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v24
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v23
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v22
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v42
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v41
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v60
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v59
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 16, v58
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v57
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v60
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v59
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v62, 0xffff, v58
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v57
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60
+; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70
+; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s6, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCN-HSA-NEXT: s_add_u32 s8, s2, 48
-; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0
; GCN-HSA-NEXT: s_add_u32 s2, s2, 64
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8
+; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33]
; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4
-; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0
-; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1
-; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
-; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1]
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v13
+; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v12
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31]
+; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0
+; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v31, 16, v15
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v15
+; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v14
+; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27]
-; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2
-; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v3
-; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v2
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
-; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[28:31]
; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s9
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9
+; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
-; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8
+; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v11
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v10
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10
+; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5
+; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
+; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7
+; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v6
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10]
; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v13
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v12
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v13
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v12
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1
+; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15
-; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v3
+; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3
; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v17
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v18
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v19
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32
; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v33
; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20
+; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v25
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v24
+; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v25
+; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v24
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23
-; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v27
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v26
+; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v27
+; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v26
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v29
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v28
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v29
-; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v28
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v21
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v20
+; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v21
+; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v20
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v23
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v31
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v30
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v23
+; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v22
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xffff, v13
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v12
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v13
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v19
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v13
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v12
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v16
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v19
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v18
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v17
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v16
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v19
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v18
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v17
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v27
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v23
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v22
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v23
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v22
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v21
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v20
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:80
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v38
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v37
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v35
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v38
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v37
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v36
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v35
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v22
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v21
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v20
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v23
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v22
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v21
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v20
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v40
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v39
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v40
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v39
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:112
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v26
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v24
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v27
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v26
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v25
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v24
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v31
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v30
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v31
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v30
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v29
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v28
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v39
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v38
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v37
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v36
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v39
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v38
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v37
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v36
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[52:55], off, s[8:11], 0 offset:112
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v42
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v39
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v56
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v56
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v55
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v41
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v40
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v42
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v41
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v40
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v39
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v58
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v57
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v58
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v57
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v53
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v52
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v53
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v52
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v38
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v37
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v36
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v39
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v38
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v37
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v36
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v55
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v54
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v55
+; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v54
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50
+; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s6, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5
-; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7
+; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2
+; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33]
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0
-; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v13
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v12
; GCN-HSA-NEXT: v_bfe_i32 v26, v13, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v24, v12, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13]
-; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33]
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27]
+; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v15
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v14
; GCN-HSA-NEXT: v_bfe_i32 v26, v15, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v24, v14, 0, 16
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27]
; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v8
; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10
; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[11:14]
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
+; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4
; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6
-; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6
+; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16
+; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[8:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v33
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v32
-; GCN-HSA-NEXT: v_bfe_i32 v14, v33, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v12, v32, 0, 16
+; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v29
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v28
+; GCN-HSA-NEXT: v_bfe_i32 v14, v29, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v12, v28, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v35
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v31
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v34
-; GCN-HSA-NEXT: v_bfe_i32 v10, v35, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v8, v34, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30
+; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v20
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v29
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v28
-; GCN-HSA-NEXT: v_bfe_i32 v6, v29, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v4, v28, 0, 16
+; GCN-HSA-NEXT: s_waitcnt vmcnt(14)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v33
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v32
+; GCN-HSA-NEXT: v_bfe_i32 v6, v33, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v4, v32, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v31
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v30
-; GCN-HSA-NEXT: v_bfe_i32 v2, v31, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v0, v30, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34
+; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v0
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v6
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v20
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v7
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, 0
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
-# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
# REQUIRES: asserts
---
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
# Check that %3 was not rematerialized before the last store since its operand %1
# is killed by that store.
; GFX8-NEXT: v_mov_b32_e32 v6, 0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_mov_b32_e32 v7, 0
-; GFX8-NEXT: s_movk_i32 s0, 0x7f
+; GFX8-NEXT: s_movk_i32 s4, 0x7f
; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB1_2 Depth 2
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: s_mov_b32 s1, 0
+; GFX8-NEXT: s_mov_b32 s5, 0
; GFX8-NEXT: .LBB1_2: ; %for.body
; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffb000, v4
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v5, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
+; GFX8-NEXT: s_mov_b64 s[0:1], vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffb800, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v5, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT: s_mov_b64 s[2:3], vcc
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, -1, v5, s[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffc000, v4
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v5, vcc
+; GFX8-NEXT: s_mov_b64 s[0:1], vcc
+; GFX8-NEXT: v_addc_u32_e64 v11, vcc, -1, v5, s[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xffffc800, v4
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xffffd000, v4
+; GFX8-NEXT: s_mov_b64 s[2:3], vcc
+; GFX8-NEXT: v_addc_u32_e64 v13, vcc, -1, v5, s[0:1]
+; GFX8-NEXT: s_addk_i32 s5, 0x2000
+; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v8, v6
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[12:13]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffd000, v4
+; GFX8-NEXT: s_mov_b64 s[0:1], vcc
+; GFX8-NEXT: v_addc_u32_e64 v15, vcc, -1, v5, s[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[14:15]
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v16
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v7, vcc
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffd800, v4
+; GFX8-NEXT: s_mov_b64 s[2:3], vcc
+; GFX8-NEXT: v_addc_u32_e64 v7, vcc, -1, v5, s[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v8, v16
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffe000, v4
+; GFX8-NEXT: s_mov_b64 s[0:1], vcc
+; GFX8-NEXT: v_addc_u32_e64 v11, vcc, -1, v5, s[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v12, v14
+; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v13, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffe800, v4
+; GFX8-NEXT: s_mov_b64 s[2:3], vcc
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, -1, v5, s[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v6, v14
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v13, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff000, v4
+; GFX8-NEXT: s_mov_b64 s[0:1], vcc
+; GFX8-NEXT: v_addc_u32_e64 v13, vcc, -1, v5, s[2:3]
; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13]
-; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15]
-; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffd800, v4
-; GFX8-NEXT: v_addc_u32_e32 v19, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffe000, v4
-; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17]
-; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe800, v4
-; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xfffff000, v4
-; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21]
-; GFX8-NEXT: flat_load_dwordx2 v[22:23], v[22:23]
-; GFX8-NEXT: v_addc_u32_e32 v25, vcc, -1, v5, vcc
-; GFX8-NEXT: s_addk_i32 s1, 0x2000
-; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff
-; GFX8-NEXT: s_waitcnt vmcnt(7)
-; GFX8-NEXT: v_add_u32_e32 v26, vcc, v8, v6
-; GFX8-NEXT: v_addc_u32_e32 v27, vcc, v9, v7, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[24:25]
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v4
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v5, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v10, v14
+; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v11, v7, vcc
+; GFX8-NEXT: v_addc_u32_e64 v7, s[0:1], -1, v5, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xfffff800, v4
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
-; GFX8-NEXT: flat_load_dwordx2 v[24:25], v[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v5, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v8, v14
+; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v9, v15, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5]
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x10000, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(9)
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v26
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v27, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(8)
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v10
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v13, v11, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(7)
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v15, v11, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(6)
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v16, v10
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v17, v11, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(5)
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v10
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v11, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v20, v10
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v21, v11, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v22, v10
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v23, v11, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v14
+; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v13, v15, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v12
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v13, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v7, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v24, v6
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v25, v7, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
; GFX8-NEXT: s_cbranch_scc0 .LBB1_2
; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT: s_add_i32 s1, s0, -1
-; GFX8-NEXT: s_cmp_eq_u32 s0, 0
+; GFX8-NEXT: s_add_i32 s0, s4, -1
+; GFX8-NEXT: s_cmp_eq_u32 s4, 0
; GFX8-NEXT: s_cbranch_scc1 .LBB1_5
; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT: s_mov_b32 s0, s1
+; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_branch .LBB1_1
; GFX8-NEXT: .LBB1_5: ; %while.end
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v5, vcc
+; GFX900-NEXT: s_mov_b64 s[0:1], vcc
+; GFX900-NEXT: v_addc_co_u32_e64 v9, s[0:1], -1, v5, s[0:1]
; GFX900-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[12:13], v[4:5], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v4
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc
-; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
-; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v5, vcc
-; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off
-; GFX900-NEXT: global_load_dwordx2 v[24:25], v[16:17], off offset:-2048
-; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, s3, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v5, vcc
-; GFX900-NEXT: global_load_dwordx2 v[16:17], v[20:21], off offset:-4096
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s5, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc
; GFX900-NEXT: s_addk_i32 s6, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff
-; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_add_co_u32_e64 v28, s[0:1], v8, v6
-; GFX900-NEXT: v_addc_co_u32_e64 v29, s[0:1], v9, v7, s[0:1]
-; GFX900-NEXT: global_load_dwordx2 v[6:7], v[20:21], off offset:-2048
-; GFX900-NEXT: global_load_dwordx2 v[8:9], v[20:21], off
-; GFX900-NEXT: s_nop 0
-; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
-; GFX900-NEXT: global_load_dwordx2 v[26:27], v[4:5], off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v6
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v7, vcc
+; GFX900-NEXT: global_load_dwordx2 v[6:7], v[14:15], off offset:-2048
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v6, v8
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v9, vcc
+; GFX900-NEXT: global_load_dwordx2 v[7:8], v[14:15], off
+; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, s2, v4
+; GFX900-NEXT: s_mov_b64 s[0:1], vcc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v7, v16
+; GFX900-NEXT: v_addc_co_u32_e64 v7, s[0:1], -1, v5, s[0:1]
+; GFX900-NEXT: global_load_dwordx2 v[6:7], v[6:7], off offset:-2048
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v8, v9, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, s3, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v5, vcc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
+; GFX900-NEXT: global_load_dwordx2 v[6:7], v[8:9], off offset:-4096
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
+; GFX900-NEXT: global_load_dwordx2 v[6:7], v[8:9], off offset:-2048
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14
+; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, s5, v4
+; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v5, vcc
+; GFX900-NEXT: global_load_dwordx2 v[6:7], v[6:7], off offset:-2048
+; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v8, v14
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v9, v15, vcc
+; GFX900-NEXT: global_load_dwordx2 v[8:9], v[4:5], off
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(7)
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v18, v28
-; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v19, v29, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(6)
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v22, v14
-; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v23, v15, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v24, v14
-; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v25, v15, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v16, v14
-; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v17, v15, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v6, v14
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6
-; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v20, v6
-; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v21, v7, vcc
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v12, v6
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v7, vcc
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v26, v6
-; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v27, v7, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
; GFX900-NEXT: s_cbranch_scc0 .LBB1_2
; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1