From e3895bfb470de6d552b73af103c783ce062e2dcd Mon Sep 17 00:00:00 2001 From: Marek Olsak Date: Fri, 25 Nov 2016 16:03:34 +0000 Subject: [PATCH] Revert "AMDGPU: Implement SGPR spilling with scalar stores" This reverts commit 4404d0d6e354e80dd7f8f0a0e12d8ad809cf007e. llvm-svn: 287936 --- llvm/lib/Target/AMDGPU/SIInsertWaits.cpp | 43 +---- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 106 +------------ llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll | 36 ++--- llvm/test/CodeGen/AMDGPU/basic-branch.ll | 2 +- llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll | 46 +----- llvm/test/CodeGen/AMDGPU/spill-m0.ll | 21 +-- .../MIR/AMDGPU/scalar-store-cache-flush.mir | 173 --------------------- 8 files changed, 38 insertions(+), 403 deletions(-) delete mode 100644 llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index da4db63..a9e6939 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -532,7 +532,6 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); IV = getIsaVersion(ST->getFeatureBits()); - const SIMachineFunctionInfo *MFI = MF.getInfo(); HardwareLimits.Named.VM = getVmcntBitMask(IV); HardwareLimits.Named.EXP = getExpcntBitMask(IV); @@ -544,27 +543,20 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { LastOpcodeType = OTHER; LastInstWritesM0 = false; IsFlatOutstanding = false; - ReturnsVoid = MFI->returnsVoid(); + ReturnsVoid = MF.getInfo()->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); SmallVector RemoveMI; - SmallVector EndPgmBlocks; - - bool HaveScalarStores = false; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { - if (!HaveScalarStores && TII->isScalarStore(*I)) - HaveScalarStores = true; - if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a @@ -633,45 +625,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); - - if (I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN) - EndPgmBlocks.push_back(&MBB); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } - if (HaveScalarStores) { - // If scalar writes are used, the cache must be flushed or else the next - // wave to reuse the same scratch memory can be clobbered. - // - // Insert s_dcache_wb at wave termination points if there were any scalar - // stores, and only if the cache hasn't already been flushed. This could be - // improved by looking across blocks for flushes in postdominating blocks - // from the stores but an explicitly requested flush is probably very rare. - for (MachineBasicBlock *MBB : EndPgmBlocks) { - bool SeenDCacheWB = false; - - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); - I != E; ++I) { - - if (I->getOpcode() == AMDGPU::S_DCACHE_WB) - SeenDCacheWB = true; - else if (TII->isScalarStore(*I)) - SeenDCacheWB = false; - - // FIXME: It would be better to insert this before a waitcnt if any. - if ((I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) { - Changes = true; - BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); - } - } - } - } - for (MachineInstr *I : RemoveMI) I->eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e9fbde1..63ce258 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -544,7 +544,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) + BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) @@ -554,11 +554,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // needing them, and need to ensure that the reserved registers are // correctly handled. - if (ST.hasScalarStores()) { - // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); - } - return; } @@ -648,17 +643,12 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) + BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); - if (ST.hasScalarStores()) { - // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); - } - return; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 948ea11..54fcbb5 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -24,12 +24,6 @@ using namespace llvm; -static cl::opt EnableSpillSGPRToSMEM( - "amdgpu-spill-sgpr-to-smem", - cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), - cl::init(true)); - - static bool hasPressureSet(const int *PSets, unsigned PSetID) { for (unsigned i = 0; PSets[i] != -1; ++i) { if (PSets[i] == (int)PSetID) @@ -491,21 +485,18 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { - MachineBasicBlock *MBB = MI->getParent(); - MachineFunction *MF = MBB->getParent(); + MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock *MBB = MI->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); + const DebugLoc &DL = MI->getDebugLoc(); unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); - const DebugLoc &DL = MI->getDebugLoc(); - - SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - - bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); @@ -513,55 +504,6 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); - if (SpillToSMEM) { - if (SuperReg == AMDGPU::M0) { - assert(NumSubRegs == 1); - unsigned CopyM0 - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), CopyM0) - .addReg(AMDGPU::M0, getKillRegState(IsKill)); - - // The real spill now kills the temp copy. - SubReg = SuperReg = CopyM0; - IsKill = true; - } - - int64_t FrOffset = FrameInfo.getObjectOffset(Index); - unsigned Size = FrameInfo.getObjectSize(Index); - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - Size, Align); - - unsigned OffsetReg = AMDGPU::M0; - // Add i * 4 wave offset. - // - // SMEM instructions only support a single offset, so increment the wave - // offset. - - int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR)) - .addReg(SubReg, getKillRegState(IsKill)) // sdata - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg) // soff - .addImm(0) // glc - .addMemOperand(MMO); - - continue; - } - struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { @@ -588,9 +530,10 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, // it are fixed. } else { // Spill SGPR to a frame index. + // FIXME we should use S_STORE_DWORD here for VI. + // TODO: Should VI try to spill to VGPR and then spill to SMEM? unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // TODO: Should VI try to spill to VGPR and then spill to SMEM? MachineInstrBuilder Mov = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) @@ -642,7 +585,6 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); - bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; // m0 is not allowed as with readlane/writelane, so a temporary SGPR and // extra copy is needed. @@ -652,44 +594,10 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); } - int64_t FrOffset = FrameInfo.getObjectOffset(Index); - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); - if (SpillToSMEM) { - unsigned Size = FrameInfo.getObjectSize(Index); - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, - Size, Align); - - unsigned OffsetReg = AMDGPU::M0; - - // Add i * 4 offset - int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg) - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg) // soff - .addImm(0) // glc - .addMemOperand(MMO) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); - - continue; - } - SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index 617204f..aba0b63 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -1,20 +1,16 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -; If spilling to smem, additional registers are used for the resource -; descriptor. - -; ALL-LABEL: {{^}}max_14_sgprs: +; CHECK-LABEL: {{^}}max_14_sgprs: ; FIXME: Should be ablo to skip this copying of the private segment ; buffer because all the SGPR spills are to VGPRs. -; ALL: s_mov_b64 s[6:7], s[2:3] -; ALL: s_mov_b64 s[4:5], s[0:1] -; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 14 -define void @max_14_sgprs(i32 addrspace(1)* %out1, +; CHECK: s_mov_b64 s[6:7], s[2:3] +; CHECK: s_mov_b64 s[4:5], s[0:1] +; CHECK: SGPRBlocks: 1 +; CHECK: NumSGPRsForWavesPerEU: 14 +define void @max_14_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, @@ -35,7 +31,7 @@ define void @max_14_sgprs(i32 addrspace(1)* %out1, ; --------------------- ; total: 14 -; + reserved vcc, xnack, flat_scratch = 20 +; + reserved vcc, flat_scratch = 18 ; Because we can't handle re-using the last few input registers as the ; special vcc etc. registers (as well as decide to not use the unused @@ -44,14 +40,14 @@ define void @max_14_sgprs(i32 addrspace(1)* %out1, ; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: ; TOSGPR: SGPRBlocks: 2 -; TOSGPR: NumSGPRsForWavesPerEU: 20 +; TOSGPR: NumSGPRsForWavesPerEU: 18 ; TOSMEM: s_mov_b64 s[6:7], s[2:3] +; TOSMEM: s_mov_b32 s9, s13 ; TOSMEM: s_mov_b64 s[4:5], s[0:1] -; TOSMEM: s_mov_b32 s3, s13 ; TOSMEM: SGPRBlocks: 2 -; TOSMEM: NumSGPRsForWavesPerEU: 20 +; TOSMEM: NumSGPRsForWavesPerEU: 18 define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, @@ -83,12 +79,12 @@ define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, ; ; swapping the order the registers are copied from what normally ; ; happens. -; TOSMEM: s_mov_b32 s5, s11 -; TOSMEM: s_add_u32 m0, s5, -; TOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0 +; TOSMEM: s_mov_b64 s[6:7], s[2:3] +; TOSMEM: s_mov_b64 s[4:5], s[0:1] +; TOSMEM: s_mov_b32 s3, s11 -; ALL: SGPRBlocks: 2 -; ALL: NumSGPRsForWavesPerEU: 18 +; ALL: SGPRBlocks: 1 +; ALL: NumSGPRsForWavesPerEU: 16 define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll index 83313ed5..7bc4d73 100644 --- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll +++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index f267eb4..c3d9ee7 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -1,44 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s ; Make sure this doesn't crash. -; ALL-LABEL: {{^}}test: -; ALL: s_mov_b32 s92, SCRATCH_RSRC_DWORD0 -; ALL: s_mov_b32 s91, s3 - +; CHECK: {{^}}test: ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 -; SGPR-NEXT: s_waitcnt vmcnt(0) -; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] -; SGPR-NEXT: s_nop 4 -; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 - - -; Make sure scratch wave offset register is correctly incremented and -; then restored. -; SMEM: s_mov_b32 m0, s91{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s91, 0x100{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s91, 0x200{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s91, 0x300{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill - - -; SMEM: s_mov_b32 m0, s91{{$}} -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s91, 0x100{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s91, 0x200{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s91, 0x300{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload - -; ALL: s_endpgm +; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] +; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 +; CHECK: s_endpgm define void @test(i32 addrspace(1)* %out, i32 %in) { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index c5ef75e..74e33d1 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -1,13 +1,12 @@ ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s ; XXX - Why does it like to use vcc? ; GCN-LABEL: {{^}}spill_m0: -; TOSMEM: s_mov_b32 s84, SCRATCH_RSRC_DWORD0 +; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN: s_cmp_lg_u32 @@ -17,13 +16,6 @@ ; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], m0 ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill ; TOVMEM: s_waitcnt vmcnt(0) - -; TOSMEM: s_mov_b32 vcc_hi, m0 -; TOSMEM: s_mov_b32 m0, s3{{$}} -; TOSMEM-NOT: vcc_hi -; TOSMEM: s_buffer_store_dword vcc_hi, s[84:87], m0 ; 4-byte Folded Spill -; TOSMEM: s_waitcnt lgkmcnt(0) - ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ENDIF]]: @@ -35,11 +27,6 @@ ; TOVMEM: v_readfirstlane_b32 vcc_hi, [[RELOAD_VREG]] ; TOVMEM: s_mov_b32 m0, vcc_hi -; TOSMEM: s_mov_b32 m0, s3{{$}} -; TOSMEM: s_buffer_load_dword vcc_hi, s[84:87], m0 ; 4-byte Folded Reload -; TOSMEM-NOT: vcc_hi -; TOSMEM: s_mov_b32 m0, vcc_hi - ; GCN: s_add_i32 m0, m0, 1 define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { entry: @@ -61,8 +48,6 @@ endif: ; GCN-LABEL: {{^}}spill_m0_lds: ; GCN-NOT: v_readlane_b32 m0 -; GCN-NOT: s_buffer_store_dword m0 -; GCN-NOT: s_buffer_load_dword m0 define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 { main_body: %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) diff --git a/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir b/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir deleted file mode 100644 index af71086..0000000 --- a/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir +++ /dev/null @@ -1,173 +0,0 @@ -# RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s - ---- | - define void @basic_insert_dcache_wb() { - ret void - } - - define void @explicit_flush_after() { - ret void - } - - define void @explicit_flush_before() { - ret void - } - - define void @no_scalar_store() { - ret void - } - - define void @multi_block_store() { - bb0: - br i1 undef, label %bb1, label %bb2 - - bb1: - ret void - - bb2: - ret void - } - - define void @one_block_store() { - bb0: - br i1 undef, label %bb1, label %bb2 - - bb1: - ret void - - bb2: - ret void - } - - define amdgpu_ps float @si_return() { - ret float undef - } - -... ---- -# CHECK-LABEL: name: basic_insert_dcache_wb -# CHECK: bb.0: -# CHECK-NEXT: S_STORE_DWORD -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: S_ENDPGM - -name: basic_insert_dcache_wb -tracksRegLiveness: false - -body: | - bb.0: - S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 - S_ENDPGM -... ---- -# Already has an explicitly requested flush after the last store. -# CHECK-LABEL: name: explicit_flush_after -# CHECK: bb.0: -# CHECK-NEXT: S_STORE_DWORD -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: S_ENDPGM - -name: explicit_flush_after -tracksRegLiveness: false - -body: | - bb.0: - S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 - S_DCACHE_WB - S_ENDPGM -... ---- -# Already has an explicitly requested flush before the last store. -# CHECK-LABEL: name: explicit_flush_before -# CHECK: bb.0: -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: S_STORE_DWORD -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: S_ENDPGM - -name: explicit_flush_before -tracksRegLiveness: false - -body: | - bb.0: - S_DCACHE_WB - S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 - S_ENDPGM -... ---- -# CHECK-LABEL: no_scalar_store -# CHECK: bb.0 -# CHECK-NEXT: S_ENDPGM -name: no_scalar_store -tracksRegLiveness: false - -body: | - bb.0: - S_ENDPGM -... - -# CHECK-LABEL: name: multi_block_store -# CHECK: bb.0: -# CHECK-NEXT: S_STORE_DWORD -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: S_ENDPGM - -# CHECK: bb.1: -# CHECK-NEXT: S_STORE_DWORD -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: S_ENDPGM - -name: multi_block_store -tracksRegLiveness: false - -body: | - bb.0: - S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 - S_ENDPGM - - bb.1: - S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0 - S_ENDPGM -... -... - -# This one should be able to omit the flush in the storeless block but -# this isn't handled now. - -# CHECK-LABEL: name: one_block_store -# CHECK: bb.0: -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: S_ENDPGM - -# CHECK: bb.1: -# CHECK-NEXT: S_STORE_DWORD -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: S_ENDPGM - -name: one_block_store -tracksRegLiveness: false - -body: | - bb.0: - S_ENDPGM - - bb.1: - S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0 - S_ENDPGM -... ---- -# CHECK-LABEL: name: si_return -# CHECK: bb.0: -# CHECK-NEXT: S_STORE_DWORD -# CHECK-NEXT: S_WAITCNT -# CHECK-NEXT: S_DCACHE_WB -# CHECK-NEXT: SI_RETURN - -name: si_return -tracksRegLiveness: false - -body: | - bb.0: - S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 - SI_RETURN undef %vgpr0 -... -- 2.7.4