AMDGPU: Fix using incorrect private resource with no allocation

author Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 28 Oct 2016 19:43:31 +0000 (19:43 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 28 Oct 2016 19:43:31 +0000 (19:43 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 28 Oct 2016 19:43:31 +0000 (19:43 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 28 Oct 2016 19:43:31 +0000 (19:43 +0000)
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

index 0c0d9c9..d0a69ea 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -21,12 +21,6 @@
  using namespace llvm;
  
  
-static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
-                              const MachineFrameInfo &MFI) {
-  return FuncInfo->hasSpilledSGPRs() &&
-    (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
-}
-
  static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
                                           const SIRegisterInfo *TRI) {
    return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
@@ -75,7 +69,6 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
  
-
    // Add wave offset in bytes to private base offset.
    // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
@@ -97,7 +90,8 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
  
    // We need to insert initialization of the scratch resource descriptor.
    unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-  assert(ScratchRsrcReg != AMDGPU::NoRegister);
+  if (ScratchRsrcReg == AMDGPU::NoRegister)
+    return AMDGPU::NoRegister;
  
    if (ST.hasSGPRInitBug() ||
        ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
@@ -116,14 +110,17 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
  
    MachineRegisterInfo &MRI = MF.getRegInfo();
  
-  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+  unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
+  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+  AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
+
    // Skip the last 2 elements because the last one is reserved for VCC, and
    // this is the 2nd to last element already.
-  for (MCPhysReg Reg : getAllSGPR128(MF, TRI).drop_back(2).slice(NumPreloaded)) {
+  for (MCPhysReg Reg : AllSGPR128s) {
      // Pick the first unallocated one. Make sure we don't clobber the other
      // reserved input we needed.
-    if (!MRI.isPhysRegUsed(Reg)) {
-      assert(MRI.isAllocatable(Reg));
+    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+      //assert(MRI.isAllocatable(Reg));
        MRI.replaceRegWith(ScratchRsrcReg, Reg);
        MFI->setScratchRSrcReg(Reg);
        return Reg;
@@ -146,8 +143,15 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
  
    unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
    MachineRegisterInfo &MRI = MF.getRegInfo();
+
    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
  
+  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+  if (NumPreloaded > AllSGPRs.size())
+    return ScratchWaveOffsetReg;
+
+  AllSGPRs = AllSGPRs.slice(NumPreloaded);
+
    // We need to drop register from the end of the list that we cannot use
    // for the scratch wave offset.
    // + 2 s102 and s103 do not exist on VI.
@@ -161,7 +165,10 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
    //     are no other free SGPRs, then the value will stay in this register.
    // ----
    //  13
-  for (MCPhysReg Reg : getAllSGPRs(MF, TRI).drop_back(13).slice(NumPreloaded)) {
+  if (AllSGPRs.size() < 13)
+    return ScratchWaveOffsetReg;
+
+  for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
      // Pick the first unallocated SGPR. Be careful not to pick an alias of the
      // scratch descriptor, since we haven’t added its uses yet.
      if (!MRI.isPhysRegUsed(Reg)) {
@@ -186,9 +193,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
    if (ST.debuggerEmitPrologue())
      emitDebuggerPrologue(MF, MBB);
  
-  if (!MF.getFrameInfo().hasStackObjects())
-    return;
-
    assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
  
    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -198,8 +202,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
    //
    // FIXME: We should be cleaning up these unused SGPR spill frame indices
    // somewhere.
-  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
-    return;
  
    const SIInstrInfo *TII = ST.getInstrInfo();
    const SIRegisterInfo *TRI = &TII->getRegisterInfo();
@@ -209,38 +211,51 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
      = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
    unsigned ScratchWaveOffsetReg
      = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-  assert(ScratchRsrcReg != AMDGPU::NoRegister);
-  assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+  if (ScratchRsrcReg == AMDGPU::NoRegister) {
+    assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
+    return;
+  }
+
    assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
  
-  if (MFI->hasFlatScratchInit())
+  // We need to do the replacement of the private segment buffer and wave offset
+  // register even if there are no stack objects. There could be stores to undef
+  // or a constant without an associated object.
+
+  // FIXME: We still have implicit uses on SGPR spill instructions in case they
+  // need to spill to vector memory. It's likely that will not happen, but at
+  // this point it appears we need the setup. This part of the prolog should be
+  // emitted after frame indices are eliminated.
+
+  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
      emitFlatScratchInit(TII, TRI, MF, MBB);
  
    // We need to insert initialization of the scratch resource descriptor.
    unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
      MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
  
+
    unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
    if (ST.isAmdCodeObjectV2()) {
      PreloadedPrivateBufferReg = TRI->getPreloadedValue(
        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
    }
  
-  // If we reserved the original input registers, we don't need to copy to the
-  // reserved registers.
-  if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
-    // We should always reserve these 5 registers at the same time.
-    assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
-           "scratch wave offset and private segment buffer inconsistent");
-    return;
-  }
+  bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
+  bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
  
    // We added live-ins during argument lowering, but since they were not used
    // they were deleted. We're adding the uses now, so add them back.
-  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
-  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+  if (OffsetRegUsed) {
+    assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
+           "scratch wave offset input is required");
+    MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+    MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+  }
  
-  if (ST.isAmdCodeObjectV2()) {
+  if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
+    assert(ST.isAmdCodeObjectV2());
      MRI.addLiveIn(PreloadedPrivateBufferReg);
      MBB.addLiveIn(PreloadedPrivateBufferReg);
    }
@@ -250,30 +265,46 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
      if (&OtherBB == &MBB)
        continue;
  
-    OtherBB.addLiveIn(ScratchRsrcReg);
-    OtherBB.addLiveIn(ScratchWaveOffsetReg);
+    if (OffsetRegUsed)
+      OtherBB.addLiveIn(ScratchWaveOffsetReg);
+
+    if (ResourceRegUsed)
+      OtherBB.addLiveIn(ScratchRsrcReg);
    }
  
    DebugLoc DL;
    MachineBasicBlock::iterator I = MBB.begin();
  
-  if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
-    // Make sure we emit the copy for the offset first. We may have chosen to
-    // copy the buffer resource into a register that aliases the input offset
-    // register.
+  // If we reserved the original input registers, we don't need to copy to the
+  // reserved registers.
+
+  bool CopyBuffer = ResourceRegUsed &&
+    PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+    ScratchRsrcReg != PreloadedPrivateBufferReg;
+
+  // This needs to be careful of the copying order to avoid overwriting one of
+  // the input registers before it's been copied to it's final
+  // destination. Usually the offset should be copied first.
+  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
+                                              ScratchWaveOffsetReg);
+  if (CopyBuffer && CopyBufferFirst) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+  }
+
+  if (OffsetRegUsed &&
+      PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
        .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
    }
  
-  if (ST.isAmdCodeObjectV2()) {
-    // Insert copies from argument register.
-    assert(
-      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
-      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
-
+  if (CopyBuffer && !CopyBufferFirst) {
      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
        .addReg(PreloadedPrivateBufferReg, RegState::Kill);
-  } else {
+  }
+
+  if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
+    assert(!ST.isAmdCodeObjectV2());
      const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
  
      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index a7de513..0acbed1 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -614,7 +614,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
      BuildMI(MBB, MI, DL, OpDesc)
        .addReg(SrcReg, getKillRegState(isKill)) // data
        .addFrameIndex(FrameIndex)               // addr
-      .addMemOperand(MMO);
+      .addMemOperand(MMO)
+      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+    // Add the scratch resource registers as implicit uses because we may end up
+    // needing them, and need to ensure that the reserved registers are
+    // correctly handled.
  
      return;
    }
@@ -707,7 +712,9 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
  
      BuildMI(MBB, MI, DL, OpDesc, DestReg)
        .addFrameIndex(FrameIndex) // addr
-      .addMemOperand(MMO);
+      .addMemOperand(MMO)
+      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
  
      return;
    }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

index 5bb7024..9a35a99 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1059,9 +1059,20 @@ unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
        F, "amdgpu-num-sgpr", MaxNumSGPRs);
  
      // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && Requested <= getNumReservedSGPRs(ST))
+    if (Requested && (Requested <= getNumReservedSGPRs(ST)))
        Requested = 0;
  
+    // If more SGPRs are required to support the input user/system SGPRs,
+    // increase to accomodate them.
+    //
+    // FIXME: This really ends up using the requested number of SGPRs + number
+    // of reserved special registers in total. Theoretically you could re-use
+    // the last input registers for these special registers, but this would
+    // require a lot of complexity to deal with the weird aliasing.
+    unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
+    if (Requested && Requested < NumInputSGPRs)
+      Requested = NumInputSGPRs;
+
      // Make sure requested value is compatible with values implied by
      // default/requested minimum/maximum number of waves per execution unit.
      if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first))
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll

index 404b125..aba0b63 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -1,9 +1,16 @@
  ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
  
-; CHECK-LABEL: {{^}}max_18_sgprs:
+; CHECK-LABEL: {{^}}max_14_sgprs:
+
+; FIXME: Should be ablo to skip this copying of the private segment
+; buffer because all the SGPR spills are to VGPRs.
+
+; CHECK: s_mov_b64 s[6:7], s[2:3]
+; CHECK: s_mov_b64 s[4:5], s[0:1]
+
  ; CHECK: SGPRBlocks: 1
-; CHECK: NumSGPRsForWavesPerEU: 13
-define void @max_18_sgprs(i32 addrspace(1)* %out1,
+; CHECK: NumSGPRsForWavesPerEU: 14
+define void @max_14_sgprs(i32 addrspace(1)* %out1,
                            i32 addrspace(1)* %out2,
                            i32 addrspace(1)* %out3,
                            i32 addrspace(1)* %out4,
@@ -14,4 +21,102 @@ define void @max_18_sgprs(i32 addrspace(1)* %out1,
    store i32 %four, i32 addrspace(1)* %out4
    ret void
  }
-attributes #0 = {"amdgpu-num-sgpr"="18"}
+
+; private resource: 4
+; scratch wave offset: 1
+; workgroup ids: 3
+; dispatch id: 2
+; queue ptr: 2
+; flat scratch init: 2
+; ---------------------
+; total: 14
+
+; + reserved vcc, flat_scratch = 18
+
+; Because we can't handle re-using the last few input registers as the
+; special vcc etc. registers (as well as decide to not use the unused
+; features when the number of registers is frozen), this ends up using
+; more than expected.
+
+; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
+; TOSGPR: SGPRBlocks: 2
+; TOSGPR: NumSGPRsForWavesPerEU: 18
+
+; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b32 s9, s13
+; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+
+; TOSMEM: SGPRBlocks: 2
+; TOSMEM: NumSGPRsForWavesPerEU: 18
+define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+                                        i32 addrspace(1)* %out2,
+                                        i32 addrspace(1)* %out3,
+                                        i32 addrspace(1)* %out4,
+                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+  store volatile i32 0, i32* undef
+  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
+
+  store i32 %one, i32 addrspace(1)* %out1
+  store i32 %two, i32 addrspace(1)* %out2
+  store i32 %three, i32 addrspace(1)* %out3
+  store i32 %four, i32 addrspace(1)* %out4
+  ret void
+}
+
+; ALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}
+; ; Make sure copies for input buffer are not clobbered. This requires
+; ; swapping the order the registers are copied from what normally
+; ; happens.
+
+; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+; TOSMEM: s_mov_b32 s3, s11
+
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 16
+define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+                                        i32 addrspace(1)* %out2,
+                                        i32 addrspace(1)* %out3,
+                                        i32 addrspace(1)* %out4,
+                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+  store volatile i32 0, i32* undef
+  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+
+  store i32 %one, i32 addrspace(1)* %out1
+  store i32 %two, i32 addrspace(1)* %out2
+  store i32 %three, i32 addrspace(1)* %out3
+  store i32 %four, i32 addrspace(1)* %out4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.z() #1
+declare i64 @llvm.amdgcn.dispatch.id() #1
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1
+
+attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
+attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll

index c24cfef..2894730 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -1,6 +1,17 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPTNONE %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=OPTNONE %s
+
+; There are no stack objects, but still a private memory access. The
+; private access regiters need to be correctly initialized anyway, and
+; shifted down to the end of the used registers.
  
  ; GCN-LABEL: {{^}}store_to_undef:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
  
  ; -O0 should assume spilling, so the input scratch resource descriptor
  ; -should be used directly without any copies.
@@ -13,18 +24,30 @@ define void @store_to_undef() #0 {
  }
  
  ; GCN-LABEL: {{^}}store_to_inttoptr:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
  define void @store_to_inttoptr() #0 {
   store volatile i32 0, i32* inttoptr (i32 123 to i32*)
   ret void
  }
  
  ; GCN-LABEL: {{^}}load_from_undef:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
  define void @load_from_undef() #0 {
    %ld = load volatile i32, i32* undef
    ret void
  }
  
  ; GCN-LABEL: {{^}}load_from_inttoptr:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
  define void @load_from_inttoptr() #0 {
    %ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll

index 63b1b71..ac06c26 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
  
  ; These tests check that the compiler won't crash when it needs to spill
  ; SGPRs.
diff --git a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll

index cc4b6bc..ff94298 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=TONGA %s
  
  ; On Tonga and Iceland, limited SGPR availability means care must be taken to
  ; allocate scratch registers correctly. Check that this test compiles without
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll

index 2f99efd..74e33d1 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -6,6 +6,8 @@
  ; XXX - Why does it like to use vcc?
  
  ; GCN-LABEL: {{^}}spill_m0:
+; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+
  ; GCN: s_cmp_lg_u32
  
  ; TOVGPR: s_mov_b32 vcc_hi, m0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll

index be49b69..14c279b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -459,7 +459,7 @@ entry:
    br i1 %cc, label %if, label %else
  
  if:
-  store volatile <4 x float> %dtex, <4 x float>* undef
+  store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
    unreachable
  
  else:
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 28 Oct 2016 19:43:31 +0000 (19:43 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 28 Oct 2016 19:43:31 +0000 (19:43 +0000)
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/spill-m0.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history