AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4

author Marek Olsak <marek.olsak@amd.com>

Thu, 9 Nov 2017 01:52:23 +0000 (01:52 +0000)

committer Marek Olsak <marek.olsak@amd.com>

Thu, 9 Nov 2017 01:52:23 +0000 (01:52 +0000)
author Marek Olsak <marek.olsak@amd.com>
Thu, 9 Nov 2017 01:52:23 +0000 (01:52 +0000)
committer Marek Olsak <marek.olsak@amd.com>
Thu, 9 Nov 2017 01:52:23 +0000 (01:52 +0000)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

index 6ee529c..2cc48a7 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -323,6 +323,14 @@ public:
      return HasMadMixInsts;
    }
  
+  bool hasSBufferLoadStoreAtomicDwordxN() const {
+    // Only use the "x1" variants on GFX9 or don't use the buffer variants.
+    // For x2 and higher variants, if the accessed region spans 2 VM pages and
+    // the second page is unmapped, the hw hangs.
+    // TODO: There is one future GFX9 chip that doesn't have this bug.
+    return getGeneration() != GFX9;
+  }
+
    bool hasCARRY() const {
      return (getGeneration() >= EVERGREEN);
    }
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

index 026fd97..712fd68 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -14,6 +14,12 @@
  // ==>
  //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
  //
+// The same is done for certain SMEM opcodes, e.g.:
+//  s_buffer_load_dword s4, s[0:3], 4
+//  s_buffer_load_dword s5, s[0:3], 8
+// ==>
+//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
+//
  //
  // Future improvements:
  //
@@ -76,23 +82,28 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
      unsigned Offset0;
      unsigned Offset1;
      unsigned BaseOff;
+    bool GLC0;
+    bool GLC1;
      bool UseST64;
+    bool IsSBufferLoadImm;
+    bool IsX2;
      SmallVector<MachineInstr*, 8> InstsToMove;
     };
  
  private:
+  const SISubtarget *STM = nullptr;
    const SIInstrInfo *TII = nullptr;
    const SIRegisterInfo *TRI = nullptr;
    MachineRegisterInfo *MRI = nullptr;
    AliasAnalysis *AA = nullptr;
+  unsigned CreatedX2;
  
    static bool offsetsCanBeCombined(CombineInfo &CI);
  
-  bool findMatchingDSInst(CombineInfo &CI);
-
+  bool findMatchingInst(CombineInfo &CI);
    MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
-
    MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
+  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
  
  public:
    static char ID;
@@ -210,6 +221,14 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
    CI.UseST64 = false;
    CI.BaseOff = 0;
  
+  // SMEM offsets must be consecutive.
+  if (CI.IsSBufferLoadImm) {
+    unsigned Diff = CI.IsX2 ? 2 : 1;
+    return (EltOffset0 + Diff == EltOffset1 ||
+            EltOffset1 + Diff == EltOffset0) &&
+           CI.GLC0 == CI.GLC1;
+  }
+
    // If the offset in elements doesn't fit in 8-bits, we might be able to use
    // the stride 64 versions.
    if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
@@ -247,13 +266,18 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
    return false;
  }
  
-bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
+bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
    MachineBasicBlock *MBB = CI.I->getParent();
    MachineBasicBlock::iterator E = MBB->end();
    MachineBasicBlock::iterator MBBI = CI.I;
  
-  int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
-                                           AMDGPU::OpName::addr);
+  unsigned AddrOpName;
+  if (CI.IsSBufferLoadImm)
+    AddrOpName = AMDGPU::OpName::sbase;
+  else
+    AddrOpName = AMDGPU::OpName::addr;
+
+  int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName);
    const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
  
    // We only ever merge operations with the same base address register, so don't
@@ -319,10 +343,18 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
          AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
        int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
                                                   AMDGPU::OpName::offset);
-      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff;
-      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
+      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
        CI.Paired = MBBI;
  
+      if (CI.IsSBufferLoadImm) {
+        CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
+        CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
+      } else {
+        CI.Offset0 &= 0xffff;
+        CI.Offset1 &= 0xffff;
+      }
+
        // Check both offsets fit in the reduced range.
        // We also need to go through the list of instructions that we plan to
        // move and make sure they are all safe to move down past the merged
@@ -488,6 +520,51 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
    return Next;
  }
  
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
+  CombineInfo &CI) {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
+  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
+                              AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+
+  const TargetRegisterClass *SuperRC =
+    CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
+  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+
+  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+      .addImm(MergedOffset) // offset
+      .addImm(CI.GLC0)      // glc
+      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+  // Handle descending offsets
+  if (CI.Offset0 > CI.Offset1)
+    std::swap(SubRegIdx0, SubRegIdx1);
+
+  // Copy to the old destination registers.
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
+  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
+
+  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+      .add(*Dest0) // Copy to same destination including flags and sub reg.
+      .addReg(DestReg, 0, SubRegIdx0);
+  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+                            .add(*Dest1)
+                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+  moveInstsAfter(Copy1, CI.InstsToMove);
+
+  MachineBasicBlock::iterator Next = std::next(CI.I);
+  CI.I->eraseFromParent();
+  CI.Paired->eraseFromParent();
+  return Next;
+}
+
  // Scan through looking for adjacent LDS operations with constant offsets from
  // the same base register. We rely on the scheduler to do the hard work of
  // clustering nearby loads, and assume these are all adjacent.
@@ -505,10 +582,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
  
      CombineInfo CI;
      CI.I = I;
+    CI.IsSBufferLoadImm = false;
      unsigned Opc = MI.getOpcode();
      if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
        CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
-      if (findMatchingDSInst(CI)) {
+      if (findMatchingInst(CI)) {
          Modified = true;
          I = mergeRead2Pair(CI);
        } else {
@@ -516,9 +594,10 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
        }
  
        continue;
-    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+    }
+    if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
        CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
-      if (findMatchingDSInst(CI)) {
+      if (findMatchingInst(CI)) {
          Modified = true;
          I = mergeWrite2Pair(CI);
        } else {
@@ -527,6 +606,23 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
  
        continue;
      }
+    if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
+        (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+      // EltSize is in units of the offset encoding.
+      CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
+      CI.IsSBufferLoadImm = true;
+      CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+      if (findMatchingInst(CI)) {
+        Modified = true;
+        I = mergeSBufferLoadImmPair(CI);
+        if (!CI.IsX2)
+          CreatedX2++;
+      } else {
+        ++I;
+      }
+      continue;
+    }
  
      ++I;
    }
@@ -538,11 +634,11 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
    if (skipFunction(*MF.getFunction()))
      return false;
  
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  if (!STM.loadStoreOptEnabled())
+  STM = &MF.getSubtarget<SISubtarget>();
+  if (!STM->loadStoreOptEnabled())
      return false;
  
-  TII = STM.getInstrInfo();
+  TII = STM->getInstrInfo();
    TRI = &TII->getRegisterInfo();
  
    MRI = &MF.getRegInfo();
@@ -553,9 +649,16 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
    DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
  
    bool Modified = false;
+  CreatedX2 = 0;
  
    for (MachineBasicBlock &MBB : MF)
      Modified |= optimizeBlock(MBB);
  
+  // Run again to convert x2 to x4.
+  if (CreatedX2 >= 1) {
+    for (MachineBasicBlock &MBB : MF)
+      Modified |= optimizeBlock(MBB);
+  }
+
    return Modified;
  }
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll

index ab7e742..ce2a97b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -1,11 +1,12 @@
-; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN  %s
-; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI   -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI   -check-prefix=GCN -check-prefix=SICI %s
+; RUN: llc -march=amdgcn -mcpu=tonga   -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI   -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9  %s
  
  ; SMRD load with an immediate offset.
  ; GCN-LABEL: {{^}}smrd0:
  ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
+; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
  define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
  entry:
    %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
@@ -17,7 +18,7 @@ entry:
  ; SMRD load with the largest possible immediate offset.
  ; GCN-LABEL: {{^}}smrd1:
  ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
  define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
  entry:
    %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
@@ -31,7 +32,7 @@ entry:
  ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
  ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
  ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
  ; GCN: s_endpgm
  define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
  entry:
@@ -61,7 +62,7 @@ entry:
  ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
  ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
  ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
  define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
  entry:
    %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
@@ -72,8 +73,8 @@ entry:
  
  ; SMRD load with an offset greater than the largest possible immediate on VI
  ; GCN-LABEL: {{^}}smrd5:
-; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
-; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
+; SIVIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
  ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
  ; GCN: s_endpgm
  define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
@@ -104,7 +105,7 @@ main_body:
  ; SMRD load using the load.const.v4i32 intrinsic with an immediate offset
  ; GCN-LABEL: {{^}}smrd_load_const0:
  ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
  define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
  main_body:
    %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
@@ -118,7 +119,7 @@ main_body:
  ; offset.
  ; GCN-LABEL: {{^}}smrd_load_const1:
  ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
  define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
  main_body:
    %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
@@ -135,7 +136,7 @@ main_body:
  ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
  ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
  ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
  define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
  main_body:
    %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
@@ -150,7 +151,7 @@ main_body:
  ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
  ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
  ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
  define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
  main_body:
    %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
@@ -162,8 +163,8 @@ main_body:
  
  ; SMRD load with an offset greater than the largest possible immediate on VI
  ; GCN-LABEL: {{^}}smrd_load_const4:
-; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
-; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
+; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
  ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
  ; GCN: s_endpgm
  define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
@@ -212,6 +213,31 @@ main_body:
    ret float %r
  }
  
+; GCN-LABEL: {{^}}smrd_imm_merged:
+; GCN-NEXT: BB#
+; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1
+; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7
+; VI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4
+; VI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 {
+main_body:
+  %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)
+  %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 8)
+  %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 12)
+  %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 16)
+  %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 28)
+  %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 32)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
+  ret void
+}
+
  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
  declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
author	Marek Olsak <marek.olsak@amd.com>
	Thu, 9 Nov 2017 01:52:23 +0000 (01:52 +0000)
committer	Marek Olsak <marek.olsak@amd.com>
	Thu, 9 Nov 2017 01:52:23 +0000 (01:52 +0000)
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/smrd.ll		patch \| blob \| history