From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 18 Jul 2016 18:34:59 +0000 (+0000)
Subject: AMDGPU/R600: Replace barrier intrinsics
X-Git-Tag: llvmorg-4.0.0-rc1~15017
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4c519d35188bc004eb351c0c6f4b90aa3dbeade4;p=platform%2Fupstream%2Fllvm.git

AMDGPU/R600: Replace barrier intrinsics

llvm-svn: 275870
---

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d19b648..9bf2a4d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -43,6 +43,8 @@ defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
 
 def int_r600_read_workdim : AMDGPUReadPreloadRegisterIntrinsic;
 
+def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">,
+  Intrinsic<[], [], [IntrConvergent]>;
 
 // AS 7 is PARAM_I_ADDRESS, used for kernel arguments
 def int_r600_implicitarg_ptr :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index a011a85..2127391 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -30,10 +30,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
     [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
   >;
 
-  // Deprecated in favor of llvm.amdgcn.s.barrier
-  def int_AMDGPU_barrier_local  : Intrinsic<[], [], [IntrConvergent]>;
-  def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>;
-
   // Deprecated in favor of llvm.amdgcn.read.workdim
   def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
 }
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 1092e61..94f05cc 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -394,7 +394,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
 def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
 
 def GROUP_BARRIER : InstR600 <
-    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
+    (outs), (ins), "  GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>,
     R600ALU_Word0,
     R600ALU_Word1_OP2 <0x54> {
 
@@ -423,11 +423,6 @@ def GROUP_BARRIER : InstR600 <
   let ALUInst = 1;
 }
 
-def : Pat <
-	(int_AMDGPU_barrier_global),
-	(GROUP_BARRIER)
->;
-
 //===----------------------------------------------------------------------===//
 // LDS Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 858505b..6427db8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2453,17 +2453,6 @@ def : Pat <
   (S_WAITCNT (as_i16imm $simm16))
 >;
 
-// FIXME: These should be removed eventually
-def : Pat <
-  (int_AMDGPU_barrier_global),
-  (S_BARRIER)
->;
-
-def : Pat <
-  (int_AMDGPU_barrier_local),
-  (S_BARRIER)
->;
-
 //===----------------------------------------------------------------------===//
 // VOP1 Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
index 44ffc36..abe472e4 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
 ;
 ; This test checks that the lds input queue will is empty at the end of
 ; the ALU clause.
@@ -14,7 +14,7 @@ define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
   %1 = load i32, i32 addrspace(3)* %0
-  call void @llvm.AMDGPU.barrier.local()
+  call void @llvm.r600.group.barrier()
 
   ; This will start a new clause for the vertex fetch
   %2 = load i32, i32 addrspace(1)* %in
@@ -23,7 +23,7 @@ entry:
   ret void
 }
 
-declare void @llvm.AMDGPU.barrier.local()
+declare void @llvm.r600.group.barrier() nounwind convergent
 
 ; The machine scheduler does not do proper alias analysis and assumes that
 ; loads from global values (Note that a global value is different that a
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
deleted file mode 100644
index db88397..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_barrier_global:
-; EG: GROUP_BARRIER
-; SI: buffer_store_dword
-; SI: s_waitcnt
-; SI: s_barrier
-
-define void @test_barrier_global(i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x()
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
-  store i32 %0, i32 addrspace(1)* %1
-  call void @llvm.AMDGPU.barrier.global()
-  %2 = call i32 @llvm.r600.read.local.size.x()
-  %3 = sub i32 %2, 1
-  %4 = sub i32 %3, %0
-  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
-  %6 = load i32, i32 addrspace(1)* %5
-  store i32 %6, i32 addrspace(1)* %1
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.global()
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-
-attributes #0 = { readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
deleted file mode 100644
index 48fb2e0..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_barrier_local:
-; EG: GROUP_BARRIER
-
-; SI: buffer_store_dword
-; SI: s_waitcnt
-; SI: s_barrier
-
-define void @test_barrier_local(i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x()
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
-  store i32 %0, i32 addrspace(1)* %1
-  call void @llvm.AMDGPU.barrier.local()
-  %2 = call i32 @llvm.r600.read.local.size.x()
-  %3 = sub i32 %2, 1
-  %4 = sub i32 %3, %0
-  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
-  %6 = load i32, i32 addrspace(1)* %5
-  store i32 %6, i32 addrspace(1)* %1
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.local()
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-
-attributes #0 = { readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
new file mode 100644
index 0000000..e4e6dd8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG  %s
+
+; EG-LABEL: {{^}}test_group_barrier:
+; EG: GROUP_BARRIER
+define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
+  store i32 %tmp, i32 addrspace(1)* %tmp1
+  call void @llvm.r600.group.barrier()
+  %tmp2 = call i32 @llvm.r600.read.local.size.x()
+  %tmp3 = sub i32 %tmp2, 1
+  %tmp4 = sub i32 %tmp3, %tmp
+  %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
+  %tmp6 = load i32, i32 addrspace(1)* %tmp5
+  store i32 %tmp6, i32 addrspace(1)* %tmp1
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.r600.group.barrier() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
new file mode 100644
index 0000000..f6c0e3c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; SI: .long 47180
+; SI-NEXT: .long 65668
+; CI: .long 47180
+; CI-NEXT: .long 32900
+
+; GCN-LABEL: {{^}}local_memory:
+
+; GCN-NOT: s_wqm_b64
+; GCN: ds_write_b32
+
+; GCN: s_barrier
+
+; GCN: ds_read_b32 {{v[0-9]+}},
+define void @local_memory(i32 addrspace(1)* %out) #0 {
+entry:
+  %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %add = add nsw i32 %y.i, 1
+  %cmp = icmp eq i32 %add, 16
+  %.add = select i1 %cmp, i32 0, i32 %add
+  call void @llvm.amdgcn.s.barrier()
+  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
+  store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 8
+; GCN: .long 47180
+; GCN-NEXT: .long 32900
+
+; GCN-LABEL: {{^}}local_memory_two_objects:
+; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
+; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
+; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
+
+; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
+
+; SI-DAG: ds_write_b32 [[ADDRW]],
+; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
+
+; GCN: s_barrier
+
+; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
+; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
+
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
+
+; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
+define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+entry:
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x()
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+  %mul = shl nsw i32 %x.i, 1
+  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+  %sub = sub nsw i32 3, %x.i
+  call void @llvm.amdgcn.s.barrier()
+  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
+  store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %add = add nsw i32 %x.i, 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
+  store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll
index 8d48f59..1a11332 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll
@@ -1,57 +1,20 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
-
-; Check that the LDS size emitted correctly
-; EG: .long 166120
-; EG-NEXT: .long 128
-; SI: .long 47180
-; SI-NEXT: .long 65668
-; CI: .long 47180
-; CI-NEXT: .long 32900
-
-; FUNC-LABEL: {{^}}local_memory:
-
-; EG: LDS_WRITE
-; SI-NOT: s_wqm_b64
-; SI: ds_write_b32
-
-; GROUP_BARRIER must be the last instruction in a clause
-; EG: GROUP_BARRIER
-; EG-NEXT: ALU clause
-; SI: s_barrier
-
-; EG: LDS_READ_RET
-; SI: ds_read_b32 {{v[0-9]+}},
-
-define void @local_memory(i32 addrspace(1)* %out) {
-entry:
-  %y.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
-  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
-  %add = add nsw i32 %y.i, 1
-  %cmp = icmp eq i32 %add, 16
-  %.add = select i1 %cmp, i32 0, i32 %add
-  call void @llvm.AMDGPU.barrier.local()
-  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
-  %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
-  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
-  ret void
-}
-
 @lds = addrspace(3) global [512 x i32] undef, align 4
 
-; On SI we need to make sure that the base offset is a register and not
-; an immediate.
+; On SI we need to make sure that the base offset is a register and
+; not an immediate.
+
 ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
+
 ; R600: LDS_READ_RET
-define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
   %tmp1 = load i32, i32 addrspace(3)* %tmp0
@@ -67,7 +30,7 @@ entry:
 ; R600: LDS_READ_RET
 ; GCN-DAG: ds_read_b32
 ; GCN-DAG: ds_read2_b32
-define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
+define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
   %scalar = load i32, i32 addrspace(3)* %in
   %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
   %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
@@ -78,7 +41,4 @@ define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local()
-
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
similarity index 52%
rename from llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll
rename to llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
index cec334f..9841b88 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
@@ -1,18 +1,45 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 128
+
+; FUNC-LABEL: {{^}}local_memory:
+
+; EG: LDS_WRITE
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+
+; EG: LDS_READ_RET
+define void @local_memory(i32 addrspace(1)* %out) #0 {
+entry:
+  %y.i = call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %add = add nsw i32 %y.i, 1
+  %cmp = icmp eq i32 %add, 16
+  %.add = select i1 %cmp, i32 0, i32 %add
+  call void @llvm.r600.group.barrier()
+  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
+  store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
 @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 
-
 ; Check that the LDS size emitted correctly
 ; EG: .long 166120
 ; EG-NEXT: .long 8
 ; GCN: .long 47180
 ; GCN-NEXT: .long 32900
 
-
 ; FUNC-LABEL: {{^}}local_memory_two_objects:
 
 ; We would like to check the lds writes are using different
@@ -30,51 +57,31 @@
 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 
-
-; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
-; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
-; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
-
-
-; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
-
-; SI-DAG: ds_write_b32 [[ADDRW]],
-; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
-
-; GCN: s_barrier
-
-; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
-; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
-
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
-
-; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
-; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
-
-define void @local_memory_two_objects(i32 addrspace(1)* %out) {
+define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %x.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
   store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
   %mul = shl nsw i32 %x.i, 1
   %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
   store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
   %sub = sub nsw i32 3, %x.i
-  call void @llvm.AMDGPU.barrier.local()
+  call void @llvm.r600.group.barrier()
   %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
-  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
+  %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
-  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
   %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
-  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
   %add = add nsw i32 %x.i, 4
   %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
-  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local()
+declare i32 @llvm.r600.read.tidig.x() #1
+declare void @llvm.r600.group.barrier() #2
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index 6e00f76..9b490bb 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -1,12 +1,9 @@
-; XFAIL: *
-; REQUIRES: asserts
-; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
 
-declare void @llvm.AMDGPU.barrier.local() nounwind convergent
+declare void @llvm.amdgcn.s.barrier() nounwind convergent
 
-
-; SI-LABEL: {{^}}main(
+; GCN-LABEL: {{^}}main:
 define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
@@ -39,63 +36,63 @@ ENDIF:                                            ; preds = %main_body, %Flow2
   %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
   %15 = extractelement <4 x float> %reg1, i32 1
   %16 = extractelement <4 x float> %reg1, i32 3
-  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %17 = load <4 x float>, <4 x float> addrspace(2)* null
   %18 = extractelement <4 x float> %17, i32 0
   %19 = fmul float %18, %0
-  %20 = load <4 x float>, <4 x float> addrspace(9)* null
+  %20 = load <4 x float>, <4 x float> addrspace(2)* null
   %21 = extractelement <4 x float> %20, i32 1
   %22 = fmul float %21, %0
-  %23 = load <4 x float>, <4 x float> addrspace(9)* null
+  %23 = load <4 x float>, <4 x float> addrspace(2)* null
   %24 = extractelement <4 x float> %23, i32 2
   %25 = fmul float %24, %0
-  %26 = load <4 x float>, <4 x float> addrspace(9)* null
+  %26 = load <4 x float>, <4 x float> addrspace(2)* null
   %27 = extractelement <4 x float> %26, i32 3
   %28 = fmul float %27, %0
-  %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %29 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
   %30 = extractelement <4 x float> %29, i32 0
   %31 = fmul float %30, %15
   %32 = fadd float %31, %19
-  %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %33 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
   %34 = extractelement <4 x float> %33, i32 1
   %35 = fmul float %34, %15
   %36 = fadd float %35, %22
-  %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %37 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
   %38 = extractelement <4 x float> %37, i32 2
   %39 = fmul float %38, %15
   %40 = fadd float %39, %25
-  %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %41 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
   %42 = extractelement <4 x float> %41, i32 3
   %43 = fmul float %42, %15
   %44 = fadd float %43, %28
-  %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %45 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
   %46 = extractelement <4 x float> %45, i32 0
   %47 = fmul float %46, %1
   %48 = fadd float %47, %32
-  %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %49 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
   %50 = extractelement <4 x float> %49, i32 1
   %51 = fmul float %50, %1
   %52 = fadd float %51, %36
-  %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %53 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
   %54 = extractelement <4 x float> %53, i32 2
   %55 = fmul float %54, %1
   %56 = fadd float %55, %40
-  %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %57 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
   %58 = extractelement <4 x float> %57, i32 3
   %59 = fmul float %58, %1
   %60 = fadd float %59, %44
-  %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %61 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
   %62 = extractelement <4 x float> %61, i32 0
   %63 = fmul float %62, %16
   %64 = fadd float %63, %48
-  %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %65 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
   %66 = extractelement <4 x float> %65, i32 1
   %67 = fmul float %66, %16
   %68 = fadd float %67, %52
-  %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %69 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
   %70 = extractelement <4 x float> %69, i32 2
   %71 = fmul float %70, %16
   %72 = fadd float %71, %56
-  %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %73 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
   %74 = extractelement <4 x float> %73, i32 3
   %75 = fmul float %74, %16
   %76 = fadd float %75, %60
@@ -103,12 +100,12 @@ ENDIF:                                            ; preds = %main_body, %Flow2
   %78 = insertelement <4 x float> %77, float %68, i32 1
   %79 = insertelement <4 x float> %78, float %72, i32 2
   %80 = insertelement <4 x float> %79, float %76, i32 3
-  call void @llvm.AMDGPU.barrier.local()
+  call void @llvm.amdgcn.s.barrier()
   %81 = insertelement <4 x float> undef, float %temp.0, i32 0
   %82 = insertelement <4 x float> %81, float %temp1.0, i32 1
   %83 = insertelement <4 x float> %82, float %temp2.0, i32 2
   %84 = insertelement <4 x float> %83, float %temp3.0, i32 3
-  call void @llvm.AMDGPU.barrier.local()
+  call void @llvm.amdgcn.s.barrier()
   ret void
 
 LOOP:                                             ; preds = %main_body, %Flow