From 4c519d35188bc004eb351c0c6f4b90aa3dbeade4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 18 Jul 2016 18:34:59 +0000 Subject: [PATCH] AMDGPU/R600: Replace barrier intrinsics llvm-svn: 275870 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 + llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td | 4 - llvm/lib/Target/AMDGPU/EvergreenInstructions.td | 7 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 11 --- llvm/test/CodeGen/AMDGPU/lds-output-queue.ll | 6 +- .../CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll | 30 ------- .../CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll | 31 -------- .../test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll | 31 ++++++++ llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll | 92 ++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/local-memory.ll | 58 +++----------- ...-memory-two-objects.ll => local-memory.r600.ll} | 79 ++++++++++--------- .../AMDGPU/schedule-vs-if-nested-loop-failure.ll | 47 ++++++----- 12 files changed, 203 insertions(+), 195 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll create mode 100644 llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll rename llvm/test/CodeGen/AMDGPU/{local-memory-two-objects.ll => local-memory.r600.ll} (52%) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index d19b648..9bf2a4d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -43,6 +43,8 @@ defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; def int_r600_read_workdim : AMDGPUReadPreloadRegisterIntrinsic; +def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">, + Intrinsic<[], [], [IntrConvergent]>; // AS 7 is PARAM_I_ADDRESS, used for kernel arguments def int_r600_implicitarg_ptr : diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td index a011a85..2127391 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -30,10 +30,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] >; - // Deprecated in favor of llvm.amdgcn.s.barrier - def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; - // Deprecated in favor of llvm.amdgcn.read.workdim def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; } diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 1092e61..94f05cc 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -394,7 +394,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; def GROUP_BARRIER : InstR600 < - (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, + (outs), (ins), " GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>, R600ALU_Word0, R600ALU_Word1_OP2 <0x54> { @@ -423,11 +423,6 @@ def GROUP_BARRIER : InstR600 < let ALUInst = 1; } -def : Pat < - (int_AMDGPU_barrier_global), - (GROUP_BARRIER) ->; - //===----------------------------------------------------------------------===// // LDS Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 858505b..6427db8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2453,17 +2453,6 @@ def : Pat < (S_WAITCNT (as_i16imm $simm16)) >; -// FIXME: These should be removed eventually -def : Pat < - (int_AMDGPU_barrier_global), - (S_BARRIER) ->; - -def : Pat < - (int_AMDGPU_barrier_local), - (S_BARRIER) ->; - //===----------------------------------------------------------------------===// // VOP1 Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll index 44ffc36..abe472e4 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s ; ; This test checks that the lds input queue will is empty at the end of ; the ALU clause. @@ -14,7 +14,7 @@ define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 entry: %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index %1 = load i32, i32 addrspace(3)* %0 - call void @llvm.AMDGPU.barrier.local() + call void @llvm.r600.group.barrier() ; This will start a new clause for the vertex fetch %2 = load i32, i32 addrspace(1)* %in @@ -23,7 +23,7 @@ entry: ret void } -declare void @llvm.AMDGPU.barrier.local() +declare void @llvm.r600.group.barrier() nounwind convergent ; The machine scheduler does not do proper alias analysis and assumes that ; loads from global values (Note that a global value is different that a diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll deleted file mode 100644 index db88397..0000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_barrier_global: -; EG: GROUP_BARRIER -; SI: buffer_store_dword -; SI: s_waitcnt -; SI: s_barrier - -define void @test_barrier_global(i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 - store i32 %0, i32 addrspace(1)* %1 - call void @llvm.AMDGPU.barrier.global() - %2 = call i32 @llvm.r600.read.local.size.x() - %3 = sub i32 %2, 1 - %4 = sub i32 %3, %0 - %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 - %6 = load i32, i32 addrspace(1)* %5 - store i32 %6, i32 addrspace(1)* %1 - ret void -} - -declare void @llvm.AMDGPU.barrier.global() - -declare i32 @llvm.r600.read.tidig.x() #0 -declare i32 @llvm.r600.read.local.size.x() #0 - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll deleted file mode 100644 index 48fb2e0..0000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_barrier_local: -; EG: GROUP_BARRIER - -; SI: buffer_store_dword -; SI: s_waitcnt -; SI: s_barrier - -define void @test_barrier_local(i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 - store i32 %0, i32 addrspace(1)* %1 - call void @llvm.AMDGPU.barrier.local() - %2 = call i32 @llvm.r600.read.local.size.x() - %3 = sub i32 %2, 1 - %4 = sub i32 %3, %0 - %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 - %6 = load i32, i32 addrspace(1)* %5 - store i32 %6, i32 addrspace(1)* %1 - ret void -} - -declare void @llvm.AMDGPU.barrier.local() - -declare i32 @llvm.r600.read.tidig.x() #0 -declare i32 @llvm.r600.read.local.size.x() #0 - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll new file mode 100644 index 0000000..e4e6dd8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s + +; EG-LABEL: {{^}}test_group_barrier: +; EG: GROUP_BARRIER +define void @test_group_barrier(i32 addrspace(1)* %out) #0 { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() + %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp + store i32 %tmp, i32 addrspace(1)* %tmp1 + call void @llvm.r600.group.barrier() + %tmp2 = call i32 @llvm.r600.read.local.size.x() + %tmp3 = sub i32 %tmp2, 1 + %tmp4 = sub i32 %tmp3, %tmp + %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4 + %tmp6 = load i32, i32 addrspace(1)* %tmp5 + store i32 %tmp6, i32 addrspace(1)* %tmp1 + ret void +} + +; Function Attrs: convergent nounwind +declare void @llvm.r600.group.barrier() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.x() #2 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll new file mode 100644 index 0000000..f6c0e3c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s + +@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 + +; Check that the LDS size emitted correctly +; SI: .long 47180 +; SI-NEXT: .long 65668 +; CI: .long 47180 +; CI-NEXT: .long 32900 + +; GCN-LABEL: {{^}}local_memory: + +; GCN-NOT: s_wqm_b64 +; GCN: ds_write_b32 + +; GCN: s_barrier + +; GCN: ds_read_b32 {{v[0-9]+}}, +define void @local_memory(i32 addrspace(1)* %out) #0 { +entry: + %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1 + %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i + store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4 + %add = add nsw i32 %y.i, 1 + %cmp = icmp eq i32 %add, 16 + %.add = select i1 %cmp, i32 0, i32 %add + call void @llvm.amdgcn.s.barrier() + %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add + %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i + store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 +@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 + +; Check that the LDS size emitted correctly +; EG: .long 166120 +; EG-NEXT: .long 8 +; GCN: .long 47180 +; GCN-NEXT: .long 32900 + +; GCN-LABEL: {{^}}local_memory_two_objects: +; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 +; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16 +; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}} + +; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] + +; SI-DAG: ds_write_b32 [[ADDRW]], +; SI-DAG: ds_write_b32 [[ADDRW_OFF]], + +; GCN: s_barrier + +; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] + +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] + +; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] +; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 +define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { +entry: + %x.i = call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i + store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 + %mul = shl nsw i32 %x.i, 1 + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i + store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 + %sub = sub nsw i32 3, %x.i + call void @llvm.amdgcn.s.barrier() + %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub + %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i + store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub + %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4 + %add = add nsw i32 %x.i, 4 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add + store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare void @llvm.amdgcn.s.barrier() #2 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { convergent nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll index 8d48f59..1a11332 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll @@ -1,57 +1,20 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 - -; Check that the LDS size emitted correctly -; EG: .long 166120 -; EG-NEXT: .long 128 -; SI: .long 47180 -; SI-NEXT: .long 65668 -; CI: .long 47180 -; CI-NEXT: .long 32900 - -; FUNC-LABEL: {{^}}local_memory: - -; EG: LDS_WRITE -; SI-NOT: s_wqm_b64 -; SI: ds_write_b32 - -; GROUP_BARRIER must be the last instruction in a clause -; EG: GROUP_BARRIER -; EG-NEXT: ALU clause -; SI: s_barrier - -; EG: LDS_READ_RET -; SI: ds_read_b32 {{v[0-9]+}}, - -define void @local_memory(i32 addrspace(1)* %out) { -entry: - %y.i = call i32 @llvm.r600.read.tidig.x() #0 - %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i - store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4 - %add = add nsw i32 %y.i, 1 - %cmp = icmp eq i32 %add, 16 - %.add = select i1 %cmp, i32 0, i32 %add - call void @llvm.AMDGPU.barrier.local() - %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add - %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i - store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 - ret void -} - @lds = addrspace(3) global [512 x i32] undef, align 4 -; On SI we need to make sure that the base offset is a register and not -; an immediate. +; On SI we need to make sure that the base offset is a register and +; not an immediate. + ; FUNC-LABEL: {{^}}load_i32_local_const_ptr: ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4 + ; R600: LDS_READ_RET -define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 { entry: %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1 %tmp1 = load i32, i32 addrspace(3)* %tmp0 @@ -67,7 +30,7 @@ entry: ; R600: LDS_READ_RET ; GCN-DAG: ds_read_b32 ; GCN-DAG: ds_read2_b32 -define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) { +define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 { %scalar = load i32, i32 addrspace(3)* %in %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)* %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2 @@ -78,7 +41,4 @@ define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3) ret void } -declare i32 @llvm.r600.read.tidig.x() #0 -declare void @llvm.AMDGPU.barrier.local() - -attributes #0 = { readnone } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll similarity index 52% rename from llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll rename to llvm/test/CodeGen/AMDGPU/local-memory.r600.ll index cec334f..9841b88 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll @@ -1,18 +1,45 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 + +; Check that the LDS size emitted correctly +; EG: .long 166120 +; EG-NEXT: .long 128 + +; FUNC-LABEL: {{^}}local_memory: + +; EG: LDS_WRITE + +; GROUP_BARRIER must be the last instruction in a clause +; EG: GROUP_BARRIER +; EG-NEXT: ALU clause + +; EG: LDS_READ_RET +define void @local_memory(i32 addrspace(1)* %out) #0 { +entry: + %y.i = call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i + store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4 + %add = add nsw i32 %y.i, 1 + %cmp = icmp eq i32 %add, 16 + %.add = select i1 %cmp, i32 0, i32 %add + call void @llvm.r600.group.barrier() + %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add + %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i + store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 - ; Check that the LDS size emitted correctly ; EG: .long 166120 ; EG-NEXT: .long 8 ; GCN: .long 47180 ; GCN-NEXT: .long 32900 - ; FUNC-LABEL: {{^}}local_memory_two_objects: ; We would like to check the lds writes are using different @@ -30,51 +57,31 @@ ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] - -; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16 -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}} - - -; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] - -; SI-DAG: ds_write_b32 [[ADDRW]], -; SI-DAG: ds_write_b32 [[ADDRW_OFF]], - -; GCN: s_barrier - -; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] - -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] - -; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] -; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 - -define void @local_memory_two_objects(i32 addrspace(1)* %out) { +define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: - %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %x.i = call i32 @llvm.r600.read.tidig.x() #1 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 %mul = shl nsw i32 %x.i, 1 %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 %sub = sub nsw i32 3, %x.i - call void @llvm.AMDGPU.barrier.local() + call void @llvm.r600.group.barrier() %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub - %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4 + %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4 %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i - store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 + store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4 %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub - %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4 + %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4 %add = add nsw i32 %x.i, 4 %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add - store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 + store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4 ret void } -declare i32 @llvm.r600.read.tidig.x() #0 -declare void @llvm.AMDGPU.barrier.local() +declare i32 @llvm.r600.read.tidig.x() #1 +declare void @llvm.r600.group.barrier() #2 -attributes #0 = { readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { convergent nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll index 6e00f76..9b490bb 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll @@ -1,12 +1,9 @@ -; XFAIL: * -; REQUIRES: asserts -; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI +; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s -declare void @llvm.AMDGPU.barrier.local() nounwind convergent +declare void @llvm.amdgcn.s.barrier() nounwind convergent - -; SI-LABEL: {{^}}main( +; GCN-LABEL: {{^}}main: define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 @@ -39,63 +36,63 @@ ENDIF: ; preds = %main_body, %Flow2 %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ] %15 = extractelement <4 x float> %reg1, i32 1 %16 = extractelement <4 x float> %reg1, i32 3 - %17 = load <4 x float>, <4 x float> addrspace(9)* null + %17 = load <4 x float>, <4 x float> addrspace(2)* null %18 = extractelement <4 x float> %17, i32 0 %19 = fmul float %18, %0 - %20 = load <4 x float>, <4 x float> addrspace(9)* null + %20 = load <4 x float>, <4 x float> addrspace(2)* null %21 = extractelement <4 x float> %20, i32 1 %22 = fmul float %21, %0 - %23 = load <4 x float>, <4 x float> addrspace(9)* null + %23 = load <4 x float>, <4 x float> addrspace(2)* null %24 = extractelement <4 x float> %23, i32 2 %25 = fmul float %24, %0 - %26 = load <4 x float>, <4 x float> addrspace(9)* null + %26 = load <4 x float>, <4 x float> addrspace(2)* null %27 = extractelement <4 x float> %26, i32 3 %28 = fmul float %27, %0 - %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %29 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1) %30 = extractelement <4 x float> %29, i32 0 %31 = fmul float %30, %15 %32 = fadd float %31, %19 - %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %33 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1) %34 = extractelement <4 x float> %33, i32 1 %35 = fmul float %34, %15 %36 = fadd float %35, %22 - %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %37 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1) %38 = extractelement <4 x float> %37, i32 2 %39 = fmul float %38, %15 %40 = fadd float %39, %25 - %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %41 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1) %42 = extractelement <4 x float> %41, i32 3 %43 = fmul float %42, %15 %44 = fadd float %43, %28 - %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %45 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2) %46 = extractelement <4 x float> %45, i32 0 %47 = fmul float %46, %1 %48 = fadd float %47, %32 - %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %49 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2) %50 = extractelement <4 x float> %49, i32 1 %51 = fmul float %50, %1 %52 = fadd float %51, %36 - %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %53 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2) %54 = extractelement <4 x float> %53, i32 2 %55 = fmul float %54, %1 %56 = fadd float %55, %40 - %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %57 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2) %58 = extractelement <4 x float> %57, i32 3 %59 = fmul float %58, %1 %60 = fadd float %59, %44 - %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %61 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3) %62 = extractelement <4 x float> %61, i32 0 %63 = fmul float %62, %16 %64 = fadd float %63, %48 - %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %65 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3) %66 = extractelement <4 x float> %65, i32 1 %67 = fmul float %66, %16 %68 = fadd float %67, %52 - %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %69 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3) %70 = extractelement <4 x float> %69, i32 2 %71 = fmul float %70, %16 %72 = fadd float %71, %56 - %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %73 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3) %74 = extractelement <4 x float> %73, i32 3 %75 = fmul float %74, %16 %76 = fadd float %75, %60 @@ -103,12 +100,12 @@ ENDIF: ; preds = %main_body, %Flow2 %78 = insertelement <4 x float> %77, float %68, i32 1 %79 = insertelement <4 x float> %78, float %72, i32 2 %80 = insertelement <4 x float> %79, float %76, i32 3 - call void @llvm.AMDGPU.barrier.local() + call void @llvm.amdgcn.s.barrier() %81 = insertelement <4 x float> undef, float %temp.0, i32 0 %82 = insertelement <4 x float> %81, float %temp1.0, i32 1 %83 = insertelement <4 x float> %82, float %temp2.0, i32 2 %84 = insertelement <4 x float> %83, float %temp3.0, i32 3 - call void @llvm.AMDGPU.barrier.local() + call void @llvm.amdgcn.s.barrier() ret void LOOP: ; preds = %main_body, %Flow -- 2.7.4