From: Matt Arsenault Date: Tue, 2 May 2017 18:33:18 +0000 (+0000) Subject: AMDGPU: Don't promote alloca to LDS for leaf functions X-Git-Tag: llvmorg-5.0.0-rc1~6143 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5c80618fb78e5571e859e42b7ff2f062f5fc6f81;p=platform%2Fupstream%2Fllvm.git AMDGPU: Don't promote alloca to LDS for leaf functions LDS use in leaf functions not currently handled. llvm-svn: 301958 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 4fb262c..36dcc69 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -677,12 +677,19 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { } const Function &ContainingFunction = *I.getParent()->getParent(); + CallingConv::ID CC = ContainingFunction.getCallingConv(); // Don't promote the alloca to LDS for shader calling conventions as the work // item ID intrinsics are not supported for these calling conventions. // Furthermore not all LDS is available for some of the stages. - if (AMDGPU::isShader(ContainingFunction.getCallingConv())) + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + break; + default: + DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); return; + } const AMDGPUSubtarget &ST = TM->getSubtarget(ContainingFunction); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll new file mode 100644 index 0000000..a95e9f8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll @@ -0,0 +1,74 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s + +; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { +; IR: alloca [5 x i32] +; ASM-LABEL: {{^}}promote_alloca_shaders: +; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only) + +define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { +entry: + %stack = alloca [5 x i32], align 4 + %tmp0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %tmp2 = load i32, i32* %arrayidx4, align 4 + store i32 %tmp2, i32 addrspace(1)* %out, align 4 + %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %tmp3 = load i32, i32* %arrayidx5 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %tmp3, i32 addrspace(1)* %arrayidx6 + ret void +} + +; OPT-LABEL: @promote_to_vector_call_c( +; OPT-NOT: alloca +; OPT: extractelement <2 x i32> %{{[0-9]+}}, i32 %in +; ASM-NOT: LDSByteSize +define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in + %tmp4 = load i32, i32* %tmp3 + %tmp5 = load volatile i32, i32 addrspace(1)* undef + %tmp6 = add i32 %tmp4, %tmp5 + store i32 %tmp6, i32 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @no_promote_to_lds_c( +; OPT: alloca +; ASM-NOT: LDSByteSize +define void @no_promote_to_lds(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-shaders.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-shaders.ll deleted file mode 100644 index d40fca9..0000000 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-shaders.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s - -; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { -; IR: alloca [5 x i32] -; ASM-LABEL: {{^}}promote_alloca_shaders: -; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only) - -define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { -entry: - %stack = alloca [5 x i32], align 4 - %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 - store i32 4, i32* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %tmp2 = load i32, i32* %arrayidx4, align 4 - store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %tmp3 = load i32, i32* %arrayidx5 - %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %tmp3, i32 addrspace(1)* %arrayidx6 - ret void -} - -attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }