From c5962266044b4d328a29b6c2ebe7d3587a7b0cd2 Mon Sep 17 00:00:00 2001 From: Yaxun Liu Date: Wed, 22 Nov 2017 16:13:35 +0000 Subject: [PATCH] [AMDGPU] Fix SITargetLowering::LowerCall for pointer info of byval argument SITargetLowering::LowerCall uses dummy pointer info for byval argument, which causes flat load instead of buffer load. This patch fixes that. Differential Revision: https://reviews.llvm.org/D40040 llvm-svn: 318844 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 +-- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 53 ++++++++++++++++--------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3493de9..2561f7f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2296,8 +2296,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Cpy = DAG.getMemcpy( Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), /*isVol = */ false, /*AlwaysInline = */ true, - /*isTailCall = */ false, - DstInfo, MachinePointerInfo()); + /*isTailCall = */ false, DstInfo, + MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy( + *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)))); MemOpChains.push_back(Cpy); } else { diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 1fca549..35254130 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1,6 +1,7 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s +target datalayout = "A5" ; GCN-LABEL: {{^}}i32_fastcc_i32_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -20,9 +21,9 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 68 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { - %alloca = alloca [16 x i32], align 4 - %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 - store volatile i32 9, i32* %gep + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 + store volatile i32 9, i32 addrspace(5)* %gep %add0 = add i32 %arg0, %arg1 ret i32 %add0 } @@ -41,9 +42,9 @@ entry: ; GCN: ; ScratchSize: 68 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { entry: - %alloca = alloca [16 x i32], align 4 - %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 - store volatile i32 9, i32* %gep + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 + store volatile i32 9, i32 addrspace(5)* %gep %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret i32 %ret } @@ -55,9 +56,9 @@ entry: ; GCN: ; ScratchSize: 136 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { entry: - %alloca = alloca [16 x i32], align 4 - %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 - store volatile i32 9, i32* %gep + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 + store volatile i32 9, i32 addrspace(5)* %gep %ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b) ret i32 %ret } @@ -85,8 +86,8 @@ entry: ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_{{[_coiu]*}}32_e32 v0, vcc, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] -define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32* byval align 4 %arg1) #1 { - %arg1.load = load i32, i32* %arg1, align 4 +define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval align 4 %arg1) #1 { + %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4 %add0 = add i32 %arg0, %arg1.load ret i32 %add0 } @@ -98,9 +99,9 @@ define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32* byval align 4 %arg1) ; GCN: s_swappc_b64 ; GCN-NOT: v_readlane_b32 s32 ; GCN: s_setpc_b64 -define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32* byval %b.byval, i32 %c) #1 { +define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval %b.byval, i32 %c) #1 { entry: - %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* %b.byval) + %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* %b.byval) ret i32 %ret } @@ -114,7 +115,7 @@ entry: ; GCN-NEXT: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 { entry: - %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* inttoptr (i32 16 to i32*)) + %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*)) ret i32 %ret } @@ -171,9 +172,9 @@ entry: ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: - %alloca = alloca [16 x i32], align 4 - %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 - store volatile i32 9, i32* %gep + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 + store volatile i32 9, i32 addrspace(5)* %gep %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) ret i32 %ret } @@ -234,9 +235,9 @@ entry: ; GCN: s_setpc_b64 s[6:7] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: - %alloca = alloca [16 x i32], align 4 - %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 - store volatile i32 9, i32* %gep + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 + store volatile i32 9, i32 addrspace(5)* %gep %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) ret i32 %ret } @@ -247,9 +248,9 @@ entry: ; GCN: s_setpc_b64 s[6:7] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { entry: - %alloca = alloca [16 x i32], align 4 - %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 - store volatile i32 9, i32* %gep + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 + store volatile i32 9, i32 addrspace(5)* %gep %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) ret i32 %ret } -- 2.7.4