From bae3636f9692a56cbb2d75271766a4d7eca4e8c3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 1 Jul 2019 18:50:50 +0000 Subject: [PATCH] AMDGPU/GlobalISel: Handle more input argument intrinsics llvm-svn: 364836 --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 101 ++++++++++++--------- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 12 +++ .../AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll | 2 +- .../AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll | 18 ++-- .../AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll | 19 ++++ .../AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll | 18 ++++ .../GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll | 17 ++++ .../AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll | 18 ++++ .../AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll | 2 +- 9 files changed, 155 insertions(+), 52 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 528a773..adcc904 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -198,6 +198,58 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, } } +// Allocate special inputs passed in user SGPRs. +static void allocateHSAUserSGPRs(CCState &CCInfo, + MachineIRBuilder &MIRBuilder, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info.hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info.hasDispatchPtr()) { + unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info.hasQueuePtr()) { + unsigned QueuePtrReg = Info.addQueuePtr(TRI); + MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(QueuePtrReg); + } + + if (Info.hasKernargSegmentPtr()) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register InputPtrReg = Info.addKernargSegmentPtr(TRI); + const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register VReg = MRI.createGenericVirtualRegister(P4); + MRI.addLiveIn(InputPtrReg, VReg); + MIRBuilder.getMBB().addLiveIn(InputPtrReg); + MIRBuilder.buildCopy(VReg, InputPtrReg); + CCInfo.AllocateReg(InputPtrReg); + } + + if (Info.hasDispatchID()) { + unsigned DispatchIDReg = Info.addDispatchID(TRI); + MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(DispatchIDReg); + } + + if (Info.hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(FlatScratchInitReg); + } + + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. +} + static void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, @@ -272,51 +324,12 @@ bool AMDGPUCallLowering::lowerFormalArguments( SmallVector ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); - // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info->hasPrivateSegmentBuffer()) { - Register PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); - MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); - CCInfo.AllocateReg(PrivateSegmentBufferReg); - } - - if (Info->hasDispatchPtr()) { - Register DispatchPtrReg = Info->addDispatchPtr(*TRI); - // FIXME: Need to add reg as live-in - CCInfo.AllocateReg(DispatchPtrReg); - } - - if (Info->hasQueuePtr()) { - Register QueuePtrReg = Info->addQueuePtr(*TRI); - // FIXME: Need to add reg as live-in - CCInfo.AllocateReg(QueuePtrReg); - } - - if (Info->hasKernargSegmentPtr()) { - Register InputPtrReg = Info->addKernargSegmentPtr(*TRI); - const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); - Register VReg = MRI.createGenericVirtualRegister(P2); - MRI.addLiveIn(InputPtrReg, VReg); - MIRBuilder.getMBB().addLiveIn(InputPtrReg); - MIRBuilder.buildCopy(VReg, InputPtrReg); - CCInfo.AllocateReg(InputPtrReg); - } - - if (Info->hasDispatchID()) { - unsigned DispatchIDReg = Info->addDispatchID(*TRI); - // FIXME: Need to add reg as live-in - CCInfo.AllocateReg(DispatchIDReg); - } - - if (Info->hasFlatScratchInit()) { - unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); - // FIXME: Need to add reg as live-in - CCInfo.AllocateReg(FlatScratchInitReg); - } - // The infrastructure for normal calling convention lowering is essentially // useless for kernels. We want to avoid any kind of legalization or argument // splitting. if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) { + allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info); + unsigned i = 0; const unsigned KernArgBaseAlign = 16; const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); @@ -352,6 +365,12 @@ bool AMDGPUCallLowering::lowerFormalArguments( return true; } + if (Info->hasImplicitBufferPtr()) { + unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); + MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(ImplicitBufferPtrReg); + } + unsigned NumArgs = F.arg_size(); Function::const_arg_iterator CurOrigArg = F.arg_begin(); const AMDGPUTargetLowering &TLI = *getTLI(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 653be65..cada5e5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1237,6 +1237,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, case Intrinsic::amdgcn_workgroup_id_z: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_dispatch_ptr: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::DISPATCH_PTR); + case Intrinsic::amdgcn_queue_ptr: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::QUEUE_PTR); + case Intrinsic::amdgcn_implicit_buffer_ptr: + return legalizePreloadedArgIntrin( + MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); + case Intrinsic::amdgcn_dispatch_id: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::DISPATCH_ID); default: return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll index 2ead7bd..0d11aa9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -stop-after=irtranslator -global-isel %s -o - | FileCheck %s ; Check that we correctly skip over disabled inputs -; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0 +; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2 ; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]](s32), [[S0]](s32), [[S0]](s32), [[V0]](s32) define amdgpu_ps void @ps0(float inreg %arg0, float %psinput0, float %psinput1) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll index 8d87ce1..557ce16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: name: test_f32_inreg -; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0 +; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]] define amdgpu_vs void @test_f32_inreg(float inreg %arg0) { call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0 @@ -18,7 +18,7 @@ define amdgpu_vs void @test_f32(float %arg0) { } ; CHECK-LABEL: name: test_ptr2_byval -; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 +; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; CHECK: G_LOAD [[S01]] define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) { %tmp0 = load volatile i32, i32 addrspace(4)* %arg0 @@ -26,7 +26,7 @@ define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) { } ; CHECK-LABEL: name: test_ptr2_inreg -; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 +; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; CHECK: G_LOAD [[S01]] define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) { %tmp0 = load volatile i32, i32 addrspace(4)* %arg0 @@ -34,8 +34,8 @@ define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) { } ; CHECK-LABEL: name: test_sgpr_alignment0 -; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0 -; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 +; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2 +; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; CHECK: G_LOAD [[S23]] ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]] define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) { @@ -45,8 +45,8 @@ define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* } ; CHECK-LABEL: name: test_order -; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0 -; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr1 +; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2 +; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[V1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[V0]](s32), [[S0]](s32), [[V1]](s32), [[S1]](s32) @@ -56,8 +56,8 @@ define amdgpu_vs void @test_order(float inreg %arg0, float inreg %arg1, float %a } ; CHECK-LABEL: name: ret_struct -; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0 -; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr1 +; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2 +; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; CHECK: $sgpr0 = COPY [[S0]] ; CHECK: $sgpr1 = COPY [[S1]] ; CHECK: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll new file mode 100644 index 0000000..f0c2067 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll @@ -0,0 +1,19 @@ +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i64 @llvm.amdgcn.dispatch.id() #1 + +; GCN-LABEL: {{^}}dispatch_id: +; GCN: .amd_kernel_code_t +; GCN: enable_sgpr_dispatch_id = 1 + +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 { + %tmp0 = call i64 @llvm.amdgcn.dispatch.id() + store i64 %tmp0, i64 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll new file mode 100644 index 0000000..e2e0dcd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll @@ -0,0 +1,18 @@ +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Error on non-HSA target + +; GCN-LABEL: {{^}}test: +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %value = load i32, i32 addrspace(4)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll new file mode 100644 index 0000000..2028b8a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll @@ -0,0 +1,17 @@ +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Dropped parts from original test + +; GCN-LABEL: {{^}}test_ps: +; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +define amdgpu_ps i32 @test_ps() #1 { + %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %buffer_ptr + ret i32 %value +} + +declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll new file mode 100644 index 0000000..155a060 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll @@ -0,0 +1,18 @@ +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Error on non-hsa target + +; GCN-LABEL: {{^}}test: +; GCN: enable_sgpr_queue_ptr = 1 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +define amdgpu_kernel void @test(i32 addrspace(1)* %out) { + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %value = load i32, i32 addrspace(4)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll index 3a69ef6..e5e1a44 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: Requires stack object to not assert ; GCN-LABEL: {{^}}test_ps: -- 2.7.4