From 9e8e8c60fa1060f6f5fd0ffd367c06b14fee52a3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 1 Jul 2019 18:49:01 +0000 Subject: [PATCH] AMDGPU/GlobalISel: Lower kernarg segment ptr intrinsics llvm-svn: 364835 --- .../Target/AMDGPU/AMDGPUInstructionSelector.cpp | 20 ---- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 48 +++++++- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 4 + .../inst-select-amdgcn.kernarg.segment.ptr.mir | 19 ---- .../GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll | 125 +++++++++++++++++++++ 5 files changed, 173 insertions(+), 43 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.kernarg.segment.ptr.mir create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index bd4c73e..868a742 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -383,26 +383,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I, case Intrinsic::minnum: case Intrinsic::amdgcn_cvt_pkrtz: return selectImpl(I, CoverageInfo); - - case Intrinsic::amdgcn_kernarg_segment_ptr: { - MachineFunction *MF = I.getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIMachineFunctionInfo *MFI = MF->getInfo(); - const ArgDescriptor *InputPtrReg; - const TargetRegisterClass *RC; - const DebugLoc &DL = I.getDebugLoc(); - - std::tie(InputPtrReg, RC) - = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); - if (!InputPtrReg) - report_fatal_error("missing kernarg segment ptr"); - - BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY)) - .add(I.getOperand(0)) - .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister())); - I.eraseFromParent(); - return true; - } } return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index f42e00a..653be65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -82,8 +82,9 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { }; } -AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, - const GCNTargetMachine &TM) { +AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, + const GCNTargetMachine &TM) + : ST(ST_) { using namespace TargetOpcode; auto GetAddrSpacePtr = [&TM](unsigned AS) { @@ -460,7 +461,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, [](const LegalityQuery &Query) { return std::make_pair(0, LLT::scalar(32)); }) - .fewerElementsIf([=, &ST](const LegalityQuery &Query) { + .fewerElementsIf([=](const LegalityQuery &Query) { unsigned MemSize = Query.MMODescrs[0].SizeInBits; return (MemSize == 96) && Query.Types[0].isVector() && @@ -469,7 +470,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, [=](const LegalityQuery &Query) { return std::make_pair(0, V2S32); }) - .legalIf([=, &ST](const LegalityQuery &Query) { + .legalIf([=](const LegalityQuery &Query) { const LLT &Ty0 = Query.Types[0]; unsigned Size = Ty0.getSizeInBits(); @@ -1134,6 +1135,40 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( return false; } +bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + if (!MFI->isEntryFunction()) { + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + } + + B.setInstr(MI); + + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset( + B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); + + const ArgDescriptor *Arg; + const TargetRegisterClass *RC; + std::tie(Arg, RC) + = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + if (!Arg) + return false; + + Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); + if (!loadInputValue(KernargPtrReg, B, Arg)) + return false; + + B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1179,6 +1214,11 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, return false; } + case Intrinsic::amdgcn_kernarg_segment_ptr: + return legalizePreloadedArgIntrin( + MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + case Intrinsic::amdgcn_implicitarg_ptr: + return legalizeImplicitArgPtr(MI, MRI, B); case Intrinsic::amdgcn_workitem_id_x: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_X); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 7c566d2..b013108 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -25,6 +25,8 @@ class GCNSubtarget; /// This class provides the information for the target register banks. class AMDGPULegalizerInfo : public LegalizerInfo { + const GCNSubtarget &ST; + public: AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM); @@ -57,6 +59,8 @@ public: MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.kernarg.segment.ptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.kernarg.segment.ptr.mir deleted file mode 100644 index 7ae6077..0000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.kernarg.segment.ptr.mir +++ /dev/null @@ -1,19 +0,0 @@ -# XFAIL: * -# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN - -# FIXME: This requires additional context for what input registers are special inputs not present in MIR. - ---- - -name: kernarg_segment_Ptr -legalized: true -regBankSelected: true - -body: | - bb.0: - %0:vgpr(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - %1:sgpr(s32) = G_LOAD %0 :: (load 4) - %2:vgpr(p1) = G_IMPLICIT_DEF - G_STORE %1, %2 :: (store 4) -... ---- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll new file mode 100644 index 0000000..ca28f6e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -0,0 +1,125 @@ +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s + +; ALL-LABEL: {{^}}test: +; CO-V2: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: kernarg_segment_byte_size = 8 +; HSA: kernarg_segment_alignment = 4 + +; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa + +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa +define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 { + %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* + %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10 + %value = load i32, i32 addrspace(4)* %gep + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_implicit: +; HSA: kernarg_segment_byte_size = 8 +; OS-MESA3D: kernarg_segment_byte_size = 24 +; CO-V2: kernarg_segment_alignment = 4 + +; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 + +; OS-UNKNOWN: s_add_u32 s[[LO:[0-9]+]], s0, 44 +; OS-UNKNOWN-NEXT: s_addc_u32 s[[HI:[0-9]+]], s1, 0 +; OS-UNKNOWN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO]]:[[HI]]{{\]}}, 0xa +define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10 + %value = load i32, i32 addrspace(4)* %gep + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_implicit_alignment: +; HSA: kernarg_segment_byte_size = 12 +; OS-MESA3D: kernarg_segment_byte_size = 28 +; CO-V2: kernarg_segment_alignment = 4 + + +; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc +; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4 +; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3 +; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]] +; ALL: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]] +define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 { + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %arg.ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}opencl_test_implicit_alignment +; HSA: kernarg_segment_byte_size = 64 +; OS-MESA3D: kernarg_segment_byte_size = 28 +; CO-V2: kernarg_segment_alignment = 4 + + +; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc +; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4 +; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3 +; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]] +; ALL: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]] +define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #2 { + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %arg.ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_no_kernargs: +; CO-V2: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: kernarg_segment_byte_size = 0 +; OS-MESA3D: kernarg_segment_byte_size = 16 +; CO-V2: kernarg_segment_alignment = 4 + +; HSA: s_load_dword s{{[0-9]+}}, s[4:5] +define amdgpu_kernel void @test_no_kernargs() #1 { + %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* + %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10 + %value = load i32, i32 addrspace(4)* %gep + store volatile i32 %value, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs: +; HSA: kernarg_segment_byte_size = 48 +; OS-MESA3d: kernarg_segment_byte_size = 16 +; CO-V2: kernarg_segment_alignment = 4 +define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 { + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val = load volatile i32, i32 addrspace(4)* %arg.ptr + store volatile i32 %val, i32 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs_round_up: +; HSA: kernarg_segment_byte_size = 40 +; OS-MESA3D: kernarg_segment_byte_size = 16 +; CO-V2: kernarg_segment_alignment = 4 +define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_round_up() #3 { + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val = load volatile i32, i32 addrspace(4)* %arg.ptr + store volatile i32 %val, i32 addrspace(1)* null + ret void +} + +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } +attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" } +attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" } -- 2.7.4