From e2c86cce3a2d1ad88c1f2b37f4b419c887a8d5b9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 1 Jul 2019 18:45:36 +0000 Subject: [PATCH] AMDGPU/GlobalISel: Legalize workitem ID intrinsics Tests don't cover the masked input path since non-kernel arguments aren't lowered yet. Test is copied directly from the existing test, with 2 additions. llvm-svn: 364833 --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 33 ++++++++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 84 ++++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 10 +++ .../irtranslator-amdgpu_kernel-system-sgprs.ll | 3 +- .../GlobalISel/irtranslator-fast-math-flags.ll | 2 +- .../AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll | 92 ++++++++++++++++++++++ 6 files changed, 222 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 04a2632..8cbe34a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -166,6 +166,38 @@ static Register findFirstFreeSGPR(CCState &CCInfo) { llvm_unreachable("Cannot allocate sgpr"); } +static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + const LLT S32 = LLT::scalar(32); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (Info.hasWorkItemIDX()) { + Register Reg = AMDGPU::VGPR0; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); + } + + if (Info.hasWorkItemIDY()) { + Register Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + } + + if (Info.hasWorkItemIDZ()) { + Register Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + } +} + static void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, @@ -288,6 +320,7 @@ bool AMDGPUCallLowering::lowerFormalArguments( ++i; } + allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index d9bdfbb..831e933 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -23,6 +23,8 @@ #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" +#define DEBUG_TYPE "amdgpu-legalinfo" + using namespace llvm; using namespace LegalizeActions; using namespace LegalizeMutations; @@ -1059,6 +1061,79 @@ static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; } +Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, + Register Reg, LLT Ty) const { + Register LiveIn = MRI.getLiveInVirtReg(Reg); + if (LiveIn) + return LiveIn; + + Register NewReg = MRI.createGenericVirtualRegister(Ty); + MRI.addLiveIn(Reg, NewReg); + return NewReg; +} + +bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, + const ArgDescriptor *Arg) const { + if (!Arg->isRegister()) + return false; // TODO: Handle these + + assert(Arg->getRegister() != 0); + assert(Arg->getRegister().isPhysical()); + + MachineRegisterInfo &MRI = *B.getMRI(); + + LLT Ty = MRI.getType(DstReg); + Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); + + if (Arg->isMasked()) { + // TODO: Should we try to emit this once in the entry block? + const LLT S32 = LLT::scalar(32); + const unsigned Mask = Arg->getMask(); + const unsigned Shift = countTrailingZeros(Mask); + + auto ShiftAmt = B.buildConstant(S32, Shift); + auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); + B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); + } else + B.buildCopy(DstReg, LiveIn); + + // Insert the argument copy if it doens't already exist. + // FIXME: It seems EmitLiveInCopies isn't called anywhere? + if (!MRI.getVRegDef(LiveIn)) { + MachineBasicBlock &EntryMBB = B.getMF().front(); + EntryMBB.addLiveIn(Arg->getRegister()); + B.setInsertPt(EntryMBB, EntryMBB.begin()); + B.buildCopy(LiveIn, Arg->getRegister()); + } + + return true; +} + +bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( + MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + B.setInstr(MI); + + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + + const ArgDescriptor *Arg; + const TargetRegisterClass *RC; + std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); + if (!Arg) { + LLVM_DEBUG(dbgs() << "Required arg register missing\n"); + return false; + } + + if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { + MI.eraseFromParent(); + return true; + } + + return false; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1104,6 +1179,15 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, return false; } + case Intrinsic::amdgcn_workitem_id_x: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_X); + case Intrinsic::amdgcn_workitem_id_y: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + case Intrinsic::amdgcn_workitem_id_z: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z); default: return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index b5e12f4..7c566d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "AMDGPUArgumentUsageInfo.h" namespace llvm { @@ -47,6 +48,15 @@ public: bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, bool Signed) const; + Register getLiveInRegister(MachineRegisterInfo &MRI, + Register Reg, LLT Ty) const; + + bool loadInputValue(Register DstReg, MachineIRBuilder &B, + const ArgDescriptor *Arg) const; + bool legalizePreloadedArgIntrin( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll index 00b1264..4446d1c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll @@ -3,7 +3,8 @@ ; HSA-LABEL: name: default_kernel ; HSA: liveins: ; HSA-NEXT: - { reg: '$sgpr0_sgpr1_sgpr2_sgpr3', virtual-reg: '%0' } -; HSA-NEXT: - { reg: '$sgpr4', virtual-reg: '%1' } +; HSA-NEXT: - { reg: '$vgpr0', virtual-reg: '%1' } +; HSA-NEXT: - { reg: '$sgpr4', virtual-reg: '%2' } ; HSA-NEXT: frameInfo: define amdgpu_kernel void @default_kernel() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fast-math-flags.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fast-math-flags.ll index dc660bf..a3ffff2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fast-math-flags.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fast-math-flags.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @fma_fast(float %arg0, float %arg1, float %arg2) { ; Check flags are preserved for an arbitrarry target intrinsic ; CHECK-LABEL: name: rcp_nsz -; CHECK: = nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %8(s32) +; CHECK: = nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %{{[0-9]+}}(s32) define amdgpu_kernel void @rcp_nsz(float %arg0) { %res = call nsz float @llvm.amdgcn.rcp.f32 (float %arg0) store float %res, float addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll new file mode 100644 index 0000000..efd6cc7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -0,0 +1,92 @@ +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 + +; MESA: .section .AMDGPU.config +; MESA: .long 47180 +; MESA-NEXT: .long 132{{$}} + +; ALL-LABEL {{^}}test_workitem_id_x: +; CO-V2: enable_vgpr_workitem_id = 0 + +; ALL-NOT: v0 +; ALL: {{buffer|flat}}_store_dword {{.*}}v0 +define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + store i32 %id, i32 addrspace(1)* %out + ret void +} + +; MESA: .section .AMDGPU.config +; MESA: .long 47180 +; MESA-NEXT: .long 2180{{$}} + +; ALL-LABEL {{^}}test_workitem_id_y: +; CO-V2: enable_vgpr_workitem_id = 1 + +; ALL-NOT: v1 +; ALL: {{buffer|flat}}_store_dword {{.*}}v1 +define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 { + %id = call i32 @llvm.amdgcn.workitem.id.y() + store i32 %id, i32 addrspace(1)* %out + ret void +} + +; MESA: .section .AMDGPU.config +; MESA: .long 47180 +; MESA-NEXT: .long 4228{{$}} + +; ALL-LABEL {{^}}test_workitem_id_z: +; CO-V2: enable_vgpr_workitem_id = 2 + +; ALL-NOT: v2 +; ALL: {{buffer|flat}}_store_dword {{.*}}v2 +define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 { + %id = call i32 @llvm.amdgcn.workitem.id.z() + store i32 %id, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_workitem_id_x_usex2: +; ALL-NOT: v0 +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; ALL-NOT: v0 +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +define amdgpu_kernel void @test_workitem_id_x_usex2(i32 addrspace(1)* %out) #1 { + %id0 = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id0, i32 addrspace(1)* %out + + %id1 = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id1, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_workitem_id_x_use_outside_entry: +; ALL-NOT: v0 +; ALL: flat_store_dword +; ALL-NOT: v0 +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +define amdgpu_kernel void @test_workitem_id_x_use_outside_entry(i32 addrspace(1)* %out, i32 %arg) #1 { +bb0: + store volatile i32 0, i32 addrspace(1)* %out + %cond = icmp eq i32 %arg, 0 + br i1 %cond, label %bb1, label %bb2 + +bb1: + %id = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %id, i32 addrspace(1)* %out + br label %bb2 + +bb2: + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } -- 2.7.4