From 4b3847e865b0b7ae020e80dfab2ea7ac3a2f9626 Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Thu, 6 Apr 2017 23:02:33 +0000 Subject: [PATCH] AMDGPU/GFX9: Fix shared and private aperture queries Differential Revision: https://reviews.llvm.org/D31786 llvm-svn: 299727 --- llvm/lib/Target/AMDGPU/SIDefines.h | 11 ++++++++-- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 34 +++++++++++++++++++++---------- llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 +++- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 18 +++++++++++++--- 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 4829d44..3dd372b 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -248,6 +248,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_LDS_ALLOC = 6, ID_IB_STS = 7, ID_SYMBOLIC_LAST_ = 8, + ID_MEM_BASES = 15, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) @@ -257,14 +258,20 @@ enum Offset { // Offset, (5) [10:6] OFFSET_DEFAULT_ = 0, OFFSET_SHIFT_ = 6, OFFSET_WIDTH_ = 5, - OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_) + OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), + + OFFSET_SRC_SHARED_BASE = 16, + OFFSET_SRC_PRIVATE_BASE = 0 }; enum WidthMinusOne { // WidthMinusOne, (5) [15:11] WIDTH_M1_DEFAULT_ = 31, WIDTH_M1_SHIFT_ = 11, WIDTH_M1_WIDTH_ = 5, - WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_) + WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_), + + WIDTH_M1_SRC_SHARED_BASE = 15, + WIDTH_M1_SRC_PRIVATE_BASE = 15 }; } // namespace Hwreg diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e8b0626..4aa2c8a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2340,16 +2340,28 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; } -SDValue SITargetLowering::getSegmentAperture(unsigned AS, +SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { - - if (Subtarget->hasApertureRegs()) { // Read from Aperture Registers directly. - unsigned RegNo = (AS == AMDGPUASI.LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : - AMDGPU::SRC_PRIVATE_BASE; - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, RegNo, MVT::i32); + // FIXME: Use inline constants (src_{shared, private}_base) instead. + if (Subtarget->hasApertureRegs()) { + unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ? + AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : + AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; + unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ? + AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : + AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; + unsigned Encoding = + AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | + Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | + WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; + + SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16); + SDValue ApertureReg = SDValue( + DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0); + SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32); + return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); } - SDLoc SL; MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo(); unsigned UserSGPR = Info->getQueuePtrUserSGPR(); @@ -2362,8 +2374,8 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44; - SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, - DAG.getConstant(StructOffset, SL, MVT::i64)); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr, + DAG.getConstant(StructOffset, DL, MVT::i64)); // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not @@ -2372,7 +2384,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, AMDGPUASI.CONSTANT_ADDRESS)); MachinePointerInfo PtrInfo(V, StructOffset); - return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, + return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, MinAlign(64, StructOffset), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); @@ -2417,7 +2429,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); - SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); SDValue CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 550e8a2..ded0242 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -58,7 +58,9 @@ class SITargetLowering final : public AMDGPUTargetLowering { /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, + SelectionDAG &DAG) const; + SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 52b7029..6ec93c7 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -12,7 +12,11 @@ ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base +; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] + +; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] @@ -45,7 +49,11 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base +; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]] + +; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] @@ -153,7 +161,11 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] -; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base +; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] + +; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} -- 2.7.4