From fd63e46941fc48d4cc777ef94e185637898d0adb Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 16 Jun 2020 14:52:14 -0400 Subject: [PATCH] AMDGPU/GlobalISel: Apply load bitcast to s.buffer.load intrinsic Should also apply this to the non-scalar buffer loads. --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 68 ++++++++++------- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 4 +- .../legalize-llvm.amdgcn.s.buffer.load.mir | 85 ++++++++++++++++++++-- 3 files changed, 121 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index cc97e11..b408700 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -122,20 +122,23 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +static LLT getBitcastRegisterType(const LLT Ty) { + const unsigned Size = Ty.getSizeInBits(); + + LLT CoercedTy; + if (Size <= 32) { + // <2 x s8> -> s16 + // <4 x s8> -> s32 + return LLT::scalar(Size); + } + + return LLT::scalarOrVector(Size / 32, 32); +} + static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; - unsigned Size = Ty.getSizeInBits(); - - LLT CoercedTy; - if (Size <= 32) { - // <2 x s8> -> s16 - // <4 x s8> -> s32 - CoercedTy = LLT::scalar(Size); - } else - CoercedTy = LLT::scalarOrVector(Size / 32, 32); - - return std::make_pair(TypeIdx, CoercedTy); + return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); }; } @@ -335,6 +338,20 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, !loadStoreBitcastWorkaround(Ty); } +/// Return true if a load or store of the type should be lowered with a bitcast +/// to a different type. +static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, + const unsigned MemSizeInBits) { + const unsigned Size = Ty.getSizeInBits(); + if (Size != MemSizeInBits) + return Size <= 32 && Ty.isVector(); + + if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) + return true; + return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && + !isRegisterVectorElementType(Ty.getElementType()); +} + AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const GCNTargetMachine &TM) : ST(ST_) { @@ -1048,16 +1065,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // 16-bit vector parts. Actions.bitcastIf( [=](const LegalityQuery &Query) -> bool { - const LLT Ty = Query.Types[0]; - const unsigned Size = Ty.getSizeInBits(); - - if (Size != Query.MMODescrs[0].SizeInBits) - return Size <= 32 && Ty.isVector(); - - if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) - return true; - return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && - !isRegisterVectorElementType(Ty.getElementType()); + return shouldBitcastLoadStoreType(ST, Query.Types[0], + Query.MMODescrs[0].SizeInBits); }, bitcastToRegisterType(0)); Actions @@ -4137,8 +4146,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } bool AMDGPULegalizerInfo::legalizeSBufferLoad( - MachineInstr &MI, MachineIRBuilder &B, - GISelChangeObserver &Observer) const { + LegalizerHelper &Helper, MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + GISelChangeObserver &Observer = Helper.Observer; + Register Dst = MI.getOperand(0).getReg(); LLT Ty = B.getMRI()->getType(Dst); unsigned Size = Ty.getSizeInBits(); @@ -4146,6 +4157,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( Observer.changingInstr(MI); + if (shouldBitcastLoadStoreType(ST, Ty, Size)) { + Ty = getBitcastRegisterType(Ty); + Helper.bitcastDst(MI, Ty, 0); + Dst = MI.getOperand(0).getReg(); + B.setInsertPt(B.getMBB(), MI); + } + // FIXME: We don't really need this intermediate instruction. The intrinsic // should be fixed to have a memory operand. Since it's readnone, we're not // allowed to add one. @@ -4167,8 +4185,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( // always be legal. We may need to restore this to a 96-bit result if it turns // out this needs to be converted to a vector load during RegBankSelect. if (!isPowerOf2_32(Size)) { - LegalizerHelper Helper(MF, *this, Observer, B); - if (Ty.isVector()) Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); else @@ -4360,7 +4376,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return true; } case Intrinsic::amdgcn_s_buffer_load: - return legalizeSBufferLoad(MI, B, Helper.Observer); + return legalizeSBufferLoad(Helper, MI); case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_struct_buffer_store: return legalizeBufferStore(MI, MRI, B, false, false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index fe4e17d..332d675 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -167,9 +167,7 @@ public: GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const; - bool legalizeSBufferLoad( - MachineInstr &MI, MachineIRBuilder &B, - GISelChangeObserver &Observer) const; + bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const; bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, bool IsInc) const; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir index 8860ca6..9aee145 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -67,9 +67,10 @@ body: | ; GCN-LABEL: name: s_buffer_load_v6s16 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s16>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s16>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<8 x s16>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s16>) + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 + ; GCN: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[EXTRACT]](<3 x s32>) + ; GCN: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -124,13 +125,83 @@ body: | ; GCN-LABEL: name: s_buffer_load_v12s8 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<16 x s8>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<12 x s8>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<16 x s8>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<12 x s8>) + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 + ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GCN: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GCN: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GCN: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32) + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GCN: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; GCN: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; GCN: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32) + ; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; GCN: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; GCN: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; GCN: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32) + ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C4]] + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GCN: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]] + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GCN: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; GCN: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]] + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32) + ; GCN: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C4]] + ; GCN: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GCN: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GCN: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32) + ; GCN: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C4]] + ; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) + ; GCN: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C4]] + ; GCN: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GCN: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GCN: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) + ; GCN: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C4]] + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) + ; GCN: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C4]] + ; GCN: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; GCN: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GCN: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) + ; GCN: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C4]] + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GCN: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C4]] + ; GCN: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; GCN: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; GCN: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 - S_ENDPGM 0, implicit %2 + %3:_(<12 x s16>) = G_ANYEXT %2 + S_ENDPGM 0, implicit %3 ... -- 2.7.4