From 944f429b210d0d94171dc5df3068d4f1e8fb5d80 Mon Sep 17 00:00:00 2001 From: Konstantina Mitropoulou Date: Fri, 17 Feb 2023 14:16:39 -0800 Subject: [PATCH] [AMDGPU] Improve the lowering of raw_buffer_load_{i8,i16} and struct_buffer_load_{i8,i16} intrinsics Currently, raw_buffer_load_{i8,i16} and struct_buffer_load_{i8,i16} intrinsics are lowered as buffer_load_{u8,u16}. This patch combines buffer_load_{u8,u16} and sign extension instructions in order to generate buffer_load_{i8,i16} instructions. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D144313 --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 10 ++++- .../Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 50 +++++++++++++++++++++- .../GlobalISel/llvm.amdgcn.raw.buffer.load.ll | 10 ++--- .../GlobalISel/llvm.amdgcn.struct.buffer.load.ll | 10 ++--- 4 files changed, 66 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 539b566..e5a53f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -104,6 +104,14 @@ def foldable_fneg : GICombineRule< [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]), (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>; +def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">; + +def sign_extension_in_reg : GICombineRule< + (defs root:$sign_inreg, sign_exension_in_reg_matchdata:$matchinfo), + (match (wip_match_opcode G_SEXT_INREG):$sign_inreg, + [{ return PostLegalizerHelper.matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]), + (apply [{ PostLegalizerHelper.applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -119,7 +127,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq]> { + rcp_sqrt_to_rsq, sign_extension_in_reg]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index d3e6d3a..3acbd6d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -36,12 +36,16 @@ protected: MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; + const GCNSubtarget &Subtarget; + const SIInstrInfo &TII; AMDGPUCombinerHelper &Helper; public: AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, AMDGPUCombinerHelper &Helper) - : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; + : B(B), MF(B.getMF()), MRI(*B.getMRI()), + Subtarget(MF.getSubtarget()), + TII(*Subtarget.getInstrInfo()), Helper(Helper){}; struct FMinFMaxLegacyInfo { Register LHS; @@ -74,6 +78,11 @@ public: const CvtF32UByteMatchInfo &MatchInfo); bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); + + // Combine unsigned buffer load and signed extension instructions to generate + // signed buffer laod instructions. + bool matchCombineSignExtendInReg(MachineInstr &MI, MachineInstr *&MatchInfo); + void applyCombineSignExtendInReg(MachineInstr &MI, MachineInstr *&MatchInfo); }; bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( @@ -302,6 +311,45 @@ bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize( return TLI->isCanonicalized(Reg, MF); } +// The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, +// u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined +// with sign extension instrucions in order to generate buffer_load_{i8, i16} +// instructions. + +// Identify buffer_load_{u8, u16}. +bool AMDGPUPostLegalizerCombinerHelper::matchCombineSignExtendInReg( + MachineInstr &MI, MachineInstr *&SubwordBufferLoad) { + Register Op0Reg = MI.getOperand(1).getReg(); + SubwordBufferLoad = MRI.getVRegDef(Op0Reg); + + if (!MRI.hasOneNonDBGUse(Op0Reg)) + return false; + + // Check if the first operand of the sign extension is a subword buffer load + // instruction. + return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE || + SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; +} + +// Combine buffer_load_{u8, u16} and the sign extension instruction to generate +// buffer_load_{i8, i16}. +void AMDGPUPostLegalizerCombinerHelper::applyCombineSignExtendInReg( + MachineInstr &MI, MachineInstr *&SubwordBufferLoad) { + // Modify the opcode and the destination of buffer_load_{u8, u16}: + // Replace the opcode. + unsigned Opc = + SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE + ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE + : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT; + SubwordBufferLoad->setDesc(TII.get(Opc)); + // Update the destination register of SubwordBufferLoad with the destination + // register of the sign extension. + Register SignExtendInsnDst = MI.getOperand(0).getReg(); + SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst); + // Remove the sign extension. + MI.eraseFromParent(); +} + class AMDGPUPostLegalizerCombinerHelperState { protected: AMDGPUCombinerHelper &Helper; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll index 8d5dcc1..0a754f3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -442,9 +442,8 @@ define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 7) - ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_OFFEN]], 0, 8, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] + ; CHECK-NEXT: [[BUFFER_LOAD_SBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 7) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) %zext = sext i8 %val to i32 @@ -485,9 +484,8 @@ define amdgpu_ps float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 7) - ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_OFFEN]], 0, 16, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] + ; CHECK-NEXT: [[BUFFER_LOAD_SSHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 7) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) %sext = sext i16 %val to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll index ac46716..2aede9c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -267,9 +267,8 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 7) - ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 8, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] + ; CHECK-NEXT: [[BUFFER_LOAD_SBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 7) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %ext = sext i8 %val to i32 @@ -314,9 +313,8 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 7) - ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 16, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] + ; CHECK-NEXT: [[BUFFER_LOAD_SSHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 7) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %ext = sext i16 %val to i32 -- 2.7.4