From b27d255e1e40bc065e68e39c6e1eaf3a5a16f005 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 25 Mar 2020 10:45:07 -0400 Subject: [PATCH] AMDGPU/GlobalISel: Form CVT_F32_UBYTE0 --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 9 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 5 + .../Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 37 +++++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 + llvm/lib/Target/AMDGPU/SIInstructions.td | 8 + .../CodeGen/AMDGPU/GlobalISel/combine-itofp.mir | 175 +++++++++++++++++++++ 6 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 4063bb1..ff8a31d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -20,6 +20,13 @@ def fcmp_select_to_fmin_fmax_legacy : GICombineRule< (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>; +def uchar_to_float : GICombineRule< + (defs root:$itofp), + (match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp, + [{ return matchUCharToFloat(*${itofp}, MRI, *MF, Helper); }]), + (apply [{ applyUCharToFloat(*${itofp}); }])>; + + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -32,6 +39,6 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, - gfx6gfx7_combines]> { + gfx6gfx7_combines, uchar_to_float]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index faf007f..3d6a2c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -153,6 +153,11 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; + def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 35299b6..7fe5e48 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -127,6 +127,43 @@ static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, MI.eraseFromParent(); } +static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, CombinerHelper &Helper) { + Register DstReg = MI.getOperand(0).getReg(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + LLT Ty = MRI.getType(DstReg); + if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { + const APInt Mask = APInt::getHighBitsSet(32, 24); + return Helper.getKnownBits()->maskedValueIsZero(MI.getOperand(1).getReg(), + Mask); + } + + return false; +} + +static void applyUCharToFloat(MachineInstr &MI) { + MachineIRBuilder B(MI); + + const LLT S32 = LLT::scalar(32); + + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = B.getMRI()->getType(DstReg); + + if (Ty == S32) { + B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, + {MI.getOperand(1)}, MI.getFlags()); + } else { + auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, + {MI.getOperand(1)}, MI.getFlags()); + B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); + } + + MI.eraseFromParent(); +} #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenPostLegalizeGICombiner.inc" diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7a49f76..8070bcb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3293,6 +3293,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_FMIN_LEGACY: case AMDGPU::G_AMDGPU_FMAX_LEGACY: case AMDGPU::G_AMDGPU_RCP_IFLAG: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 030baa3..6e6b2e7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2297,6 +2297,14 @@ def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { let hasSideEffects = 0; } +foreach N = 0-3 in { +def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0); + let hasSideEffects = 0; +} +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir new file mode 100644 index 0000000..2021107 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir @@ -0,0 +1,175 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: uitofp_char_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: uitofp_char_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_UITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: uitofp_too_many_bits_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: uitofp_too_many_bits_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND]](s32) + ; CHECK: $vgpr0 = COPY [[UITOFP]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 256 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_UITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: sitofp_char_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_char_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: sitofp_bits127_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_bits127_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 127 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: sitofp_bits128_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_bits128_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 128 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... +--- +name: sitofp_too_many_bits_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_too_many_bits_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[AND]](s32) + ; CHECK: $vgpr0 = COPY [[SITOFP]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 256 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: uitofp_char_to_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: uitofp_char_to_f16 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[AMDGPU_CVT_F32_UBYTE0_]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %0, %1 + %3:_(s16) = G_UITOFP %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: sitofp_char_to_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_char_to_f16 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[AMDGPU_CVT_F32_UBYTE0_]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %0, %1 + %3:_(s16) = G_SITOFP %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... -- 2.7.4