From: Petar Avramovic Date: Tue, 27 Apr 2021 09:51:22 +0000 (+0200) Subject: AMDGPU/GlobalISel: Add integer med3 combines X-Git-Tag: llvmorg-14-init~8349 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4a9bc59867b64a24ff628c5a7ab2ca5e727dd9c1;p=platform%2Fupstream%2Fllvm.git AMDGPU/GlobalISel: Add integer med3 combines Add signed and unsigned integer version of med3 combine. Source pattern is min(max(Val, K0), K1) or max(min(Val, K1), K0) where K0 and K1 are constants and K0 <= K1. Destination is med3 that corresponds to signedness of min/max in source. Differential Revision: https://reviews.llvm.org/D90050 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 5faebb2..9b71428 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -45,6 +45,17 @@ def clamp_i64_to_i16 : GICombineRule< [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; +def med3_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::Med3MatchInfo">; + +def int_minmax_to_med3 : GICombineRule< + (defs root:$min_or_max, med3_matchdata:$matchinfo), + (match (wip_match_opcode G_SMAX, + G_SMIN, + G_UMAX, + G_UMIN):$min_or_max, + [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -64,6 +75,8 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold]> { + "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; + let StateClass = "AMDGPURegBankCombinerHelperState"; + let AdditionalArguments = []; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 9fea66b..36e04fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -167,7 +167,8 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; -def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index c58b15f..3a861f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -143,7 +143,7 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( auto Bitcast = B.buildBitcast({S32}, CvtPk); auto Med3 = B.buildInstr( - AMDGPU::G_AMDGPU_MED3, {S32}, + AMDGPU::G_AMDGPU_SMED3, {S32}, {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, MI.getFlags()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index d644c03..4e12e5c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -13,7 +13,9 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -27,6 +29,126 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPURegBankCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + const RegisterBankInfo &RBI; + const TargetRegisterInfo &TRI; + CombinerHelper &Helper; + +public: + AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), + RBI(*MF.getSubtarget().getRegBankInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; + + bool isVgprRegBank(Register Reg); + + struct MinMaxMedOpc { + unsigned Min, Max, Med; + }; + + struct Med3MatchInfo { + unsigned Opc; + Register Val0, Val1, Val2; + }; + + MinMaxMedOpc getMinMaxPair(unsigned Opc); + + template + bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc, + Register &Val, Register &K0, Register &K1); + + bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); +}; + +bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { + return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID; +} + +AMDGPURegBankCombinerHelper::MinMaxMedOpc +AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Unsupported opcode"); + case AMDGPU::G_SMAX: + case AMDGPU::G_SMIN: + return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3}; + case AMDGPU::G_UMAX: + case AMDGPU::G_UMIN: + return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; + } +} + +template +bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI, + MachineRegisterInfo &MRI, + MinMaxMedOpc MMMOpc, Register &Val, + Register &K0, Register &K1) { + // 4 operand commutes of: min(max(Val, K0), K1). + // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)). + // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0). + // 4 operand commutes of: max(min(Val, K1), K0). + // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)). + // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1). + return mi_match( + MI, MRI, + m_any_of( + m_CommutativeBinOp( + MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)), + m_Cst(K1)), + m_CommutativeBinOp( + MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)), + m_Cst(K0)))); +} + +bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( + MachineInstr &MI, Med3MatchInfo &MatchInfo) { + Register Dst = MI.getOperand(0).getReg(); + if (!isVgprRegBank(Dst)) + return false; + + if (MRI.getType(Dst).isVector()) + return false; + + MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode()); + Register Val, K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. + if (!matchMed(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + const APInt &K0_Imm = getConstantIntVRegVal(K0, MRI)->getValue(); + const APInt &K1_Imm = getConstantIntVRegVal(K1, MRI)->getValue(); + if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0_Imm.sgt(K1_Imm)) + return false; + if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0_Imm.ugt(K1_Imm)) + return false; + + MatchInfo = {OpcodeTriple.Med, Val, K0, K1}; + return true; +} + +void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, + Med3MatchInfo &MatchInfo) { + B.setInstrAndDebugLoc(MI); + B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, + {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, MI.getFlags()); + MI.eraseFromParent(); +} + +class AMDGPURegBankCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPURegBankCombinerHelper &RegBankHelper; + +public: + AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, + AMDGPURegBankCombinerHelper &RegBankHelper) + : Helper(Helper), RegBankHelper(RegBankHelper) {} +}; #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenRegBankGICombiner.inc" @@ -62,9 +184,11 @@ bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg); + AMDGPURegBankCombinerHelper RegBankHelper(B, Helper); + AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, + RegBankHelper); - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 0f29c97..88b2f40 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3507,7 +3507,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: - case AMDGPU::G_AMDGPU_MED3: + case AMDGPU::G_AMDGPU_SMED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6a4f984..8f6ccd8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2652,7 +2652,13 @@ def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { let hasSideEffects = 0; } -def G_AMDGPU_MED3 : AMDGPUGenericInstruction { +def G_AMDGPU_SMED3 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); + let hasSideEffects = 0; +} + +def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); let hasSideEffects = 0; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir new file mode 100644 index 0000000..ec4755a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir @@ -0,0 +1,328 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_min_max_ValK0_K1_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_max_ValK0_K1_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 -12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMAX %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMIN %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: min_max_ValK0_K1_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: min_max_ValK0_K1_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 -12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMAX %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMIN %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_min_K1max_ValK0__i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_K1max_ValK0__i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 -12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMAX %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMIN %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_min_K1max_K0Val__i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_K1max_K0Val__i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 -12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMAX %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMIN %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_min_ValK1_K0_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_min_ValK1_K0_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMIN %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 -12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMAX %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_min_K1Val_K0_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_min_K1Val_K0_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMIN %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 -12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMAX %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_ValK1__i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_ValK1__i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMIN %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 -12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMAX %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMIN %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 -12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMAX %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__v2i16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__v2i16 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; CHECK: [[SMIN:%[0-9]+]]:vgpr(<2 x s16>) = G_SMIN [[COPY2]], [[COPY]] + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; CHECK: [[SMAX:%[0-9]+]]:vgpr(<2 x s16>) = G_SMAX [[COPY3]], [[SMIN]] + ; CHECK: $vgpr0 = COPY [[SMAX]](<2 x s16>) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %9:sgpr(s32) = G_CONSTANT i32 17 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %9(s32), %9(s32) + %10:sgpr(s32) = G_CONSTANT i32 -12 + %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %10(s32), %10(s32) + %11:vgpr(<2 x s16>) = COPY %2(<2 x s16>) + %4:vgpr(<2 x s16>) = G_SMIN %11, %0 + %12:vgpr(<2 x s16>) = COPY %5(<2 x s16>) + %7:vgpr(<2 x s16>) = G_SMAX %12, %4 + $vgpr0 = COPY %7(<2 x s16>) + %8:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %8, implicit $vgpr0 +... + +--- +name: test_uniform_min_max +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $sgpr2 + + ; CHECK-LABEL: name: test_uniform_min_max + ; CHECK: liveins: $sgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[COPY]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SMAX]], [[C1]] + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[SMIN]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 + %0:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_CONSTANT i32 -12 + %4:sgpr(s32) = G_SMAX %0, %3 + %5:sgpr(s32) = G_CONSTANT i32 17 + %6:sgpr(s32) = G_SMIN %4, %5 + %8:vgpr(s32) = COPY %6(s32) + %7:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %8(s32) + $sgpr0 = COPY %7(s32) + SI_RETURN_TO_EPILOG implicit $sgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir new file mode 100644 index 0000000..cc5131b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir @@ -0,0 +1,329 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_min_max_ValK0_K1_u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_max_ValK0_K1_u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMAX %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMIN %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: min_max_ValK0_K1_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: min_max_ValK0_K1_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMAX %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMIN %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_min_K1max_ValK0__u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_K1max_ValK0__u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMAX %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMIN %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_min_K1max_K0Val__u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_K1max_K0Val__u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMAX %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMIN %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_min_ValK1_K0_u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_min_ValK1_K0_u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMIN %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMAX %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_min_K1Val_K0_u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_min_K1Val_K0_u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMIN %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMAX %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_ValK1__u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_ValK1__u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMIN %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMAX %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMIN %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMAX %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__v2u16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__v2u16 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; CHECK: [[UMIN:%[0-9]+]]:vgpr(<2 x s16>) = G_UMIN [[COPY2]], [[COPY]] + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; CHECK: [[UMAX:%[0-9]+]]:vgpr(<2 x s16>) = G_UMAX [[COPY3]], [[UMIN]] + ; CHECK: $vgpr0 = COPY [[UMAX]](<2 x s16>) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %9:sgpr(s32) = G_CONSTANT i32 17 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %9(s32), %9(s32) + %10:sgpr(s32) = G_CONSTANT i32 12 + %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %10(s32), %10(s32) + %11:vgpr(<2 x s16>) = COPY %2(<2 x s16>) + %4:vgpr(<2 x s16>) = G_UMIN %11, %0 + %12:vgpr(<2 x s16>) = COPY %5(<2 x s16>) + %7:vgpr(<2 x s16>) = G_UMAX %12, %4 + $vgpr0 = COPY %7(<2 x s16>) + %8:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %8, implicit $vgpr0 +... + +--- +name: test_uniform_min_max +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $sgpr2 + + ; CHECK-LABEL: name: test_uniform_min_max + ; CHECK: liveins: $sgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[COPY]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[UMAX]], [[C1]] + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UMIN]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 + %0:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_CONSTANT i32 12 + %4:sgpr(s32) = G_UMAX %0, %3 + %5:sgpr(s32) = G_CONSTANT i32 17 + %6:sgpr(s32) = G_UMIN %4, %5 + %8:vgpr(s32) = COPY %6(s32) + %7:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %8(s32) + $sgpr0 = COPY %7(s32) + SI_RETURN_TO_EPILOG implicit $sgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll new file mode 100644 index 0000000..b42ab8f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define i32 @test_min_max_ValK0_K1_i32(i32 %a) { +; GFX10-LABEL: test_min_max_ValK0_K1_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12) + %smed = call i32 @llvm.smin.i32(i32 %smax, i32 17) + ret i32 %smed +} + +define i32 @min_max_ValK0_K1_i32(i32 %a) { +; GFX10-LABEL: min_max_ValK0_K1_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 -12, i32 %a) + %smed = call i32 @llvm.smin.i32(i32 %smax, i32 17) + ret i32 %smed +} + +define i32 @test_min_K1max_ValK0__i32(i32 %a) { +; GFX10-LABEL: test_min_K1max_ValK0__i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12) + %smed = call i32 @llvm.smin.i32(i32 17, i32 %smax) + ret i32 %smed +} + +define i32 @test_min_K1max_K0Val__i32(i32 %a) { +; GFX10-LABEL: test_min_K1max_K0Val__i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 -12, i32 %a) + %smed = call i32 @llvm.smin.i32(i32 17, i32 %smax) + ret i32 %smed +} + +define i32 @test_max_min_ValK1_K0_i32(i32 %a) { +; GFX10-LABEL: test_max_min_ValK1_K0_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 %a, i32 17) + %smed = call i32 @llvm.smax.i32(i32 %smin, i32 -12) + ret i32 %smed +} + +define i32 @test_max_min_K1Val_K0_i32(i32 %a) { +; GFX10-LABEL: test_max_min_K1Val_K0_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 17, i32 %a) + %smed = call i32 @llvm.smax.i32(i32 %smin, i32 -12) + ret i32 %smed +} + +define i32 @test_max_K0min_ValK1__i32(i32 %a) { +; GFX10-LABEL: test_max_K0min_ValK1__i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 %a, i32 17) + %smed = call i32 @llvm.smax.i32(i32 -12, i32 %smin) + ret i32 %smed +} + +define i32 @test_max_K0min_K1Val__i32(i32 %a) { +; GFX10-LABEL: test_max_K0min_K1Val__i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 17, i32 %a) + %smed = call i32 @llvm.smax.i32(i32 -12, i32 %smin) + ret i32 %smed +} + +define <2 x i16> @test_max_K0min_K1Val__v2i16(<2 x i16> %a) { +; GFX10-LABEL: test_max_K0min_K1Val__v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_min_i16 v0, 17, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_max_i16 v0, -12, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call <2 x i16> @llvm.smin.v2i16(<2 x i16> , <2 x i16> %a) + %smed = call <2 x i16> @llvm.smax.v2i16(<2 x i16> , <2 x i16> %smin) + ret <2 x i16> %smed +} + +define amdgpu_ps i32 @test_uniform_min_max(i32 inreg %a) { +; GFX10-LABEL: test_uniform_min_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_max_i32 s0, s2, -12 +; GFX10-NEXT: s_min_i32 s0, s0, 17 +; GFX10-NEXT: ; return to shader part epilog + %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12) + %smed = call i32 @llvm.smin.i32(i32 %smax, i32 17) + ret i32 %smed +} + +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) +declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll new file mode 100644 index 0000000..08c9ae6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define i32 @test_min_max_ValK0_K1_u32(i32 %a) { +; GFX10-LABEL: test_min_max_ValK0_K1_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 %a, i32 12) + %umed = call i32 @llvm.umin.i32(i32 %umax, i32 17) + ret i32 %umed +} + +define i32 @min_max_ValK0_K1_i32(i32 %a) { +; GFX10-LABEL: min_max_ValK0_K1_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 12, i32 %a) + %umed = call i32 @llvm.umin.i32(i32 %umax, i32 17) + ret i32 %umed +} + +define i32 @test_min_K1max_ValK0__u32(i32 %a) { +; GFX10-LABEL: test_min_K1max_ValK0__u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 %a, i32 12) + %umed = call i32 @llvm.umin.i32(i32 17, i32 %umax) + ret i32 %umed +} + +define i32 @test_min_K1max_K0Val__u32(i32 %a) { +; GFX10-LABEL: test_min_K1max_K0Val__u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 12, i32 %a) + %umed = call i32 @llvm.umin.i32(i32 17, i32 %umax) + ret i32 %umed +} + +define i32 @test_max_min_ValK1_K0_u32(i32 %a) { +; GFX10-LABEL: test_max_min_ValK1_K0_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 %a, i32 17) + %umed = call i32 @llvm.umax.i32(i32 %umin, i32 12) + ret i32 %umed +} + +define i32 @test_max_min_K1Val_K0_u32(i32 %a) { +; GFX10-LABEL: test_max_min_K1Val_K0_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 17, i32 %a) + %umed = call i32 @llvm.umax.i32(i32 %umin, i32 12) + ret i32 %umed +} + +define i32 @test_max_K0min_ValK1__u32(i32 %a) { +; GFX10-LABEL: test_max_K0min_ValK1__u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 %a, i32 17) + %umed = call i32 @llvm.umax.i32(i32 12, i32 %umin) + ret i32 %umed +} + +define i32 @test_max_K0min_K1Val__u32(i32 %a) { +; GFX10-LABEL: test_max_K0min_K1Val__u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 17, i32 %a) + %umed = call i32 @llvm.umax.i32(i32 12, i32 %umin) + ret i32 %umed +} + +define <2 x i16> @test_max_K0min_K1Val__v2u16(<2 x i16> %a) { +; GFX10-LABEL: test_max_K0min_K1Val__v2u16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_min_u16 v0, 17, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_max_u16 v0, 12, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call <2 x i16> @llvm.umin.v2i16(<2 x i16> , <2 x i16> %a) + %umed = call <2 x i16> @llvm.umax.v2i16(<2 x i16> , <2 x i16> %umin) + ret <2 x i16> %umed +} + +define amdgpu_ps i32 @test_uniform_min_max(i32 inreg %a) { +; GFX10-LABEL: test_uniform_min_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_max_u32 s0, s2, 12 +; GFX10-NEXT: s_min_u32 s0, s0, 17 +; GFX10-NEXT: ; return to shader part epilog + %umax = call i32 @llvm.umax.i32(i32 %a, i32 12) + %umed = call i32 @llvm.umin.i32(i32 %umax, i32 17) + ret i32 %umed +} + +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) +declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>)