From 15a24e1fdbc446bc3d0fa0bf6c2a1aa44281cc75 Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Tue, 28 Sep 2021 15:04:23 -0700 Subject: [PATCH] [GlobalISel] Combine mulo x, 2 -> addo x, x Similar to what SDAG does when it sees a smulo/umulo against 2 (see: `DAGCombiner::visitMULO`) This pattern is fairly common in Swift code AFAICT. Here's an example extracted from a Swift testcase: https://godbolt.org/z/6cT8Mesx7 Differential Revision: https://reviews.llvm.org/D110662 --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 5 + llvm/include/llvm/Target/GlobalISel/Combine.td | 8 +- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 29 +++++ .../AArch64/GlobalISel/combine-mulo-with-2.mir | 117 +++++++++++++++++++++ 4 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-mulo-with-2.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 5e3f371..32d5070 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -609,6 +609,11 @@ public: /// and rename: s/bool tryEmit/void emit/ bool tryEmitMemcpyInline(MachineInstr &MI); + /// Match: + /// (G_UMULO x, 2) -> (G_UADDO x, x) + /// (G_SMULO x, 2) -> (G_SADDO x, x) + bool matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo); + private: /// Given a non-indexed load or store instruction \p MI, find an offset that /// can be usefully and legally folded into it as a post-indexing operation. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 09be6db..b7bfdb6 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -702,6 +702,12 @@ def constant_fold : GICombineRule< [{ return Helper.matchConstantFold(*${d}, ${matchinfo}); }]), (apply [{ Helper.replaceInstWithConstant(*${d}, ${matchinfo}); }])>; +def mulo_by_2: GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_UMULO, G_SMULO):$root, + [{ return Helper.matchMulOBy2(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -718,7 +724,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, fneg_fneg_fold, right_identity_one]>; def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p, - overlapping_and]>; + overlapping_and, mulo_by_2]>; def known_bits_simplifications : GICombineGroup<[ redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 17d256c..0515e44 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4376,6 +4376,35 @@ bool CombinerHelper::matchNarrowBinopFeedingAnd( return true; } +bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) { + unsigned Opc = MI.getOpcode(); + assert(Opc == TargetOpcode::G_UMULO || Opc == TargetOpcode::G_SMULO); + // Check for a constant 2 or a splat of 2 on the RHS. + auto RHS = MI.getOperand(3).getReg(); + bool IsVector = MRI.getType(RHS).isVector(); + if (!IsVector && !mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICst(2))) + return false; + if (IsVector) { + // FIXME: There's no mi_match pattern for this yet. + auto *RHSDef = getDefIgnoringCopies(RHS, MRI); + if (!RHSDef) + return false; + auto Splat = getBuildVectorConstantSplat(*RHSDef, MRI); + if (!Splat || *Splat != 2) + return false; + } + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Observer.changingInstr(MI); + unsigned NewOpc = Opc == TargetOpcode::G_UMULO ? TargetOpcode::G_UADDO + : TargetOpcode::G_SADDO; + MI.setDesc(Builder.getTII().get(NewOpc)); + MI.getOperand(3).setReg(MI.getOperand(2).getReg()); + Observer.changedInstr(MI); + }; + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-mulo-with-2.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-mulo-with-2.mir new file mode 100644 index 0000000..27486b5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-mulo-with-2.mir @@ -0,0 +1,117 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="mulo_by_2" -global-isel -verify-machineinstrs %s -o - | FileCheck %s +# REQUIRES: asserts +... +--- +name: smulo_to_saddo +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: smulo_to_saddo + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:_(s64) = COPY $x0 + ; CHECK-NEXT: %mul:_(s64), %o:_(s1) = G_SADDO %copy, %copy + ; CHECK-NEXT: %overflow_ext:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %overflow_ext(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %copy:_(s64) = COPY $x0 + %two:_(s64) = G_CONSTANT i64 2 + %mul:_(s64), %o:_(s1) = G_SMULO %copy, %two + %overflow_ext:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %overflow_ext(s32) + RET_ReallyLR implicit $w0 +... +--- +name: umulo_to_uaddo +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: umulo_to_uaddo + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:_(s64) = COPY $x0 + ; CHECK-NEXT: %mul:_(s64), %o:_(s1) = G_UADDO %copy, %copy + ; CHECK-NEXT: %overflow_ext:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %overflow_ext(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %copy:_(s64) = COPY $x0 + %two:_(s64) = G_CONSTANT i64 2 + %mul:_(s64), %o:_(s1) = G_UMULO %copy, %two + %overflow_ext:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %overflow_ext(s32) + RET_ReallyLR implicit $w0 +... +--- +name: vector +tracksRegLiveness: true +body: | + bb.0: + liveins: $d0, $d1 + + ; CHECK-LABEL: name: vector + ; CHECK: liveins: $d0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:_(<2 x s32>) = COPY $d0 + ; CHECK-NEXT: %mul:_(<2 x s32>), %o:_(<2 x s1>) = G_SADDO %copy, %copy + ; CHECK-NEXT: %overflow_ext:_(<2 x s32>) = G_ANYEXT %o(<2 x s1>) + ; CHECK-NEXT: $d0 = COPY %overflow_ext(<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 + %copy:_(<2 x s32>) = COPY $d0 + %two:_(s32) = G_CONSTANT i32 2 + %splat_two:_(<2 x s32>) = G_BUILD_VECTOR %two(s32), %two(s32) + %mul:_(<2 x s32>), %o:_(<2 x s1>) = G_SMULO %copy, %splat_two + %overflow_ext:_(<2 x s32>) = G_ANYEXT %o(<2 x s1>) + $d0 = COPY %overflow_ext(<2 x s32>) + RET_ReallyLR implicit $d0 +... +--- +name: dont_combine_wrong_cst +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: dont_combine_wrong_cst + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:_(s64) = COPY $x0 + ; CHECK-NEXT: %three:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: %mul:_(s64), %o:_(s1) = G_UMULO %copy, %three + ; CHECK-NEXT: %overflow_ext:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %overflow_ext(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %copy:_(s64) = COPY $x0 + %three:_(s64) = G_CONSTANT i64 3 + %mul:_(s64), %o:_(s1) = G_UMULO %copy, %three + %overflow_ext:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %overflow_ext(s32) + RET_ReallyLR implicit $w0 +... +--- +name: dont_combine_not_cst +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: dont_combine_not_cst + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy1:_(s64) = COPY $x0 + ; CHECK-NEXT: %copy2:_(s64) = COPY $x1 + ; CHECK-NEXT: %mul:_(s64), %o:_(s1) = G_UMULO %copy1, %copy2 + ; CHECK-NEXT: %overflow_ext:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %overflow_ext(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %copy1:_(s64) = COPY $x0 + %copy2:_(s64) = COPY $x1 + %mul:_(s64), %o:_(s1) = G_UMULO %copy1, %copy2 + %overflow_ext:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %overflow_ext(s32) + RET_ReallyLR implicit $w0 +... -- 2.7.4