From 2c3f66519c5e6af2a43e8c7087679f90d4582623 Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Fri, 1 Apr 2022 03:57:04 +0000 Subject: [PATCH] [SVE] Extend support for folding select + masked gathers Extend the work done in D106376 to include masked gathers Differential Revision: https://reviews.llvm.org/D122896 --- llvm/include/llvm/IR/PatternMatch.h | 8 ++ .../Transforms/InstCombine/InstCombineSelect.cpp | 28 +++-- .../Transforms/InstCombine/select-masked_gather.ll | 124 +++++++++++++++++++++ 3 files changed, 148 insertions(+), 12 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/select-masked_gather.ll diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index f9f4f16..5009fee 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -2117,6 +2117,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, return m_Intrinsic(Op0, Op1, Op2, Op3); } +/// Matches MaskedGather Intrinsic. +template +inline typename m_Intrinsic_Ty::Ty +m_MaskedGather(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, + const Opnd3 &Op3) { + return m_Intrinsic(Op0, Op1, Op2, Op3); +} + template inline typename m_Intrinsic_Ty::Ty m_Intrinsic(const T0 &Op0) { return m_CombineAnd(m_Intrinsic(), m_Argument<0>(Op0)); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 06c2de6..1cf6e43 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3031,18 +3031,22 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { // select(mask, mload(,,mask,0), 0) -> mload(,,mask,0) // Load inst is intentionally not checked for hasOneUse() if (match(FalseVal, m_Zero()) && - match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal), - m_CombineOr(m_Undef(), m_Zero())))) { - auto *MaskedLoad = cast(TrueVal); - if (isa(MaskedLoad->getArgOperand(3))) - MaskedLoad->setArgOperand(3, FalseVal /* Zero */); - return replaceInstUsesWith(SI, MaskedLoad); + (match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal), + m_CombineOr(m_Undef(), m_Zero()))) || + match(TrueVal, m_MaskedGather(m_Value(), m_Value(), m_Specific(CondVal), + m_CombineOr(m_Undef(), m_Zero()))))) { + auto *MaskedInst = cast(TrueVal); + if (isa(MaskedInst->getArgOperand(3))) + MaskedInst->setArgOperand(3, FalseVal /* Zero */); + return replaceInstUsesWith(SI, MaskedInst); } Value *Mask; if (match(TrueVal, m_Zero()) && - match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask), - m_CombineOr(m_Undef(), m_Zero()))) && + (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask), + m_CombineOr(m_Undef(), m_Zero()))) || + match(FalseVal, m_MaskedGather(m_Value(), m_Value(), m_Value(Mask), + m_CombineOr(m_Undef(), m_Zero())))) && (CondVal->getType() == Mask->getType())) { // We can remove the select by ensuring the load zeros all lanes the // select would have. We determine this by proving there is no overlap @@ -3053,10 +3057,10 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { CanMergeSelectIntoLoad = match(V, m_Zero()); if (CanMergeSelectIntoLoad) { - auto *MaskedLoad = cast(FalseVal); - if (isa(MaskedLoad->getArgOperand(3))) - MaskedLoad->setArgOperand(3, TrueVal /* Zero */); - return replaceInstUsesWith(SI, MaskedLoad); + auto *MaskedInst = cast(FalseVal); + if (isa(MaskedInst->getArgOperand(3))) + MaskedInst->setArgOperand(3, TrueVal /* Zero */); + return replaceInstUsesWith(SI, MaskedInst); } } diff --git a/llvm/test/Transforms/InstCombine/select-masked_gather.ll b/llvm/test/Transforms/InstCombine/select-masked_gather.ll new file mode 100644 index 0000000..22d7e71 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/select-masked_gather.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +; Fold zeroing of inactive lanes into the gather's passthrough parameter. +define @masked_gather_and_zero_inactive_1( %ptr, %mask) { +; CHECK-LABEL: @masked_gather_and_zero_inactive_1( +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2f32.nxv2p0f32( [[PTR:%.*]], i32 4, [[MASK:%.*]], zeroinitializer) +; CHECK-NEXT: ret [[GATHER]] +; + %gather = call @llvm.masked.gather.nxv2f32( %ptr, i32 4, %mask, undef) + %masked = select %mask, %gather, zeroinitializer + ret %masked +} + +; As above but reuse the gather's existing passthrough. +define @masked_gather_and_zero_inactive_2( %ptr, %mask) { +; CHECK-LABEL: @masked_gather_and_zero_inactive_2( +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0i32( [[PTR:%.*]], i32 4, [[MASK:%.*]], zeroinitializer) +; CHECK-NEXT: ret [[GATHER]] +; + %gather = call @llvm.masked.gather.nxv2i32( %ptr, i32 4, %mask, zeroinitializer) + %masked = select %mask, %gather, zeroinitializer + ret %masked +} + +; No transform when the gather's passthrough cannot be reused or altered. +define @masked_gather_and_zero_inactive_3( %ptr, %mask, %passthrough) { +; CHECK-LABEL: @masked_gather_and_zero_inactive_3( +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0i32( [[PTR:%.*]], i32 4, [[MASK:%.*]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: [[MASKED:%.*]] = select [[MASK]], [[GATHER]], zeroinitializer +; CHECK-NEXT: ret [[MASKED]] +; + %gather = call @llvm.masked.gather.nxv2i32( %ptr, i32 4, %mask, %passthrough) + %masked = select %mask, %gather, zeroinitializer + ret %masked +} + +; Remove redundant select when its mask doesn't overlap with the gather mask. +define @masked_gather_and_zero_inactive_4( %ptr, %inv_mask) { +; CHECK-LABEL: @masked_gather_and_zero_inactive_4( +; CHECK-NEXT: [[MASK:%.*]] = xor [[INV_MASK:%.*]], shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer) +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0i32( [[PTR:%.*]], i32 4, [[MASK]], zeroinitializer) +; CHECK-NEXT: ret [[GATHER]] +; + %splat = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %mask = xor %inv_mask, %splat + %gather = call @llvm.masked.gather.nxv2i32( %ptr, i32 4, %mask, undef) + %masked = select %inv_mask, zeroinitializer, %gather + ret %masked +} + +; As above but reuse the gather's existing passthrough. +define @masked_gather_and_zero_inactive_5( %ptr, %inv_mask) { +; CHECK-LABEL: @masked_gather_and_zero_inactive_5( +; CHECK-NEXT: [[MASK:%.*]] = xor [[INV_MASK:%.*]], shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer) +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0i32( [[PTR:%.*]], i32 4, [[MASK]], zeroinitializer) +; CHECK-NEXT: ret [[GATHER]] +; + %splat = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %mask = xor %inv_mask, %splat + %gather = call @llvm.masked.gather.nxv2i32( %ptr, i32 4, %mask, zeroinitializer) + %masked = select %inv_mask, zeroinitializer, %gather + ret %masked +} + +; No transform when the gather's passthrough cannot be reused or altered. +define @masked_gather_and_zero_inactive_6( %ptr, %inv_mask, %passthrough) { +; CHECK-LABEL: @masked_gather_and_zero_inactive_6( +; CHECK-NEXT: [[MASK:%.*]] = xor [[INV_MASK:%.*]], shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer) +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0i32( [[PTR:%.*]], i32 4, [[MASK]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: [[MASKED:%.*]] = select [[INV_MASK]], zeroinitializer, [[GATHER]] +; CHECK-NEXT: ret [[MASKED]] +; + %splat = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %mask = xor %inv_mask, %splat + %gather = call @llvm.masked.gather.nxv2i32( %ptr, i32 4, %mask, %passthrough) + %masked = select %inv_mask, zeroinitializer, %gather + ret %masked +} + +; No transform when select and gather masks have no relation. +define @masked_gather_and_zero_inactive_7( %ptr, %mask1, %mask2) { +; CHECK-LABEL: @masked_gather_and_zero_inactive_7( +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0i32( [[PTR:%.*]], i32 4, [[MASK1:%.*]], zeroinitializer) +; CHECK-NEXT: [[MASKED:%.*]] = select [[MASK2:%.*]], zeroinitializer, [[GATHER]] +; CHECK-NEXT: ret [[MASKED]] +; + %gather = call @llvm.masked.gather.nxv2i32( %ptr, i32 4, %mask1, zeroinitializer) + %masked = select %mask2, zeroinitializer, %gather + ret %masked +} + +; A more complex case where we can prove the select mask is a subset of the +; gather's inactive lanes and thus the gather's passthrough takes effect. +define @masked_gather_and_zero_inactive_8( %ptr, %inv_mask, %cond) { +; CHECK-LABEL: @masked_gather_and_zero_inactive_8( +; CHECK-NEXT: [[MASK:%.*]] = xor [[INV_MASK:%.*]], shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer) +; CHECK-NEXT: [[PG:%.*]] = and [[MASK]], [[COND:%.*]] +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2f32.nxv2p0f32( [[PTR:%.*]], i32 4, [[PG]], zeroinitializer) +; CHECK-NEXT: ret [[GATHER]] +; + %splat = shufflevector insertelement ( undef, i1 true, i32 0), undef, zeroinitializer + %mask = xor %inv_mask, %splat + %pg = and %mask, %cond + %gather = call @llvm.masked.gather.nxv2f32( %ptr, i32 4, %pg, undef) + %masked = select %inv_mask, zeroinitializer, %gather + ret %masked +} + +define @masked_load_and_scalar_select_cond( %ptr, %mask, i1 %cond) { +; CHECK-LABEL: @masked_load_and_scalar_select_cond( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.masked.gather.nxv2f32.nxv2p0f32( [[PTR:%.*]], i32 32, [[MASK:%.*]], undef) +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[COND:%.*]], zeroinitializer, [[TMP0]] +; CHECK-NEXT: ret [[TMP1]] +; +entry: + %0 = call @llvm.masked.gather.nxv2f32( %ptr, i32 32, %mask, undef) + %1 = select i1 %cond, zeroinitializer, %0 + ret %1 +} + +declare @llvm.masked.gather.nxv2i32(, i32, , ) +declare @llvm.masked.gather.nxv2f32(, i32, , ) -- 2.7.4