From b6315aee5b420aa5a0be5f1ea86c3c963760a3f2 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 29 Jun 2020 10:29:59 -0400
Subject: [PATCH] [VectorCombine] try to form vector compare and binop to
 eliminate scalar ops

binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
-->
vcmp = cmp Pred X, VecC
ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0

This is a larger pattern than the existing extractelement folds because we can't
reasonably vectorize the sub-patterns with constants based on cost model calcs
(it doesn't usually make sense to replace a single extracted scalar op with
constant operand with a vector op).

I salvaged as much of the existing logic as I could, but there might be better
ways to share and reduce code.

The motivating case from PR43745:
https://bugs.llvm.org/show_bug.cgi?id=43745
...is the special case of a 2-way reduction. We tried to get SLP to handle that
particular pattern in D59710, but that caused crashing and regressions.
This patch is more general, but hopefully safer.

The v2f64 test with SSE2 surprised me - the cost model accounting looks like this:
OldCost = 0 (free extract of f64 at index 0) + 1 (extract of f64 at index 1) + 2 (scalar fcmps) + 1 (and of bools) = 4
NewCost = 2 (vector fcmp) + 1 (shuffle) + 1 (vector 'and') + 1 (extract of bool) = 5

Differential Revision: https://reviews.llvm.org/D82474
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp    | 87 ++++++++++++++++++++++
 .../PhaseOrdering/X86/vector-reductions.ll         | 13 ++--
 .../VectorCombine/X86/extract-cmp-binop.ll         | 80 +++++++++++++-------
 3 files changed, 147 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 8698dde..32332ed 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -35,6 +35,7 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "vector-combine"
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
+STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
 STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
 STATISTIC(NumScalarBO, "Number of scalar binops formed");
 STATISTIC(NumScalarCmp, "Number of scalar compares formed");
@@ -77,6 +78,7 @@ private:
   bool foldExtractExtract(Instruction &I);
   bool foldBitcastShuf(Instruction &I);
   bool scalarizeBinopOrCmp(Instruction &I);
+  bool foldExtractedCmps(Instruction &I);
 };
 
 static void replaceValue(Value &Old, Value &New) {
@@ -518,6 +520,90 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   return true;
 }
 
+/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
+/// a vector into vector operations followed by extract. Note: The SLP pass
+/// may miss this pattern because of implementation problems.
+bool VectorCombine::foldExtractedCmps(Instruction &I) {
+  // We are looking for a scalar binop of booleans.
+  // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
+  if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
+    return false;
+
+  // The compare predicates should match, and each compare should have a
+  // constant operand.
+  // TODO: Relax the one-use constraints.
+  Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
+  Instruction *I0, *I1;
+  Constant *C0, *C1;
+  CmpInst::Predicate P0, P1;
+  if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
+      !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
+      P0 != P1)
+    return false;
+
+  // The compare operands must be extracts of the same vector with constant
+  // extract indexes.
+  // TODO: Relax the one-use constraints.
+  Value *X;
+  uint64_t Index0, Index1;
+  if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
+      !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
+    return false;
+
+  auto *Ext0 = cast<ExtractElementInst>(I0);
+  auto *Ext1 = cast<ExtractElementInst>(I1);
+  ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
+  if (!ConvertToShuf)
+    return false;
+
+  // The original scalar pattern is:
+  // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
+  CmpInst::Predicate Pred = P0;
+  unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
+                                                    : Instruction::ICmp;
+  auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
+  if (!VecTy)
+    return false;
+
+  int OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+  OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+  OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
+  OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+
+  // The proposed vector pattern is:
+  // vcmp = cmp Pred X, VecC
+  // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
+  int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
+  int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
+  auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
+  int NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+  NewCost +=
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
+  NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
+  NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
+
+  // Aggressively form vector ops if the cost is equal because the transform
+  // may enable further optimization.
+  // Codegen can reverse this transform (scalarize) if it was not profitable.
+  if (OldCost < NewCost)
+    return false;
+
+  // Create a vector constant from the 2 scalar constants.
+  SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
+                                   UndefValue::get(VecTy->getElementType()));
+  CmpC[Index0] = C0;
+  CmpC[Index1] = C1;
+  Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
+
+  Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
+  Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
+                                        VCmp, Shuf);
+  Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
+  replaceValue(I, *NewExt);
+  ++NumVecCmpBO;
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -540,6 +626,7 @@ bool VectorCombine::run() {
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= scalarizeBinopOrCmp(I);
+      MadeChange |= foldExtractedCmps(I);
     }
   }
 
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 3ec7036..cf01ead 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -294,16 +294,15 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
-; CHECK-NEXT:    [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i1> [[TMP8]], [[SHIFT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = extractelement <2 x i1> [[TMP9]], i64 0
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
 ; CHECK:       lor.lhs.false:
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP10]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP10]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <2 x i1> [[TMP10]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP10]], [[SHIFT2]]
 ; CHECK-NEXT:    [[NOT_OR_COND1:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
 ; CHECK-NEXT:    ret i1 [[NOT_OR_COND1]]
 ; CHECK:       cleanup:
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
index fa7df75..211936b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
@@ -3,13 +3,20 @@
 ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i1 @fcmp_and_v2f64(<2 x double> %a) {
-; CHECK-LABEL: @fcmp_and_v2f64(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @fcmp_and_v2f64(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SSE-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
+; SSE-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @fcmp_and_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <2 x double> [[A:%.*]], <double 4.200000e+01, double -8.000000e+00>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <2 x double> %a, i32 0
   %e2 = extractelement <2 x double> %a, i32 1
@@ -20,13 +27,20 @@ define i1 @fcmp_and_v2f64(<2 x double> %a) {
 }
 
 define i1 @fcmp_or_v4f64(<4 x double> %a) {
-; CHECK-LABEL: @fcmp_or_v4f64(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @fcmp_or_v4f64(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SSE-NEXT:    [[E2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
+; SSE-NEXT:    [[R:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @fcmp_or_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[A:%.*]], <double 4.200000e+01, double undef, double -8.000000e+00, double undef>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <4 x double> %a, i32 0
   %e2 = extractelement <4 x double> %a, i64 2
@@ -38,11 +52,10 @@ define i1 @fcmp_or_v4f64(<4 x double> %a) {
 
 define i1 @icmp_xor_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: @icmp_xor_v4i32(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 3
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[E1]], 42
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[E2]], -8
-; CHECK-NEXT:    [[R:%.*]] = xor i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A:%.*]], <i32 undef, i32 -8, i32 undef, i32 42>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <4 x i32> %a, i32 3
@@ -56,13 +69,20 @@ define i1 @icmp_xor_v4i32(<4 x i32> %a) {
 ; add is not canonical (should be xor), but that is ok.
 
 define i1 @icmp_add_v8i32(<8 x i32> %a) {
-; CHECK-LABEL: @icmp_add_v8i32(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 7
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[E1]], 42
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[E2]], -8
-; CHECK-NEXT:    [[R:%.*]] = add i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @icmp_add_v8i32(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 7
+; SSE-NEXT:    [[E2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[E1]], 42
+; SSE-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[E2]], -8
+; SSE-NEXT:    [[R:%.*]] = add i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @icmp_add_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i32> [[A:%.*]], <i32 undef, i32 undef, i32 -8, i32 undef, i32 undef, i32 undef, i32 undef, i32 42>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <8 x i32> <i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2
+; AVX-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <8 x i32> %a, i32 7
   %e2 = extractelement <8 x i32> %a, i32 2
@@ -72,6 +92,8 @@ define i1 @icmp_add_v8i32(<8 x i32> %a) {
   ret i1 %r
 }
 
+; Negative test - this could CSE/simplify.
+
 define i1 @same_extract_index(<4 x i32> %a) {
 ; CHECK-LABEL: @same_extract_index(
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 2
@@ -89,6 +111,8 @@ define i1 @same_extract_index(<4 x i32> %a) {
   ret i1 %r
 }
 
+; Negative test - need identical predicates.
+
 define i1 @different_preds(<4 x i32> %a) {
 ; CHECK-LABEL: @different_preds(
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 1
@@ -106,6 +130,8 @@ define i1 @different_preds(<4 x i32> %a) {
   ret i1 %r
 }
 
+; Negative test - need 1 source vector.
+
 define i1 @different_source_vec(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @different_source_vec(
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 1
@@ -123,6 +149,8 @@ define i1 @different_source_vec(<4 x i32> %a, <4 x i32> %b) {
   ret i1 %r
 }
 
+; Negative test - don't try this with scalable vectors.
+
 define i1 @scalable(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @scalable(
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <vscale x 4 x i32> [[A:%.*]], i32 3
-- 
2.7.4