[SLP]Improve vectorization of phi nodes by trying wider vectors.

author Alexey Bataev <a.bataev@outlook.com>

Wed, 25 Aug 2021 22:38:36 +0000 (15:38 -0700)

committer Alexey Bataev <a.bataev@outlook.com>

Tue, 28 Sep 2021 14:20:36 +0000 (07:20 -0700)
author Alexey Bataev <a.bataev@outlook.com>
Wed, 25 Aug 2021 22:38:36 +0000 (15:38 -0700)
committer Alexey Bataev <a.bataev@outlook.com>
Tue, 28 Sep 2021 14:20:36 +0000 (07:20 -0700)
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h

index 5e8c299..cd605aa 100644 (file)
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -94,8 +94,11 @@ private:
    bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
  
    /// Try to vectorize a list of operands.
+  /// \param LimitForRegisterSize Vectorize only using maximal allowed register
+  /// size.
    /// \returns true if a value was vectorized.
-  bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R);
+  bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
+                          bool LimitForRegisterSize = false);
  
    /// Try to vectorize a chain that may start at the operands of \p I.
    bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

index 4ea8224..6988926 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7681,7 +7681,8 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
    return tryToVectorizeList(VL, R);
  }
  
-bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                                           bool LimitForRegisterSize) {
    if (VL.size() < 2)
      return false;
  
@@ -7753,7 +7754,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
        if (!isPowerOf2_32(OpsWidth))
          continue;
  
-      if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
+      if ((LimitForRegisterSize && OpsWidth < MaxVF) ||
+          (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
          break;
  
        ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
@@ -9106,21 +9108,49 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
        // So allow tryToVectorizeList to reorder them if it is beneficial. This
        // is done when there are exactly two elements since tryToVectorizeList
        // asserts that there are only two values when AllowReorder is true.
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
+      // The vectorization is a 3-state attempt:
+      // 1. Try to vectorize PHIs with the same/alternate opcodes with the size
+      // of maximal register at first.
+      // 2. Try to vectorize remaining PHIs with the same type, if possible.
+      // This may result in the better vectorization results rather than if we
+      // try just to vectorize PHIs with the same/alternate opcodes.
+      // 3. Final attempt to try to vectorize all PHIs with the same/alternate
+      // ops only, this may result in some extra final vectorization.
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+                                            /*LimitForRegisterSize=*/true)) {
          // Success start over because instructions might have been changed.
          HaveVectorizedPhiNodes = true;
          Changed = true;
-      } else if (NumElts < 4 &&
+      } else if (NumElts * R.getVectorElementSize(*IncIt) <
+                     R.getMaxVecRegSize() &&
                   (Candidates.empty() ||
                    Candidates.front()->getType() == (*IncIt)->getType())) {
          Candidates.append(IncIt, std::next(IncIt, NumElts));
        }
        // Final attempt to vectorize phis with the same types.
-      if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) {
-        if (Candidates.size() > 1 && tryToVectorizeList(Candidates, R)) {
+      if (Candidates.size() > 1 &&
+          (SameTypeIt == E ||
+           (*SameTypeIt)->getType() != (*IncIt)->getType())) {
+        if (tryToVectorizeList(Candidates, R)) {
            // Success start over because instructions might have been changed.
            HaveVectorizedPhiNodes = true;
            Changed = true;
+        } else {
+          // Try to vectorize using small vectors.
+          for (SmallVector<Value *, 4>::iterator It = Candidates.begin(),
+                                                 End = Candidates.end();
+               It != End;) {
+            SmallVector<Value *, 4>::iterator SameTypeIt = It;
+            while (SameTypeIt != End && AreCompatiblePHIs(*SameTypeIt, *It))
+              ++SameTypeIt;
+            unsigned NumElts = (SameTypeIt - It);
+            if (NumElts > 1 &&
+                tryToVectorizeList(makeArrayRef(It, NumElts), R)) {
+              HaveVectorizedPhiNodes = true;
+              Changed = true;
+            }
+            It = SameTypeIt;
+          }
          }
          Candidates.clear();
        }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll

index d2e3053..87709a8 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -8,33 +8,32 @@ define void @foo() {
  ; CHECK-NEXT:    [[SUB:%.*]] = fsub float 6.553500e+04, undef
  ; CHECK-NEXT:    br label [[BB1:%.*]]
  ; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[SUB]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1
  ; CHECK-NEXT:    br label [[BB2:%.*]]
  ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ [[SUB]], [[BB1]] ], [ [[TMP9:%.*]], [[BB3:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[CONV]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB1]] ], [ [[TMP11:%.*]], [[BB3]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP18:%.*]], [[BB3:%.*]] ]
  ; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* undef, align 8
  ; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
  ; CHECK:       bb4:
-; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <2 x double> undef, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x float> [[TMP2]], <2 x float> undef
-; CHECK-NEXT:    [[EXT3:%.*]] = fpext float [[TMP1]] to double
  ; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]]
-; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt double [[ADD1]], [[EXT3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fptrunc double [[ADD1]] to float
-; CHECK-NEXT:    [[SEL3:%.*]] = select i1 [[CMP3]], float [[TMP1]], float [[TMP7]]
-; CHECK-NEXT:    [[EXT4:%.*]] = fpext float [[TMP0]] to double
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[CMP4:%.*]] = fcmp ogt double [[SUB1]], [[EXT4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc double [[SUB1]] to float
-; CHECK-NEXT:    [[SEL4:%.*]] = select i1 [[CMP4]], float [[TMP0]], float [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float>
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]]
  ; CHECK-NEXT:    br label [[BB3]]
  ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP9]] = phi float [ [[SEL4]], [[BB4]] ], [ [[TMP0]], [[BB2]] ]
-; CHECK-NEXT:    [[TMP10]] = phi float [ [[SEL3]], [[BB4]] ], [ [[TMP1]], [[BB2]] ]
-; CHECK-NEXT:    [[TMP11]] = phi <2 x float> [ [[TMP6]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP18]] = phi <4 x float> [ [[TMP17]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
  ; CHECK-NEXT:    br label [[BB2]]
  ;
  entry:
author	Alexey Bataev <a.bataev@outlook.com>
	Wed, 25 Aug 2021 22:38:36 +0000 (15:38 -0700)
committer	Alexey Bataev <a.bataev@outlook.com>
	Tue, 28 Sep 2021 14:20:36 +0000 (07:20 -0700)
llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h		patch \| blob \| history
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll		patch \| blob \| history