tryToVectorize() method implements one of searching paths for vectorizable tree roots in SLP vectorizer,
specifically for binary and comparison operations. Order of making probes for various scalar pairs
was defined by its implementation: the instruction operands, then climb over one operand if
the instruction is its sole user and then perform same actions for another operand if previous
attempts failed. Problem with this approach is that among these options we can have more than a
single vectorizable tree candidate and it is not necessarily the one that encountered first.
Trying to build vectorizable tree for each possible combination for just evaluation is expensive.
But we already have lookahead heuristics mechanism which we use for finding best pick among
operands of commutative instructions. It calculates cumulative score for candidates in two
consecutive lanes. This patch introduces use of the heuristics for choosing the best pair among
several combinations. We only try one that looks as most promising for vectorization.
Additional benefit is that we reduce total number of vectorization trees built for probes
because we skip those looking non-profitable early.
Reviewed By: Alexey Bataev (ABataev), Vasileios Porpodas (vporpo)
Differential Revision: https://reviews.llvm.org/D124309
"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
cl::desc("The maximum look-ahead depth for operand reordering scores"));
+// The maximum depth that the look-ahead score heuristic will explore
+// when it probing among candidates for vectorization tree roots.
+// The higher this value, the higher the compilation time overhead but unlike
+// similar limit for operands ordering this is less frequently used, hence
+// impact of higher value is less noticeable.
+static cl::opt<int> RootLookAheadMaxDepth(
+ "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
+ cl::desc("The maximum look-ahead depth for searching best rooting option"));
+
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
#endif
};
+ /// Evaluate each pair in \p Candidates and return index into \p Candidates
+ /// for a pair which have highest score deemed to have best chance to form
+ /// root of profitable tree to vectorize. Return None if no candidate scored
+ /// above the LookAheadHeuristics::ScoreFail.
+ Optional<int>
+ findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates) {
+ LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
+ RootLookAheadMaxDepth);
+ int BestScore = LookAheadHeuristics::ScoreFail;
+ Optional<int> Index = None;
+ for (int I : seq<int>(0, Candidates.size())) {
+ int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
+ Candidates[I].second,
+ /*U1=*/nullptr, /*U2=*/nullptr,
+ /*Level=*/1, None);
+ if (Score > BestScore) {
+ BestScore = Score;
+ Index = I;
+ }
+ }
+ return Index;
+ }
+
/// Checks if the instruction is marked for deletion.
bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
if (!I)
return false;
- if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+ if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||
+ isa<VectorType>(I->getType()))
return false;
Value *P = I->getParent();
if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
return false;
- // Try to vectorize V.
- if (tryToVectorizePair(Op0, Op1, R))
- return true;
+ // First collect all possible candidates
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+ Candidates.emplace_back(Op0, Op1);
auto *A = dyn_cast<BinaryOperator>(Op0);
auto *B = dyn_cast<BinaryOperator>(Op1);
// Try to skip B.
- if (B && B->hasOneUse()) {
+ if (A && B && B->hasOneUse()) {
auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
- return true;
- if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
- return true;
+ if (B0 && B0->getParent() == P)
+ Candidates.emplace_back(A, B0);
+ if (B1 && B1->getParent() == P)
+ Candidates.emplace_back(A, B1);
}
-
// Try to skip A.
- if (A && A->hasOneUse()) {
+ if (B && A && A->hasOneUse()) {
auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
- return true;
- if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
- return true;
+ if (A0 && A0->getParent() == P)
+ Candidates.emplace_back(A0, B);
+ if (A1 && A1->getParent() == P)
+ Candidates.emplace_back(A1, B);
}
- return false;
+
+ if (Candidates.size() == 1)
+ return tryToVectorizePair(Op0, Op1, R);
+
+ // We have multiple options. Try to pick the single best.
+ Optional<int> BestCandidate = R.findBestRootPair(Candidates);
+ if (!BestCandidate)
+ return false;
+ return tryToVectorizePair(Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second, R);
}
namespace {
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -S -pass-remarks-missed=slp-vectorizer 2>&1 | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
; This test check that slp vectorizer is not trying to vectorize instructions already vectorized.
-; CHECK: remark: <unknown>:0:0: Cannot SLP vectorize list: type <16 x i8> is unsupported by vectorizer
define void @vector() {
+; CHECK-LABEL: @vector(
+; CHECK-NEXT: [[LOAD0:%.*]] = tail call <16 x i8> @vector.load(<16 x i8>* undef, i32 1)
+; CHECK-NEXT: [[LOAD1:%.*]] = tail call <16 x i8> @vector.load(<16 x i8>* undef, i32 2)
+; CHECK-NEXT: [[ADD:%.*]] = add <16 x i8> [[LOAD1]], [[LOAD0]]
+; CHECK-NEXT: tail call void @vector.store(<16 x i8> [[ADD]], <16 x i8>* undef, i32 1)
+; CHECK-NEXT: ret void
+;
%load0 = tail call <16 x i8> @vector.load(<16 x i8> *undef, i32 1)
%load1 = tail call <16 x i8> @vector.load(<16 x i8> *undef, i32 2)
%add = add <16 x i8> %load1, %load0
; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
-; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
-; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]])
; MINTREESIZE-NEXT: ret <4 x float> undef
; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
-; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
-; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]])
; MINTREESIZE-NEXT: ret <4 x float> undef
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <2 x double> [[TMP2]], <double 7.000000e+00, double 5.000000e+00>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT: [[I11:%.*]] = fmul fast double [[TMP4]], undef
+; CHECK-NEXT: [[I09:%.*]] = fmul fast double [[TMP4]], undef
+; CHECK-NEXT: [[I10:%.*]] = fsub fast double undef, [[I09]]
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP3]], <double 3.000000e+00, double undef>
-; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x double> undef, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> undef, [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[I11]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x double> [[TMP12]], <double 4.000000e+00, double 1.100000e+01>
-; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <2 x double> [[TMP12]], <double 4.000000e+00, double 1.100000e+01>
-; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP16:%.*]] = fdiv fast <2 x double> [[TMP15]], <double 1.400000e+00, double 1.200000e+01>
-; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <2 x double> [[TMP15]], <double 1.400000e+00, double 1.200000e+01>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP17]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP19:%.*]] = fadd fast <2 x double> [[TMP18]], <double undef, double 1.400000e+00>
-; CHECK-NEXT: [[TMP20:%.*]] = fdiv fast <2 x double> [[TMP18]], <double undef, double 1.400000e+00>
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP20]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[TMP21]], i32 0
-; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[TMP21]], i32 1
-; CHECK-NEXT: [[I16:%.*]] = fadd fast double [[TMP22]], [[TMP23]]
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[I10]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], <double undef, double 1.100000e+01>
+; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP8]], <double 4.000000e+00, double 1.200000e+01>
+; CHECK-NEXT: [[TMP10:%.*]] = fdiv fast <2 x double> [[TMP9]], <double 1.400000e+00, double 1.400000e+00>
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
+; CHECK-NEXT: [[I07:%.*]] = fadd fast double undef, [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
+; CHECK-NEXT: [[I16:%.*]] = fadd fast double [[I07]], [[TMP12]]
; CHECK-NEXT: [[I17:%.*]] = fadd fast double [[I16]], [[C:%.*]]
; CHECK-NEXT: [[I18:%.*]] = fadd fast double [[I17]], [[D:%.*]]
; CHECK-NEXT: ret double [[I18]]