/// Attempt to vectorize the tree found by
/// matchAssociativeReduction.
- bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, bool Try2WayRdx) {
if (ReducedVals.empty())
return false;
// to a nearby power-of-2. Can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
unsigned NumReducedVals = ReducedVals.size();
- if (NumReducedVals < 4)
+ if (Try2WayRdx && NumReducedVals != 2)
+ return false;
+ unsigned MinRdxVals = Try2WayRdx ? 2 : 4;
+ if (NumReducedVals < MinRdxVals)
return false;
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-
+ unsigned MinRdxWidth = Log2_32(MinRdxVals);
Value *VectorizedTree = nullptr;
// FIXME: Fast-math-flags should be set based on the instructions in the
SmallVector<Value *, 16> IgnoreList;
for (auto &V : ReductionOps)
IgnoreList.append(V.begin(), V.end());
- while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
+ while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
Optional<ArrayRef<unsigned>> Order = V.bestOrder();
/// performed.
static bool tryToVectorizeHorReductionOrInstOperands(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI,
+ TargetTransformInfo *TTI, bool Try2WayRdx,
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
if (BI || SI) {
HorizontalReduction HorRdx;
if (HorRdx.matchAssociativeReduction(P, Inst)) {
- if (HorRdx.tryToReduce(R, TTI)) {
+ if (HorRdx.tryToReduce(R, TTI, Try2WayRdx)) {
Res = true;
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI) {
+ TargetTransformInfo *TTI,
+ bool Try2WayRdx) {
if (!V)
return false;
auto *I = dyn_cast<Instruction>(V);
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
return tryToVectorize(I, R);
};
- return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, Try2WayRdx,
ExtraVectorization);
}
PostProcessInstructions.push_back(&*it);
}
+ // Make a final attempt to match a 2-way reduction if nothing else worked.
+ // We do not try this above because it may interfere with other vectorization
+ // attempts.
+ // TODO: The constraints are copied from the above call to
+ // vectorizeRootInstruction(), but that might be too restrictive?
+ BasicBlock::iterator LastInst = --BB->end();
+ if (!Changed && LastInst->use_empty() &&
+ (LastInst->getType()->isVoidTy() || isa<CallInst>(LastInst) ||
+ isa<InvokeInst>(LastInst))) {
+ if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(LastInst)) {
+ for (auto *V : LastInst->operand_values()) {
+ Changed |= vectorizeRootInstruction(nullptr, V, BB, R, TTI,
+ /* Try2WayRdx */ true);
+ }
+ }
+ }
+
return Changed;
}
define i1 @two_wide_fcmp_reduction(<2 x double> %a0) {
; CHECK-LABEL: @two_wide_fcmp_reduction(
; CHECK-NEXT: [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], <double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT: [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0
-; CHECK-NEXT: [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1
-; CHECK-NEXT: [[D:%.*]] = and i1 [[B]], [[C]]
-; CHECK-NEXT: ret i1 [[D]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
+; CHECK-NEXT: ret i1 [[TMP1]]
;
%a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0>
%b = extractelement <2 x i1> %a, i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
-; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]]
-; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
+; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D>
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
+; CHECK-NEXT: br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
; CHECK: lor.lhs.false:
; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0