for (ReductionOpsType &RdxOp : ReductionOps)
IgnoreList.append(RdxOp.begin(), RdxOp.end());
+ unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+ if (NumReducedVals > ReduxWidth) {
+ // In the loop below, we are building a tree based on a window of
+ // 'ReduxWidth' values.
+ // If the operands of those values have common traits (compare predicate,
+ // constant operand, etc), then we want to group those together to
+ // minimize the cost of the reduction.
+
+ // TODO: This should be extended to count common operands for
+ // compares and binops.
+
+ // Step 1: Count the number of times each compare predicate occurs.
+ SmallDenseMap<unsigned, unsigned> PredCountMap;
+ for (Value *RdxVal : ReducedVals) {
+ CmpInst::Predicate Pred;
+ if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
+ ++PredCountMap[Pred];
+ }
+ // Step 2: Sort the values so the most common predicates come first.
+ stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
+ CmpInst::Predicate PredA, PredB;
+ if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
+ match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
+ return PredCountMap[PredA] > PredCountMap[PredB];
+ }
+ return false;
+ });
+ }
+
Value *VectorizedTree = nullptr;
unsigned i = 0;
- unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
ArrayRef<Value *> VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
define float @merge_anyof_v4f32_wrong_first(<4 x float> %x) {
; CHECK-LABEL: @merge_anyof_v4f32_wrong_first(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x float> [[X]], i32 1
-; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x float> [[X]], i32 2
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x float> [[X]], i32 3
-; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[X3]], 4.200000e+01
-; CHECK-NEXT: [[CMP0:%.*]] = fcmp ogt float [[X0]], 1.000000e+00
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[X1]], 1.000000e+00
-; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[X2]], 1.000000e+00
-; CHECK-NEXT: [[CMP3:%.*]] = fcmp ogt float [[X3]], 1.000000e+00
-; CHECK-NEXT: [[OR03:%.*]] = or i1 [[CMP0]], [[CMP3WRONG]]
-; CHECK-NEXT: [[OR031:%.*]] = or i1 [[OR03]], [[CMP1]]
-; CHECK-NEXT: [[OR0312:%.*]] = or i1 [[OR031]], [[CMP2]]
-; CHECK-NEXT: [[OR03123:%.*]] = or i1 [[OR0312]], [[CMP3]]
-; CHECK-NEXT: [[R:%.*]] = select i1 [[OR03123]], float -1.000000e+00, float 1.000000e+00
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01
+; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
+; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00
; CHECK-NEXT: ret float [[R]]
;
%x0 = extractelement <4 x float> %x, i32 0
define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) {
; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[X3]], 42
-; CHECK-NEXT: [[CMP0:%.*]] = icmp sgt i32 [[X0]], 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[X1]], 1
-; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[X2]], 1
-; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[X3]], 1
-; CHECK-NEXT: [[OR03:%.*]] = or i1 [[CMP0]], [[CMP3]]
-; CHECK-NEXT: [[OR033:%.*]] = or i1 [[OR03]], [[CMP3WRONG]]
-; CHECK-NEXT: [[OR0332:%.*]] = or i1 [[OR033]], [[CMP2]]
-; CHECK-NEXT: [[OR03321:%.*]] = or i1 [[OR0332]], [[CMP1]]
-; CHECK-NEXT: [[R:%.*]] = select i1 [[OR03321]], i32 -1, i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
+; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP1]], 42
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
+; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP4]], i32 -1, i32 1
; CHECK-NEXT: ret i32 [[R]]
;
%x0 = extractelement <4 x i32> %x, i32 0
ret i32 %r
}
+; Operand/predicate swapping allows forming a reduction, but the
+; ideal reduction groups all of the original 'sgt' ops together.
+
define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle_better_rdx(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT: [[Y0:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 0
-; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i32> [[Y]], i32 1
-; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
-; CHECK-NEXT: [[Y3:%.*]] = extractelement <4 x i32> [[Y]], i32 3
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[X1]], [[Y1]]
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X3]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y3]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X2]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[Y0]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[Y3]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[X3]], i32 2
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y2]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
-; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP10]], [[CMP1]]
-; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP11]], i32 -1, i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
+; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[CMP3WRONG]]
+; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
; CHECK-NEXT: ret i32 [[R]]
;
%x0 = extractelement <4 x i32> %x, i32 0