unsigned BundleWidth = VectorizableTree[0].Scalars.size();
- for (TreeEntry &TE : VectorizableTree) {
+ for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+ TreeEntry &TE = VectorizableTree[I];
+
+ // We create duplicate tree entries for gather sequences that have multiple
+ // uses. However, we should not compute the cost of duplicate sequences.
+ // For example, if we have a build vector (i.e., insertelement sequence)
+ // that is used by more than one vector instruction, we only need to
+ // compute the cost of the insertelement instructions once. The redundent
+ // instructions will be eliminated by CSE.
+ //
+ // We should consider not creating duplicate tree entries for gather
+ // sequences, and instead add additional edges to the tree representing
+ // their uses. Since such an approach results in fewer total entries,
+ // existing heuristics based on tree size may yeild different results.
+ //
+ if (TE.NeedToGather &&
+ std::any_of(std::next(VectorizableTree.begin(), I + 1),
+ VectorizableTree.end(), [TE](TreeEntry &Entry) {
+ return Entry.NeedToGather && Entry.isSame(TE.Scalars);
+ }))
+ continue;
+
int C = getEntryCost(&TE);
DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
<< *TE.Scalars[0] << ".\n");
target triple = "aarch64--linux-gnu"
; CHECK-LABEL: @gather_multiple_use(
-; CHECK-NEXT: [[TMP00:%.*]] = lshr i32 [[A:%.*]], 15
-; CHECK-NEXT: [[TMP01:%.*]] = and i32 [[TMP00]], 65537
-; CHECK-NEXT: [[TMP02:%.*]] = mul nuw i32 [[TMP01]], 65535
-; CHECK-NEXT: [[TMP03:%.*]] = add i32 [[TMP02]], [[A]]
-; CHECK-NEXT: [[TMP04:%.*]] = xor i32 [[TMP03]], [[TMP02]]
-; CHECK-NEXT: [[TMP05:%.*]] = lshr i32 [[C:%.*]], 15
-; CHECK-NEXT: [[TMP06:%.*]] = and i32 [[TMP05]], 65537
-; CHECK-NEXT: [[TMP07:%.*]] = mul nuw i32 [[TMP06]], 65535
-; CHECK-NEXT: [[TMP08:%.*]] = add i32 [[TMP07]], [[C]]
-; CHECK-NEXT: [[TMP09:%.*]] = xor i32 [[TMP08]], [[TMP07]]
-; CHECK-NEXT: [[TMP10:%.*]] = lshr i32 [[B:%.*]], 15
-; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 65537
-; CHECK-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 65535
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[B]]
-; CHECK-NEXT: [[TMP14:%.*]] = xor i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[D:%.*]], 15
-; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65537
-; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 65535
-; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], [[D]]
-; CHECK-NEXT: [[TMP19:%.*]] = xor i32 [[TMP18]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP09]], [[TMP04]]
-; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP20]], [[TMP14]]
-; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP19]]
-; CHECK-NEXT: ret i32 [[TMP22]]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT: ret i32 [[TMP10]]
;
; REMARK-LABEL: Function: gather_multiple_use
; REMARK: Args:
-; REMARK-NEXT: - String: Vectorizing horizontal reduction is possible
-; REMARK-NEXT: - String: 'but not beneficial with cost '
-; REMARK-NEXT: - Cost: '2'
+; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost '
+; REMARK-NEXT: - Cost: '-7'
;
define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
%tmp00 = lshr i32 %a, 15
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]]
-; CHECK-NEXT: ret i8 [[TMP6]]
+; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
br label %bb1