An early run of VectorCombine was added with D102496 specifically to
deal with unnecessary vector ops produced with the C matrix extension.
This patch is proposing to try those folds in general and add a pair
of load folds to the menu.
The load transform will partly solve (see PhaseOrdering diffs) a
longstanding vectorization perf bug by removing redundant loads via GVN:
issue #17113
The main reason for not enabling the extra pass generally in the initial
patch was compile-time cost. The cost of VectorCombine was significantly
(surprisingly) improved with:
87debdadaf18
https://llvm-compile-time-tracker.com/compare.php?from=
ffe05b8f57d97bc4340f791cb386c8d00e0739f2&to=
87debdadaf18f8a5c7e5d563889e10731dc3554d&stat=instructions:u
...so the extra run is going to cost very little now - the total cost of
the 2 runs should be less than the 1 run before that micro-optimization:
https://llvm-compile-time-tracker.com/compare.php?from=
5e8c2026d10e8e2c93c038c776853bed0e7c8fc1&to=
2c4b68eab5ae969811f422714e0eba44c5f7eefb&stat=instructions:u
It may be possible to reduce the cost slightly more with a few more
earlier-exits like that, but it's probably in the noise based on timing
experiments.
Differential Revision: https://reviews.llvm.org/D138353
// Delete small array after loop unroll.
FPM.addPass(SROAPass());
- // The matrix extension can introduce large vector operations early, which can
- // benefit from running vector-combine early on.
- if (EnableMatrix)
- FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
+ // Try vectorization/scalarization transforms that are both improvements
+ // themselves and can allow further folds with GVN and InstCombine.
+ FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
// Eliminate redundancies.
FPM.addPass(MergedLoadStoreMotionPass());
Builder.SetInsertPoint(&I);
if (!TryEarlyFoldsOnly) {
if (isa<FixedVectorType>(I.getType())) {
- MadeChange |= vectorizeLoadInsert(I);
- MadeChange |= widenSubvectorLoad(I);
MadeChange |= foldInsExtFNeg(I);
MadeChange |= foldBitcastShuf(I);
MadeChange |= foldShuffleOfBinops(I);
}
}
if (isa<FixedVectorType>(I.getType())) {
+ MadeChange |= vectorizeLoadInsert(I);
+ MadeChange |= widenSubvectorLoad(I);
MadeChange |= scalarizeBinopOrCmp(I);
MadeChange |= scalarizeLoadExtract(I);
}
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
; CHECK-O-NEXT: Running pass: SROAPass on foo
-; CHECK-MATRIX: Running pass: VectorCombinePass
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
; CHECK-O23SZ-NEXT: Running pass: GVNPass
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
; CHECK-O-NEXT: Running pass: LoopDeletionPass
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
; CHECK-O23SZ-NEXT: Running pass: GVNPass
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
; CHECK-O-NEXT: Running pass: LoopDeletionPass
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
; CHECK-O23SZ-NEXT: Running pass: GVNPass
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
; CHECK-O-NEXT: Running pass: LoopDeletionPass
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
; CHECK-O23SZ-NEXT: Running pass: GVNPass
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
; CHECK-O-NEXT: Running pass: LoopDeletionPass
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
; CHECK-O23SZ-NEXT: Running pass: GVNPass
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
; CHECK-O-NEXT: Running pass: IndVarSimplifyPass
; CHECK-O-NEXT: Running pass: LoopDeletionPass
; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
; CHECK-O23SZ-NEXT: Running pass: GVNPass
; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP0]], i64 0, i64 1
-; SSE-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; SSE-NEXT: ret <4 x float> [[TMP7]]
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP0]], i64 0, i64 2
-; AVX-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 8
-; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i64 2
-; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP4]], i64 3
-; AVX-NEXT: ret <4 x float> [[TMP6]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
%3 = alloca <4 x float>, align 16