From 163bb6d64e5f1220777c3ec2a8b58c0666a74d91 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 21 Nov 2022 13:16:56 -0500 Subject: [PATCH] [Passes][VectorCombine] enable early run generally and try load folds An early run of VectorCombine was added with D102496 specifically to deal with unnecessary vector ops produced with the C matrix extension. This patch is proposing to try those folds in general and add a pair of load folds to the menu. The load transform will partly solve (see PhaseOrdering diffs) a longstanding vectorization perf bug by removing redundant loads via GVN: issue #17113 The main reason for not enabling the extra pass generally in the initial patch was compile-time cost. The cost of VectorCombine was significantly (surprisingly) improved with: 87debdadaf18 https://llvm-compile-time-tracker.com/compare.php?from=ffe05b8f57d97bc4340f791cb386c8d00e0739f2&to=87debdadaf18f8a5c7e5d563889e10731dc3554d&stat=instructions:u ...so the extra run is going to cost very little now - the total cost of the 2 runs should be less than the 1 run before that micro-optimization: https://llvm-compile-time-tracker.com/compare.php?from=5e8c2026d10e8e2c93c038c776853bed0e7c8fc1&to=2c4b68eab5ae969811f422714e0eba44c5f7eefb&stat=instructions:u It may be possible to reduce the cost slightly more with a few more earlier-exits like that, but it's probably in the noise based on timing experiments. Differential Revision: https://reviews.llvm.org/D138353 --- llvm/lib/Passes/PassBuilderPipelines.cpp | 7 +++---- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 4 ++-- llvm/test/Other/new-pm-defaults.ll | 2 +- llvm/test/Other/new-pm-thinlto-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll | 1 + .../Other/new-pm-thinlto-postlink-samplepgo-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll | 1 + .../Other/new-pm-thinlto-prelink-samplepgo-defaults.ll | 1 + .../test/Transforms/PhaseOrdering/X86/vec-load-combine.ll | 15 ++++----------- 9 files changed, 15 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 2183681..285645b 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -615,10 +615,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // Delete small array after loop unroll. FPM.addPass(SROAPass()); - // The matrix extension can introduce large vector operations early, which can - // benefit from running vector-combine early on. - if (EnableMatrix) - FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true)); + // Try vectorization/scalarization transforms that are both improvements + // themselves and can allow further folds with GVN and InstCombine. + FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true)); // Eliminate redundancies. FPM.addPass(MergedLoadStoreMotionPass()); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 7bd9ee9..7769aaf 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1700,8 +1700,6 @@ bool VectorCombine::run() { Builder.SetInsertPoint(&I); if (!TryEarlyFoldsOnly) { if (isa(I.getType())) { - MadeChange |= vectorizeLoadInsert(I); - MadeChange |= widenSubvectorLoad(I); MadeChange |= foldInsExtFNeg(I); MadeChange |= foldBitcastShuf(I); MadeChange |= foldShuffleOfBinops(I); @@ -1713,6 +1711,8 @@ bool VectorCombine::run() { } } if (isa(I.getType())) { + MadeChange |= vectorizeLoadInsert(I); + MadeChange |= widenSubvectorLoad(I); MadeChange |= scalarizeBinopOrCmp(I); MadeChange |= scalarizeLoadExtract(I); } diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 303e164..ddb84d0 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -185,7 +185,7 @@ ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass ; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass ; CHECK-O-NEXT: Running pass: SROAPass on foo -; CHECK-MATRIX: Running pass: VectorCombinePass +; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll index 42d553c..997f707 100644 --- a/llvm/test/Other/new-pm-thinlto-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-defaults.ll @@ -158,6 +158,7 @@ ; CHECK-O-NEXT: Running pass: LoopDeletionPass ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass ; CHECK-O-NEXT: Running pass: SROAPass on foo +; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll index e67b23d..51a0b45 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -119,6 +119,7 @@ ; CHECK-O-NEXT: Running pass: LoopDeletionPass ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass ; CHECK-O-NEXT: Running pass: SROAPass on foo +; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index 01204ec..0390d64 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -128,6 +128,7 @@ ; CHECK-O-NEXT: Running pass: LoopDeletionPass ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass ; CHECK-O-NEXT: Running pass: SROAPass on foo +; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll index ccd7345..b47d19a 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -157,6 +157,7 @@ ; CHECK-O-NEXT: Running pass: LoopDeletionPass ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass ; CHECK-O-NEXT: Running pass: SROAPass on foo +; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll index dcaf0ff..87d2109 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -122,6 +122,7 @@ ; CHECK-O-NEXT: Running pass: IndVarSimplifyPass ; CHECK-O-NEXT: Running pass: LoopDeletionPass ; CHECK-O-NEXT: Running pass: SROAPass on foo +; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass ; CHECK-O23SZ-NEXT: Running pass: GVNPass ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll index 1ed7229..d37638a 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll @@ -12,20 +12,13 @@ $getAt = comdat any define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 { ; SSE-LABEL: @ConvertVectors_ByRef( ; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP0]], i64 0, i64 1 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[TMP3]], align 4 -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP5]], <4 x i32> -; SSE-NEXT: ret <4 x float> [[TMP7]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; SSE-NEXT: ret <4 x float> [[TMP3]] ; ; AVX-LABEL: @ConvertVectors_ByRef( ; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP0]], i64 0, i64 2 -; AVX-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 8 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i64 2 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP4]], i64 3 -; AVX-NEXT: ret <4 x float> [[TMP6]] +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; AVX-NEXT: ret <4 x float> [[TMP3]] ; %2 = alloca ptr, align 8 %3 = alloca <4 x float>, align 16 -- 2.7.4