From d8d1cc647d87b453a2dbb8242c75e3bccc443bbd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 6 Nov 2020 09:39:55 +0000 Subject: [PATCH] [SLP] Also try to vectorize incoming values of PHIs . Currently we do not consider incoming values of PHIs as roots for SLP vectorization. This means we miss scenarios like the one in the test case and PR47670. It appears quite straight-forward to consider incoming values of PHIs as roots for vectorization, but I might be missing something that makes this problematic. In terms of vectorized instructions, this applies to quite a few benchmarks across MultiSource/SPEC2000/SPEC2006 on X86 with -O3 -flto Same hash: 185 (filtered out) Remaining: 52 Metric: SLP.NumVectorInstructions Program base patch diff test-suite...ProxyApps-C++/HPCCG/HPCCG.test 9.00 27.00 200.0% test-suite...C/CFP2000/179.art/179.art.test 8.00 22.00 175.0% test-suite...T2006/458.sjeng/458.sjeng.test 14.00 30.00 114.3% test-suite...ce/Benchmarks/PAQ8p/paq8p.test 11.00 18.00 63.6% test-suite...s/FreeBench/neural/neural.test 12.00 18.00 50.0% test-suite...rimaran/enc-3des/enc-3des.test 65.00 95.00 46.2% test-suite...006/450.soplex/450.soplex.test 63.00 89.00 41.3% test-suite...ProxyApps-C++/CLAMR/CLAMR.test 177.00 250.00 41.2% test-suite...nchmarks/McCat/18-imp/imp.test 13.00 18.00 38.5% test-suite.../Applications/sgefa/sgefa.test 26.00 35.00 34.6% test-suite...pplications/oggenc/oggenc.test 100.00 133.00 33.0% test-suite...6/482.sphinx3/482.sphinx3.test 103.00 134.00 30.1% test-suite...oxyApps-C++/miniFE/miniFE.test 169.00 213.00 26.0% test-suite.../Benchmarks/Olden/tsp/tsp.test 59.00 73.00 23.7% test-suite...TimberWolfMC/timberwolfmc.test 503.00 622.00 23.7% test-suite...T2006/456.hmmer/456.hmmer.test 65.00 79.00 21.5% test-suite...libquantum/462.libquantum.test 58.00 68.00 17.2% test-suite...ternal/HMMER/hmmcalibrate.test 84.00 98.00 16.7% test-suite...ications/JM/ldecod/ldecod.test 351.00 401.00 14.2% test-suite...arks/VersaBench/dbms/dbms.test 52.00 57.00 9.6% test-suite...ce/Benchmarks/Olden/bh/bh.test 118.00 128.00 8.5% test-suite.../Benchmarks/Bullet/bullet.test 6355.00 6880.00 8.3% test-suite...nsumer-lame/consumer-lame.test 480.00 519.00 8.1% test-suite...000/183.equake/183.equake.test 226.00 244.00 8.0% test-suite...chmarks/Olden/power/power.test 105.00 113.00 7.6% test-suite...6/471.omnetpp/471.omnetpp.test 92.00 99.00 7.6% test-suite...ications/JM/lencod/lencod.test 1173.00 1261.00 7.5% test-suite...0/253.perlbmk/253.perlbmk.test 55.00 59.00 7.3% test-suite...oxyApps-C/miniAMR/miniAMR.test 92.00 98.00 6.5% test-suite...chmarks/MallocBench/gs/gs.test 446.00 473.00 6.1% test-suite.../CINT2006/403.gcc/403.gcc.test 464.00 491.00 5.8% test-suite...6/464.h264ref/464.h264ref.test 998.00 1055.00 5.7% test-suite...006/453.povray/453.povray.test 5711.00 6007.00 5.2% test-suite...FreeBench/distray/distray.test 102.00 107.00 4.9% test-suite...:: External/Povray/povray.test 4184.00 4378.00 4.6% test-suite...DOE-ProxyApps-C/CoMD/CoMD.test 112.00 117.00 4.5% test-suite...T2006/445.gobmk/445.gobmk.test 104.00 108.00 3.8% test-suite...CI_Purple/SMG2000/smg2000.test 789.00 819.00 3.8% test-suite...yApps-C++/PENNANT/PENNANT.test 233.00 241.00 3.4% test-suite...marks/7zip/7zip-benchmark.test 417.00 428.00 2.6% test-suite...arks/mafft/pairlocalalign.test 627.00 643.00 2.6% test-suite.../Benchmarks/nbench/nbench.test 259.00 265.00 2.3% test-suite...006/447.dealII/447.dealII.test 4641.00 4732.00 2.0% test-suite...lications/ClamAV/clamscan.test 106.00 108.00 1.9% test-suite...CFP2000/177.mesa/177.mesa.test 1639.00 1664.00 1.5% test-suite...oxyApps-C/RSBench/rsbench.test 66.00 65.00 -1.5% test-suite.../CINT2000/252.eon/252.eon.test 3416.00 3444.00 0.8% test-suite...CFP2000/188.ammp/188.ammp.test 1846.00 1861.00 0.8% test-suite.../CINT2000/176.gcc/176.gcc.test 152.00 153.00 0.7% test-suite...CFP2006/444.namd/444.namd.test 3528.00 3544.00 0.5% test-suite...T2006/473.astar/473.astar.test 98.00 98.00 0.0% test-suite...frame_layout/frame_layout.test NaN 39.00 nan% On ARM64, there appears to be a slight regression on SPEC2006, which might be interesting to investigate: test-suite...T2006/473.astar/473.astar.test 0.9% Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D88735 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 29 +++++--- .../Transforms/SLPVectorizer/X86/horizontal.ll | 80 +++++++++++++++++----- 2 files changed, 82 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0940199..0d77787 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7626,16 +7626,27 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Try to vectorize reductions that use PHINodes. if (PHINode *P = dyn_cast(it)) { // Check that the PHI is a reduction PHI. - if (P->getNumIncomingValues() != 2) - return Changed; + if (P->getNumIncomingValues() == 2) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, + TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + } + // Try to vectorize the incoming values of the PHI, to catch reductions + // that feed into PHIs. + for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { + // Skip if the incoming block is the current BB for now. + // TODO: Collect the skipped incoming values and try to vectorize them + // after processing BB. + if (BB == P->getIncomingBlock(I)) + continue; - // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, - TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; + Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), + P->getIncomingBlock(I), R, TTI); } continue; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll index 5e6e30e..f2f3bd0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -1632,38 +1632,82 @@ define i32 @reduction_result_used_in_phi(i32* nocapture readonly %data, i1 zeroe ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[DATA:%.*]], align 4 -; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1 -; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[IDX_1]], align 4 -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[L_1]], [[L_0]] +; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1 ; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2 -; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[IDX_2]], align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[L_2]], [[ADD_1]] ; CHECK-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3 -; CHECK-NEXT: [[L_3:%.*]] = load i32, i32* [[IDX_3]], align 4 -; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[L_3]], [[ADD_2]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_3]], [[BB]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] ; CHECK-NEXT: ret i32 [[SUM_1]] ; ; STORE-LABEL: @reduction_result_used_in_phi( ; STORE-NEXT: entry: ; STORE-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] ; STORE: bb: -; STORE-NEXT: [[L_0:%.*]] = load i32, i32* [[DATA:%.*]], align 4 -; STORE-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1 -; STORE-NEXT: [[L_1:%.*]] = load i32, i32* [[IDX_1]], align 4 -; STORE-NEXT: [[ADD_1:%.*]] = add i32 [[L_1]], [[L_0]] +; STORE-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1 ; STORE-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2 -; STORE-NEXT: [[L_2:%.*]] = load i32, i32* [[IDX_2]], align 4 -; STORE-NEXT: [[ADD_2:%.*]] = add i32 [[L_2]], [[ADD_1]] ; STORE-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3 -; STORE-NEXT: [[L_3:%.*]] = load i32, i32* [[IDX_3]], align 4 -; STORE-NEXT: [[ADD_3:%.*]] = add i32 [[L_3]], [[ADD_2]] +; STORE-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>* +; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) ; STORE-NEXT: br label [[EXIT]] ; STORE: exit: -; STORE-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_3]], [[BB]] ] +; STORE-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] +; STORE-NEXT: ret i32 [[SUM_1]] +; +entry: + br i1 %b, label %bb, label %exit + +bb: + %l.0 = load i32, i32* %data, align 4 + %idx.1 = getelementptr inbounds i32, i32* %data, i64 1 + %l.1 = load i32, i32* %idx.1, align 4 + %add.1 = add i32 %l.1, %l.0 + %idx.2 = getelementptr inbounds i32, i32* %data, i64 2 + %l.2 = load i32, i32* %idx.2, align 4 + %add.2 = add i32 %l.2, %add.1 + %idx.3 = getelementptr inbounds i32, i32* %data, i64 3 + %l.3 = load i32, i32* %idx.3, align 4 + %add.3 = add i32 %l.3, %add.2 + br label %exit + +exit: + %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb] + ret i32 %sum.1 +} + +define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) { +; CHECK-LABEL: @reduction_result_used_in_phi_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] +; CHECK: bb: +; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1 +; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2 +; CHECK-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] +; CHECK-NEXT: ret i32 [[SUM_1]] +; +; STORE-LABEL: @reduction_result_used_in_phi_loop( +; STORE-NEXT: entry: +; STORE-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] +; STORE: bb: +; STORE-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1 +; STORE-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2 +; STORE-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3 +; STORE-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>* +; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; STORE-NEXT: br label [[EXIT]] +; STORE: exit: +; STORE-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] ; STORE-NEXT: ret i32 [[SUM_1]] ; entry: -- 2.7.4