From d8d1cc647d87b453a2dbb8242c75e3bccc443bbd Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 6 Nov 2020 09:39:55 +0000
Subject: [PATCH] [SLP] Also try to vectorize incoming values of PHIs .

Currently we do not consider incoming values of PHIs as roots for SLP
vectorization. This means we miss scenarios like the one in the test
case and PR47670.

It appears quite straight-forward to consider incoming values of PHIs as
roots for vectorization, but I might be missing something that makes
this problematic.

In terms of vectorized instructions, this applies to quite a few
benchmarks across MultiSource/SPEC2000/SPEC2006 on X86 with -O3 -flto

    Same hash: 185 (filtered out)
    Remaining: 52
    Metric: SLP.NumVectorInstructions

    Program                                        base    patch   diff
     test-suite...ProxyApps-C++/HPCCG/HPCCG.test     9.00   27.00  200.0%
     test-suite...C/CFP2000/179.art/179.art.test     8.00   22.00  175.0%
     test-suite...T2006/458.sjeng/458.sjeng.test    14.00   30.00  114.3%
     test-suite...ce/Benchmarks/PAQ8p/paq8p.test    11.00   18.00  63.6%
     test-suite...s/FreeBench/neural/neural.test    12.00   18.00  50.0%
     test-suite...rimaran/enc-3des/enc-3des.test    65.00   95.00  46.2%
     test-suite...006/450.soplex/450.soplex.test    63.00   89.00  41.3%
     test-suite...ProxyApps-C++/CLAMR/CLAMR.test   177.00  250.00  41.2%
     test-suite...nchmarks/McCat/18-imp/imp.test    13.00   18.00  38.5%
     test-suite.../Applications/sgefa/sgefa.test    26.00   35.00  34.6%
     test-suite...pplications/oggenc/oggenc.test   100.00  133.00  33.0%
     test-suite...6/482.sphinx3/482.sphinx3.test   103.00  134.00  30.1%
     test-suite...oxyApps-C++/miniFE/miniFE.test   169.00  213.00  26.0%
     test-suite.../Benchmarks/Olden/tsp/tsp.test    59.00   73.00  23.7%
     test-suite...TimberWolfMC/timberwolfmc.test   503.00  622.00  23.7%
     test-suite...T2006/456.hmmer/456.hmmer.test    65.00   79.00  21.5%
     test-suite...libquantum/462.libquantum.test    58.00   68.00  17.2%
     test-suite...ternal/HMMER/hmmcalibrate.test    84.00   98.00  16.7%
     test-suite...ications/JM/ldecod/ldecod.test   351.00  401.00  14.2%
     test-suite...arks/VersaBench/dbms/dbms.test    52.00   57.00   9.6%
     test-suite...ce/Benchmarks/Olden/bh/bh.test   118.00  128.00   8.5%
     test-suite.../Benchmarks/Bullet/bullet.test   6355.00 6880.00  8.3%
     test-suite...nsumer-lame/consumer-lame.test   480.00  519.00   8.1%
     test-suite...000/183.equake/183.equake.test   226.00  244.00   8.0%
     test-suite...chmarks/Olden/power/power.test   105.00  113.00   7.6%
     test-suite...6/471.omnetpp/471.omnetpp.test    92.00   99.00   7.6%
     test-suite...ications/JM/lencod/lencod.test   1173.00 1261.00  7.5%
     test-suite...0/253.perlbmk/253.perlbmk.test    55.00   59.00   7.3%
     test-suite...oxyApps-C/miniAMR/miniAMR.test    92.00   98.00   6.5%
     test-suite...chmarks/MallocBench/gs/gs.test   446.00  473.00   6.1%
     test-suite.../CINT2006/403.gcc/403.gcc.test   464.00  491.00   5.8%
     test-suite...6/464.h264ref/464.h264ref.test   998.00  1055.00  5.7%
     test-suite...006/453.povray/453.povray.test   5711.00 6007.00  5.2%
     test-suite...FreeBench/distray/distray.test   102.00  107.00   4.9%
     test-suite...:: External/Povray/povray.test   4184.00 4378.00  4.6%
     test-suite...DOE-ProxyApps-C/CoMD/CoMD.test   112.00  117.00   4.5%
     test-suite...T2006/445.gobmk/445.gobmk.test   104.00  108.00   3.8%
     test-suite...CI_Purple/SMG2000/smg2000.test   789.00  819.00   3.8%
     test-suite...yApps-C++/PENNANT/PENNANT.test   233.00  241.00   3.4%
     test-suite...marks/7zip/7zip-benchmark.test   417.00  428.00   2.6%
     test-suite...arks/mafft/pairlocalalign.test   627.00  643.00   2.6%
     test-suite.../Benchmarks/nbench/nbench.test   259.00  265.00   2.3%
     test-suite...006/447.dealII/447.dealII.test   4641.00 4732.00  2.0%
     test-suite...lications/ClamAV/clamscan.test   106.00  108.00   1.9%
     test-suite...CFP2000/177.mesa/177.mesa.test   1639.00 1664.00  1.5%
     test-suite...oxyApps-C/RSBench/rsbench.test    66.00   65.00  -1.5%
     test-suite.../CINT2000/252.eon/252.eon.test   3416.00 3444.00  0.8%
     test-suite...CFP2000/188.ammp/188.ammp.test   1846.00 1861.00  0.8%
     test-suite.../CINT2000/176.gcc/176.gcc.test   152.00  153.00   0.7%
     test-suite...CFP2006/444.namd/444.namd.test   3528.00 3544.00  0.5%
     test-suite...T2006/473.astar/473.astar.test    98.00   98.00   0.0%
     test-suite...frame_layout/frame_layout.test    NaN     39.00   nan%

On ARM64, there appears to be a slight regression on SPEC2006, which
might be interesting to investigate:

   test-suite...T2006/473.astar/473.astar.test   0.9%

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D88735
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    | 29 +++++---
 .../Transforms/SLPVectorizer/X86/horizontal.ll     | 80 +++++++++++++++++-----
 2 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0940199..0d77787 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7626,16 +7626,27 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     // Try to vectorize reductions that use PHINodes.
     if (PHINode *P = dyn_cast<PHINode>(it)) {
       // Check that the PHI is a reduction PHI.
-      if (P->getNumIncomingValues() != 2)
-        return Changed;
+      if (P->getNumIncomingValues() == 2) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+                                     TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
+        }
+      }
+      // Try to vectorize the incoming values of the PHI, to catch reductions
+      // that feed into PHIs.
+      for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
+        // Skip if the incoming block is the current BB for now.
+        // TODO: Collect the skipped incoming values and try to vectorize them
+        // after processing BB.
+        if (BB == P->getIncomingBlock(I))
+          continue;
 
-      // Try to match and vectorize a horizontal reduction.
-      if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
-                                   TTI)) {
-        Changed = true;
-        it = BB->begin();
-        e = BB->end();
-        continue;
+        Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+                                            P->getIncomingBlock(I), R, TTI);
       }
       continue;
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
index 5e6e30e..f2f3bd0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -1632,38 +1632,82 @@ define i32 @reduction_result_used_in_phi(i32* nocapture readonly %data, i1 zeroe
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[L_0:%.*]] = load i32, i32* [[DATA:%.*]], align 4
-; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i32, i32* [[IDX_1]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[L_1]], [[L_0]]
+; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
 ; CHECK-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i32, i32* [[IDX_2]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[L_2]], [[ADD_1]]
 ; CHECK-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
-; CHECK-NEXT:    [[L_3:%.*]] = load i32, i32* [[IDX_3]], align 4
-; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[L_3]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_3]], [[BB]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_1]]
 ;
 ; STORE-LABEL: @reduction_result_used_in_phi(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
 ; STORE:       bb:
-; STORE-NEXT:    [[L_0:%.*]] = load i32, i32* [[DATA:%.*]], align 4
-; STORE-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1
-; STORE-NEXT:    [[L_1:%.*]] = load i32, i32* [[IDX_1]], align 4
-; STORE-NEXT:    [[ADD_1:%.*]] = add i32 [[L_1]], [[L_0]]
+; STORE-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
 ; STORE-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
-; STORE-NEXT:    [[L_2:%.*]] = load i32, i32* [[IDX_2]], align 4
-; STORE-NEXT:    [[ADD_2:%.*]] = add i32 [[L_2]], [[ADD_1]]
 ; STORE-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
-; STORE-NEXT:    [[L_3:%.*]] = load i32, i32* [[IDX_3]], align 4
-; STORE-NEXT:    [[ADD_3:%.*]] = add i32 [[L_3]], [[ADD_2]]
+; STORE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
+; STORE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
 ; STORE-NEXT:    br label [[EXIT]]
 ; STORE:       exit:
-; STORE-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_3]], [[BB]] ]
+; STORE-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
+; STORE-NEXT:    ret i32 [[SUM_1]]
+;
+entry:
+  br i1 %b, label %bb, label %exit
+
+bb:
+  %l.0 = load i32, i32* %data, align 4
+  %idx.1 = getelementptr inbounds i32, i32* %data, i64 1
+  %l.1 = load i32, i32* %idx.1, align 4
+  %add.1 = add i32 %l.1, %l.0
+  %idx.2 = getelementptr inbounds i32, i32* %data, i64 2
+  %l.2 = load i32, i32* %idx.2, align 4
+  %add.2 = add i32 %l.2, %add.1
+  %idx.3 = getelementptr inbounds i32, i32* %data, i64 3
+  %l.3 = load i32, i32* %idx.3, align 4
+  %add.3 = add i32 %l.3, %add.2
+  br label %exit
+
+exit:
+  %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
+  ret i32 %sum.1
+}
+
+define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) {
+; CHECK-LABEL: @reduction_result_used_in_phi_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
+; CHECK-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
+; CHECK-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
+; CHECK-NEXT:    ret i32 [[SUM_1]]
+;
+; STORE-LABEL: @reduction_result_used_in_phi_loop(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
+; STORE:       bb:
+; STORE-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
+; STORE-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
+; STORE-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
+; STORE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
+; STORE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; STORE-NEXT:    br label [[EXIT]]
+; STORE:       exit:
+; STORE-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
 ; STORE-NEXT:    ret i32 [[SUM_1]]
 ;
 entry:
-- 
2.7.4