From 0aca09fc6cdcda65a44dc084b20d7056912977ef Mon Sep 17 00:00:00 2001
From: Adam Nemet <anemet@apple.com>
Date: Thu, 11 May 2017 17:06:17 +0000
Subject: [PATCH] [SLP] Emit optimization remarks

The approach I followed was to emit the remark after getTreeCost concludes
that SLP is profitable.  I initially tried emitting them after the
vectorizeRootInstruction calls in vectorizeChainsInBlock but I vaguely
remember missing a few cases for example in HorizontalReduction::tryToReduce.

ORE is placed in BoUpSLP so that it's available from everywhere (notably
HorizontalReduction::tryToReduce).

We use the first instruction in the root bundle as the locator for the remark.
In order to get a sense how far the tree is spanning I've include the size of
the tree in the remark.  This is not perfect of course but it gives you at
least a rough idea about the tree.  Then you can follow up with -view-slp-tree
to really see the actual tree.

llvm-svn: 302811
---
 .../llvm/Transforms/Vectorize/SLPVectorizer.h      |  4 +-
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    | 42 ++++++++++++++++++---
 .../SLPVectorizer/AArch64/getelementptr.ll         | 43 ++++++++++++++++++++--
 .../Transforms/SLPVectorizer/AArch64/horizontal.ll | 33 ++++++++++++++++-
 .../Transforms/SLPVectorizer/AArch64/remarks.ll    | 32 ++++++++++++++++
 5 files changed, 143 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 10338f7..c514db4 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
@@ -59,7 +60,8 @@ public:
   // Glue for old PM.
   bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
                TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_,
-               DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_);
+               DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
+               OptimizationRemarkEmitter *ORE_);
 
 private:
   /// \brief Collect store and getelementptr instructions and organize them
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 960019f..f6334eb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -299,10 +299,10 @@ public:
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
-          const DataLayout *DL)
+          const DataLayout *DL, OptimizationRemarkEmitter *ORE)
       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
-        DL(DL), Builder(Se->getContext()) {
+        DL(DL), ORE(ORE), Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
@@ -361,6 +361,8 @@ public:
     MinBWs.clear();
   }
 
+  unsigned getTreeSize() const { return VectorizableTree.size(); }
+
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
@@ -399,6 +401,8 @@ public:
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
+  OptimizationRemarkEmitter *getORE() { return ORE; }
+
 private:
   struct TreeEntry;
 
@@ -928,6 +932,8 @@ private:
   AssumptionCache *AC;
   DemandedBits *DB;
   const DataLayout *DL;
+  OptimizationRemarkEmitter *ORE;
+
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
   /// Instruction builder to construct the vectorized tree.
@@ -3772,8 +3778,9 @@ struct SLPVectorizer : public FunctionPass {
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3785,6 +3792,7 @@ struct SLPVectorizer : public FunctionPass {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
@@ -3803,8 +3811,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   if (!Changed)
     return PreservedAnalyses::all();
 
@@ -3819,7 +3828,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                 TargetTransformInfo *TTI_,
                                 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
                                 LoopInfo *LI_, DominatorTree *DT_,
-                                AssumptionCache *AC_, DemandedBits *DB_) {
+                                AssumptionCache *AC_, DemandedBits *DB_,
+                                OptimizationRemarkEmitter *ORE_) {
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
@@ -3847,7 +3857,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // Use the bottom up slp vectorizer to construct chains that start with
   // store instructions.
-  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
+  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
 
   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
   // delete instructions.
@@ -3936,6 +3946,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
     if (Cost < -SLPCostThreshold) {
       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      using namespace ore;
+      R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+                                          cast<StoreInst>(Chain[i]))
+                       << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+                       << " and with tree size "
+                       << NV("TreeSize", R.getTreeSize()));
+
       R.vectorizeTree();
 
       // Move to the next bundle.
@@ -4149,6 +4166,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
       if (Cost < -SLPCostThreshold) {
         DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+                                            cast<Instruction>(Ops[0]))
+                         << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+                         << " and with tree size "
+                         << ore::NV("TreeSize", R.getTreeSize()));
+
         Value *VectorizedRoot = R.vectorizeTree();
 
         // Reconstruct the build vector by extracting the vectorized root. This
@@ -4492,6 +4515,12 @@ public:
 
       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
                    << ". (HorRdx)\n");
+      auto *I0 = cast<Instruction>(VL[0]);
+      V.getORE()->emit(
+          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+          << "Vectorized horizontal reduction with cost "
+          << ore::NV("Cost", Cost) << " and with tree size "
+          << ore::NV("TreeSize", V.getTreeSize()));
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -5146,6 +5175,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
 
 namespace llvm {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index e9b7196..962a6c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,4 +1,5 @@
-; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -23,7 +24,25 @@ target triple = "aarch64--linux-gnu"
 ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
 ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
 ; CHECK: sext i32 [[X]] to i64
-;
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '16'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
 define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
 entry:
   %cmp31 = icmp sgt i32 %n, 0
@@ -69,7 +88,25 @@ for.body:
 ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
 ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
 ; CHECK: sext i32 [[X]] to i64
-;
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
 define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
 entry:
   %cmp31 = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 8f8bf26..1a6a2fb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -1,4 +1,5 @@
-; RUN: opt -slp-vectorizer -slp-threshold=-6 -S <  %s | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t <  %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 ; FIXME: The threshold is changed to keep this test case a bit smaller.
 ; The AArch64 cost model should not give such high costs to select statements.
@@ -10,6 +11,16 @@ target triple = "aarch64--linux"
 ; CHECK: load <4 x i32>
 ; CHECK: load <4 x i32>
 ; CHECK: select <4 x i1>
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        test_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '4'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '8'
+
 define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
 entry:
   %cmp.22 = icmp sgt i32 %h, 0
@@ -93,6 +104,16 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
 ; CHECK: load <4 x i32>
 ; CHECK: load <4 x i32>
 ; CHECK: mul nsw <4 x i32>
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        reduction_with_br
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '1'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
 entry:
   %cmp.16 = icmp sgt i32 %h, 0
   br i1 %cmp.16, label %for.body.lr.ph, label %for.end
@@ -150,6 +171,16 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK: load <8 x i8>
 ; CHECK: load <8 x i8>
 ; CHECK: select <8 x i1>
+
+; YAML:      Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        test_unrolled_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '-33'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '10'
+
 define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
 entry:
   %cmp.43 = icmp sgt i32 %h, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
new file mode 100644
index 0000000..e8c3751
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+define void @f(double* %r, double* %w) {
+  %r0 = getelementptr inbounds double, double* %r, i64 0
+  %r1 = getelementptr inbounds double, double* %r, i64 1
+  %f0 = load double, double* %r0
+  %f1 = load double, double* %r1
+  %add0 = fadd double %f0, %f0
+  %add1 = fadd double %f1, %f1
+  %w0 = getelementptr inbounds double, double* %w, i64 0
+  %w1 = getelementptr inbounds double, double* %w, i64 1
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
+  store double %add0, double* %w0, !dbg !9
+  store double %add1, double* %w1
+  ret void
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
+!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 5, column: 10, scope: !7)
-- 
2.7.4