From 15869f86d84927ceb965eb5eaecabf523d8c7eb9 Mon Sep 17 00:00:00 2001
From: Matthew Simpson <mssimpso@codeaurora.org>
Date: Wed, 21 Sep 2016 16:50:24 +0000
Subject: [PATCH] [LV] Don't emit unused scalars for uniform instructions

If we identify an instruction as uniform after vectorization, we know that we
should only use the value corresponding to the first vector lane of each unroll
iteration. However, when scalarizing such instructions, we still produce values
for the other vector lanes. This patch prevents us from generating the unused
scalars.

Differential Revision: https://reviews.llvm.org/D24275

llvm-svn: 282087
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp    | 72 +++++++++++++++++-----
 llvm/test/Transforms/LoopVectorize/induction.ll    |  8 ---
 .../Transforms/LoopVectorize/reverse_induction.ll  | 30 ---------
 3 files changed, 58 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5f1a902..0b5d735 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2281,11 +2281,28 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
   assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
          "Val and Step should have the same integer type");
 
+  auto scalarUserIsUniform = [&](User *U) -> bool {
+    auto *I = cast<Instruction>(U);
+    return !OrigLoop->contains(I) || !Legal->isScalarAfterVectorization(I) ||
+           Legal->isUniformAfterVectorization(I);
+  };
+
+  // Determine the number of scalars we need to generate for each unroll
+  // iteration. If EntryVal is uniform or all it's scalar users are uniform, we
+  // only need to generate the first lane. Otherwise, we generate all VF
+  // values. We are essentially determining if the induction variable has no
+  // "multi-scalar" (non-uniform scalar) users.
+  unsigned Lanes =
+      Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ||
+              all_of(EntryVal->users(), scalarUserIsUniform)
+          ? 1
+          : VF;
+
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   ScalarParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
     Entry[Part].resize(VF);
-    for (unsigned Lane = 0; Lane < VF; ++Lane) {
+    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
       auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
       auto *Mul = Builder.CreateMul(StartIdx, Step);
       auto *Add = Builder.CreateAdd(ScalarIV, Mul);
@@ -2332,6 +2349,9 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // Initialize a new vector map entry.
     VectorParts Entry(UF);
 
+    // If we've scalarized a value, that value should be an instruction.
+    auto *I = cast<Instruction>(V);
+
     // If we aren't vectorizing, we can just copy the scalar map values over to
     // the vector map.
     if (VF == 1) {
@@ -2340,9 +2360,12 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
       return VectorLoopValueMap.initVector(V, Entry);
     }
 
-    // Get the last scalarized instruction. This corresponds to the instruction
-    // we created for the last vector lane on the last unroll iteration.
-    auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, VF - 1));
+    // Get the last scalar instruction we generated for V. If the value is
+    // known to be uniform after vectorization, this corresponds to lane zero
+    // of the last unroll iteration. Otherwise, the last instruction is the one
+    // we created for the last vector lane of the last unroll iteration.
+    unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
+    auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
 
     // Set the insert point after the last scalarized instruction. This ensures
     // the insertelement sequence will directly follow the scalar definitions.
@@ -2350,15 +2373,24 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     auto NewIP = std::next(BasicBlock::iterator(LastInst));
     Builder.SetInsertPoint(&*NewIP);
 
-    // However, if we are vectorizing, we need to construct the vector values
-    // using insertelement instructions. Since the resulting vectors are stored
-    // in VectorLoopValueMap, we will only generate the insertelements once.
+    // However, if we are vectorizing, we need to construct the vector values.
+    // If the value is known to be uniform after vectorization, we can just
+    // broadcast the scalar value corresponding to lane zero for each unroll
+    // iteration. Otherwise, we construct the vector values using insertelement
+    // instructions. Since the resulting vectors are stored in
+    // VectorLoopValueMap, we will only generate the insertelements once.
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *Insert = UndefValue::get(VectorType::get(V->getType(), VF));
-      for (unsigned Lane = 0; Lane < VF; ++Lane)
-        Insert = Builder.CreateInsertElement(
-            Insert, getScalarValue(V, Part, Lane), Builder.getInt32(Lane));
-      Entry[Part] = Insert;
+      Value *VectorValue = nullptr;
+      if (Legal->isUniformAfterVectorization(I)) {
+        VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
+      } else {
+        VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+        for (unsigned Lane = 0; Lane < VF; ++Lane)
+          VectorValue = Builder.CreateInsertElement(
+              VectorValue, getScalarValue(V, Part, Lane),
+              Builder.getInt32(Lane));
+      }
+      Entry[Part] = VectorValue;
     }
     Builder.restoreIP(OldIP);
     return VectorLoopValueMap.initVector(V, Entry);
@@ -2378,6 +2410,9 @@ Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
+  assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
+                  : true && "Uniform values only have lane zero");
+
   // If the value from the original loop has not been vectorized, it is
   // represented by UF x VF scalar values in the new loop. Return the requested
   // scalar value.
@@ -2884,11 +2919,16 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   if (IfPredicateInstr)
     Cond = createBlockInMask(Instr->getParent());
 
+  // Determine the number of scalars we need to generate for each unroll
+  // iteration. If the instruction is uniform, we only need to generate the
+  // first lane. Otherwise, we generate all VF values.
+  unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
+
   // For each vector unroll 'part':
   for (unsigned Part = 0; Part < UF; ++Part) {
     Entry[Part].resize(VF);
     // For each scalar that we create:
-    for (unsigned Lane = 0; Lane < VF; ++Lane) {
+    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
 
       // Start if-block.
       Value *Cmp = nullptr;
@@ -4398,12 +4438,16 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     // This is the normalized GEP that starts counting at zero.
     Value *PtrInd = Induction;
     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
+    // Determine the number of scalars we need to generate for each unroll
+    // iteration. If the instruction is uniform, we only need to generate the
+    // first lane. Otherwise, we generate all VF values.
+    unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
     // These are the scalar results. Notice that we don't generate vector GEPs
     // because scalar GEPs result in better code.
     ScalarParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
       Entry[Part].resize(VF);
-      for (unsigned Lane = 0; Lane < VF; ++Lane) {
+      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
         Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 8adb6a35..1866263 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -78,21 +78,15 @@ loopexit:
 ; CHECK: vector.body:
 ; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:   %[[i0:.+]] = add i64 %index, 0
-; CHECK:   %[[i1:.+]] = add i64 %index, 1
 ; CHECK:   getelementptr inbounds i64, i64* %a, i64 %[[i0]]
-; CHECK:   getelementptr inbounds i64, i64* %a, i64 %[[i1]]
 ;
 ; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
 ; UNROLL-NO-IC: vector.body:
 ; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; UNROLL-NO-IC:   %[[i0:.+]] = add i64 %index, 0
-; UNROLL-NO-IC:   %[[i1:.+]] = add i64 %index, 1
 ; UNROLL-NO-IC:   %[[i2:.+]] = add i64 %index, 2
-; UNROLL-NO-IC:   %[[i3:.+]] = add i64 %index, 3
 ; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i0]]
-; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i1]]
 ; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i2]]
-; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i3]]
 ;
 ; IND-LABEL: @scalarize_induction_variable_01(
 ; IND:     vector.body:
@@ -611,9 +605,7 @@ exit:
 ; CHECK:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
 ; CHECK:   %offset.idx = add i32 %i, %index
 ; CHECK:   %[[A1:.*]] = add i32 %offset.idx, 0
-; CHECK:   %[[A2:.*]] = add i32 %offset.idx, 1
 ; CHECK:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]]
-; CHECK:   %[[G2:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A2]]
 ; CHECK:   %[[G3:.*]] = getelementptr i32, i32* %[[G1]], i32 0
 ; CHECK:   %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>*
 ; CHECK:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
index ae8f9b3..ce81e1f 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -8,13 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %offset.idx = sub i64 %startval, %index
 ; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
 ; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
 
 define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
 entry:
@@ -40,13 +34,7 @@ loopend:
 ; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %offset.idx = sub i128 %startval, %index
 ; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3
 ; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7
 
 define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
 entry:
@@ -72,13 +60,7 @@ loopend:
 ; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %offset.idx = sub i16 %startval, {{.*}}
 ; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3
 ; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7
 
 define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
 entry:
@@ -121,13 +103,7 @@ loopend:
 ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %offset.idx = sub i64 1023, %index
 ; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
 ; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
 
 define void @reverse_forward_induction_i64_i8() {
 entry:
@@ -153,13 +129,7 @@ while.end:
 ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %offset.idx = sub i64 1023, %index
 ; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
-; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
-; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
-; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
 ; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
-; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
-; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
-; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
 
 define void @reverse_forward_induction_i64_i8_signed() {
 entry:
-- 
2.7.4