From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 21 Oct 2012 05:52:51 +0000 (+0000)
Subject: Add support for reduction variables that do not start at zero.
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c1679a95b6b46a48bf5951cda1373a123b19c1d3;p=platform%2Fupstream%2Fllvm.git

Add support for reduction variables that do not start at zero.
This is important for nested-loop reductions such as :

In the innermost loop, the induction variable does not start with zero:

for (i = 0 .. n)
 for (j = 0 .. m)
  sum += ...

llvm-svn: 166387
---

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f32b66d..5a79c33 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -179,20 +179,36 @@ public:
   TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
 
   /// This represents the kinds of reductions that we support.
+  /// We use the enum values to hold the 'identity' value for
+  /// each operand. This value does not change the result if applied.
   enum ReductionKind {
-    IntegerAdd, /// Sum of numbers.
-    IntegerMult, /// Product of numbers.
-    NoReduction /// Not a reduction.
+    NoReduction = -1, /// Not a reduction.
+    IntegerAdd  = 0,  /// Sum of numbers.
+    IntegerMult = 1  /// Product of numbers.
   };
 
-  // Holds a pairing of reduction instruction and the reduction kind.
-  typedef std::pair<Instruction*, ReductionKind> ReductionPair;
+  /// This POD struct holds information about reduction variables.
+  struct ReductionDescriptor {
+    // Default C'tor
+    ReductionDescriptor():
+    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
+
+    // C'tor.
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
+    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+    // The starting value of the reduction.
+    // It does not have to be zero!
+    Value *StartValue;
+    // The instruction who's value is used outside the loop.
+    Instruction *LoopExitInstr;
+    // The kind of the reduction.
+    ReductionKind Kind;
+  };
 
-  /// ReductionList contains the reduction variables
-  /// as well as a single EXIT (from the block) value and the kind of
-  /// reduction variable..
-  /// Notice that the EXIT instruction can also be the PHI itself.
-  typedef DenseMap<PHINode*, ReductionPair> ReductionList;
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
   /// Returns the maximum vectorization factor that we *can* use to vectorize
   /// this loop. This does not mean that it is profitable to vectorize this
@@ -229,9 +245,6 @@ private:
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
-  /// Checks if a constant matches the reduction kind.
-  /// Sums starts with zero. Products start at one.
-  bool isReductionConstant(Value *V, ReductionKind Kind);
   /// Returns true if the instruction I can be a reduction variable of type
   /// 'Kind'.
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
@@ -628,6 +641,8 @@ void
 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   typedef SmallVector<PHINode*, 4> PhiVector;
   BasicBlock &BB = *Orig->getHeader();
+  Constant *Zero = ConstantInt::get(
+    IntegerType::getInt32Ty(BB.getContext()), 0);
 
   // In order to support reduction variables we need to be able to vectorize
   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
@@ -803,29 +818,42 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
     assert(RdxPhi && "Unable to recover vectorized PHI");
 
-    // Find the reduction variable.
+    // Find the reduction variable descriptor.
     assert(Legal->getReductionVars()->count(RdxPhi) &&
            "Unable to find the reduction variable");
-    LoopVectorizationLegality::ReductionPair ReductionVar =
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
       (*Legal->getReductionVars())[RdxPhi];
 
+    // We need to generate a reduction vector from the incoming scalar.
+    // To do so, we need to generate the 'identity' vector and overide
+    // one of the elements with the incoming scalar reduction. We need
+    // to do it in the vector-loop preheader.
+    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+
     // This is the vector-clone of the value that leaves the loop.
-    Value *VectorExit = getVectorValue(ReductionVar.first);
+    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
     Type *VecTy = VectorExit->getType();
 
-    // This is the kind of reduction.
-    LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second;
-    // Find the reduction identity variable.
-    // Zero for addition. One for Multiplication.
-    unsigned IdentitySclr =
-      (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1);
-    Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType());
+    // Find the reduction identity variable. The value of the enum is the
+    // identity. Zero for addition. One for Multiplication.
+    unsigned IdentitySclr =  RdxDesc.Kind;
+    Constant *Identity = getUniformVector(IdentitySclr,
+                                          VecTy->getScalarType());
+
+    // This vector is the Identity vector where the first element is the
+    // incoming scalar reduction.
+    Value *VectorStart = Builder.CreateInsertElement(Identity,
+                                                    RdxDesc.StartValue, Zero);
+
 
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
     // preheader is the first entry.
     BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
-    VecRdxPhi->addIncoming(Identity, VecPreheader);
+
+    // Reductions do not have to start at zero. They can start with
+    // any loop invariant values.
+    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
     unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
     Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
     VecRdxPhi->addIncoming(Val, LoopVectorBody);
@@ -837,10 +865,10 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
 
     // This PHINode contains the vectorized reduction variable, or
-    // the identity vector, if we bypass the vector loop.
+    // the initial value vector, if we bypass the vector loop.
     PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-    NewPhi->addIncoming(Identity, LoopBypassBlock);
-    NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody);
+    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
+    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
 
     // Extract the first scalar.
     Value *Scalar0 =
@@ -849,7 +877,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     for (unsigned i=1; i < VF; ++i) {
       Value *Scalar1 =
         Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
-      if (RdxKind == LoopVectorizationLegality::IntegerAdd) {
+      if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
         Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
       } else {
         Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
@@ -865,11 +893,13 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
       if (!LCSSAPhi) continue;
 
-      // All PHINodes need to have a single entry edge, or two if we already fixed them.
+      // All PHINodes need to have a single entry edge, or two if
+      // we already fixed them.
       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
 
-      // We found our reduction value exit-PHI. Update it with the incoming bypass edge.
-      if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) {
+      // We found our reduction value exit-PHI. Update it with the
+      // incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
         // Add an edge coming from the bypass.
         LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
         break;
@@ -881,7 +911,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
-    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 }
 
@@ -1157,7 +1187,7 @@ bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
 }
 
 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
-                                                    ReductionKind Kind) {
+                                                ReductionKind Kind) {
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
@@ -1167,10 +1197,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
   Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
 
-  // We must have a constant that starts the reduction.
-  if (!isReductionConstant(RdxStart, Kind))
-    return false;
-
   // ExitInstruction is the single value which is used outside the loop.
   // We only allow for a single reduction value to be used outside the loop.
   // This includes users of the reduction, variables (which form a cycle
@@ -1228,26 +1254,16 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
    if (FoundStartPHI && ExitInstruction) {
      // This instruction is allowed to have out-of-loop users.
      AllowedExit.insert(ExitInstruction);
-     // Mark this as a reduction var.
-     Reductions[Phi] = std::make_pair(ExitInstruction, Kind);
+
+     // Save the description of this reduction variable.
+     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+     Reductions[Phi] = RD;
      return true;
    }
   }
 }
 
 bool
-LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) {
-  ConstantInt *CI = dyn_cast<ConstantInt>(V);
-  if (!CI)
-    return false;
-  if (Kind == IntegerMult && CI->isOne())
-    return true;
-  if (Kind == IntegerAdd && CI->isZero())
-    return true;
-  return false;
-}
-
-bool
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
                                             ReductionKind Kind) {
     switch (I->getOpcode()) {
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index 257f661..3e871b2 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -93,16 +93,16 @@ define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
   ret i32 %sum.0.lcssa
 }
 
-;CHECK: @reduction_bad
-;CHECK-NOT: <4 x i32>
+;CHECK: @reduction_mul
+;CHECK: mul <4 x i32>
 ;CHECK: ret i32
-define i32 @reduction_bad(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
   %3 = load i32* %2, align 4
   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
@@ -120,3 +120,33 @@ define i32 @reduction_bad(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
   %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
   ret i32 %sum.0.lcssa
 }
+
+;CHECK: @start_at_non_zero
+;CHECK: phi <4 x i32>
+;CHECK: <i32 120, i32 0, i32 0, i32 0>
+;CHECK: ret i32
+define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
+  %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.09
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+