From: Nadav Rotem Date: Sun, 21 Oct 2012 05:52:51 +0000 (+0000) Subject: Add support for reduction variables that do not start at zero. X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c1679a95b6b46a48bf5951cda1373a123b19c1d3;p=platform%2Fupstream%2Fllvm.git Add support for reduction variables that do not start at zero. This is important for nested-loop reductions such as : In the innermost loop, the induction variable does not start with zero: for (i = 0 .. n) for (j = 0 .. m) sum += ... llvm-svn: 166387 --- diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f32b66d..5a79c33 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -179,20 +179,36 @@ public: TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } /// This represents the kinds of reductions that we support. + /// We use the enum values to hold the 'identity' value for + /// each operand. This value does not change the result if applied. enum ReductionKind { - IntegerAdd, /// Sum of numbers. - IntegerMult, /// Product of numbers. - NoReduction /// Not a reduction. + NoReduction = -1, /// Not a reduction. + IntegerAdd = 0, /// Sum of numbers. + IntegerMult = 1 /// Product of numbers. }; - // Holds a pairing of reduction instruction and the reduction kind. - typedef std::pair ReductionPair; + /// This POD struct holds information about reduction variables. + struct ReductionDescriptor { + // Default C'tor + ReductionDescriptor(): + StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} + + // C'tor. + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): + StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + + // The starting value of the reduction. + // It does not have to be zero! + Value *StartValue; + // The instruction who's value is used outside the loop. + Instruction *LoopExitInstr; + // The kind of the reduction. + ReductionKind Kind; + }; - /// ReductionList contains the reduction variables - /// as well as a single EXIT (from the block) value and the kind of - /// reduction variable.. - /// Notice that the EXIT instruction can also be the PHI itself. - typedef DenseMap ReductionList; + /// ReductionList contains the reduction descriptors for all + /// of the reductions that were found in the loop. + typedef DenseMap ReductionList; /// Returns the maximum vectorization factor that we *can* use to vectorize /// this loop. This does not mean that it is profitable to vectorize this @@ -229,9 +245,6 @@ private: /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Checks if a constant matches the reduction kind. - /// Sums starts with zero. Products start at one. - bool isReductionConstant(Value *V, ReductionKind Kind); /// Returns true if the instruction I can be a reduction variable of type /// 'Kind'. bool isReductionInstr(Instruction *I, ReductionKind Kind); @@ -628,6 +641,8 @@ void SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { typedef SmallVector PhiVector; BasicBlock &BB = *Orig->getHeader(); + Constant *Zero = ConstantInt::get( + IntegerType::getInt32Ty(BB.getContext()), 0); // In order to support reduction variables we need to be able to vectorize // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two @@ -803,29 +818,42 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); assert(RdxPhi && "Unable to recover vectorized PHI"); - // Find the reduction variable. + // Find the reduction variable descriptor. assert(Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"); - LoopVectorizationLegality::ReductionPair ReductionVar = + LoopVectorizationLegality::ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and overide + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); + // This is the vector-clone of the value that leaves the loop. - Value *VectorExit = getVectorValue(ReductionVar.first); + Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); Type *VecTy = VectorExit->getType(); - // This is the kind of reduction. - LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second; - // Find the reduction identity variable. - // Zero for addition. One for Multiplication. - unsigned IdentitySclr = - (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1); - Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType()); + // Find the reduction identity variable. The value of the enum is the + // identity. Zero for addition. One for Multiplication. + unsigned IdentitySclr = RdxDesc.Kind; + Constant *Identity = getUniformVector(IdentitySclr, + VecTy->getScalarType()); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + Value *VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + // Fix the vector-loop phi. // We created the induction variable so we know that the // preheader is the first entry. BasicBlock *VecPreheader = Induction->getIncomingBlock(0); - VecRdxPhi->addIncoming(Identity, VecPreheader); + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + VecRdxPhi->addIncoming(VectorStart, VecPreheader); unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); VecRdxPhi->addIncoming(Val, LoopVectorBody); @@ -837,10 +865,10 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); // This PHINode contains the vectorized reduction variable, or - // the identity vector, if we bypass the vector loop. + // the initial value vector, if we bypass the vector loop. PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - NewPhi->addIncoming(Identity, LoopBypassBlock); - NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody); + NewPhi->addIncoming(VectorStart, LoopBypassBlock); + NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); // Extract the first scalar. Value *Scalar0 = @@ -849,7 +877,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { for (unsigned i=1; i < VF; ++i) { Value *Scalar1 = Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); - if (RdxKind == LoopVectorizationLegality::IntegerAdd) { + if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) { Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); } else { Scalar0 = Builder.CreateMul(Scalar0, Scalar1); @@ -865,11 +893,13 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { PHINode *LCSSAPhi = dyn_cast(LEI); if (!LCSSAPhi) continue; - // All PHINodes need to have a single entry edge, or two if we already fixed them. + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - // We found our reduction value exit-PHI. Update it with the incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) { + // We found our reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { // Add an edge coming from the bypass. LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); break; @@ -881,7 +911,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); - (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. } @@ -1157,7 +1187,7 @@ bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { } bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, - ReductionKind Kind) { + ReductionKind Kind) { if (Phi->getNumIncomingValues() != 2) return false; @@ -1167,10 +1197,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); - // We must have a constant that starts the reduction. - if (!isReductionConstant(RdxStart, Kind)) - return false; - // ExitInstruction is the single value which is used outside the loop. // We only allow for a single reduction value to be used outside the loop. // This includes users of the reduction, variables (which form a cycle @@ -1228,26 +1254,16 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (FoundStartPHI && ExitInstruction) { // This instruction is allowed to have out-of-loop users. AllowedExit.insert(ExitInstruction); - // Mark this as a reduction var. - Reductions[Phi] = std::make_pair(ExitInstruction, Kind); + + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + Reductions[Phi] = RD; return true; } } } bool -LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) { - ConstantInt *CI = dyn_cast(V); - if (!CI) - return false; - if (Kind == IntegerMult && CI->isOne()) - return true; - if (Kind == IntegerAdd && CI->isZero()) - return true; - return false; -} - -bool LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind) { switch (I->getOpcode()) { diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll index 257f661..3e871b2 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction.ll @@ -93,16 +93,16 @@ define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt ret i32 %sum.0.lcssa } -;CHECK: @reduction_bad -;CHECK-NOT: <4 x i32> +;CHECK: @reduction_mul +;CHECK: mul <4 x i32> ;CHECK: ret i32 -define i32 @reduction_bad(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { +define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge .lr.ph: ; preds = %0, %.lr.ph %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] - %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ] %2 = getelementptr inbounds i32* %A, i64 %indvars.iv %3 = load i32* %2, align 4 %4 = getelementptr inbounds i32* %B, i64 %indvars.iv @@ -120,3 +120,33 @@ define i32 @reduction_bad(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] ret i32 %sum.0.lcssa } + +;CHECK: @start_at_non_zero +;CHECK: phi <4 x i32> +;CHECK: +;CHECK: ret i32 +define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ] + %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv + %1 = load i32* %arrayidx2, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %sum.09 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ] + ret i32 %sum.0.lcssa +} + +