}
// Try to hoist some of the scalarization code to the preheader.
- if (BBChanged) hoistGatherSequence(LI, BB, R);
+ if (BBChanged) {
+ hoistGatherSequence(LI, BB, R);
+ Changed |= vectorizeUsingGatherHints(R.getGatherSeqInstructions());
+ }
Changed |= BBChanged;
}
/// \brief Try to vectorize a chain that starts at two arithmetic instrs.
bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
- /// \brief Try to vectorize a list of operands.
- bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
+ /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true
+ /// then we calculate the cost of extracting the scalars from the vector.
+ /// \returns true if a value was vectorized.
+ bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool NeedExtracts);
/// \brief Try to vectorize a chain that may start at the operands of \V;
bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
/// all of the sources are loop invariant.
void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R);
+ /// \brief Try to vectorize additional sequences in different basic blocks
+ /// based on values that we gathered in previous blocks. The list \p Gathers
+ /// holds the gather InsertElement instructions that were generated during
+ /// vectorization.
+ /// \returns True if some code was vectorized.
+ bool vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers);
+
/// \brief Scan the basic block and look for patterns that are likely to start
/// a vectorization chain.
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B) return false;
Value *VL[] = { A, B };
- return tryToVectorizeList(VL, R);
+ return tryToVectorizeList(VL, R, true);
}
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+ bool NeedExtracts) {
if (VL.size() < 2)
return false;
}
int Cost = R.getTreeCost(VL);
- int ExtrCost = R.getScalarizationCost(VL);
+ int ExtrCost = NeedExtracts ? R.getScalarizationCost(VL) : 0;
DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
" Cost of extract:" << ExtrCost << ".\n");
if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
}
if (Incoming.size() > 1)
- Changed |= tryToVectorizeList(Incoming, R);
+ Changed |= tryToVectorizeList(Incoming, R, true);
}
return Changed;
return Changed;
}
+bool SLPVectorizer::vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers) {
+ SmallVector<Value*, 4> Seq;
+ bool Changed = false;
+ for (int i = 0, e = Gathers.size(); i < e; ++i) {
+ InsertElementInst *IEI = dyn_cast_or_null<InsertElementInst>(Gathers[i]);
+
+ if (IEI) {
+ if (Instruction *I = dyn_cast<Instruction>(IEI->getOperand(1)))
+ Seq.push_back(I);
+ } else {
+
+ if (!Seq.size())
+ continue;
+
+ Instruction *I = cast<Instruction>(Seq[0]);
+ BasicBlock *BB = I->getParent();
+
+ DEBUG(dbgs()<<"SLP: Inspecting a gather list of size " << Seq.size() <<
+ " in " << BB->getName() << ".\n");
+
+ // Check if the gathered values have multiple uses. If they only have one
+ // user then we know that the insert/extract pair will go away.
+ bool HasMultipleUsers = false;
+ for (int i=0; e = Seq.size(), i < e; ++i) {
+ if (!Seq[i]->hasOneUse()) {
+ HasMultipleUsers = true;
+ break;
+ }
+ }
+
+ BoUpSLP BO(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
+
+ if (tryToVectorizeList(Seq, BO, HasMultipleUsers)) {
+ DEBUG(dbgs()<<"SLP: Vectorized a gather list of len " << Seq.size() <<
+ " in " << BB->getName() << ".\n");
+ Changed = true;
+ }
+
+ Seq.clear();
+ }
+ }
+
+ return Changed;
+}
+
void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB,
BoUpSLP &R) {
// Check if this block is inside a loop.
// Mark the insertion point for the block.
Instruction *Location = PreHeader->getTerminator();
- BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions();
- for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end();
+ BoUpSLP::InstrList &Gathers = R.getGatherSeqInstructions();
+ for (BoUpSLP::InstrList::iterator it = Gathers.begin(), e = Gathers.end();
it != e; ++it) {
- InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+ InsertElementInst *Insert = dyn_cast_or_null<InsertElementInst>(*it);
// The InsertElement sequence can be simplified into a constant.
+ // Also Ignore NULL pointers because they are only here to separate
+ // sequences.
if (!Insert)
continue;
/// Bottom Up SLP vectorization utility class.
struct BoUpSLP {
typedef SmallVector<Value*, 8> ValueList;
+ typedef SmallVector<Instruction*, 16> InstrList;
typedef SmallPtrSet<Value*, 16> ValueSet;
typedef SmallVector<StoreInst*, 8> StoreList;
static const int max_cost = 1<<20;
/// \returns the list of new instructions that were added in order to collect
/// scalars into vectors. This list can be used to further optimize the gather
/// sequences.
- ValueList &getGatherSeqInstructions() {return GatherInstructions; }
+ InstrList &getGatherSeqInstructions() {return GatherInstructions; }
private:
/// \brief This method contains the recursive part of getTreeCost.
/// A list of instructions that are used when gathering scalars into vectors.
/// In many cases these instructions can be hoisted outside of the BB.
/// Iterating over this list is faster than calling LICM.
- ValueList GatherInstructions;
+ /// Notice: We insert NULL ptrs to separate between the different gather
+ /// sequences.
+ InstrList GatherInstructions;
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
--- /dev/null
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; int foo(double *A, float *B, int g) {
+; float B0 = B[0];
+; float B1 = B[1]; <----- BasicBlock #1
+; B0 += 5;
+; B1 += 8;
+;
+; if (g) bar();
+;
+; A[0] += B0; <------- BasicBlock #3
+; A[1] += B1;
+; }
+
+
+;CHECK: @foo
+;CHECK: load <2 x float>
+;CHECK: fadd <2 x float>
+;CHECK: call i32
+;CHECK: load <2 x double>
+;CHECK: fadd <2 x double>
+;CHECK: store <2 x double>
+;CHECK: ret
+define i32 @foo(double* nocapture %A, float* nocapture %B, i32 %g) {
+entry:
+ %0 = load float* %B, align 4
+ %arrayidx1 = getelementptr inbounds float* %B, i64 1
+ %1 = load float* %arrayidx1, align 4
+ %add = fadd float %0, 5.000000e+00
+ %add2 = fadd float %1, 8.000000e+00
+ %tobool = icmp eq i32 %g, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ %call = tail call i32 (...)* @bar()
+ br label %if.end
+
+if.end:
+ %conv = fpext float %add to double
+ %2 = load double* %A, align 8
+ %add4 = fadd double %conv, %2
+ store double %add4, double* %A, align 8
+ %conv5 = fpext float %add2 to double
+ %arrayidx6 = getelementptr inbounds double* %A, i64 1
+ %3 = load double* %arrayidx6, align 8
+ %add7 = fadd double %conv5, %3
+ store double %add7, double* %arrayidx6, align 8
+ ret i32 undef
+}
+
+declare i32 @bar(...)