From 560545b85f933520c7af38dbbc357f2ff9194c5f Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Thu, 1 Nov 2012 21:50:12 +0000 Subject: [PATCH] BBVectorize: Use target costs for incoming and outgoing values instead of the depth heuristic. When target cost information is available, compute explicit costs of inserting and extracting values from vectors. At this point, all costs are estimated using the target information, and the chain-depth heuristic is not needed. As a result, it is now, by default, disabled when using target costs. llvm-svn: 167256 --- llvm/lib/Transforms/Vectorize/BBVectorize.cpp | 200 ++++++++++++++++++++- .../Transforms/BBVectorize/X86/simple-ldstr.ll | 29 +++ llvm/test/Transforms/BBVectorize/X86/simple.ll | 81 +++++++-- 3 files changed, 290 insertions(+), 20 deletions(-) create mode 100644 llvm/test/Transforms/BBVectorize/X86/simple-ldstr.ll diff --git a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index 6e36ff7..4653a7d 100644 --- a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -58,6 +58,11 @@ static cl::opt ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization")); +static cl::opt +UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), + cl::Hidden, cl::desc("Use the chain depth requirement with" + " target information")); + static cl::opt SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, cl::desc("The maximum search distance for instruction pairs")); @@ -227,6 +232,9 @@ namespace { DenseMap &CandidatePairCostSavings, std::vector &PairableInsts, bool NonPow2Len); + // FIXME: The current implementation does not account for pairs that + // are connected in multiple ways. For example: + // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) enum PairConnectionType { PairConnectionDirect, PairConnectionSwap, @@ -1665,20 +1673,39 @@ namespace { int EffSize = 0; if (VTTI) { + DenseSet PrunedTreeInstrs; + for (DenseSet::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + PrunedTreeInstrs.insert(S->first); + PrunedTreeInstrs.insert(S->second); + } + + // The set of pairs that have already contributed to the total cost. + DenseSet IncomingPairs; + // The node weights represent the cost savings associated with // fusing the pair of instructions. for (DenseSet::iterator S = PrunedTree.begin(), E = PrunedTree.end(); S != E; ++S) { - if (getDepthFactor(S->first)) - EffSize += CandidatePairCostSavings.find(*S)->second; + bool FlipOrder = false; + + if (getDepthFactor(S->first)) { + int ESContrib = CandidatePairCostSavings.find(*S)->second; + DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" + << *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize += ESContrib; + } - // The edge weights contribute in a negative sense: they represent - // the cost of shuffles. + // The edge weights contribute in a negative sense: they represent + // the cost of shuffles. VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S); if (IP.first != ConnectedPairDeps.end()) { unsigned NumDepsDirect = 0, NumDepsSwap = 0; for (std::multimap::iterator Q = IP.first; Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; DenseMap::iterator R = PairConnectionTypes.find(VPPair(Q->second, Q->first)); assert(R != PairConnectionTypes.end() && @@ -1692,12 +1719,14 @@ namespace { // If there are more swaps than direct connections, then // the pair order will be flipped during fusion. So the real // number of swaps is the minimum number. - bool FlipOrder = !FixedOrderPairs.count(*S) && + FlipOrder = !FixedOrderPairs.count(*S) && ((NumDepsSwap > NumDepsDirect) || FixedOrderPairs.count(ValuePair(S->second, S->first))); for (std::multimap::iterator Q = IP.first; Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; DenseMap::iterator R = PairConnectionTypes.find(VPPair(Q->second, Q->first)); assert(R != PairConnectionTypes.end() && @@ -1707,9 +1736,161 @@ namespace { Type *VTy = getVecTypeForPair(Ty1, Ty2); if ((R->second == PairConnectionDirect && FlipOrder) || (R->second == PairConnectionSwap && !FlipOrder) || - R->second == PairConnectionSplat) - EffSize -= (int) getInstrCost(Instruction::ShuffleVector, - VTy, VTy); + R->second == PairConnectionSplat) { + int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *Q->second.first << " <-> " << *Q->second.second << + "} -> {" << + *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + } + } + } + + // Compute the cost of outgoing edges. We assume that edges outgoing + // to shuffles, inserts or extracts can be merged, and so contribute + // no additional cost. + if (!S->first->getType()->isVoidTy()) { + Type *Ty1 = S->first->getType(), + *Ty2 = S->second->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + bool NeedsExtraction = false; + for (Value::use_iterator I = S->first->use_begin(), + IE = S->first->use_end(); I != IE; ++I) { + if (isa(*I) || + isa(*I) || + isa(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty1->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty1, VTy); + else + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 0); + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->first << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + + NeedsExtraction = false; + for (Value::use_iterator I = S->second->use_begin(), + IE = S->second->use_end(); I != IE; ++I) { + if (isa(*I) || + isa(*I) || + isa(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty2->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty2, VTy); + else + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 1); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->second << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + } + + // Compute the cost of incoming edges. + if (!isa(S->first) && !isa(S->first)) { + Instruction *S1 = cast(S->first), + *S2 = cast(S->second); + for (unsigned o = 0; o < S1->getNumOperands(); ++o) { + Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); + + // Combining constants into vector constants (or small vector + // constants into larger ones are assumed free). + if (isa(O1) && isa(O2)) + continue; + + if (FlipOrder) + std::swap(O1, O2); + + ValuePair VP = ValuePair(O1, O2); + ValuePair VPR = ValuePair(O2, O1); + + // Internal edges are not handled here. + if (PrunedTree.count(VP) || PrunedTree.count(VPR)) + continue; + + Type *Ty1 = O1->getType(), + *Ty2 = O2->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + // Combining vector operations of the same type is also assumed + // folded with other operations. + if (Ty1 == Ty2 && + (isa(O1) || + isa(O1) || + isa(O1)) && + (isa(O2) || + isa(O2) || + isa(O2))) + continue; + + int ESContrib; + // This pair has already been formed. + if (IncomingPairs.count(VP)) { + continue; + } else if (IncomingPairs.count(VPR)) { + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 0); + ESContrib += (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 1); + } else if (!Ty1->isVectorTy()) { + // O1 needs to be inserted into a vector of size O2, and then + // both need to be shuffled together. + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, Ty2, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty2); + } else if (!Ty2->isVectorTy()) { + // O2 needs to be inserted into a vector of size O1, and then + // both need to be shuffled together. + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, Ty1, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty1); + } else { + Type *TyBig = Ty1, *TySmall = Ty2; + if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) + std::swap(TyBig, TySmall); + + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, TyBig); + if (TyBig != TySmall) + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + TyBig, TySmall); + } + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" + << *O1 << " <-> " << *O2 << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + IncomingPairs.insert(VP); } } } @@ -1724,7 +1905,8 @@ namespace { << *J->first << " <-> " << *J->second << "} of depth " << MaxDepth << " and size " << PrunedTree.size() << " (effective size: " << EffSize << ")\n"); - if (MaxDepth >= Config.ReqChainDepth && + if (((VTTI && !UseChainDepthWithTI) || + MaxDepth >= Config.ReqChainDepth) && EffSize > 0 && EffSize > BestEffSize) { BestMaxDepth = MaxDepth; BestEffSize = EffSize; diff --git a/llvm/test/Transforms/BBVectorize/X86/simple-ldstr.ll b/llvm/test/Transforms/BBVectorize/X86/simple-ldstr.ll new file mode 100644 index 0000000..0124399 --- /dev/null +++ b/llvm/test/Transforms/BBVectorize/X86/simple-ldstr.ll @@ -0,0 +1,29 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +; Simple 3-pair chain with loads and stores +define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + store double %mul, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %mul5, double* %arrayidx5, align 8 + ret void +; CHECK: @test1 +; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* +; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* +; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8 +; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8 +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %0 = bitcast double* %c to <2 x double>* +; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8 +; CHECK: ret void +} + diff --git a/llvm/test/Transforms/BBVectorize/X86/simple.ll b/llvm/test/Transforms/BBVectorize/X86/simple.ll index d11c9b9..0113e38 100644 --- a/llvm/test/Transforms/BBVectorize/X86/simple.ll +++ b/llvm/test/Transforms/BBVectorize/X86/simple.ll @@ -3,25 +3,84 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Basic depth-3 chain define double @test1(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R ; CHECK: @test1 +; CHECK-NOT: fmul <2 x double> +; CHECK: ret double %R +} + +; Basic chain +define double @test1a(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %W1 = fadd double %Y1, %Z1 + %W2 = fadd double %Y2, %Z2 + %V1 = fadd double %W1, %Z1 + %V2 = fadd double %W2, %Z2 + %Q1 = fadd double %W1, %V1 + %Q2 = fadd double %W2, %V2 + %S1 = fadd double %W1, %Q1 + %S2 = fadd double %W2, %Q2 + %R = fmul double %S1, %S2 + ret double %R +; CHECK: @test1a ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 +; CHECK: %W1 = fadd <2 x double> %Y1, %Z1 +; CHECK: %V1 = fadd <2 x double> %W1, %Z1 +; CHECK: %Q1 = fadd <2 x double> %W1, %V1 +; CHECK: %S1 = fadd <2 x double> %W1, %Q1 +; CHECK: %S1.v.r1 = extractelement <2 x double> %S1, i32 0 +; CHECK: %S1.v.r2 = extractelement <2 x double> %S1, i32 1 +; CHECK: %R = fmul double %S1.v.r1, %S1.v.r2 +; CHECK: ret double %R +} + +; Basic depth-3 chain (last pair permuted) +define double @test2(double %A1, double %A2, double %B1, double %B2) { %X1 = fsub double %A1, %B1 %X2 = fsub double %A2, %B2 -; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 %Y1 = fmul double %X1, %A1 %Y2 = fmul double %X2, %A2 -; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 - %Z1 = fadd double %Y1, %B1 - %Z2 = fadd double %Y2, %B2 -; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 + %Z1 = fadd double %Y2, %B1 + %Z2 = fadd double %Y1, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test2 +; CHECK-NOT: fmul <2 x double> +; CHECK: ret double %R +} + +; Basic depth-4 chain (internal permutation) +define double @test4(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 + %Z1 = fadd double %Y2, %B1 + %Z2 = fadd double %Y1, %B2 + %W1 = fadd double %Y2, %Z1 + %W2 = fadd double %Y1, %Z2 %R = fmul double %Z1, %Z2 -; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 -; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 -; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 ret double %R +; CHECK: @test4 +; CHECK-NOT: fmul <2 x double> ; CHECK: ret double %R } @@ -37,8 +96,8 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) { %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> %R = mul <8 x i8> %Q1, %Q2 ret <8 x i8> %R -; CHECK-TI: @test6 -; CHECK-TI-NOT: sub <16 x i8> -; CHECK-TI: ret <8 x i8> +; CHECK: @test6 +; CHECK-NOT: sub <16 x i8> +; CHECK: ret <8 x i8> } -- 2.7.4