[LoopVectorize] Don't vectorize loops when everything will be scalarized

author Hal Finkel <hfinkel@anl.gov>

Wed, 30 Mar 2016 19:37:08 +0000 (19:37 +0000)

committer Hal Finkel <hfinkel@anl.gov>

Wed, 30 Mar 2016 19:37:08 +0000 (19:37 +0000)
author Hal Finkel <hfinkel@anl.gov>
Wed, 30 Mar 2016 19:37:08 +0000 (19:37 +0000)
committer Hal Finkel <hfinkel@anl.gov>
Wed, 30 Mar 2016 19:37:08 +0000 (19:37 +0000)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index d2504c5..201e9e9 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1532,15 +1532,26 @@ public:
    calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
  
  private:
+  /// The vectorization cost is a combination of the cost itself and a boolean
+  /// indicating whether any of the contributing operations will actually operate on
+  /// vector values after type legalization in the backend. If this latter value is
+  /// false, then all operations will be scalarized (i.e. no vectorization has
+  /// actually taken place).
+  typedef std::pair<unsigned, bool> VectorizationCostTy;
+
    /// Returns the expected execution cost. The unit of the cost does
    /// not matter because we use the 'cost' units to compare different
    /// vector widths. The cost that is returned is *not* normalized by
    /// the factor width.
-  unsigned expectedCost(unsigned VF);
+  VectorizationCostTy expectedCost(unsigned VF);
  
    /// Returns the execution time cost of an instruction for a given vector
    /// width. Vector width of one means scalar.
-  unsigned getInstructionCost(Instruction *I, unsigned VF);
+  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+
+  /// The cost-computation logic from getInstructionCost which provides
+  /// the vector type as an output parameter.
+  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
  
    /// Returns whether the instruction is a load or store and will be a emitted
    /// as a vector operation.
@@ -5145,7 +5156,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
      return Factor;
    }
  
-  float Cost = expectedCost(1);
+  float Cost = expectedCost(1).first;
  #ifndef NDEBUG
    const float ScalarCost = Cost;
  #endif /* NDEBUG */
@@ -5156,16 +5167,22 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
    // Ignore scalar width, because the user explicitly wants vectorization.
    if (ForceVectorization && VF > 1) {
      Width = 2;
-    Cost = expectedCost(Width) / (float)Width;
+    Cost = expectedCost(Width).first / (float)Width;
    }
  
    for (unsigned i=2; i <= VF; i*=2) {
      // Notice that the vector loop needs to be executed less times, so
      // we need to divide the cost of the vector loops by the width of
      // the vector elements.
-    float VectorCost = expectedCost(i) / (float)i;
+    VectorizationCostTy C = expectedCost(i);
+    float VectorCost = C.first / (float)i;
      DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
            (int)VectorCost << ".\n");
+    if (!C.second && !ForceVectorization) {
+      DEBUG(dbgs() << "LV: Not considering vector loop of width " << i <<
+            " because it will not generate any vector instructions.\n");
+      continue;
+    }
      if (VectorCost < Cost) {
        Cost = VectorCost;
        Width = i;
@@ -5313,7 +5330,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
    // If we did not calculate the cost for VF (because the user selected the VF)
    // then we calculate the cost of VF here.
    if (LoopCost == 0)
-    LoopCost = expectedCost(VF);
+    LoopCost = expectedCost(VF).first;
  
    // Clamp the calculated IC to be between the 1 and the max interleave count
    // that the target allows.
@@ -5540,13 +5557,14 @@ LoopVectorizationCostModel::calculateRegisterUsage(
    return RUs;
  }
  
-unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
-  unsigned Cost = 0;
+LoopVectorizationCostModel::VectorizationCostTy
+LoopVectorizationCostModel::expectedCost(unsigned VF) {
+  VectorizationCostTy Cost;
  
    // For each block.
    for (Loop::block_iterator bb = TheLoop->block_begin(),
         be = TheLoop->block_end(); bb != be; ++bb) {
-    unsigned BlockCost = 0;
+    VectorizationCostTy BlockCost;
      BasicBlock *BB = *bb;
  
      // For each instruction in the old loop.
@@ -5559,24 +5577,26 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
        if (ValuesToIgnore.count(&*it))
          continue;
  
-      unsigned C = getInstructionCost(&*it, VF);
+      VectorizationCostTy C = getInstructionCost(&*it, VF);
  
        // Check if we should override the cost.
        if (ForceTargetInstructionCost.getNumOccurrences() > 0)
-        C = ForceTargetInstructionCost;
+        C.first = ForceTargetInstructionCost;
  
-      BlockCost += C;
-      DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
-            VF << " For instruction: " << *it << '\n');
+      BlockCost.first += C.first;
+      BlockCost.second |= C.second;
+      DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first <<
+            " for VF " << VF << " For instruction: " << *it << '\n');
      }
  
      // We assume that if-converted blocks have a 50% chance of being executed.
      // When the code is scalar then some of the blocks are avoided due to CF.
      // When the code is vectorized we execute all code paths.
      if (VF == 1 && Legal->blockNeedsPredication(*bb))
-      BlockCost /= 2;
+      BlockCost.first /= 2;
  
-    Cost += BlockCost;
+    Cost.first += BlockCost.first;
+    Cost.second |= BlockCost.second;
    }
  
    return Cost;
@@ -5653,17 +5673,28 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
           Legal->hasStride(I->getOperand(1));
  }
  
-unsigned
+LoopVectorizationCostModel::VectorizationCostTy
  LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
    // If we know that this instruction will remain uniform, check the cost of
    // the scalar version.
    if (Legal->isUniformAfterVectorization(I))
      VF = 1;
  
+  Type *VectorTy;
+  unsigned C = getInstructionCost(I, VF, VectorTy);
+
+  bool TypeNotScalarized = VF > 1 && !VectorTy->isVoidTy() &&
+                           TTI.getNumberOfParts(VectorTy) < VF;
+  return VectorizationCostTy(C, TypeNotScalarized);
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF,
+                                               Type *&VectorTy) {
    Type *RetTy = I->getType();
    if (VF > 1 && MinBWs.count(I))
      RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
-  Type *VectorTy = ToVectorTy(RetTy, VF);
+  VectorTy = ToVectorTy(RetTy, VF);
  
    // TODO: We need to estimate the cost of intrinsic calls.
    switch (I->getOpcode()) {
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll

new file mode 100644 (file)

index 0000000..8abc25e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
@@ -0,0 +1,62 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: nounwind
+define zeroext i32 @test() #0 {
+; CHECK-LABEL: @test
+; CHECK-NOT: x i32>
+
+entry:
+  %a = alloca [1600 x i32], align 4
+  %c = alloca [1600 x i32], align 4
+  %0 = bitcast [1600 x i32]* %a to i8*
+  call void @llvm.lifetime.start(i64 6400, i8* %0) #3
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %1 = bitcast [1600 x i32]* %c to i8*
+  call void @llvm.lifetime.start(i64 6400, i8* %1) #3
+  %arraydecay = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 0
+  %arraydecay1 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 0
+  %call = call signext i32 @bar(i32* %arraydecay, i32* %arraydecay1) #3
+  br label %for.body6
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.body ]
+  %arrayidx = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 %indvars.iv25
+  %2 = trunc i64 %indvars.iv25 to i32
+  store i32 %2, i32* %arrayidx, align 4
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond27 = icmp eq i64 %indvars.iv.next26, 1600
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup5:                                ; preds = %for.body6
+  call void @llvm.lifetime.end(i64 6400, i8* nonnull %1) #3
+  call void @llvm.lifetime.end(i64 6400, i8* %0) #3
+  ret i32 %add
+
+for.body6:                                        ; preds = %for.body6, %for.cond.cleanup
+  %indvars.iv = phi i64 [ 0, %for.cond.cleanup ], [ %indvars.iv.next, %for.body6 ]
+  %s.022 = phi i32 [ 0, %for.cond.cleanup ], [ %add, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx8, align 4
+  %add = add i32 %3, %s.022
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup5, label %for.body6
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+declare signext i32 @bar(i32*, i32*) #2
+
+attributes #0 = { nounwind "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+attributes #3 = { nounwind }
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll

new file mode 100644 (file)

index 0000000..d147355
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define i32 @accum(i32* nocapture readonly %x, i32 %N) #0 {
+entry:
+; CHECK-LABEL: @accum
+; CHECK-NOT: x i32>
+
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.inc.preheader, label %for.end
+
+for.inc.preheader:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.inc.preheader ]
+  %sum.02 = phi i32 [ %add, %for.inc ], [ 0, %for.inc.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.02
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.inc
+
+for.end.loopexit:
+  %add.lcssa = phi i32 [ %add, %for.inc ]
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %sum.0.lcssa
+
+; CHECK: ret i32
+}
+
+attributes #0 = { "target-cpu"="core2" "target-features"="+sse,-avx,-avx2,-sse2" }
+
author	Hal Finkel <hfinkel@anl.gov>
	Wed, 30 Mar 2016 19:37:08 +0000 (19:37 +0000)
committer	Hal Finkel <hfinkel@anl.gov>
	Wed, 30 Mar 2016 19:37:08 +0000 (19:37 +0000)
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll	[new file with mode: 0644]	patch \| blob