[LV] Don't attempt to type-shrink scalarized instructions

author Matthew Simpson <mssimpso@codeaurora.org>

Fri, 16 Dec 2016 16:52:35 +0000 (16:52 +0000)

committer Matthew Simpson <mssimpso@codeaurora.org>

Fri, 16 Dec 2016 16:52:35 +0000 (16:52 +0000)
author Matthew Simpson <mssimpso@codeaurora.org>
Fri, 16 Dec 2016 16:52:35 +0000 (16:52 +0000)
committer Matthew Simpson <mssimpso@codeaurora.org>
Fri, 16 Dec 2016 16:52:35 +0000 (16:52 +0000)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index e434ca2..f52b27a 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1917,6 +1917,13 @@ public:
      return Scalars->second.count(I);
    }
  
+  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
+  /// for vectorization factor \p VF.
+  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
+    return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
+           !Legal->isScalarAfterVectorization(I);
+  }
+
  private:
    /// The vectorization cost is a combination of the cost itself and a boolean
    /// indicating whether any of the contributing operations will actually
@@ -3725,6 +3732,11 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
    //
    SmallPtrSet<Value *, 4> Erased;
    for (const auto &KV : Cost->getMinimalBitwidths()) {
+    // If the value wasn't vectorized, we must maintain the original scalar
+    // type. The absence of the value from VectorLoopValueMap indicates that it
+    // wasn't vectorized.
+    if (!VectorLoopValueMap.hasVector(KV.first))
+      continue;
      VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
      for (Value *&I : Parts) {
        if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
@@ -3817,6 +3829,11 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
  
    // We'll have created a bunch of ZExts that are now parentless. Clean up.
    for (const auto &KV : Cost->getMinimalBitwidths()) {
+    // If the value wasn't vectorized, we must maintain the original scalar
+    // type. The absence of the value from VectorLoopValueMap indicates that it
+    // wasn't vectorized.
+    if (!VectorLoopValueMap.hasVector(KV.first))
+      continue;
      VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
      for (Value *&I : Parts) {
        ZExtInst *Inst = dyn_cast<ZExtInst>(I);
@@ -6837,7 +6854,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                          unsigned VF,
                                                          Type *&VectorTy) {
    Type *RetTy = I->getType();
-  if (VF > 1 && MinBWs.count(I))
+  if (canTruncateToMinimalBitwidth(I, VF))
      RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
    VectorTy = ToVectorTy(RetTy, VF);
    auto SE = PSE.getSE();
@@ -6958,9 +6975,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
    case Instruction::FCmp: {
      Type *ValTy = I->getOperand(0)->getType();
      Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
-    auto It = MinBWs.find(Op0AsInstruction);
-    if (VF > 1 && It != MinBWs.end())
-      ValTy = IntegerType::get(ValTy->getContext(), It->second);
+    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
      VectorTy = ToVectorTy(ValTy, VF);
      return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
    }
@@ -7108,7 +7124,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
  
      Type *SrcScalarTy = I->getOperand(0)->getType();
      Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
-    if (VF > 1 && MinBWs.count(I)) {
+    if (canTruncateToMinimalBitwidth(I, VF)) {
        // This cast is going to be shrunk. This may remove the cast or it might
        // turn it into slightly different cast. For example, if MinBW == 16,
        // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll

index 3912375..c29af06 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -131,3 +131,56 @@ for.inc26:
    %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
    unreachable
  }
+
+; VEC-LABEL: @minimal_bit_widths(
+;
+; In the test below, it's more profitable for the expression feeding the
+; conditional store to remain scalar. Since we can only type-shrink vector
+; types, we shouldn't try to represent the expression in a smaller type.
+;
+; VEC: vector.body:
+; VEC:   %wide.load = load <2 x i8>, <2 x i8>* {{.*}}, align 1
+; VEC:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; VEC: [[IF0]]:
+; VEC:   %[[E0:.+]] = extractelement <2 x i8> %wide.load, i32 0
+; VEC:   %[[Z0:.+]] = zext i8 %[[E0]] to i32
+; VEC:   %[[T0:.+]] = trunc i32 %[[Z0]] to i8
+; VEC:   store i8 %[[T0]], i8* {{.*}}, align 1
+; VEC:   br label %[[CONT0]]
+; VEC: [[CONT0]]:
+; VEC:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; VEC: [[IF1]]:
+; VEC:   %[[E1:.+]] = extractelement <2 x i8> %wide.load, i32 1
+; VEC:   %[[Z1:.+]] = zext i8 %[[E1]] to i32
+; VEC:   %[[T1:.+]] = trunc i32 %[[Z1]] to i8
+; VEC:   store i8 %[[T1]], i8* {{.*}}, align 1
+; VEC:   br label %[[CONT1]]
+; VEC: [[CONT1]]:
+; VEC:   br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @minimal_bit_widths(i1 %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
+  %tmp1 = phi i64 [ %tmp7, %for.inc ], [ undef, %entry ]
+  %tmp2 = getelementptr i8, i8* undef, i64 %tmp0
+  %tmp3 = load i8, i8* %tmp2, align 1
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp4 = zext i8 %tmp3 to i32
+  %tmp5 = trunc i32 %tmp4 to i8
+  store i8 %tmp5, i8* %tmp2, align 1
+  br label %for.inc
+
+for.inc:
+  %tmp6 = add nuw nsw i64 %tmp0, 1
+  %tmp7 = add i64 %tmp1, -1
+  %tmp8 = icmp eq i64 %tmp7, 0
+  br i1 %tmp8, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
author	Matthew Simpson <mssimpso@codeaurora.org>
	Fri, 16 Dec 2016 16:52:35 +0000 (16:52 +0000)
committer	Matthew Simpson <mssimpso@codeaurora.org>
	Fri, 16 Dec 2016 16:52:35 +0000 (16:52 +0000)
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/if-pred-stores.ll		patch \| blob \| history