[SLP]Improve calculations of the cost for reused/reordered scalars.

author Alexey Bataev <a.bataev@outlook.com>

Thu, 15 Jul 2021 12:12:17 +0000 (05:12 -0700)

committer Alexey Bataev <a.bataev@outlook.com>

Fri, 16 Jul 2021 20:40:15 +0000 (13:40 -0700)
author Alexey Bataev <a.bataev@outlook.com>
Thu, 15 Jul 2021 12:12:17 +0000 (05:12 -0700)
committer Alexey Bataev <a.bataev@outlook.com>
Fri, 16 Jul 2021 20:40:15 +0000 (13:40 -0700)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

index e09e3e9..b88e5bb 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3621,6 +3621,27 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
    return Cost;
  }
  
+/// Shuffles \p Mask in accordance with the given \p SubMask.
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
+  if (SubMask.empty())
+    return;
+  if (Mask.empty()) {
+    Mask.append(SubMask.begin(), SubMask.end());
+    return;
+  }
+  SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
+  int TermValue = std::min(Mask.size(), SubMask.size());
+  for (int I = 0, E = SubMask.size(); I < E; ++I) {
+    if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
+        Mask[SubMask[I]] >= TermValue) {
+      NewMask[I] = UndefMaskElem;
+      continue;
+    }
+    NewMask[I] = Mask[SubMask[I]];
+  }
+  Mask.swap(NewMask);
+}
+
  InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                        ArrayRef<Value *> VectorizedVals) {
    ArrayRef<Value*> VL = E->Scalars;
@@ -3633,6 +3654,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
    else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
      ScalarTy = IE->getOperand(1)->getType();
    auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  auto *FinalVecTy = VecTy;
    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  
    // If we have computed a smaller type for the expression, update VecTy so
@@ -3643,12 +3665,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
  
    unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
    bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-  InstructionCost ReuseShuffleCost = 0;
-  if (NeedToShuffleReuses) {
-    ReuseShuffleCost =
-        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
-                            E->ReuseShuffleIndices);
-  }
+  if (NeedToShuffleReuses)
+    FinalVecTy =
+        FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers);
    // FIXME: it tries to fix a problem with MSVC buildbots.
    TargetTransformInfo &TTIRef = *TTI;
    auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
@@ -3737,6 +3756,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
              dbgs()
              << "SLP: perfect diamond match for gather bundle that starts with "
              << *VL.front() << ".\n");
+        if (NeedToShuffleReuses)
+          GatherCost =
+              TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                  FinalVecTy, E->ReuseShuffleIndices);
        } else {
          LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
                            << " entries for bundle that starts with "
@@ -3744,16 +3767,15 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
          // Detected that instead of gather we can emit a shuffle of single/two
          // previously vectorized nodes. Add the cost of the permutation rather
          // than gather.
-        GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
+        ::addMask(Mask, E->ReuseShuffleIndices);
+        GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask);
        }
-      return ReuseShuffleCost + GatherCost;
+      return GatherCost;
      }
      if (isSplat(VL)) {
        // Found the broadcasting of the single scalar, calculate the cost as the
        // broadcast.
-      return ReuseShuffleCost +
-             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
-                                 0);
+      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
      }
      if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
          allSameBlock(VL) &&
@@ -3771,11 +3793,36 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
          InstructionCost Cost =
              computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
          AdjustExtractsCost(Cost, /*IsGather=*/true);
-        return ReuseShuffleCost + Cost;
+        if (NeedToShuffleReuses)
+          Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                      FinalVecTy, E->ReuseShuffleIndices);
+        return Cost;
        }
      }
+    InstructionCost ReuseShuffleCost = 0;
+    if (NeedToShuffleReuses)
+      ReuseShuffleCost = TTI->getShuffleCost(
+          TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
      return ReuseShuffleCost + getGatherCost(VL);
    }
+  InstructionCost CommonCost = 0;
+  SmallVector<int> Mask;
+  if (!E->ReorderIndices.empty()) {
+    SmallVector<int> NewMask;
+    if (E->getOpcode() == Instruction::Store) {
+      // For stores the order is actually a mask.
+      NewMask.resize(E->ReorderIndices.size());
+      copy(E->ReorderIndices, NewMask.begin());
+    } else {
+      inversePermutation(E->ReorderIndices, NewMask);
+    }
+    ::addMask(Mask, NewMask);
+  }
+  if (NeedToShuffleReuses)
+    ::addMask(Mask, E->ReuseShuffleIndices);
+  if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
+    CommonCost =
+        TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
    assert((E->State == TreeEntry::Vectorize ||
            E->State == TreeEntry::ScatterVectorize) &&
           "Unhandled state");
@@ -3797,12 +3844,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
          for (unsigned I : E->ReuseShuffleIndices) {
            if (ShuffleOrOp == Instruction::ExtractElement) {
              auto *EE = cast<ExtractElementInst>(VL[I]);
-            ReuseShuffleCost -= TTI->getVectorInstrCost(
-                Instruction::ExtractElement, EE->getVectorOperandType(),
-                *getExtractIndex(EE));
+            CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                                  EE->getVectorOperandType(),
+                                                  *getExtractIndex(EE));
            } else {
-            ReuseShuffleCost -= TTI->getVectorInstrCost(
-                Instruction::ExtractElement, VecTy, Idx);
+            CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                                  VecTy, Idx);
              ++Idx;
            }
          }
@@ -3810,21 +3857,15 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
          for (Value *V : VL) {
            if (ShuffleOrOp == Instruction::ExtractElement) {
              auto *EE = cast<ExtractElementInst>(V);
-            ReuseShuffleCost += TTI->getVectorInstrCost(
-                Instruction::ExtractElement, EE->getVectorOperandType(),
-                *getExtractIndex(EE));
+            CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                                  EE->getVectorOperandType(),
+                                                  *getExtractIndex(EE));
            } else {
              --Idx;
-            ReuseShuffleCost += TTI->getVectorInstrCost(
-                Instruction::ExtractElement, VecTy, Idx);
+            CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                                  VecTy, Idx);
            }
          }
-        CommonCost = ReuseShuffleCost;
-      } else if (!E->ReorderIndices.empty()) {
-        SmallVector<int> NewMask;
-        inversePermutation(E->ReorderIndices, NewMask);
-        CommonCost = TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
        }
        if (ShuffleOrOp == Instruction::ExtractValue) {
          for (unsigned I = 0, E = VL.size(); I < E; ++I) {
@@ -3915,7 +3956,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
            TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
                                  TTI::getCastContextHint(VL0), CostKind, VL0);
        if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
        }
  
        // Calculate the cost of this instruction.
@@ -3925,12 +3966,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
        InstructionCost VecCost = 0;
        // Check if the values are candidates to demote.
        if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
-        VecCost =
-            ReuseShuffleCost +
-            TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
-                                  TTI::getCastContextHint(VL0), CostKind, VL0);
+        VecCost = CommonCost + TTI->getCastInstrCost(
+                                   E->getOpcode(), VecTy, SrcVecTy,
+                                   TTI::getCastContextHint(VL0), CostKind, VL0);
        }
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
+      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
        return VecCost - ScalarCost;
      }
      case Instruction::FCmp:
@@ -3941,7 +3981,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
            TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
                                    CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
        if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
        }
        auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
        InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
@@ -3982,8 +4022,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                        CmpInst::BAD_ICMP_PREDICATE, CostKind);
          VecCost = std::min(VecCost, IntrinsicCost);
        }
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return ReuseShuffleCost + VecCost - ScalarCost;
+      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+      return CommonCost + VecCost - ScalarCost;
      }
      case Instruction::FNeg:
      case Instruction::Add:
@@ -4046,14 +4086,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
            TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
                                        Op2VK, Op1VP, Op2VP, Operands, VL0);
        if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
        }
        InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
        InstructionCost VecCost =
            TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
                                        Op2VK, Op1VP, Op2VP, Operands, VL0);
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return ReuseShuffleCost + VecCost - ScalarCost;
+      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+      return CommonCost + VecCost - ScalarCost;
      }
      case Instruction::GetElementPtr: {
        TargetTransformInfo::OperandValueKind Op1VK =
@@ -4064,13 +4104,13 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
        InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
            Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
        if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
        }
        InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
        InstructionCost VecCost = TTI->getArithmeticInstrCost(
            Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return ReuseShuffleCost + VecCost - ScalarCost;
+      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+      return CommonCost + VecCost - ScalarCost;
      }
      case Instruction::Load: {
        // Cost of wide load - cost of scalar loads.
@@ -4078,7 +4118,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
        InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
            Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
        if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
        }
        InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
        InstructionCost VecLdCost;
@@ -4095,14 +4135,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
              Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
              /*VariableMask=*/false, Alignment, CostKind, VL0);
        }
-      if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) {
-        SmallVector<int> NewMask;
-        inversePermutation(E->ReorderIndices, NewMask);
-        VecLdCost += TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
-      return ReuseShuffleCost + VecLdCost - ScalarLdCost;
+      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
+      return CommonCost + VecLdCost - ScalarLdCost;
      }
      case Instruction::Store: {
        // We know that we can merge the stores. Calculate the cost.
@@ -4115,14 +4149,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
        InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
        InstructionCost VecStCost = TTI->getMemoryOpCost(
            Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
-      if (IsReorder) {
-        SmallVector<int> NewMask;
-        inversePermutation(E->ReorderIndices, NewMask);
-        VecStCost += TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
-      return VecStCost - ScalarStCost;
+      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
+      return CommonCost + VecStCost - ScalarStCost;
      }
      case Instruction::Call: {
        CallInst *CI = cast<CallInst>(VL0);
@@ -4133,7 +4161,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
        InstructionCost ScalarEltCost =
            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
        if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
        }
        InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
  
@@ -4145,7 +4173,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                          << " (" << VecCallCost << "-" << ScalarCallCost << ")"
                          << " for " << *CI << "\n");
  
-      return ReuseShuffleCost + VecCallCost - ScalarCallCost;
+      return CommonCost + VecCallCost - ScalarCallCost;
      }
      case Instruction::ShuffleVector: {
        assert(E->isAltShuffle() &&
@@ -4158,11 +4186,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
        if (NeedToShuffleReuses) {
          for (unsigned Idx : E->ReuseShuffleIndices) {
            Instruction *I = cast<Instruction>(VL[Idx]);
-          ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
+          CommonCost -= TTI->getInstructionCost(I, CostKind);
          }
          for (Value *V : VL) {
            Instruction *I = cast<Instruction>(V);
-          ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
+          CommonCost += TTI->getInstructionCost(I, CostKind);
          }
        }
        for (Value *V : VL) {
@@ -4196,8 +4224,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
        }
        VecCost +=
            TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0);
-      LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
-      return ReuseShuffleCost + VecCost - ScalarCost;
+      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+      return CommonCost + VecCost - ScalarCost;
      }
      default:
        llvm_unreachable("Unknown instruction");
@@ -4929,25 +4957,7 @@ public:
      addMask(NewMask);
    }
  
-  void addMask(ArrayRef<int> SubMask) {
-    if (SubMask.empty())
-      return;
-    if (Mask.empty()) {
-      Mask.append(SubMask.begin(), SubMask.end());
-      return;
-    }
-    SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
-    int TermValue = std::min(Mask.size(), SubMask.size());
-    for (int I = 0, E = SubMask.size(); I < E; ++I) {
-      if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
-          Mask[SubMask[I]] >= TermValue) {
-        NewMask[I] = UndefMaskElem;
-        continue;
-      }
-      NewMask[I] = Mask[SubMask[I]];
-    }
-    Mask.swap(NewMask);
-  }
+  void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); }
  
    Value *finalize(Value *V) {
      IsFinalized = true;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll

index 5c2896b..3e794b4 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -1,6 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -mtriple=x86_64--                 -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -mattr=avx512vl -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64--                 -S | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -mattr=avx512vl -S | FileCheck %s
  
  declare void @use1(i1)
  
@@ -334,54 +334,31 @@ define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) {
  }
  
  define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
-; SSE-LABEL: @logical_and_icmp_clamp_v8i32(
-; SSE-NEXT:    [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0
-; SSE-NEXT:    [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1
-; SSE-NEXT:    [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2
-; SSE-NEXT:    [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3
-; SSE-NEXT:    [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0
-; SSE-NEXT:    [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1
-; SSE-NEXT:    [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2
-; SSE-NEXT:    [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3
-; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[Y0]], i32 4
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[Y3]], i32 7
-; SSE-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], [[TMP8]]
-; SSE-NEXT:    [[TMP10:%.*]] = freeze <8 x i1> [[TMP9]]
-; SSE-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP10]])
-; SSE-NEXT:    ret i1 [[TMP11]]
-;
-; AVX512-LABEL: @logical_and_icmp_clamp_v8i32(
-; AVX512-NEXT:    [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0
-; AVX512-NEXT:    [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1
-; AVX512-NEXT:    [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2
-; AVX512-NEXT:    [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3
-; AVX512-NEXT:    [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0
-; AVX512-NEXT:    [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1
-; AVX512-NEXT:    [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2
-; AVX512-NEXT:    [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
-; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2
-; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3
-; AVX512-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 42>
-; AVX512-NEXT:    [[D0:%.*]] = icmp slt i32 [[X0]], [[Y0]]
-; AVX512-NEXT:    [[D1:%.*]] = icmp slt i32 [[X1]], [[Y1]]
-; AVX512-NEXT:    [[D2:%.*]] = icmp slt i32 [[X2]], [[Y2]]
-; AVX512-NEXT:    [[D3:%.*]] = icmp slt i32 [[X3]], [[Y3]]
-; AVX512-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
-; AVX512-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
-; AVX512-NEXT:    [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false
-; AVX512-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; AVX512-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; AVX512-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; AVX512-NEXT:    ret i1 [[S7]]
+; CHECK-LABEL: @logical_and_icmp_clamp_v8i32(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3
+; CHECK-NEXT:    [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2
+; CHECK-NEXT:    [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[D0:%.*]] = icmp slt i32 [[X0]], [[Y0]]
+; CHECK-NEXT:    [[D1:%.*]] = icmp slt i32 [[X1]], [[Y1]]
+; CHECK-NEXT:    [[D2:%.*]] = icmp slt i32 [[X2]], [[Y2]]
+; CHECK-NEXT:    [[D3:%.*]] = icmp slt i32 [[X3]], [[Y3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false
+; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
+; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; CHECK-NEXT:    ret i1 [[S7]]
  ;
    %x0 = extractelement <8 x i32> %x, i32 0
    %x1 = extractelement <8 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll

index 3b36098..65f0277 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -272,18 +272,21 @@ define void @tiny_vector_gather(i32 *%a, i32 *%v1, i32 *%v2) {
  ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[V1:%.*]], align 4
  ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[V2:%.*]], align 4
  ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[PTR0]], align 16
  ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[PTR1]], align 4
  ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[PTR2]], align 8
  ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[PTR3]], align 4
  ; CHECK-NEXT:    [[PTR4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[PTR4]], align 16
  ; CHECK-NEXT:    [[PTR5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 5
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[PTR5]], align 4
  ; CHECK-NEXT:    [[PTR6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 6
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[PTR6]], align 8
  ; CHECK-NEXT:    [[PTR7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 7
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[PTR0]] to <8 x i32>*
-; CHECK-NEXT:    store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP5]], align 16
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[PTR7]], align 4
  ; CHECK-NEXT:    ret void
  ;
    %1 = load i32, i32* %v1, align 4
author	Alexey Bataev <a.bataev@outlook.com>
	Thu, 15 Jul 2021 12:12:17 +0000 (05:12 -0700)
committer	Alexey Bataev <a.bataev@outlook.com>
	Fri, 16 Jul 2021 20:40:15 +0000 (13:40 -0700)
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll		patch \| blob \| history