/// Enable matching of interleaved access groups.
bool enableInterleavedAccessVectorization() const;
+ /// Enable matching of interleaved access groups that contain predicated
+ /// accesses and are vectorized using masked vector loads/stores.
+ bool enableMaskedInterleavedAccessVectorization() const;
+
/// Indicate that it is potentially unsafe to automatically vectorize
/// floating-point operations because the semantics of vector and scalar
/// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
/// load allows gaps)
/// \p Alignment is the alignment of the memory operation
/// \p AddressSpace is address space of the pointer.
+ /// \p IsMasked indicates if the memory access is predicated.
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace) const;
+ unsigned AddressSpace,
+ bool IsMasked = false) const;
/// Calculate the cost of performing a vector reduction.
///
virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
bool IsZeroCmp) const = 0;
virtual bool enableInterleavedAccessVectorization() = 0;
+ virtual bool enableMaskedInterleavedAccessVectorization() = 0;
virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
unsigned BitWidth,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) = 0;
+ unsigned AddressSpace,
+ bool IsMasked = false) = 0;
virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) = 0;
virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
bool enableInterleavedAccessVectorization() override {
return Impl.enableInterleavedAccessVectorization();
}
+ bool enableMaskedInterleavedAccessVectorization() override {
+ return Impl.enableMaskedInterleavedAccessVectorization();
+ }
bool isFPVectorizationPotentiallyUnsafe() override {
return Impl.isFPVectorizationPotentiallyUnsafe();
}
}
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace) override {
+ unsigned AddressSpace, bool IsMasked) override {
return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, IsMasked);
}
int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) override {
bool enableInterleavedAccessVectorization() { return false; }
+ bool enableMaskedInterleavedAccessVectorization() { return false; }
+
bool isFPVectorizationPotentiallyUnsafe() { return false; }
bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned Alignment, unsigned AddressSpace,
+ bool IsMasked = false) {
return 1;
}
/// This function always sets a (possibly null) value for each K in Kinds.
Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
+/// Create a mask with replicated elements.
+///
+/// This function creates a shuffle mask for replicating each of the \p VF
+/// elements in a vector \p ReplicationFactor times. It can be used to
+/// transform a mask of \p VF elements into a mask of
+/// \p VF * \p ReplicationFactor elements used by a predicated
+/// interleaved-group of loads/stores whose Interleaved-factor ==
+/// \p ReplicationFactor.
+///
+/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
+///
+/// <0,0,0,1,1,1,2,2,2,3,3,3>
+Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor,
+ unsigned VF);
+
/// Create an interleave shuffle mask.
///
/// This function creates a shuffle mask for interleaving \p NumVecs vectors of
InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
DominatorTree *DT, LoopInfo *LI,
const LoopAccessInfo *LAI)
- : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
+ : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
~InterleavedAccessInfo() {
SmallPtrSet<InterleaveGroup *, 4> DelSet;
/// Analyze the interleaved accesses and collect them in interleave
/// groups. Substitute symbolic strides using \p Strides.
- void analyzeInterleaving();
+ /// Consider also predicated loads/stores in the analysis if
+ /// \p EnableMaskedInterleavedGroup is true.
+ void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
/// Check if \p Instr belongs to any interleave group.
bool isInterleaved(Instruction *Instr) const {
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned Alignment, unsigned AddressSpace,
+ bool IsMasked = false) {
VectorType *VT = dyn_cast<VectorType>(VecTy);
assert(VT && "Expect a vector type for interleaved memory op");
VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
// Firstly, the cost of load/store operation.
- unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
- Opcode, VecTy, Alignment, AddressSpace);
+ unsigned Cost;
+ if (IsMasked)
+ Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
+ Opcode, VecTy, Alignment, AddressSpace);
+ else
+ Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
+ AddressSpace);
// Legalize the vector type, and get the legalized and unlegalized type
// sizes.
->getVectorInstrCost(Instruction::InsertElement, VT, i);
}
+ if (!IsMasked)
+ return Cost;
+
+ Type *I8Type = Type::getInt8Ty(VT->getContext());
+ VectorType *MaskVT = VectorType::get(I8Type, NumElts);
+ SubVT = VectorType::get(I8Type, NumSubElts);
+
+ // The Mask shuffling cost is extract all the elements of the Mask
+ // and insert each of them Factor times into the wide vector:
+ //
+ // E.g. an interleaved group with factor 3:
+ // %mask = icmp ult <8 x i32> %vec1, %vec2
+ // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
+ // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
+ // The cost is estimated as extract all mask elements from the <8xi1> mask
+ // vector and insert them factor times into the <24xi1> shuffled mask
+ // vector.
+ for (unsigned i = 0; i < NumSubElts; i++)
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::ExtractElement, SubVT, i);
+
+ for (unsigned i = 0; i < NumElts; i++)
+ Cost += static_cast<T *>(this)->getVectorInstrCost(
+ Instruction::InsertElement, MaskVT, i);
+
return Cost;
}
return TTIImpl->enableInterleavedAccessVectorization();
}
+bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {
+ return TTIImpl->enableMaskedInterleavedAccessVectorization();
+}
+
bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
return TTIImpl->isFPVectorizationPotentiallyUnsafe();
}
int TargetTransformInfo::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace) const {
- int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ unsigned Alignment, unsigned AddressSpace, bool IsMasked) const {
+ int Cost = TTIImpl->getInterleavedMemoryOpCost(
+ Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
return Inst;
}
+Constant *llvm::createReplicatedMask(IRBuilder<> &Builder,
+ unsigned ReplicationFactor, unsigned VF) {
+ SmallVector<Constant *, 16> MaskVec;
+ for (unsigned i = 0; i < VF; i++)
+ for (unsigned j = 0; j < ReplicationFactor; j++)
+ MaskVec.push_back(Builder.getInt32(i));
+
+ return ConstantVector::get(MaskVec);
+}
+
Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
unsigned NumVecs) {
SmallVector<Constant *, 16> Mask;
// this group because it and (2) are dependent. However, (1) can be grouped
// with other accesses that may precede it in program order. Note that a
// bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving() {
+void InterleavedAccessInfo::analyzeInterleaving(
+ bool EnablePredicatedInterleavedMemAccesses) {
LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
const ValueToValueMap &Strides = LAI->getSymbolicStrides();
// create a group for B, we continue with the bottom-up algorithm to ensure
// we don't break any of B's dependences.
InterleaveGroup *Group = nullptr;
- // TODO: Ignore B if it is in a predicated block. This restriction can be
- // relaxed in the future once we handle masked interleaved groups.
- if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) {
+ if (isStrided(DesB.Stride) &&
+ (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
Group = getInterleaveGroup(B);
if (!Group) {
LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
if (DistanceToB % static_cast<int64_t>(DesB.Size))
continue;
- // Ignore A if either A or B is in a predicated block. Although we
- // currently prevent group formation for predicated accesses, we may be
- // able to relax this limitation in the future once we handle more
- // complicated blocks.
- if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
+ // All members of a predicated interleave-group must have the same predicate,
+ // and currently must reside in the same BB.
+ BasicBlock *BlockA = A->getParent();
+ BasicBlock *BlockB = B->getParent();
+ if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
+ (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
continue;
// The index of A is the index of B plus A's distance to B in multiples
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool IsMasked) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
- if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, IsMasked);
}
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, bool IsMasked = false);
bool
shouldConsiderAddressTypePromotion(const Instruction &I,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool IsMasked) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
// vldN/vstN doesn't support vector types of i64/f64 element.
bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
- if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+ if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
+ !IsMasked) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, IsMasked);
}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, bool IsMasked);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace) {
- if (Indices.size() != Factor)
+ unsigned Alignment, unsigned AddressSpace, bool IsMasked) {
+ if (Indices.size() != Factor || IsMasked)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, IsMasked);
return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
}
bool VariableMask, unsigned Alignment);
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, bool IsMasked);
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I);
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool IsMasked) {
+ if (IsMasked)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, IsMasked);
+
assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace,
+ bool IsMasked = false);
/// @}
};
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool IsMasked) {
+ if (IsMasked)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, IsMasked);
assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, bool IsMasked = false);
/// @}
};
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool IsMasked) {
+
+ if (IsMasked)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, IsMasked);
// We currently Support only fully-interleaved groups, with no gaps.
// TODO: Support also strided loads (interleaved-groups with gaps).
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool IsMasked) {
+
+ if (IsMasked)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, IsMasked);
// VecTy for interleave memop is <VF*Factor x Elt>.
// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool IsMasked) {
auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
Type *EltTy = VecTy->getVectorElementType();
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
};
if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, IsMasked);
if (ST->hasAVX2())
return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, IsMasked);
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, IsMasked);
}
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace);
+ unsigned Alignment, unsigned AddressSpace,
+ bool IsMasked = false);
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace);
+ unsigned Alignment, unsigned AddressSpace,
+ bool IsMasked = false);
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace);
+ unsigned Alignment, unsigned AddressSpace,
+ bool IsMasked = false);
int getIntImmCost(int64_t);
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
+static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
+ "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
+
/// We don't interleave loops with a known constant trip count below this
/// number.
static const unsigned TinyTripCountInterleaveThreshold = 128;
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
- /// Try to vectorize the interleaved access group that \p Instr belongs to.
- void vectorizeInterleaveGroup(Instruction *Instr);
+ /// Try to vectorize the interleaved access group that \p Instr belongs to,
+ /// optionally masking the vector operations if \p BlockInMask is non-null.
+ void vectorizeInterleaveGroup(Instruction *Instr,
+ VectorParts *BlockInMask = nullptr);
/// Vectorize Load and Store instructions, optionally masking the vector
/// operations if \p BlockInMask is non-null.
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ /// Returns true if \p I is a memory instruction in an interleaved-group
+ /// of memory accesses that can be vectorized with wide vector loads/stores
+ /// and shuffles.
+ bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+
/// Check if \p Instr belongs to any interleaved access group.
bool isAccessInterleaved(Instruction *Instr) {
return InterleaveInfo.isInterleaved(Instr);
// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
+ VectorParts *BlockInMask) {
const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
assert(Group && "Fail to get an interleaved access group.");
SmallVector<Value *, 2> NewPtrs;
unsigned Index = Group->getIndex(Instr);
+ VectorParts Mask;
+ bool IsMaskRequired = BlockInMask;
+ if (IsMaskRequired) {
+ Mask = *BlockInMask;
+ // TODO: extend the masked interleaved-group support to reversed access.
+ assert(!Group->isReverse() && "Reversed masked interleave-group "
+ "not supported.");
+ }
+
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
// For each unroll part, create a wide load for the group.
SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
- auto *NewLoad = Builder.CreateAlignedLoad(
- NewPtrs[Part], Group->getAlignment(), "wide.vec");
+ Instruction *NewLoad;
+ if (IsMaskRequired) {
+ auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ Mask[Part], Undefs, RepMask, "interleaved.mask");
+ NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+ ShuffledMask, UndefVec,
+ "wide.masked.vec");
+ }
+ else
+ NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part],
+ Group->getAlignment(), "wide.vec");
Group->addMetadata(NewLoad);
NewLoads.push_back(NewLoad);
}
Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
"interleaved.vec");
- Instruction *NewStoreInstr =
- Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
+ Instruction *NewStoreInstr;
+ if (IsMaskRequired) {
+ auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ Mask[Part], Undefs, RepMask, "interleaved.mask");
+ NewStoreInstr = Builder.CreateMaskedStore(
+ IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
+ }
+ else
+ NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
+ Group->getAlignment());
Group->addMetadata(NewStoreInstr);
}
return false;
}
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+ if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0))
+ return TTI.enableMaskedInterleavedAccessVectorization();
+
+ // If an override option has been passed in for interleaved accesses, use it.
+ return EnableMaskedInterleavedMemAccesses;
+}
+
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
+ unsigned VF) {
+ assert(isAccessInterleaved(I) && "Expecting interleaved access.");
+ assert(getWideningDecision(I, VF) == CM_Unknown &&
+ "Decision should not be set yet.");
+
+ if (!Legal->blockNeedsPredication(I->getParent()) ||
+ !Legal->isMaskRequired(I))
+ return true;
+
+ if (!useMaskedInterleavedAccesses(TTI))
+ return false;
+
+ auto *Ty = getMemInstValueType(I);
+ return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
+ : TTI.isLegalMaskedStore(Ty);
+}
+
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
unsigned VF) {
// Get and ensure we have a valid memory instruction.
}
// Calculate the cost of the whole interleaved group.
- unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
- Group->getFactor(), Indices,
- Group->getAlignment(), AS);
-
- if (Group->isReverse())
+ unsigned Cost = TTI.getInterleavedMemoryOpCost(
+ I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+ Group->getAlignment(), AS, Legal->isMaskRequired(I));
+
+ if (Group->isReverse()) {
+ // TODO: Add support for reversed masked interleaved access.
+ assert(!Legal->isMaskRequired(I) &&
+ "Reverse masked interleaved access not supported.");
Cost += Group->getNumMembers() *
TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+ }
return Cost;
}
continue;
NumAccesses = Group->getNumMembers();
- InterleaveCost = getInterleaveGroupCost(&I, VF);
+ if (interleavedAccessCanBeWidened(&I, VF))
+ InterleaveCost = getInterleaveGroupCost(&I, VF);
}
unsigned GatherScatterCost =
}
VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
- VFRange &Range) {
+ VFRange &Range,
+ VPlanPtr &Plan) {
const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
if (!IG)
return nullptr;
assert(I == IG->getInsertPos() &&
"Generating a recipe for an adjunct member of an interleave group");
- return new VPInterleaveRecipe(IG);
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(I))
+ Mask = createBlockInMask(I->getParent(), Plan);
+
+ return new VPInterleaveRecipe(IG, Mask);
}
VPWidenMemoryInstructionRecipe *
VPRecipeBase *Recipe = nullptr;
// Check if Instr should belong to an interleave memory recipe, or already
// does. In the latter case Instr is irrelevant.
- if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+ if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
VPBB->appendRecipe(Recipe);
return true;
}
O << " +\n"
<< Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
+ if (User) {
+ O << ", ";
+ User->getOperand(0)->printAsOperand(O);
+ }
O << "\\l\"";
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *I = IG->getMember(i))
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
- State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+ if (!User)
+ return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+
+ // Last (and currently only) operand is a mask.
+ InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+ VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ MaskValues[Part] = State.get(Mask, Part);
+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
}
void VPReplicateRecipe::execute(VPTransformState &State) {
// Analyze interleaved memory accesses.
if (UseInterleaved) {
- IAI.analyzeInterleaving();
+ IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
}
// Use the cost model.
/// \return value is <true, nullptr>, as it is handled by another recipe.
/// \p Range.End may be decreased to ensure same decision from \p Range.Start
/// to \p Range.End.
- VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+ VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range,
+ VPlanPtr &Plan);
/// Check if \I is a memory instruction to be widened for \p Range.Start and
/// potentially masked. Such instructions are handled by a recipe that takes
class VPInterleaveRecipe : public VPRecipeBase {
private:
const InterleaveGroup *IG;
+ std::unique_ptr<VPUser> User;
public:
- VPInterleaveRecipe(const InterleaveGroup *IG)
- : VPRecipeBase(VPInterleaveSC), IG(IG) {}
+ VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask)
+ : VPRecipeBase(VPInterleaveSC), IG(IG) {
+ if (Mask) // Create a VPInstruction to register as a user of the mask.
+ User.reset(new VPUser({Mask}));
+ }
~VPInterleaveRecipe() override = default;
/// Method to support type inquiry through isa, cast, and dyn_cast.
--- /dev/null
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; When masked-interleaved-groups are disabled:
+; Check that the predicated load is not vectorized as an
+; interleaved-group but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load for an interleave-group (with
+; a single member).
+;
+; void masked_strided1(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard) {
+; char t = p[2*ix];
+; q[ix] = t;
+; }
+; }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32
+;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+ %conv = zext i8 %guard to i32
+ br label %for.body
+
+for.body:
+ %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp1 = icmp ugt i32 %ix.09, %conv
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %mul = shl nuw nsw i32 %ix.09, 1
+ %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+ %0 = load i8, i8* %arrayidx, align 1
+ %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+ store i8 %0, i8* %arrayidx3, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.09, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; Check also a scenario with full interleave-groups (no gaps) as well as both
+; load and store groups. We check that when masked-interleave-group is disabled
+; the predicated loads (and stores) are not vectorized as an
+; interleaved-group but rather as four separate scalarized accesses.
+; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load/store for the two interleave-
+; groups.
+;
+; void masked_strided2(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard) {
+; char left = p[2*ix];
+; char right = p[2*ix + 1];
+; char max = max(left, right);
+; q[2*ix] = max;
+; q[2*ix+1] = 0 - max;
+; }
+; }
+;}
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32
+;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+;ENABLED_MASKED_STRIDED: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+ %conv = zext i8 %guard to i32
+ br label %for.body
+
+for.body:
+ %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp1 = icmp ugt i32 %ix.024, %conv
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %mul = shl nuw nsw i32 %ix.024, 1
+ %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+ %0 = load i8, i8* %arrayidx, align 1
+ %add = or i32 %mul, 1
+ %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+ %1 = load i8, i8* %arrayidx4, align 1
+ %cmp.i = icmp slt i8 %0, %1
+ %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+ %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+ store i8 %spec.select.i, i8* %arrayidx6, align 1
+ %sub = sub i8 0, %spec.select.i
+ %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+ store i8 %sub, i8* %arrayidx11, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.024, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
--- /dev/null
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; We test here that the loop-vectorizer forms an interleave-groups from
+; predicated memory accesses only if they are both in the same (predicated)
+; block (first scenario below).
+; If the accesses are not in the same predicated block, an interleave-group
+; is not formed (scenarios 2,3 below).
+
+; Scenario 1: Check the case where it is legal to create masked interleave-
+; groups. Altogether two groups are created (one for loads and one for stores)
+; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
+; are disabled we do not create any interleave-group.
+;
+; void masked_strided1(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard) {
+; char left = p[2*ix];
+; char right = p[2*ix + 1];
+; char max = max(left, right);
+; q[2*ix] = max;
+; q[2*ix+1] = 0 - max;
+; }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1"
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided1"
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted: store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: into the interleave group with store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted: %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: into the interleave group with %{{.*}} = load i8, i8* %{{.*}}, align 1
+
+; Scenario 2: Check the case where it is illegal to create a masked interleave-
+; group because the first access is predicated, and the second isn't.
+; We therefore create a separate interleave-group with gaps for each of the
+; stores (if masked-interleaved-accesses are enabled) and these are later
+; invalidated because interleave-groups of stores with gaps are not supported.
+; If masked-interleaved-accesses is not enabled we create only one interleave
+; group of stores (for the non-predicated store) and it is later invalidated
+; due to gaps.
+;
+; void masked_strided2(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard1,
+; unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard1) {
+; q[2*ix] = 1;
+; }
+; q[2*ix+1] = 2;
+; }
+;}
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2"
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
+; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided2"
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; Scenario 3: Check the case where it is illegal to create a masked interleave-
+; group because the two accesses are in separate predicated blocks.
+; We therefore create a separate interleave-group with gaps for each of the accesses,
+; (which are later invalidated because interleave-groups of stores with gaps are
+; not supported).
+; If masked-interleaved-accesses is not enabled we don't create any interleave
+; group because all accesses are predicated.
+;
+; void masked_strided3(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard1,
+; unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard1) {
+; q[2*ix] = 1;
+; }
+; if (ix > guard2) {
+; q[2*ix+1] = 2;
+; }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3"
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided3"
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+ %conv = zext i8 %guard to i32
+ br label %for.body
+
+for.body:
+ %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp1 = icmp ugt i32 %ix.024, %conv
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %mul = shl nuw nsw i32 %ix.024, 1
+ %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+ %0 = load i8, i8* %arrayidx, align 1
+ %add = or i32 %mul, 1
+ %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+ %1 = load i8, i8* %arrayidx4, align 1
+ %cmp.i = icmp slt i8 %0, %1
+ %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+ %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+ store i8 %spec.select.i, i8* %arrayidx6, align 1
+ %sub = sub i8 0, %spec.select.i
+ %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+ store i8 %sub, i8* %arrayidx11, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.024, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+
+define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+ %conv = zext i8 %guard to i32
+ br label %for.body
+
+for.body:
+ %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %mul = shl nuw nsw i32 %ix.012, 1
+ %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+ store i8 1, i8* %arrayidx, align 1
+ %cmp1 = icmp ugt i32 %ix.012, %conv
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %add = or i32 %mul, 1
+ %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
+ store i8 2, i8* %arrayidx3, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.012, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+
+define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
+entry:
+ %conv = zext i8 %guard1 to i32
+ %conv3 = zext i8 %guard2 to i32
+ br label %for.body
+
+for.body:
+ %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %mul = shl nuw nsw i32 %ix.018, 1
+ %cmp1 = icmp ugt i32 %ix.018, %conv
+ br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+ %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+ store i8 1, i8* %arrayidx, align 1
+ br label %if.end
+
+if.end:
+ %cmp4 = icmp ugt i32 %ix.018, %conv3
+ br i1 %cmp4, label %if.then6, label %for.inc
+
+if.then6:
+ %add = or i32 %mul, 1
+ %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
+ store i8 2, i8* %arrayidx7, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.018, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+attributes #0 = { "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
%pair = type { i64, i64 }