From 2cd34bb5857b74561277cc146a2558c2fb93e91c Mon Sep 17 00:00:00 2001
From: Hao Liu <Hao.Liu@arm.com>
Date: Fri, 26 Jun 2015 02:45:36 +0000
Subject: [PATCH] [ARM] Lower interleaved memory accesses to vldN/vstN
 intrinsics. This patch also adds a function to calculate the cost of
 interleaved memory accesses.

E.g. Lower an interleaved load:
        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
     into:
        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1

E.g. Lower an interleaved store:
        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
     into:
        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)

Differential Revision: http://reviews.llvm.org/D10533

llvm-svn: 240755
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp           | 161 +++++++++++++++++
 llvm/lib/Target/ARM/ARMISelLowering.h             |   9 +
 llvm/lib/Target/ARM/ARMTargetMachine.cpp          |   4 +
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp    |  25 +++
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h      |   5 +
 llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll | 204 ++++++++++++++++++++++
 6 files changed, 408 insertions(+)
 create mode 100644 llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index ac4233c..4b2105b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -11404,6 +11404,167 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
               Addr});
 }
 
+/// \brief Lower an interleaved load into a vldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
+///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+///
+///      Into:
+///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
+///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
+///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
+bool ARMTargetLowering::lowerInterleavedLoad(
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    ArrayRef<unsigned> Indices, unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+  assert(!Shuffles.empty() && "Empty shufflevector input");
+  assert(Shuffles.size() == Indices.size() &&
+         "Unmatched number of shufflevectors and indices");
+
+  VectorType *VecTy = Shuffles[0]->getType();
+  Type *EltTy = VecTy->getVectorElementType();
+
+  const DataLayout *DL = getDataLayout();
+  unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
+  bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+
+  // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
+  // support i64/f64 element).
+  if ((VecSize != 64 && VecSize != 128) || EltIs64Bits)
+    return false;
+
+  // A pointer vector can not be the return type of the ldN intrinsics. Need to
+  // load integer vectors first and then convert to pointer vectors.
+  if (EltTy->isPointerTy())
+    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
+                            VecTy->getVectorNumElements());
+
+  static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
+                                            Intrinsic::arm_neon_vld3,
+                                            Intrinsic::arm_neon_vld4};
+
+  Function *VldnFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy);
+
+  IRBuilder<> Builder(LI);
+  SmallVector<Value *, 2> Ops;
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+  Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
+  Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+  CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+
+  // Replace uses of each shufflevector with the corresponding vector loaded
+  // by ldN.
+  for (unsigned i = 0; i < Shuffles.size(); i++) {
+    ShuffleVectorInst *SV = Shuffles[i];
+    unsigned Index = Indices[i];
+
+    Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+
+    // Convert the integer vector to pointer vector if the element is pointer.
+    if (EltTy->isPointerTy())
+      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+
+    SV->replaceAllUsesWith(SubVec);
+  }
+
+  return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                   unsigned NumElts) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumElts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store into a vstN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
+///
+///      Into:
+///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// vst3 instruction in CodeGen.
+bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                              ShuffleVectorInst *SVI,
+                                              unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
+  VectorType *VecTy = SVI->getType();
+  assert(VecTy->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleaved store");
+
+  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+  Type *EltTy = VecTy->getVectorElementType();
+  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+
+  const DataLayout *DL = getDataLayout();
+  unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+
+  // Skip illegal sub vector types and vector types of i64/f64 element (vstN
+  // doesn't support i64/f64 element).
+  if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits)
+    return false;
+
+  Value *Op0 = SVI->getOperand(0);
+  Value *Op1 = SVI->getOperand(1);
+  IRBuilder<> Builder(SI);
+
+  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+  // vectors to integer vectors.
+  if (EltTy->isPointerTy()) {
+    Type *IntTy = DL->getIntPtrType(EltTy);
+
+    // Convert to the corresponding integer vector.
+    Type *IntVecTy =
+        VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
+    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+    SubVecTy = VectorType::get(IntTy, NumSubElts);
+  }
+
+  static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+                                       Intrinsic::arm_neon_vst3,
+                                       Intrinsic::arm_neon_vst4};
+  Function *VstNFunc = Intrinsic::getDeclaration(
+      SI->getModule(), StoreInts[Factor - 2], SubVecTy);
+
+  SmallVector<Value *, 6> Ops;
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+
+  // Split the shufflevector operands into sub vectors for the new vstN call.
+  for (unsigned i = 0; i < Factor; i++)
+    Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+
+  Ops.push_back(Builder.getInt32(SI->getAlignment()));
+  Builder.CreateCall(VstNFunc, Ops);
+  return true;
+}
+
 enum HABaseType {
   HA_UNKNOWN = 0,
   HA_FLOAT,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index c0b329c..7439639 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -433,6 +433,15 @@ namespace llvm {
     Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
                            bool IsStore, bool IsLoad) const override;
 
+    unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+    bool lowerInterleavedLoad(LoadInst *LI,
+                              ArrayRef<ShuffleVectorInst *> Shuffles,
+                              ArrayRef<unsigned> Indices,
+                              unsigned Factor) const override;
+    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                               unsigned Factor) const override;
+
     bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicRMWExpansionKind
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 104a34f..6e81bd2 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -332,6 +332,10 @@ void ARMPassConfig::addIRPasses() {
     }));
 
   TargetPassConfig::addIRPasses();
+
+  // Match interleaved memory accesses to ldN/stN intrinsics.
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createInterleavedAccessPass(TM));
 }
 
 bool ARMPassConfig::addPreISel() {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 4e1b371..f4901fc 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -478,3 +478,28 @@ unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   }
   return LT.first;
 }
+
+unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                                unsigned Factor,
+                                                ArrayRef<unsigned> Indices,
+                                                unsigned Alignment,
+                                                unsigned AddressSpace) {
+  assert(Factor >= 2 && "Invalid interleave factor");
+  assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+  // vldN/vstN doesn't support vector types of i64/f64 element.
+  bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+    unsigned NumElts = VecTy->getVectorNumElements();
+    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);
+
+    // vldN/vstN only support legal vector types of size 64 or 128 in bits.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+      return Factor;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 9479d76..f2e5db6 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -126,6 +126,11 @@ public:
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace);
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace);
   /// @}
 };
 
diff --git a/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll b/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
new file mode 100644
index 0000000..9a9885c
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -0,0 +1,204 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -lower-interleaved-accesses=true < %s | FileCheck %s
+
+; CHECK-LABEL: load_factor2:
+; CHECK: vld2.8 {d16, d17}, [r0]
+define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
+  %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
+  %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %add = add nsw <8 x i8> %strided.v0, %strided.v1
+  ret <8 x i8> %add
+}
+
+; CHECK-LABEL: load_factor3:
+; CHECK: vld3.32 {d16, d17, d18}, [r0]
+define <2 x i32> @load_factor3(i32* %ptr) {
+  %base = bitcast i32* %ptr to <6 x i32>*
+  %wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
+  %strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+  %strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+  %add = add nsw <2 x i32> %strided.v2, %strided.v1
+  ret <2 x i32> %add
+}
+
+; CHECK-LABEL: load_factor4:
+; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
+; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
+define <4 x i32> @load_factor4(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: store_factor2:
+; CHECK: vst2.8 {d16, d17}, [r0]
+define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
+  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_factor3:
+; CHECK: vst3.32 {d16, d18, d20}, [r0]!
+; CHECK: vst3.32 {d17, d19, d21}, [r0]
+define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_factor4:
+; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
+; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
+define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}
+
+; The following cases test that interleaved access of pointer vectors can be
+; matched to ldN/stN instruction.
+
+; CHECK-LABEL: load_ptrvec_factor2:
+; CHECK: vld2.32 {d16, d17}, [r0]
+define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
+  %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32*> %strided.v0
+}
+
+; CHECK-LABEL: load_ptrvec_factor3:
+; CHECK: vld3.32 {d16, d17, d18}, [r0]
+define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
+  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
+  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: load_ptrvec_factor4:
+; CHECK: vld4.32 {d16, d17, d18, d19}, [r0]
+define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <8 x i32*>*
+  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
+  %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
+  %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
+  store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor2:
+; CHECK: vst2.32 {d16, d17}, [r0]
+define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor3:
+; CHECK: vst3.32 {d16, d17, d18}, [r0]
+define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor4:
+; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
+define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
+  %base = bitcast i32* %ptr to <8 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
+  ret void
+}
+
+; Following cases check that shuffle maskes with undef indices can be matched
+; into ldN/stN instruction.
+
+; CHECK-LABEL: load_undef_mask_factor2:
+; CHECK: vld2.32 {d16, d17, d18, d19}, [r0]
+define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
+  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_undef_mask_factor3:
+; CHECK: vld3.32 {d16, d18, d20}, [r0]!
+; CHECK: vld3.32 {d17, d19, d21}, [r0]
+define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
+  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %add = add nsw <4 x i32> %strided.v2, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_undef_mask_factor4:
+; CHECK: vld4.32 {d16, d18, d20, d22}, [r0]!
+; CHECK: vld4.32 {d17, d19, d21, d23}, [r0]
+define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: store_undef_mask_factor2:
+; CHECK: vst2.32 {d16, d17, d18, d19}, [r0]
+define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_undef_mask_factor3:
+; CHECK: vst3.32 {d16, d18, d20}, [r0]!
+; CHECK: vst3.32 {d17, d19, d21}, [r0]
+define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_undef_mask_factor4:
+; CHECK: vst4.32 {d16, d18, d20, d22}, [r0]!
+; CHECK: vst4.32 {d17, d19, d21, d23}, [r0]
+define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}
-- 
2.7.4