#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
/// replacements are also performed.
bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
ArrayRef<ShuffleVectorInst *> Shuffles);
+
+ /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
+ /// to binop(shuffle(x), shuffle(y)) to allow the formation of an
+ /// interleaving load. Any newly created shuffles that operate on \p LI will
+ /// be added to \p Shuffles.
+ bool tryReplaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
+ SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
+ LoadInst *LI);
};
} // end anonymous namespace.
if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
return false;
+ // Check if all users of this load are shufflevectors. If we encounter any
+ // users that are extractelement instructions or binary operators, we save
+ // them to later check if they can be modified to extract from one of the
+ // shufflevectors instead of the load.
+
SmallVector<ShuffleVectorInst *, 4> Shuffles;
SmallVector<ExtractElementInst *, 4> Extracts;
+ // BinOpShuffles need to be handled a single time in case both operands of the
+ // binop are the same load.
+ SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
- // Check if all users of this load are shufflevectors. If we encounter any
- // users that are extractelement instructions, we save them to later check if
- // they can be modifed to extract from one of the shufflevectors instead of
- // the load.
- for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
- auto *Extract = dyn_cast<ExtractElementInst>(*UI);
+ for (auto *User : LI->users()) {
+ auto *Extract = dyn_cast<ExtractElementInst>(User);
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
Extracts.push_back(Extract);
continue;
}
- ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
+ auto *BI = dyn_cast<BinaryOperator>(User);
+ if (BI && BI->hasOneUse()) {
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(*BI->user_begin())) {
+ BinOpShuffles.insert(SVI);
+ continue;
+ }
+ }
+ auto *SVI = dyn_cast<ShuffleVectorInst>(User);
if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
return false;
Shuffles.push_back(SVI);
}
- if (Shuffles.empty())
+ if (Shuffles.empty() && BinOpShuffles.empty())
return false;
unsigned Factor, Index;
unsigned NumLoadElements =
cast<FixedVectorType>(LI->getType())->getNumElements();
+ auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
// Check if the first shufflevector is DE-interleave shuffle.
- if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
- MaxFactor, NumLoadElements))
+ if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
+ NumLoadElements))
return false;
// Holds the corresponding index for each DE-interleave shuffle.
SmallVector<unsigned, 4> Indices;
- Indices.push_back(Index);
- Type *VecTy = Shuffles[0]->getType();
+ Type *VecTy = FirstSVI->getType();
// Check if other shufflevectors are also DE-interleaved of the same type
// and factor as the first shufflevector.
- for (unsigned i = 1; i < Shuffles.size(); i++) {
- if (Shuffles[i]->getType() != VecTy)
+ for (auto *Shuffle : Shuffles) {
+ if (Shuffle->getType() != VecTy)
return false;
-
- if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
+ if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
Index))
return false;
Indices.push_back(Index);
}
+ for (auto *Shuffle : BinOpShuffles) {
+ if (Shuffle->getType() != VecTy)
+ return false;
+ if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
+ Index))
+ return false;
+
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+ Indices.push_back(Index);
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+ Indices.push_back(Index);
+ }
// Try and modify users of the load that are extractelement instructions to
// use the shufflevector instructions instead of the load.
if (!tryReplaceExtracts(Extracts, Shuffles))
return false;
+ if (!tryReplaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI))
+ return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
return true;
}
+bool InterleavedAccess::tryReplaceBinOpShuffles(
+ ArrayRef<ShuffleVectorInst *> BinOpShuffles,
+ SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+ for (auto *SVI : BinOpShuffles) {
+ BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
+ ArrayRef<int> Mask = SVI->getShuffleMask();
+
+ auto *NewSVI1 = new ShuffleVectorInst(
+ BI->getOperand(0), UndefValue::get(BI->getOperand(0)->getType()), Mask,
+ SVI->getName(), SVI);
+ auto *NewSVI2 = new ShuffleVectorInst(
+ BI->getOperand(1), UndefValue::get(BI->getOperand(1)->getType()), Mask,
+ SVI->getName(), SVI);
+ Value *NewBI = BinaryOperator::Create(BI->getOpcode(), NewSVI1, NewSVI2,
+ BI->getName(), SVI);
+ SVI->replaceAllUsesWith(NewBI);
+ LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI
+ << "\n With : " << *NewSVI1 << "\n And : "
+ << *NewSVI2 << "\n And : " << *NewBI << "\n");
+ RecursivelyDeleteTriviallyDeadInstructions(SVI);
+ if (NewSVI1->getOperand(0) == LI)
+ Shuffles.push_back(NewSVI1);
+ if (NewSVI2->getOperand(0) == LI)
+ Shuffles.push_back(NewSVI2);
+ }
+ return true;
+}
+
bool InterleavedAccess::tryReplaceExtracts(
ArrayRef<ExtractElementInst *> Extracts,
ArrayRef<ShuffleVectorInst *> Shuffles) {
if (!SI->isSimple())
return false;
- ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+ auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
return false;
bool Changed = false;
for (auto &I : instructions(F)) {
- if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+ if (auto *LI = dyn_cast<LoadInst>(&I))
Changed |= lowerInterleavedLoad(LI, DeadInsts);
- if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+ if (auto *SI = dyn_cast<StoreInst>(&I))
Changed |= lowerInterleavedStore(SI, DeadInsts);
}
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q1, q0, [x0], #32
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: uzp1 v2.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: str q0, [x1, x8]
+; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32
+; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s
+; CHECK-NEXT: str q2, [x1, x8]
; CHECK-NEXT: add x8, x8, #16 // =16
; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q2, q0, [x0, #16]
-; CHECK-NEXT: ldr q1, [x0], #48
-; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: ext v3.16b, v2.16b, v1.16b, #8
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: ext v5.16b, v1.16b, v3.16b, #12
-; CHECK-NEXT: ext v3.16b, v3.16b, v2.16b, #4
-; CHECK-NEXT: dup v4.4s, v0.s[1]
-; CHECK-NEXT: mov v2.s[0], v1.s[2]
-; CHECK-NEXT: dup v1.4s, v0.s[2]
-; CHECK-NEXT: mov v0.s[2], v0.s[0]
-; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #12
-; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8
-; CHECK-NEXT: mov v5.s[3], v4.s[3]
-; CHECK-NEXT: mov v3.s[3], v1.s[3]
-; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: fadd v1.4s, v3.4s, v5.4s
-; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: str q0, [x1, x8]
+; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
+; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s
+; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s
+; CHECK-NEXT: str q3, [x1, x8]
; CHECK-NEXT: add x8, x8, #16 // =16
; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q2, q3, [x0, #32]
-; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
; CHECK-NEXT: add x9, x1, x8
; CHECK-NEXT: add x8, x8, #32 // =32
-; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s
-; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: zip1 v5.4s, v2.4s, v3.4s
-; CHECK-NEXT: trn2 v7.4s, v2.4s, v3.4s
-; CHECK-NEXT: zip1 v4.4s, v0.4s, v1.4s
-; CHECK-NEXT: trn2 v6.4s, v0.4s, v1.4s
-; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8
-; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #8
-; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: ext v4.16b, v5.16b, v4.16b, #8
-; CHECK-NEXT: zip2 v5.4s, v2.4s, v3.4s
-; CHECK-NEXT: ext v0.16b, v6.16b, v0.16b, #8
-; CHECK-NEXT: ext v6.16b, v7.16b, v6.16b, #8
-; CHECK-NEXT: mov v2.s[3], v3.s[2]
-; CHECK-NEXT: ext v0.16b, v5.16b, v0.16b, #8
-; CHECK-NEXT: ext v3.16b, v4.16b, v4.16b, #8
-; CHECK-NEXT: ext v4.16b, v6.16b, v6.16b, #8
-; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8
-; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: fadd v2.4s, v4.4s, v3.4s
-; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192
-; CHECK-NEXT: fadd v3.4s, v0.4s, v1.4s
-; CHECK-NEXT: add x0, x0, #64 // =64
-; CHECK-NEXT: st2 { v2.4s, v3.4s }, [x9]
+; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s
+; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s
+; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s
+; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9]
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: // %bb.2: // %while.end
; CHECK-NEXT: ret
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8
; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: ldp q0, q1, [x9]
-; CHECK-NEXT: ldp q3, q2, [x10]
+; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9]
+; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10]
; CHECK-NEXT: add x8, x8, #32 // =32
; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192
-; CHECK-NEXT: fmul v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: fmul v0.4s, v3.4s, v0.4s
-; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: str q0, [x2], #16
+; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s
+; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s
+; CHECK-NEXT: str q4, [x2], #16
; CHECK-NEXT: b.ne .LBB3_1
; CHECK-NEXT: // %bb.2: // %while.end
; CHECK-NEXT: ret
define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: arm_cmplx_mag_squared_f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: beq.w .LBB0_9
+; CHECK-NEXT: beq .LBB0_8
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
; CHECK-NEXT: cmp r2, #8
-; CHECK-NEXT: blo.w .LBB0_6
+; CHECK-NEXT: blo .LBB0_9
; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
; CHECK-NEXT: add.w r3, r0, r2, lsl #2
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: itt hi
; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1
; CHECK-NEXT: cmphi r3, r0
-; CHECK-NEXT: bhi .LBB0_6
+; CHECK-NEXT: bhi .LBB0_9
; CHECK-NEXT: @ %bb.3: @ %vector.ph
-; CHECK-NEXT: bic r5, r2, #7
-; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: sub.w r3, r5, #8
-; CHECK-NEXT: and r8, r2, #7
-; CHECK-NEXT: add.w r12, r1, r5, lsl #1
-; CHECK-NEXT: add.w r3, r4, r3, lsr #3
-; CHECK-NEXT: mov r7, r3
-; CHECK-NEXT: add.w r3, r0, r5, lsl #2
+; CHECK-NEXT: bic r4, r2, #7
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: sub.w r12, r4, #8
+; CHECK-NEXT: and r7, r2, #7
+; CHECK-NEXT: add.w r3, r3, r12, lsr #3
+; CHECK-NEXT: add.w r12, r1, r4, lsl #1
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: add.w r3, r0, r4, lsl #2
; CHECK-NEXT: .LBB0_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q0, [r0], #32
-; CHECK-NEXT: mov lr, r7
+; CHECK-NEXT: vld20.16 {q0, q1}, [r0]
+; CHECK-NEXT: mov lr, r5
; CHECK-NEXT: subs.w lr, lr, #1
-; CHECK-NEXT: vmul.f16 q1, q0, q0
-; CHECK-NEXT: mov r7, lr
-; CHECK-NEXT: vmovx.f16 s0, s5
-; CHECK-NEXT: vmovx.f16 s8, s6
-; CHECK-NEXT: vmov r4, s0
-; CHECK-NEXT: vmovx.f16 s0, s4
-; CHECK-NEXT: vmov r6, s0
-; CHECK-NEXT: vmov.16 q0[0], r6
-; CHECK-NEXT: vmov r6, s4
-; CHECK-NEXT: vmov.16 q0[1], r4
-; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: vmovx.f16 s8, s7
-; CHECK-NEXT: vmov.16 q0[2], r4
-; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmov.16 q0[3], r4
-; CHECK-NEXT: vmul.f16 q2, q2, q2
-; CHECK-NEXT: vmovx.f16 s12, s8
-; CHECK-NEXT: vmov r4, s12
-; CHECK-NEXT: vmovx.f16 s12, s9
-; CHECK-NEXT: vmov.16 q0[4], r4
-; CHECK-NEXT: vmov r4, s12
-; CHECK-NEXT: vmovx.f16 s12, s10
-; CHECK-NEXT: vmov.16 q0[5], r4
-; CHECK-NEXT: vmov r4, s12
-; CHECK-NEXT: vmov.16 q3[0], r6
-; CHECK-NEXT: vmov.16 q0[6], r4
-; CHECK-NEXT: vmov r4, s5
-; CHECK-NEXT: vmov.16 q3[1], r4
-; CHECK-NEXT: vmov r4, s6
-; CHECK-NEXT: vmov.16 q3[2], r4
-; CHECK-NEXT: vmov r4, s7
-; CHECK-NEXT: vmov.16 q3[3], r4
-; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: vmov.16 q3[4], r4
-; CHECK-NEXT: vmov r4, s9
-; CHECK-NEXT: vmov.16 q3[5], r4
-; CHECK-NEXT: vmov r4, s10
-; CHECK-NEXT: vmov.16 q3[6], r4
-; CHECK-NEXT: vmov r4, s11
-; CHECK-NEXT: vmovx.f16 s4, s11
-; CHECK-NEXT: vmov.16 q3[7], r4
-; CHECK-NEXT: vmov r4, s4
-; CHECK-NEXT: vmov.16 q0[7], r4
-; CHECK-NEXT: vadd.f16 q0, q0, q3
-; CHECK-NEXT: vstrb.8 q0, [r1], #16
+; CHECK-NEXT: vld21.16 {q0, q1}, [r0]!
+; CHECK-NEXT: mov r5, lr
+; CHECK-NEXT: vmul.f16 q2, q0, q0
+; CHECK-NEXT: vfma.f16 q2, q1, q1
+; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: bne .LBB0_4
; CHECK-NEXT: b .LBB0_5
; CHECK-NEXT: .LBB0_5: @ %middle.block
-; CHECK-NEXT: cmp r5, r2
-; CHECK-NEXT: mov lr, r8
-; CHECK-NEXT: bne .LBB0_7
-; CHECK-NEXT: b .LBB0_9
-; CHECK-NEXT: .LBB0_6:
-; CHECK-NEXT: mov r3, r0
-; CHECK-NEXT: mov r12, r1
-; CHECK-NEXT: mov lr, r2
-; CHECK-NEXT: .LBB0_7: @ %while.body.preheader26
+; CHECK-NEXT: cmp r4, r2
+; CHECK-NEXT: mov lr, r7
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r4, r5, r7, pc}
+; CHECK-NEXT: .LBB0_6: @ %while.body.preheader26
; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: .LBB0_8: @ %while.body
+; CHECK-NEXT: .LBB0_7: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr.16 s0, [r3]
; CHECK-NEXT: vldr.16 s2, [r3, #2]
; CHECK-NEXT: vfma.f16 s0, s2, s2
; CHECK-NEXT: vstr.16 s0, [r12]
; CHECK-NEXT: add.w r12, r12, #2
-; CHECK-NEXT: le lr, .LBB0_8
-; CHECK-NEXT: .LBB0_9: @ %while.end
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: le lr, .LBB0_7
+; CHECK-NEXT: .LBB0_8: @ %while.end
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: .LBB0_9:
+; CHECK-NEXT: mov r3, r0
+; CHECK-NEXT: mov r12, r1
+; CHECK-NEXT: mov lr, r2
+; CHECK-NEXT: b .LBB0_6
entry:
%cmp.not11 = icmp eq i32 %numSamples, 0
br i1 %cmp.not11, label %while.end, label %while.body.preheader
; CHECK-NEXT: cmphi r3, r0
; CHECK-NEXT: bhi .LBB1_9
; CHECK-NEXT: @ %bb.3: @ %vector.ph
-; CHECK-NEXT: bic r5, r2, #3
-; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: subs r3, r5, #4
+; CHECK-NEXT: bic r4, r2, #3
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: sub.w r12, r4, #4
; CHECK-NEXT: and r7, r2, #3
-; CHECK-NEXT: add.w r12, r1, r5, lsl #2
-; CHECK-NEXT: add.w r3, r4, r3, lsr #2
-; CHECK-NEXT: mov r4, r3
-; CHECK-NEXT: add.w r3, r0, r5, lsl #3
+; CHECK-NEXT: add.w r3, r3, r12, lsr #2
+; CHECK-NEXT: add.w r12, r1, r4, lsl #2
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: add.w r3, r0, r4, lsl #3
; CHECK-NEXT: .LBB1_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q1, [r0], #32
-; CHECK-NEXT: mov lr, r4
-; CHECK-NEXT: vmul.f32 q1, q1, q1
-; CHECK-NEXT: vmul.f32 q0, q0, q0
-; CHECK-NEXT: vmov.f64 d4, d2
+; CHECK-NEXT: vld20.32 {q0, q1}, [r0]
+; CHECK-NEXT: mov lr, r5
; CHECK-NEXT: subs.w lr, lr, #1
-; CHECK-NEXT: mov r4, lr
-; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmov.f32 s9, s6
-; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmov.f32 s10, s0
-; CHECK-NEXT: vmov.f32 s14, s1
-; CHECK-NEXT: vmov.f32 s11, s2
-; CHECK-NEXT: vmov.f32 s15, s3
-; CHECK-NEXT: vadd.f32 q0, q3, q2
-; CHECK-NEXT: vstrb.8 q0, [r1], #16
+; CHECK-NEXT: vld21.32 {q0, q1}, [r0]!
+; CHECK-NEXT: mov r5, lr
+; CHECK-NEXT: vmul.f32 q2, q0, q0
+; CHECK-NEXT: vfma.f32 q2, q1, q1
+; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: bne .LBB1_4
; CHECK-NEXT: b .LBB1_5
; CHECK-NEXT: .LBB1_5: @ %middle.block
-; CHECK-NEXT: cmp r5, r2
+; CHECK-NEXT: cmp r4, r2
; CHECK-NEXT: mov lr, r7
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, r5, r7, pc}
define <4 x float> @vld2(<8 x float>* %pSrc) {
; CHECK-LABEL: @vld2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC:%.*]], align 4
-; CHECK-NEXT: [[L2:%.*]] = fmul fast <8 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L3:%.*]] = shufflevector <8 x float> [[L2]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L5]], [[L3]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[L26:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[L43:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L43]], [[L26]]
; CHECK-NEXT: ret <4 x float> [[L6]]
;
entry:
define <4 x float> @vld3(<12 x float>* %pSrc) {
; CHECK-LABEL: @vld3(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, <12 x float>* [[PSRC:%.*]], align 4
-; CHECK-NEXT: [[L2:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L3:%.*]] = shufflevector <12 x float> [[L2]], <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT: [[L4:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L5:%.*]] = shufflevector <12 x float> [[L4]], <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L5]], [[L3]]
-; CHECK-NEXT: [[L7:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L8:%.*]] = shufflevector <12 x float> [[L7]], <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT: [[L9:%.*]] = fadd fast <4 x float> [[L6]], [[L8]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <12 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[L29:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[L46:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L46]], [[L29]]
+; CHECK-NEXT: [[L73:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[L9:%.*]] = fadd fast <4 x float> [[L6]], [[L73]]
; CHECK-NEXT: ret <4 x float> [[L9]]
;
entry:
define <4 x float> @vld4(<16 x float>* %pSrc) {
; CHECK-LABEL: @vld4(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, <16 x float>* [[PSRC:%.*]], align 4
-; CHECK-NEXT: [[L3:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L4:%.*]] = shufflevector <16 x float> [[L3]], <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[L5:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L6:%.*]] = shufflevector <16 x float> [[L5]], <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT: [[L7:%.*]] = fadd fast <4 x float> [[L6]], [[L4]]
-; CHECK-NEXT: [[L8:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L9:%.*]] = shufflevector <16 x float> [[L8]], <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT: [[L10:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L11:%.*]] = shufflevector <16 x float> [[L10]], <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: [[L12:%.*]] = fadd fast <4 x float> [[L11]], [[L9]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[L312:%.*]] = fmul <4 x float> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[L59:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[L7:%.*]] = fadd fast <4 x float> [[L59]], [[L312]]
+; CHECK-NEXT: [[L86:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[L103:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[L12:%.*]] = fadd fast <4 x float> [[L103]], [[L86]]
; CHECK-NEXT: ret <4 x float> [[L12]]
;
entry:
define <4 x float> @twosrc(<8 x float>* %pSrc1, <8 x float>* %pSrc2) {
; CHECK-LABEL: @twosrc(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC1:%.*]], align 4
-; CHECK-NEXT: [[WIDE_VEC26:%.*]] = load <8 x float>, <8 x float>* [[PSRC2:%.*]], align 4
-; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[L6:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L7:%.*]] = shufflevector <8 x float> [[L6]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L7]], [[L5]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC1:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x float>* [[PSRC2:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN7:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN7]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN7]], 1
+; CHECK-NEXT: [[L46:%.*]] = fmul <4 x float> [[TMP4]], [[TMP2]]
+; CHECK-NEXT: [[L63:%.*]] = fmul <4 x float> [[TMP5]], [[TMP1]]
+; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L63]], [[L46]]
; CHECK-NEXT: ret <4 x float> [[L8]]
;
entry:
define <4 x float> @twosrc2(<8 x float>* %pSrc1, <8 x float>* %pSrc2) {
; CHECK-LABEL: @twosrc2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC1:%.*]], align 4
-; CHECK-NEXT: [[WIDE_VEC26:%.*]] = load <8 x float>, <8 x float>* [[PSRC2:%.*]], align 4
-; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[WIDE_VEC26]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[L6:%.*]] = fmul fast <4 x float> [[S1]], [[S2]]
-; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L6]], [[L5]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC1:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x float>* [[PSRC2:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN4:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN4]], 1
+; CHECK-NEXT: [[L43:%.*]] = fmul <4 x float> [[TMP4]], [[TMP2]]
+; CHECK-NEXT: [[L6:%.*]] = fmul fast <4 x float> [[TMP5]], [[TMP1]]
+; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L6]], [[L43]]
; CHECK-NEXT: ret <4 x float> [[L8]]
;
entry: