/// the bundle
void setInsertPointAfterBundle(const TreeEntry *E);
- /// \returns a vector from a collection of scalars in \p VL.
- Value *gather(ArrayRef<Value *> VL);
+ /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
+ /// specified, the starting vector value is poison.
+ Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
LocalVF = SVOpTy->getNumElements();
SmallVector<int> ExtMask(Mask.size(), UndefMaskElem);
for (auto [Idx, I] : enumerate(Mask)) {
- if (I == UndefMaskElem)
- continue;
- ExtMask[Idx] = SV->getMaskValue(I);
+ if (I == UndefMaskElem ||
+ static_cast<unsigned>(I) >= SV->getShuffleMask().size())
+ continue;
+ ExtMask[Idx] = SV->getMaskValue(I);
}
bool IsOp1Undef =
isUndefVector(SV->getOperand(0),
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
-Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
+Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
// List of instructions/lanes from current block and/or the blocks which are
// part of the current loop. These instructions will be inserted at the end to
// make it possible to optimize loops and hoist invariant instructions out of
for (int I = 0, E = VL.size(); I < E; ++I) {
if (auto *Inst = dyn_cast<Instruction>(VL[I]))
if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
- getTreeEntry(Inst) || (L && (L->contains(Inst)))) &&
+ getTreeEntry(Inst) ||
+ (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
PostponedIndices.insert(I).second)
PostponedInsts.emplace_back(Inst, I);
}
Value *Val0 =
isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
- Value *Vec = PoisonValue::get(VecTy);
+ Value *Vec = Root ? Root : PoisonValue::get(VecTy);
SmallVector<int> NonConsts;
// Insert constant values at first.
for (int I = 0, E = VL.size(); I < E; ++I) {
NonConsts.push_back(I);
continue;
}
+ if (Root) {
+ if (!isa<UndefValue>(VL[I])) {
+ NonConsts.push_back(I);
+ continue;
+ }
+ if (isa<PoisonValue>(VL[I]))
+ continue;
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
+ if (SV->getMaskValue(I) == UndefMaskElem)
+ continue;
+ }
+ }
Vec = CreateInsertElement(Vec, VL[I], I);
}
// Insert non-constant values.
add(V1, NewMask);
}
/// Finalize emission of the shuffles.
+ /// \param Action the action (if any) to be performed before final applying of
+ /// the \p ExtMask mask.
Value *
- finalize(ArrayRef<int> ExtMask = std::nullopt) {
+ finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
+ function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
IsFinalized = true;
+ if (Action) {
+ Value *Vec = InVectors.front();
+ if (InVectors.size() == 2) {
+ Vec = createShuffle(Vec, InVectors.back(), CommonMask);
+ InVectors.pop_back();
+ } else {
+ Vec = createShuffle(Vec, nullptr, CommonMask);
+ }
+ for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+ if (CommonMask[Idx] != UndefMaskElem)
+ CommonMask[Idx] = Idx;
+ assert(VF > 0 &&
+ "Expected vector length for the final value before action.");
+ unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+ if (VecVF < VF) {
+ SmallVector<int> ResizeMask(VF, UndefMaskElem);
+ std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
+ Vec = createShuffle(Vec, nullptr, ResizeMask);
+ }
+ Action(Vec, CommonMask);
+ InVectors.front() = Vec;
+ }
if (!ExtMask.empty()) {
if (CommonMask.empty()) {
CommonMask.assign(ExtMask.begin(), ExtMask.end());
inversePermutation(E->ReorderIndices, ReorderMask);
if (!ReorderMask.empty())
reorderScalars(GatheredScalars, ReorderMask);
-
+ auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {
+ if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
+ return isa<UndefValue>(V) && !isa<PoisonValue>(V);
+ }))
+ return false;
+ TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
+ unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
+ if (UserTE->getNumOperands() != 2)
+ return false;
+ auto *It =
+ find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
+ return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
+ return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
+ }) != TE->UserTreeIndices.end();
+ });
+ if (It == VectorizableTree.end())
+ return false;
+ unsigned I =
+ *find_if_not(Mask, [](int Idx) { return Idx == UndefMaskElem; });
+ int Sz = Mask.size();
+ if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask))
+ std::iota(Mask.begin(), Mask.end(), 0);
+ else
+ std::fill(Mask.begin(), Mask.end(), I);
+ return true;
+ };
ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
Value *Vec = nullptr;
SmallVector<int> Mask;
}
}
}
- if ((ExtractShuffle || GatherShuffle) &&
- all_of(GatheredScalars, PoisonValue::classof)) {
+ if (ExtractShuffle || GatherShuffle) {
+ bool IsNonPoisoned = true;
+ bool IsUsedInExpr = false;
Value *Vec1 = nullptr;
if (ExtractShuffle) {
// Gather of extractelements can be represented as just a shuffle of
Vec2 = EI->getVectorOperand();
}
}
- if (Vec2)
+ if (Vec2) {
ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
- else if (Vec1)
+ } else if (Vec1) {
ShuffleBuilder.add(Vec1, ExtractMask);
- else
+ IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
+ IsUsedInExpr = FindReusedSplat(ExtractMask);
+ } else {
ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
ScalarTy, GatheredScalars.size())),
ExtractMask);
+ }
}
if (GatherShuffle) {
- if (Entries.size() == 1)
+ if (Entries.size() == 1) {
ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
- else
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
+ IsUsedInExpr = FindReusedSplat(Mask);
+ } else {
ShuffleBuilder.add(Entries.front()->VectorizedValue,
Entries.back()->VectorizedValue, Mask);
+ }
}
+ // Try to figure out best way to combine values: build a shuffle and insert
+ // elements or just build several shuffles.
+ // Insert non-constant scalars.
+ SmallVector<Value *> NonConstants(GatheredScalars);
+ int EMSz = ExtractMask.size();
+ int MSz = Mask.size();
+ // Try to build constant vector and shuffle with it only if currently we
+ // have a single permutation and more than 1 scalar constants.
+ bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
+ bool IsIdentityShuffle =
+ (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc &&
+ none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
+ ShuffleVectorInst::isIdentityMask(ExtractMask)) ||
+ (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+ TTI::SK_PermuteSingleSrc &&
+ none_of(Mask, [&](int I) { return I >= MSz; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask));
+ bool EnoughConstsForShuffle =
+ IsSingleShuffle &&
+ (none_of(GatheredScalars,
+ [](Value *V) {
+ return isa<UndefValue>(V) && !isa<PoisonValue>(V);
+ }) ||
+ any_of(GatheredScalars,
+ [](Value *V) {
+ return isa<Constant>(V) && !isa<UndefValue>(V);
+ })) &&
+ (!IsIdentityShuffle ||
+ (GatheredScalars.size() == 2 &&
+ any_of(GatheredScalars,
+ [](Value *V) { return !isa<UndefValue>(V); })) ||
+ count_if(GatheredScalars, [](Value *V) {
+ return isa<Constant>(V) && !isa<PoisonValue>(V);
+ }) > 1);
+ // NonConstants array contains just non-constant values, GatheredScalars
+ // contains only constant to build final vector and then shuffle.
+ for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
+ if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
+ NonConstants[I] = PoisonValue::get(ScalarTy);
+ else
+ GatheredScalars[I] = PoisonValue::get(ScalarTy);
+ }
+ // Generate constants for final shuffle and build a mask for them.
+ if (!all_of(GatheredScalars, PoisonValue::classof)) {
+ SmallVector<int> BVMask(GatheredScalars.size(), UndefMaskElem);
+ Value *BV = gather(GatheredScalars);
+ for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
+ if (!isa<PoisonValue>(GatheredScalars[I]))
+ BVMask[I] = I;
+ }
+ ShuffleBuilder.add(BV, BVMask);
+ }
+ if (all_of(NonConstants, [=](Value *V) {
+ return isa<PoisonValue>(V) ||
+ (IsSingleShuffle && ((IsIdentityShuffle &&
+ IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
+ }))
+ Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+ else
+ Vec = ShuffleBuilder.finalize(
+ E->ReuseShuffleIndices, E->Scalars.size(),
+ [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
+ Vec = gather(NonConstants, Vec);
+ for (unsigned I = 0, Sz = Mask.size(); I < Sz; ++I)
+ if (!isa<PoisonValue>(NonConstants[I]))
+ Mask[I] = I;
+ });
} else if (!allConstant(E->Scalars)) {
// TODO: remove this code once able to combine shuffled vectors and build
// vector elements.
// Gather unique scalars and all constants.
Vec = gather(GatheredScalars);
ShuffleBuilder.add(Vec, ReuseMask);
+ Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
} else {
// Gather all constants.
Vec = gather(E->Scalars);
ShuffleBuilder.add(Vec, ReuseMask);
+ Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
}
- Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
if (NeedFreeze)
Vec = Builder.CreateFreeze(Vec);
return Vec;
; CHECK-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0
; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1
-; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0
+; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1
; CHECK-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]]
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP32:%.*]] = fdiv <2 x double> [[TMP30]], [[SHUFFLE]]
-; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[TMP32]], i32 1
-; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP33]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP32]], i32 0
-; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1
+; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0
+; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D
; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false
; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]]
; CHECK: lor.lhs.false:
-; CHECK-NEXT: [[TMP35:%.*]] = fcmp ule <2 x double> [[TMP32]], <double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP35]], i32 0
-; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP35]], i32 1
-; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP37]], i1 true, i1 [[TMP36]]
+; CHECK-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
+; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
+; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]]
; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32
; CHECK-NEXT: br label [[CLEANUP]]
; CHECK: cleanup:
; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer
; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B:%.*]], i32 1
-; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[C]], i32 2
-; AVX-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
-; AVX-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP5]], [[TMP8]]
-; AVX-NEXT: store <4 x i32> [[TMP9]], ptr @cle32, align 16
+; AVX-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 undef, i32 4, i32 0>
+; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[B:%.*]], i32 1
+; AVX-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP5]], [[TMP7]]
+; AVX-NEXT: store <4 x i32> [[TMP8]], ptr @cle32, align 16
; AVX-NEXT: ret void
;
%add1 = add i32 %c, %a
define i1 @foo() {
; CHECK-LABEL: @foo(
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT: br i1 false, label [[TMP15:%.*]], label [[TMP2:%.*]]
+; CHECK-NEXT: br i1 false, label [[TMP11:%.*]], label [[TMP2:%.*]]
; CHECK: 2:
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> zeroinitializer, i64 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> zeroinitializer, <4 x float> [[TMP7]], <4 x float> [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: br label [[TMP15]]
-; CHECK: 15:
-; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x float> [ [[TMP14]], [[TMP2]] ], [ zeroinitializer, [[TMP0:%.*]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> <float undef, float 0.000000e+00>, float [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x float> [[TMP5]], <4 x float> [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = fsub <4 x float> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT: br label [[TMP11]]
+; CHECK: 11:
+; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP10]], [[TMP2]] ], [ zeroinitializer, [[TMP0:%.*]] ]
; CHECK-NEXT: ret i1 false
;
%1 = load float, ptr null, align 4
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @c, align 4
; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP2]], 7
; CHECK-NEXT: store i32 [[AND]], ptr @a, align 4
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> <i32 poison, i32 0>, <2 x i32> [[TMP0]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: switch i32 [[AND]], label [[IF_END:%.*]] [
; CHECK-NEXT: i32 7, label [[SAVE_STATE_AND_RETURN]]
; CHECK-NEXT: i32 0, label [[SAVE_STATE_AND_RETURN]]
; CHECK-NEXT: br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
; CHECK: if.then38:
; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> <double 6.000000e-01, double poison>, double 6.000000e-02, i32 1
-; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> <double 5.000000e-01, double 8.000000e-01>, [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> <double 2.400000e-02, double 0.000000e+00>, [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> <double 9.000000e-01, double 9.100000e-01>, [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> <double 9.200000e-01, double 9.300000e-01>, [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> <double 0x3FEE147AE147AE14, double 0x3FEE666666666666>, [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> <double 0x3FEEB851EB851EB8, double 0x3FEF0A3D70A3D70A>, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> <double 0x3FEF5C28F5C28F5C, double 0x3FEFAE147AE147AE>, [[TMP6]]
-; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
+; CHECK-NEXT: store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
; CHECK-NEXT: br label [[IF_THEN78]]
; CHECK: if.then78:
; CHECK-NEXT: ret void
; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], <double 4.000000e+00, double 3.000000e+00>
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 1.000000e+00, double 6.000000e+00>
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[G]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[G]], i64 2
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
-; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP4]], 4.000000e+00
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL11]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 7.000000e+00, double 8.000000e+00>
-; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[ARRAYIDX9]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
+; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL11]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[ARRAYIDX9]], align 8
; CHECK-NEXT: ret i32 undef
;
entry:
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> <i32 8, i32 poison, i32 ptrtoint (ptr @fn1 to i32), i32 poison>, <4 x i32> [[TMP0]], <4 x i32> <i32 0, i32 5, i32 2, i32 undef>
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SHUFFLE]], <4 x i32> <i32 0, i32 6, i32 0, i32 0>
-; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], ptr @a, align 4
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 8, i32 poison, i32 ptrtoint (ptr @fn1 to i32), i32 ptrtoint (ptr @fn1 to i32)>, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 0, i32 0>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr @a, align 4
; CHECK-NEXT: ret i32 0
;
entry:
; SSE-LABEL: @foo(
; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
-; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 undef, i32 1>
+; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 poison>, i32 [[SUB86_1]], i32 4
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 6
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_1]], i32 7
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_3]], i32 12
-; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12>
-; CHECK-NEXT: [[TMP10:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT: [[TMP13:%.*]] = lshr <16 x i32> [[TMP12]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP13]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP15:%.*]] = mul nuw <16 x i32> [[TMP14]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP16:%.*]] = add <16 x i32> [[TMP15]], [[TMP12]]
-; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP16]], [[TMP15]]
-; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP17]])
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP18]], 16
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 6, i32 5, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[SUB102_3]], i32 12
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_3]], i32 15
+; CHECK-NEXT: [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[TMP12]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]])
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP17]], 16
; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]]
; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
; CHECK-NEXT: ret i32 [[SHR120]]
; CHECK-NEXT: br i1 [[C_1:%.*]], label [[BB16:%.*]], label [[BB6:%.*]]
; CHECK: bb6:
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[ARG1:%.*]], i32 3
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, float [[ARG2:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[ARG2:%.*]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], zeroinitializer
; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP8]], align 4
; CHECK-NEXT: ret void
; CHECK-NEXT: [[TMP38:%.*]] = fadd float 0.000000e+00, [[TMP37]]
; CHECK-NEXT: store float [[TMP38]], ptr [[TMP35]], align 4
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[ARG4]], i64 1
-; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[TMP39]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> zeroinitializer, [[TMP7]]
-; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[TMP39]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[TMP39]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> zeroinitializer, [[TMP4]]
+; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP39]], align 4
; CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[ARG3:%.*]], align 4
; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[ARG4]], align 4
; CHECK-NEXT: [[TMP46:%.*]] = fadd float 0.000000e+00, [[TMP45]]
define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) {
; CHECK-LABEL: @shuffle_operands1(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i64 1
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @shuffle_operands1(
-; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
-; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i64 1
-; SSE2-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE2-NEXT: store <2 x double> [[TMP5]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
+; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
+; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: ret void
;
%from_1 = getelementptr double, ptr %from, i64 1
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
+; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
+; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
-; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
-; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
-; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
-; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
-; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 undef, i32 0>
+; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]]
+; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16
; CHECK-NEXT: br label [[FOR_BODY3:%.*]]
; CHECK: for.body3:
-; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1
; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4
; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
-; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
+; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP13]]
-; CHECK-NEXT: [[TMP14]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3
-; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
+; CHECK-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
+; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995
+; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
; CHECK: for.end:
; CHECK-NEXT: ret void
; SSE2-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16
; SSE2-NEXT: br label [[FOR_BODY3:%.*]]
; SSE2: for.body3:
-; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ]
+; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1
; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4
; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
-; SSE2-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
-; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
-; SSE2-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
-; SSE2-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
-; SSE2-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
+; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
+; SSE2-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
+; SSE2-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; SSE2-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP13]]
-; SSE2-NEXT: [[TMP14]] = load float, ptr [[ARRAYIDX41]], align 4
-; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3
-; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]]
+; SSE2-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
+; SSE2-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
+; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
+; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
; SSE2-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
-; SSE2-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995
+; SSE2-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
; SSE2-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
; SSE2: for.end:
; SSE2-NEXT: ret void
define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
; CHECK-LABEL: @load_reorder_double(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @load_reorder_double(
-; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
-; SSE2-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
-; SSE2-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE2-NEXT: store <2 x double> [[TMP5]], ptr [[C:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
+; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
; SSE2-NEXT: ret void
;
%1 = load double, ptr %a
define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
; CHECK-LABEL: @load_reorder_float(
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @load_reorder_float(
-; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
-; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
+; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
; SSE2-NEXT: ret void
;
%1 = load float, ptr %a
define void @opcode_reorder(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c,ptr noalias nocapture readonly %d) {
; CHECK-LABEL: @opcode_reorder(
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]]
-; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
+; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @opcode_reorder(
-; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
-; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
-; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
-; SSE2-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]]
-; SSE2-NEXT: store <4 x float> [[TMP8]], ptr [[A:%.*]], align 4
+; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
+; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
+; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
+; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
; SSE2-NEXT: ret void
;
%1 = load float, ptr %b
; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]]
; CHECK: if.else:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 10
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: br label [[IF_END]]
; CHECK: if.end:
-; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[IF_ELSE]] ], [ <double 3.000000e+00, double 5.000000e+00>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[A]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x double> [ [[TMP0]], [[IF_ELSE]] ], [ <double 3.000000e+00, double 5.000000e+00>, [[ENTRY:%.*]] ]
+; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[A]], align 8
; CHECK-NEXT: ret i32 undef
;
entry:
define i32 @foo2(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n, i32 %m) #0 {
; CHECK-LABEL: @foo2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I_019:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
-; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
-; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x double> [ [[TMP0]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT: [[TMP4]] = fadd <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
; CHECK-NEXT: [[INC]] = add nsw i32 [[I_019]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
-; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[B:%.*]], align 8
; CHECK-NEXT: ret i32 0
;
entry:
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 0
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
-; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00
+; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00
; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]]
-; CHECK-NEXT: [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT: [[TMP12]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 undef, i32 2, i32 3>
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP10]], i32 1
-; CHECK-NEXT: [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
-; CHECK-NEXT: [[TMP16]] = fadd <4 x float> [[TMP6]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP17]], 121
+; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP9]], <4 x i32> <i32 1, i32 undef, i32 2, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP8]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP11]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
+; CHECK-NEXT: [[TMP13]] = fadd <4 x float> [[TMP4]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP16]], i32 0
-; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP18]]
-; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP16]], i32 1
-; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP19]]
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP16]], i32 2
-; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP20]]
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP16]], i32 3
-; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0
+; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 1
+; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 2
+; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP13]], i32 3
+; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP18]]
; CHECK-NEXT: ret float [[ADD31]]
;
entry:
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1
-; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0
-; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
-; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
-; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
-; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
-; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
-; SSE-NEXT: store <2 x i64> [[TMP12]], ptr [[ARRAYIDX2_2]], align 1
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0
+; SSE-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
+; SSE-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
+; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
+; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
+; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
; SSE-NEXT: ret void
;
; AVX-LABEL: @pr35497(
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1
-; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0
-; AVX-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
-; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
-; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
-; AVX-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
-; AVX-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
-; AVX-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
+; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0
+; AVX-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], <i64 2, i64 2>
+; AVX-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 20, i64 20>
+; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
+; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1
; AVX-NEXT: ret void
;
entry:
define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: @logical_and_icmp_clamp_v8i32(
-; CHECK-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0
-; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1
-; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2
-; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[Y0]], i32 4
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[Y1]], i32 5
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[Y2]], i32 6
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[Y3]], i32 7
-; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]])
-; CHECK-NEXT: ret i1 [[TMP8]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[Y:%.*]], <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
+; CHECK-NEXT: ret i1 [[TMP5]]
;
%x0 = extractelement <8 x i32> %x, i32 0
%x1 = extractelement <8 x i32> %x, i32 1
; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0
; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> <i32 2, i32 0, i32 1, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x ptr> <ptr poison, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x ptr> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP4]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP13]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP14]], false
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 2, i32 0, i32 1, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP13]], false
; CHECK-NEXT: ret i1 [[OP_RDX]]
;
bb:
; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 4, i32 3, i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 2, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x float> [[TMP8]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 9, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: store <16 x float> [[TMP11]], ptr [[TMP5]], align 4
; CHECK-NEXT: [[T13:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer
; CHECK-NEXT: br label [[ELSE1:%.*]]
; CHECK: else1:
-; CHECK-NEXT: [[T20:%.*]] = extractelement <2 x i32> [[T13]], i64 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[BF_CAST162]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[T20]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[T13]], <2 x i32> poison, <2 x i32> <i32 undef, i32 0>
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[BF_CAST162]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt <2 x i32> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
; CHECK-NEXT: ret i1 [[TMP8]]
; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> <float 0.000000e+00, float poison>, <2 x float> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
; CHECK: bb2:
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ]
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP1]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[TMP12]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer
-; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16
+; CHECK-NEXT: [[TMP15:%.*]] = fsub <2 x float> [[TMP14]], zeroinitializer
+; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[ARRAYIDX21_I]], align 16
; CHECK-NEXT: ret void
;
entry: