TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
E->ReuseShuffleIndices);
}
+ auto &&AdjustExtractsCost = [this, CostKind, VL, VecTy](InstructionCost &Cost,
+ bool IsGather) {
+ DenseMap<Value *, int> ExtractVectorsTys;
+ for (auto *V : VL) {
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ if (IsGather && (!areAllUsersVectorized(cast<Instruction>(V)) ||
+ ScalarToTreeEntry.count(V)))
+ continue;
+ auto *EE = cast<ExtractElementInst>(V);
+ unsigned Idx = *getExtractIndex(EE);
+ if (TTI->getNumberOfParts(VecTy) !=
+ TTI->getNumberOfParts(EE->getVectorOperandType())) {
+ auto It =
+ ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
+ It->getSecond() = std::min<int>(It->second, Idx);
+ }
+ // Take credit for instruction that will become dead.
+ if (EE->hasOneUse()) {
+ Instruction *Ext = EE->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ Cost -=
+ TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+ EE->getVectorOperandType(), Idx);
+ // Add back the cost of s|zext which is subtracted separately.
+ Cost += TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), EE->getType(),
+ TTI::getCastContextHint(Ext), CostKind, Ext);
+ continue;
+ }
+ }
+ Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
+ EE->getVectorOperandType(), Idx);
+ }
+ // Add a cost for subvector extracts/inserts if required.
+ for (const auto &Data : ExtractVectorsTys) {
+ auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
+ unsigned NumElts = VecTy->getNumElements();
+ if (TTI->getNumberOfParts(EEVTy) > TTI->getNumberOfParts(VecTy))
+ Cost +=
+ TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, EEVTy,
+ None, (Data.second / NumElts) * NumElts, VecTy);
+ else
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+ VecTy, None, 0, EEVTy);
+ }
+ };
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return 0;
if (ShuffleKind.hasValue()) {
InstructionCost Cost =
computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
- for (auto *V : VL) {
- // If all users of instruction are going to be vectorized and this
- // instruction itself is not going to be vectorized, consider this
- // instruction as dead and remove its cost from the final cost of the
- // vectorized tree.
- if (areAllUsersVectorized(cast<Instruction>(V)) &&
- !ScalarToTreeEntry.count(V)) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(V)->getIndexOperand());
- Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
- IO->getZExtValue());
- }
- }
+ AdjustExtractsCost(Cost, /*IsGather=*/true);
return ReuseShuffleCost + Cost;
}
}
unsigned Idx = 0;
for (unsigned I : E->ReuseShuffleIndices) {
if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(VL[I])->getIndexOperand());
- Idx = IO->getZExtValue();
+ auto *EE = cast<ExtractElementInst>(VL[I]);
ReuseShuffleCost -= TTI->getVectorInstrCost(
- Instruction::ExtractElement, VecTy, Idx);
+ Instruction::ExtractElement, EE->getVectorOperandType(),
+ *getExtractIndex(EE));
} else {
ReuseShuffleCost -= TTI->getVectorInstrCost(
Instruction::ExtractElement, VecTy, Idx);
Idx = ReuseShuffleNumbers;
for (Value *V : VL) {
if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(V)->getIndexOperand());
- Idx = IO->getZExtValue();
+ auto *EE = cast<ExtractElementInst>(V);
+ ReuseShuffleCost += TTI->getVectorInstrCost(
+ Instruction::ExtractElement, EE->getVectorOperandType(),
+ *getExtractIndex(EE));
} else {
--Idx;
+ ReuseShuffleCost += TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
}
- ReuseShuffleCost +=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
CommonCost = ReuseShuffleCost;
} else if (!E->ReorderIndices.empty()) {
CommonCost = TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
}
- for (unsigned I = 0, E = VL.size(); I < E; ++I) {
- Instruction *EI = cast<Instruction>(VL[I]);
- // If all users are going to be vectorized, instruction can be
- // considered as dead.
- // The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(EI)) {
+ if (ShuffleOrOp == Instruction::ExtractValue) {
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = cast<Instruction>(VL[I]);
// Take credit for instruction that will become dead.
if (EI->hasOneUse()) {
Instruction *Ext = EI->user_back();
CommonCost -=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
}
+ } else {
+ AdjustExtractsCost(CommonCost, /*IsGather=*/false);
}
return CommonCost;
}
define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
-; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
-; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
-; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
-; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0
+; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
+; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
-; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0
; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
-; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
-; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
+; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x i32> [[TMP2_3]]
;
%v0.0 = extractelement <2 x i32> %v0, i32 0
define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
-; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
-; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
-; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
-; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0
+; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
+; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
-; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0
; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
-; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
-; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
+; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x i32> [[TMP2_3]]
;
%v0.0 = extractelement <2 x i32> %v0, i32 0
; CHECK-LABEL: @noop_extracts_first_2_lanes(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8
+; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
+; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1
; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
; CHECK-NEXT: [[V2_LANE_3:%.*]] = extractelement <4 x double> [[V_2]], i32 3
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_3]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <2 x double> undef, double [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
-; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <2 x double> [[A_INS_0]], double [[TMP4]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[V_1]], i32 0
-; CHECK-NEXT: call void @use(double [[TMP5]])
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[V_1]], i32 1
-; CHECK-NEXT: call void @use(double [[TMP6]])
+; CHECK-NEXT: [[A_LANE_0:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
+; CHECK-NEXT: [[A_LANE_1:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_3]]
+; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <2 x double> undef, double [[A_LANE_0]], i32 0
+; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <2 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
+; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
+; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
; CHECK-NEXT: store <2 x double> [[A_INS_1]], <2 x double>* [[PTR_1]], align 8
; CHECK-NEXT: ret void
;
; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[V1_LANE_2]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[V1_LANE_0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_1]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[V2_LANE_0]], i32 1
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP3]], [[SHUFFLE]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 0
-; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 1
-; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 2
-; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[TMP9]], i32 2
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 3
-; CHECK-NEXT: [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[TMP10]], i32 3
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[A_LANE_2:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
+; CHECK-NEXT: [[A_LANE_3:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_0]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[TMP6]], i32 1
+; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[A_LANE_2]], i32 2
+; CHECK-NEXT: [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[A_LANE_3]], i32 3
; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
; CHECK-NEXT: [[A_LANE_0:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[A_LANE_1:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
+; CHECK-NEXT: [[A_LANE_2:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_2]]
; CHECK-NEXT: [[A_LANE_3:%.*]] = fmul double [[V1_LANE_3]], [[V2_LANE_0]]
; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[A_LANE_0]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
-; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[TMP6]], i32 2
+; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
+; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[A_LANE_2]], i32 2
; CHECK-NEXT: [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[A_LANE_3]], i32 3
; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
; AVX1-LABEL: @ashr_shl_v8i32(
; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
-; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
-; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]]
; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 undef, i32 undef>
+; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
; AVX1-NEXT: ret <8 x i32> [[R7]]
;
; AVX2-LABEL: @ashr_shl_v8i32(
; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4
; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8
; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2
-; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3
-; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1
+; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5
; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
; CHECK-NEXT: ret <8 x i32> [[R7]]
; AVX1-LABEL: @ashr_shl_v8i32(
; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
-; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
-; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]]
; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 undef, i32 undef>
+; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
; AVX1-NEXT: ret <8 x i32> [[R7]]
;
; AVX2-LABEL: @ashr_shl_v8i32(
; CHECK-NEXT: ret float [[X0]]
;
; THRESH1-LABEL: @f_used_out_of_tree(
-; THRESH1-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
-; THRESH1-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1
-; THRESH1-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X0]]
-; THRESH1-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]]
-; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]]
+; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; THRESH1-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]]
+; THRESH1-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESH1-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESH1-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]]
; THRESH1-NEXT: store float [[ADD]], float* @a, align 4
-; THRESH1-NEXT: ret float [[X0]]
+; THRESH1-NEXT: ret float [[TMP1]]
;
; THRESH2-LABEL: @f_used_out_of_tree(
; THRESH2-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0