We had previously limited the shuffle(HORIZOP,HORIZOP) combine to binary shuffles, but we can often merge unary shuffles just as well, folding in UNDEF/ZERO values into the 64-bit half lanes.
For the (P)HADD/HSUB cases this is limited to fast-horizontal cases but PACKSS/PACKUS combines under all cases.
SmallVector<int, 64> TargetMask;
SmallVector<SDValue, 2> TargetOps;
if (isTargetShuffle(Opcode))
- getTargetShuffleMask(N.getNode(), VT, false, TargetOps, TargetMask, IsUnary);
+ getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
// single instruction. Attempt to match a v2X64 repeating shuffle pattern that
// represents the LHS/RHS inputs for the lower/upper halves.
SmallVector<int, 16> TargetMask128;
- if (!TargetMask.empty() && TargetOps.size() == 2 &&
- is128BitLaneRepeatedShuffleMask(VT, TargetMask, TargetMask128)) {
+ if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
+ isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
SmallVector<int, 16> WidenedMask128 = TargetMask128;
while (WidenedMask128.size() > 2) {
SmallVector<int, 16> WidenedMask;
break;
WidenedMask128 = std::move(WidenedMask);
}
- if (WidenedMask128.size() == 2 && isInRange(WidenedMask128, 0, 4)) {
- SDValue BC0 = peekThroughBitcasts(TargetOps[0]);
- SDValue BC1 = peekThroughBitcasts(TargetOps[1]);
+ if (WidenedMask128.size() == 2) {
+ assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
+ SDValue BC0 = peekThroughBitcasts(TargetOps.front());
+ SDValue BC1 = peekThroughBitcasts(TargetOps.back());
EVT VT0 = BC0.getValueType();
EVT VT1 = BC1.getValueType();
unsigned Opcode0 = BC0.getOpcode();
unsigned Opcode1 = BC1.getOpcode();
+ bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
if (Opcode0 == Opcode1 && VT0 == VT1 &&
- (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
- Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
- Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
- SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
- SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
- Lo = Lo.getOperand(WidenedMask128[0] & 1);
- Hi = Hi.getOperand(WidenedMask128[1] & 1);
- SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
- return DAG.getBitcast(VT, Horiz);
+ (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
+ bool SingleOp = (TargetOps.size() == 1);
+ if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+ SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
+ SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
+ Lo = Lo.getOperand(WidenedMask128[0] & 1);
+ Hi = Hi.getOperand(WidenedMask128[1] & 1);
+ if (SingleOp) {
+ MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+ SDValue Undef = DAG.getUNDEF(SrcVT);
+ SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
+ Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
+ Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
+ Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
+ Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
+ }
+ SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+ return DAG.getBitcast(VT, Horiz);
+ }
}
}
}
}
define <8 x float> @hadd_v8f32a(<8 x float> %a) {
-; SSSE3-LABEL: hadd_v8f32a:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movaps %xmm0, %xmm2
-; SSSE3-NEXT: haddps %xmm1, %xmm2
-; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
-; SSSE3-NEXT: movaps %xmm2, %xmm1
-; SSSE3-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v8f32a:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
+; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1
+; SSSE3_SLOW-NEXT: retq
;
-; AVX1-LABEL: hadd_v8f32a:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
+; SSSE3_FAST-LABEL: hadd_v8f32a:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2
+; SSSE3_FAST-NEXT: haddps %xmm1, %xmm2
+; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0
+; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v8f32a:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v8f32a:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
;
; AVX2-LABEL: hadd_v8f32a:
; AVX2: # %bb.0:
}
define <8 x float> @hsub_v8f32a(<8 x float> %a) {
-; SSSE3-LABEL: hsub_v8f32a:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movaps %xmm0, %xmm2
-; SSSE3-NEXT: hsubps %xmm1, %xmm2
-; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
-; SSSE3-NEXT: movaps %xmm2, %xmm1
-; SSSE3-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v8f32a:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: hsubps %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
+; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1
+; SSSE3_SLOW-NEXT: retq
;
-; AVX1-LABEL: hsub_v8f32a:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vhsubps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
+; SSSE3_FAST-LABEL: hsub_v8f32a:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2
+; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm2
+; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0
+; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v8f32a:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_SLOW-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v8f32a:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_FAST-NEXT: vhsubps %xmm1, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
;
; AVX2-LABEL: hsub_v8f32a:
; AVX2: # %bb.0:
}
define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
-; SSSE3-LABEL: hadd_v8i32a:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: phaddd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v8i32a:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: phaddd %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
+; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1
+; SSSE3_SLOW-NEXT: retq
;
-; AVX1-LABEL: hadd_v8i32a:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
+; SSSE3_FAST-LABEL: hadd_v8i32a:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2
+; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm2
+; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v8i32a:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v8i32a:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
;
; AVX2-LABEL: hadd_v8i32a:
; AVX2: # %bb.0:
}
define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
-; SSSE3-LABEL: hsub_v8i32a:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: phsubd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v8i32a:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: phsubd %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
+; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1
+; SSSE3_SLOW-NEXT: retq
;
-; AVX1-LABEL: hsub_v8i32a:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vphsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
+; SSSE3_FAST-LABEL: hsub_v8i32a:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2
+; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm2
+; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v8i32a:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_SLOW-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v8i32a:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_FAST-NEXT: vphsubd %xmm1, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
;
; AVX2-LABEL: hsub_v8i32a:
; AVX2: # %bb.0:
}
define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
-; SSSE3-LABEL: hadd_v16i16a:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: phaddw %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v16i16a:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: phaddw %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
+; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1
+; SSSE3_SLOW-NEXT: retq
;
-; AVX1-LABEL: hadd_v16i16a:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
+; SSSE3_FAST-LABEL: hadd_v16i16a:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2
+; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm2
+; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v16i16a:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_SLOW-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v16i16a:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_FAST-NEXT: vphaddw %xmm1, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
;
; AVX2-LABEL: hadd_v16i16a:
; AVX2: # %bb.0:
}
define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
-; SSSE3-LABEL: hsub_v16i16a:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: phsubw %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v16i16a:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: phsubw %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
+; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1
+; SSSE3_SLOW-NEXT: retq
;
-; AVX1-LABEL: hsub_v16i16a:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vphsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
+; SSSE3_FAST-LABEL: hsub_v16i16a:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2
+; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm2
+; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0
+; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v16i16a:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_SLOW-NEXT: vphsubw %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v16i16a:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1_FAST-NEXT: vphsubw %xmm1, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
;
; AVX2-LABEL: hsub_v16i16a:
; AVX2: # %bb.0:
;
; SSE-FAST-LABEL: test11_undef:
; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: movaps %xmm3, %xmm1
; SSE-FAST-NEXT: haddps %xmm0, %xmm0
-; SSE-FAST-NEXT: haddps %xmm3, %xmm3
-; SSE-FAST-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
+; SSE-FAST-NEXT: haddps %xmm3, %xmm1
; SSE-FAST-NEXT: retq
;
; AVX-LABEL: test11_undef:
; AVX-FAST-LABEL: add_pd_010:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-FAST-NEXT: retq
%l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
%add = fadd <2 x double> %l, %x
;
; SSE-FAST-LABEL: test15_undef:
; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: movdqa %xmm3, %xmm1
; SSE-FAST-NEXT: phaddd %xmm0, %xmm0
-; SSE-FAST-NEXT: phaddd %xmm3, %xmm3
-; SSE-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
+; SSE-FAST-NEXT: phaddd %xmm3, %xmm1
; SSE-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: test15_undef:
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: retq
;
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi)
; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm8
; AVX1-NEXT: vpsrad $31, %xmm8, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9
+; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX1-NEXT: vpmuldq %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm9
+; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm1
+; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmuldq %xmm4, %xmm7, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3],xmm6[4,5],xmm1[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; AVX1-NEXT: vpmuldq %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm9, %xmm0, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpxor %xmm0, %xmm9, %xmm0
+; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm5
+; AVX1-NEXT: vpacksswb %xmm1, %xmm5, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vpacksswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi)
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpacksswb %xmm1, %xmm4, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi)
; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6
+; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5
; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5
-; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5
+; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm7
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6
-; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpacksswb %xmm5, %xmm11, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
; AVX1-NEXT: vmovdqa %xmm6, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi)
; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vpacksswb %xmm0, %xmm5, %xmm5
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpmovsxbd %xmm4, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
+; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm4, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
+; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm6
+; AVX1-NEXT: vpacksswb %xmm1, %xmm6, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)