(V)PHMINPOSUW determines the UMIN element in an v8i16 input, with suitable bit flipping it can also be used for SMAX/SMIN/UMAX cases as well.
This patch matches vXi16 SMAX/SMIN/UMAX/UMIN horizontal reductions and reduces the input down to a v8i16 vector before calling (V)PHMINPOSUW.
A later patch will use this for v16i8 reductions as well (PR32841).
Differential Revision: https://reviews.llvm.org/D39729
llvm-svn: 318917
case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
+ case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
case X86ISD::ADD: return "X86ISD::ADD";
case X86ISD::SUB: return "X86ISD::SUB";
case X86ISD::ADC: return "X86ISD::ADC";
return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
}
+// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
+static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Bail without SSE41.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ EVT ExtractVT = Extract->getValueType(0);
+ if (ExtractVT != MVT::i16)
+ return SDValue();
+
+ // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
+ unsigned BinOp;
+ SDValue Src = matchBinOpReduction(
+ Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
+ if (!Src)
+ return SDValue();
+
+ EVT SrcVT = Src.getValueType();
+ EVT SrcSVT = SrcVT.getScalarType();
+ if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
+ return SDValue();
+
+ SDLoc DL(Extract);
+ SDValue MinPos = Src;
+
+ // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
+ while (SrcVT.getSizeInBits() > 128) {
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ unsigned NumSubElts = NumElts / 2;
+ SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
+ unsigned SubSizeInBits = SrcVT.getSizeInBits();
+ SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
+ SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
+ MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
+ }
+ assert(SrcVT == MVT::v8i16 && "Unexpected value type");
+
+ // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
+ // to flip the value accordingly.
+ SDValue Mask;
+ if (BinOp == ISD::SMAX)
+ Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
+ else if (BinOp == ISD::SMIN)
+ Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
+ else if (BinOp == ISD::UMAX)
+ Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);
+
+ if (Mask)
+ MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
+
+ MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);
+
+ if (Mask)
+ MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
+ DAG.getIntPtrConstant(0, DL));
+}
+
// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SelectionDAG &DAG,
if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
return Cmp;
+ // Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
+ if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
+ return MinMax;
+
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
if (SrcVT != MVT::v4i32)
// Vector integer comparisons, the result is in a mask vector.
PCMPEQM, PCMPGTM,
+ // v8i16 Horizontal minimum and position.
+ PHMINPOS,
+
MULTISHIFT,
/// Vector comparison generating mask bits for fp and
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;
+def X86phminpos: SDNode<"X86ISD::PHMINPOS",
+ SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>;
+
def X86vshiftuniform : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisVec<2>, SDTCisInt<0>,
SDTCisInt<1>]>;
Sched<[WriteFAddLd]>, XS;
}
-
-
// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
- Intrinsic IntId128, PatFrag ld_frag,
+ SDNode OpNode, PatFrag ld_frag,
X86FoldableSchedWrite Sched> {
def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
Sched<[Sched]>;
def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
- (IntId128 (bitconvert (ld_frag addr:$src))))]>,
+ (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
Sched<[Sched.Folded]>;
}
// model, although the naming is misleading.
let Predicates = [HasAVX] in
defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
- int_x86_sse41_phminposuw, loadv2i64,
+ X86phminpos, loadv2i64,
WriteVecIMul>, VEX, VEX_WIG;
defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
- int_x86_sse41_phminposuw, memopv2i64,
+ X86phminpos, memopv2i64,
WriteVecIMul>;
/// SS48I_binop_rm - Simple SSE41 binary operator.
X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
}
define i16 @test_reduce_v8i16(<8 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v8i16:
-; X86-SSE: ## BB#0:
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psrld $16, %xmm1
-; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT: movd %xmm1, %eax
-; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT: retl
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v8i16:
; X86-AVX: ## BB#0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vmovd %xmm0, %eax
; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX-NEXT: retl
;
-; X64-SSE-LABEL: test_reduce_v8i16:
-; X64-SSE: ## BB#0:
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE-NEXT: psrld $16, %xmm1
-; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT: retq
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v8i16:
; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vmovd %xmm0, %eax
; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX-NEXT: retq
}
define i16 @test_reduce_v16i16(<16 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v16i16:
-; X86-SSE: ## BB#0:
-; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psrld $16, %xmm1
-; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT: movd %xmm1, %eax
-; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT: retl
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
;
; X86-AVX1-LABEL: test_reduce_v16i16:
; X86-AVX1: ## BB#0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX1-NEXT: vzeroupper
; X86-AVX2-LABEL: test_reduce_v16i16:
; X86-AVX2: ## BB#0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
-; X64-SSE-LABEL: test_reduce_v16i16:
-; X64-SSE: ## BB#0:
-; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE-NEXT: psrld $16, %xmm1
-; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT: retq
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: test_reduce_v16i16:
; X64-AVX1: ## BB#0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2-LABEL: test_reduce_v16i16:
; X64-AVX2: ## BB#0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX2-NEXT: vzeroupper
; X64-AVX512-LABEL: test_reduce_v16i16:
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovd %xmm0, %eax
; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX512-NEXT: vzeroupper
}
define i16 @test_reduce_v32i16(<32 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v32i16:
-; X86-SSE: ## BB#0:
-; X86-SSE-NEXT: pmaxsw %xmm3, %xmm1
-; X86-SSE-NEXT: pmaxsw %xmm2, %xmm0
-; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psrld $16, %xmm1
-; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X86-SSE-NEXT: movd %xmm1, %eax
-; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT: retl
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsw %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsw %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
;
; X86-AVX1-LABEL: test_reduce_v32i16:
; X86-AVX1: ## BB#0:
; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX1-NEXT: vzeroupper
; X86-AVX2: ## BB#0:
; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
-; X64-SSE-LABEL: test_reduce_v32i16:
-; X64-SSE: ## BB#0:
-; X64-SSE-NEXT: pmaxsw %xmm3, %xmm1
-; X64-SSE-NEXT: pmaxsw %xmm2, %xmm0
-; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
-; X64-SSE-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE-NEXT: psrld $16, %xmm1
-; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT: retq
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsw %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsw %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: test_reduce_v32i16:
; X64-AVX1: ## BB#0:
; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2: ## BB#0:
; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX2-NEXT: vzeroupper
; X64-AVX512-LABEL: test_reduce_v32i16:
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovd %xmm0, %eax
; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX512-NEXT: vzeroupper
}
define i16 @test_reduce_v8i16(<8 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v8i16:
-; X86-SSE: ## BB#0:
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psrld $16, %xmm1
-; X86-SSE-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE-NEXT: movd %xmm1, %eax
-; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT: retl
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v8i16:
; X86-AVX: ## BB#0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vmovd %xmm0, %eax
; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX-NEXT: retl
;
-; X64-SSE-LABEL: test_reduce_v8i16:
-; X64-SSE: ## BB#0:
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE-NEXT: psrld $16, %xmm1
-; X64-SSE-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT: retq
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v8i16:
; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vmovd %xmm0, %eax
; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX-NEXT: retq
}
define i16 @test_reduce_v16i16(<16 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v16i16:
-; X86-SSE: ## BB#0:
-; X86-SSE-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psrld $16, %xmm1
-; X86-SSE-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE-NEXT: movd %xmm1, %eax
-; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT: retl
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
;
; X86-AVX1-LABEL: test_reduce_v16i16:
; X86-AVX1: ## BB#0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX1-NEXT: vzeroupper
; X86-AVX2-LABEL: test_reduce_v16i16:
; X86-AVX2: ## BB#0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
-; X64-SSE-LABEL: test_reduce_v16i16:
-; X64-SSE: ## BB#0:
-; X64-SSE-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE-NEXT: psrld $16, %xmm1
-; X64-SSE-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT: retq
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: test_reduce_v16i16:
; X64-AVX1: ## BB#0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2-LABEL: test_reduce_v16i16:
; X64-AVX2: ## BB#0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX2-NEXT: vzeroupper
; X64-AVX512-LABEL: test_reduce_v16i16:
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovd %xmm0, %eax
; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX512-NEXT: vzeroupper
}
define i16 @test_reduce_v32i16(<32 x i16> %a0) {
-; X86-SSE-LABEL: test_reduce_v32i16:
-; X86-SSE: ## BB#0:
-; X86-SSE-NEXT: pminsw %xmm3, %xmm1
-; X86-SSE-NEXT: pminsw %xmm2, %xmm0
-; X86-SSE-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE-NEXT: pminsw %xmm1, %xmm0
-; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psrld $16, %xmm1
-; X86-SSE-NEXT: pminsw %xmm0, %xmm1
-; X86-SSE-NEXT: movd %xmm1, %eax
-; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-SSE-NEXT: retl
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pminsw %xmm3, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsw %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
;
; X86-AVX1-LABEL: test_reduce_v32i16:
; X86-AVX1: ## BB#0:
; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX1-NEXT: vzeroupper
; X86-AVX2: ## BB#0:
; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX2-NEXT: vzeroupper
; X86-AVX2-NEXT: retl
;
-; X64-SSE-LABEL: test_reduce_v32i16:
-; X64-SSE: ## BB#0:
-; X64-SSE-NEXT: pminsw %xmm3, %xmm1
-; X64-SSE-NEXT: pminsw %xmm2, %xmm0
-; X64-SSE-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE-NEXT: pminsw %xmm1, %xmm0
-; X64-SSE-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE-NEXT: psrld $16, %xmm1
-; X64-SSE-NEXT: pminsw %xmm0, %xmm1
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; X64-SSE-NEXT: retq
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pminsw %xmm3, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsw %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: test_reduce_v32i16:
; X64-AVX1: ## BB#0:
; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2: ## BB#0:
; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX2-NEXT: vzeroupper
; X64-AVX512-LABEL: test_reduce_v32i16:
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovd %xmm0, %eax
; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX512-NEXT: vzeroupper
;
; X86-SSE42-LABEL: test_reduce_v8i16:
; X86-SSE42: ## BB#0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v8i16:
; X86-AVX: ## BB#0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vmovd %xmm0, %eax
; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX-NEXT: retl
;
; X64-SSE42-LABEL: test_reduce_v8i16:
; X64-SSE42: ## BB#0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v8i16:
; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vmovd %xmm0, %eax
; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX-NEXT: retq
; X86-SSE42-LABEL: test_reduce_v16i16:
; X86-SSE42: ## BB#0:
; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-SSE42-NEXT: retl
;
; X86-AVX1: ## BB#0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX1-NEXT: vzeroupper
; X86-AVX2-LABEL: test_reduce_v16i16:
; X86-AVX2: ## BB#0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX2-NEXT: vzeroupper
; X64-SSE42-LABEL: test_reduce_v16i16:
; X64-SSE42: ## BB#0:
; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-SSE42-NEXT: retq
;
; X64-AVX1: ## BB#0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2-LABEL: test_reduce_v16i16:
; X64-AVX2: ## BB#0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX2-NEXT: vzeroupper
; X64-AVX512-LABEL: test_reduce_v16i16:
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovd %xmm0, %eax
; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX512-NEXT: vzeroupper
; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1
; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0
; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-SSE42-NEXT: retl
;
; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX1-NEXT: vzeroupper
; X86-AVX2: ## BB#0:
; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX2-NEXT: vzeroupper
; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1
; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0
; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-SSE42-NEXT: retq
;
; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2: ## BB#0:
; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX2-NEXT: vzeroupper
; X64-AVX512-LABEL: test_reduce_v32i16:
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovd %xmm0, %eax
; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX512-NEXT: vzeroupper
;
; X86-SSE42-LABEL: test_reduce_v8i16:
; X86-SSE42: ## BB#0:
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v8i16:
; X86-AVX: ## BB#0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX-NEXT: vmovd %xmm0, %eax
; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX-NEXT: retl
;
; X64-SSE42-LABEL: test_reduce_v8i16:
; X64-SSE42: ## BB#0:
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v8i16:
; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX-NEXT: vmovd %xmm0, %eax
; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX-NEXT: retq
; X86-SSE42-LABEL: test_reduce_v16i16:
; X86-SSE42: ## BB#0:
; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-SSE42-NEXT: retl
;
; X86-AVX1: ## BB#0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX1-NEXT: vzeroupper
; X86-AVX2-LABEL: test_reduce_v16i16:
; X86-AVX2: ## BB#0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX2-NEXT: vzeroupper
; X64-SSE42-LABEL: test_reduce_v16i16:
; X64-SSE42: ## BB#0:
; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-SSE42-NEXT: retq
;
; X64-AVX1: ## BB#0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2-LABEL: test_reduce_v16i16:
; X64-AVX2: ## BB#0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX2-NEXT: vzeroupper
; X64-AVX512-LABEL: test_reduce_v16i16:
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vmovd %xmm0, %eax
; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX512-NEXT: vzeroupper
; X86-SSE42-NEXT: pminuw %xmm3, %xmm1
; X86-SSE42-NEXT: pminuw %xmm2, %xmm0
; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT: psrld $16, %xmm1
-; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-SSE42-NEXT: retl
;
; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX1-NEXT: vzeroupper
; X86-AVX2: ## BB#0:
; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X86-AVX2-NEXT: vzeroupper
; X64-SSE42-NEXT: pminuw %xmm3, %xmm1
; X64-SSE42-NEXT: pminuw %xmm2, %xmm0
; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
-; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT: psrld $16, %xmm1
-; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-SSE42-NEXT: retq
;
; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX1-NEXT: vzeroupper
; X64-AVX2: ## BB#0:
; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX2-NEXT: vzeroupper
; X64-AVX512-LABEL: test_reduce_v32i16:
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vmovd %xmm0, %eax
; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; X64-AVX512-NEXT: vzeroupper