Previously we used vptestmd, but the scheduling data for SKX says vpmovq2m/vpmovd2m is lower latency. We already used vpmovb2m/vpmovw2m for byte/word truncates. So this is more consistent anyway.
llvm-svn: 325534
In = DAG.getNode(ISD::SHL, DL, InVT, In,
DAG.getConstant(ShiftInx, DL, InVT));
}
+ // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
+ if (Subtarget.hasDQI())
+ return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, InVT),
+ In, DAG.getConstant(6, DL, MVT::i8));
return DAG.getNode(X86ISD::CMPM, DL, VT, In,
getZeroVector(InVT, Subtarget, DAG, DL),
DAG.getConstant(4, DL, MVT::i8));
}
define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
-; NOVL-LABEL: f64to4f32_mask:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vpslld $31, %xmm1, %xmm1
-; NOVL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; NOVL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: f64to4f32_mask:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vpslld $31, %xmm1, %xmm1
+; NOVLDQ-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NOVLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0
+; NOVLDQ-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT: vzeroupper
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: f64to4f32_mask:
-; VL: # %bb.0:
-; VL-NEXT: vpslld $31, %xmm1, %xmm1
-; VL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
-; VL-NEXT: vzeroupper
-; VL-NEXT: retq
+; VLDQ-LABEL: f64to4f32_mask:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vpslld $31, %xmm1, %xmm1
+; VLDQ-NEXT: vpmovd2m %xmm1, %k1
+; VLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VLDQ-NEXT: vzeroupper
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: f64to4f32_mask:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vpslld $31, %xmm1, %xmm1
+; VLNODQ-NEXT: vptestmd %xmm1, %xmm1, %k1
+; VLNODQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: vzeroupper
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: f64to4f32_mask:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: vpslld $31, %xmm1, %xmm1
+; DQNOVL-NEXT: vpmovd2m %zmm1, %k1
+; DQNOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; DQNOVL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT: vzeroupper
+; DQNOVL-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
}
define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) {
-; NOVL-LABEL: test_2f64toub:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_2f64toub:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; NOVLDQ-NEXT: vcvttpd2udq %zmm0, %ymm0
+; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT: vzeroupper
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_2f64toub:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2udq %xmm0, %xmm0
-; VL-NEXT: vpslld $31, %xmm0, %xmm0
-; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_2f64toub:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0
+; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT: vpmovd2m %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_2f64toub:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttpd2udq %xmm0, %xmm0
+; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_2f64toub:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; DQNOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT: vzeroupper
+; DQNOVL-NEXT: retq
%mask = fptoui <2 x double> %a to <2 x i1>
%select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
ret <2 x i64> %select
}
define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) {
-; NOVL-LABEL: test_4f64toub:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_4f64toub:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0
+; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_4f64toub:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VL-NEXT: vpslld $31, %xmm0, %xmm0
-; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_4f64toub:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0
+; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT: vpmovd2m %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_4f64toub:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0
+; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_4f64toub:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT: retq
%mask = fptoui <4 x double> %a to <4 x i1>
%select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
ret <4 x i64> %select
}
define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) {
-; NOVL-LABEL: test_8f64toub:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_8f64toub:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
+; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_8f64toub:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT: vpslld $31, %ymm0, %ymm0
-; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
-; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_8f64toub:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
+; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; VLDQ-NEXT: vpmovd2m %ymm0, %k1
+; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_8f64toub:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0
+; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0
+; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_8f64toub:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: retq
%mask = fptoui <8 x double> %a to <8 x i1>
%select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
ret <8 x i64> %select
}
define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) {
-; NOVL-LABEL: test_2f32toub:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
-; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_2f32toub:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT: vzeroupper
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_2f32toub:
-; VL: # %bb.0:
-; VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; VL-NEXT: vpslld $31, %xmm0, %xmm0
-; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_2f32toub:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT: vpmovd2m %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_2f32toub:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_2f32toub:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0
+; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT: vzeroupper
+; DQNOVL-NEXT: retq
%mask = fptoui <2 x float> %a to <2 x i1>
%select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
ret <2 x i64> %select
}
define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) {
-; NOVL-LABEL: test_4f32toub:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
-; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_4f32toub:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_4f32toub:
-; VL: # %bb.0:
-; VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; VL-NEXT: vpslld $31, %xmm0, %xmm0
-; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_4f32toub:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT: vpmovd2m %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_4f32toub:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_4f32toub:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0
+; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT: retq
%mask = fptoui <4 x float> %a to <4 x i1>
%select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
ret <4 x i64> %select
}
define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) {
-; NOVL-LABEL: test_8f32toub:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0
-; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_8f32toub:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
+; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_8f32toub:
-; VL: # %bb.0:
-; VL-NEXT: vcvttps2dq %ymm0, %ymm0
-; VL-NEXT: vpslld $31, %ymm0, %ymm0
-; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
-; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_8f32toub:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
+; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
+; VLDQ-NEXT: vpmovd2m %ymm0, %k1
+; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_8f32toub:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0
+; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0
+; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_8f32toub:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0
+; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: retq
%mask = fptoui <8 x float> %a to <8 x i1>
%select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
ret <8 x i64> %select
}
define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) {
-; ALL-LABEL: test_16f32toub:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT: vpslld $31, %zmm0, %zmm0
-; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; ALL-NEXT: retq
+; NODQ-LABEL: test_16f32toub:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0
+; NODQ-NEXT: vpslld $31, %zmm0, %zmm0
+; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; NODQ-NEXT: retq
+;
+; VLDQ-LABEL: test_16f32toub:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0
+; VLDQ-NEXT: vpslld $31, %zmm0, %zmm0
+; VLDQ-NEXT: vpmovd2m %zmm0, %k1
+; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_16f32toub:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0
+; DQNOVL-NEXT: vpslld $31, %zmm0, %zmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: retq
%mask = fptoui <16 x float> %a to <16 x i1>
%select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
ret <16 x i32> %select
}
define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) {
-; NOVL-LABEL: test_2f64tosb:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_2f64tosb:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0
+; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT: vzeroupper
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_2f64tosb:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; VL-NEXT: vpslld $31, %xmm0, %xmm0
-; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_2f64tosb:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLDQ-NEXT: vpmovd2m %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_2f64tosb:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
+; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_2f64tosb:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; DQNOVL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT: vzeroupper
+; DQNOVL-NEXT: retq
%mask = fptosi <2 x double> %a to <2 x i1>
%select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
ret <2 x i64> %select
}
define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) {
-; NOVL-LABEL: test_4f64tosb:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_4f64tosb:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_4f64tosb:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_4f64tosb:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0
+; VLDQ-NEXT: vpmovd2m %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_4f64tosb:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0
+; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_4f64tosb:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT: retq
%mask = fptosi <4 x double> %a to <4 x i1>
%select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
ret <4 x i64> %select
}
define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) {
-; NOVL-LABEL: test_8f64tosb:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_8f64tosb:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_8f64tosb:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
-; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_8f64tosb:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
+; VLDQ-NEXT: vpmovd2m %ymm0, %k1
+; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_8f64tosb:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0
+; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_8f64tosb:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: retq
%mask = fptosi <8 x double> %a to <8 x i1>
%select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
ret <8 x i64> %select
}
define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) {
-; NOVL-LABEL: test_2f32tosb:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_2f32tosb:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; NOVLDQ-NEXT: vzeroupper
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_2f32tosb:
-; VL: # %bb.0:
-; VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_2f32tosb:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; VLDQ-NEXT: vpmovd2m %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_2f32tosb:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_2f32tosb:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; DQNOVL-NEXT: vzeroupper
+; DQNOVL-NEXT: retq
%mask = fptosi <2 x float> %a to <2 x i1>
%select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
ret <2 x i64> %select
}
define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) {
-; NOVL-LABEL: test_4f32tosb:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_4f32tosb:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_4f32tosb:
-; VL: # %bb.0:
-; VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_4f32tosb:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; VLDQ-NEXT: vpmovd2m %xmm0, %k1
+; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_4f32tosb:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_4f32tosb:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; DQNOVL-NEXT: retq
%mask = fptosi <4 x float> %a to <4 x i1>
%select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
ret <4 x i64> %select
}
define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) {
-; NOVL-LABEL: test_8f32tosb:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0
-; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVL-NEXT: retq
+; NOVLDQ-LABEL: test_8f32tosb:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
+; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT: retq
;
-; VL-LABEL: test_8f32tosb:
-; VL: # %bb.0:
-; VL-NEXT: vcvttps2dq %ymm0, %ymm0
-; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
-; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VL-NEXT: retq
+; VLDQ-LABEL: test_8f32tosb:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
+; VLDQ-NEXT: vpmovd2m %ymm0, %k1
+; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: test_8f32tosb:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0
+; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1
+; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VLNODQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_8f32tosb:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: retq
%mask = fptosi <8 x float> %a to <8 x i1>
%select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
ret <8 x i64> %select
}
define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) {
-; ALL-LABEL: test_16f32tosb:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; ALL-NEXT: retq
+; NODQ-LABEL: test_16f32tosb:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0
+; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; NODQ-NEXT: retq
+;
+; VLDQ-LABEL: test_16f32tosb:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0
+; VLDQ-NEXT: vpmovd2m %zmm0, %k1
+; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; VLDQ-NEXT: retq
+;
+; DQNOVL-LABEL: test_16f32tosb:
+; DQNOVL: # %bb.0:
+; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0
+; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
+; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; DQNOVL-NEXT: retq
%mask = fptosi <16 x float> %a to <16 x i1>
%select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
ret <16 x i32> %select
; SKX-LABEL: zext_4x8mem_to_4x32:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
; SKX-LABEL: sext_4x8mem_to_4x32:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
; SKX-LABEL: zext_2x8mem_to_2x64:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovq2m %xmm0, %k1
; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
; SKX-LABEL: sext_2x8mem_to_2x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovq2m %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
; SKX-LABEL: zext_4x8mem_to_4x64:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
; SKX-LABEL: sext_4x8mem_to_4x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
; SKX-LABEL: zext_4x16mem_to_4x32:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
; SKX-LABEL: sext_4x16mem_to_4x32mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
; SKX-LABEL: zext_2x16mem_to_2x64:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovq2m %xmm0, %k1
; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; SKX-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
; SKX-LABEL: sext_2x16mem_to_2x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovq2m %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
; SKX-LABEL: zext_4x16mem_to_4x64:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
; SKX-LABEL: sext_4x16mem_to_4x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
; SKX-LABEL: zext_2x32mem_to_2x64:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovq2m %xmm0, %k1
; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero
; SKX-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
; SKX-LABEL: sext_2x32mem_to_2x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovq2m %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
; SKX-LABEL: zext_4x32mem_to_4x64:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SKX-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
; SKX-LABEL: sext_4x32mem_to_4x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovd2m %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
; SKX-LABEL: zext_4x32_to_4x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SKX-NEXT: retq
%x = zext <4 x i32> %a to <4 x i64>
; SKX-LABEL: trunc_16i32_to_16i1:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; SKX-NEXT: vpmovd2m %zmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: # kill: def $ax killed $ax killed $eax
; SKX-NEXT: vzeroupper
; AVX512DQ: ## %bb.0: ## %allocas
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: orb $85, %al
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: LBB18_3:
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; SKX-NEXT: LBB20_1:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: LBB20_3:
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512DQ-NEXT: LBB20_1:
; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: LBB20_3:
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; SKX-LABEL: test22:
; SKX: ## %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512DQ-LABEL: test22:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; SKX-LABEL: test23:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0
+; SKX-NEXT: vpmovq2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512DQ-LABEL: test23:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; SKX-LABEL: store_v2i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0
+; SKX-NEXT: vpmovq2m %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
; AVX512DQ-LABEL: store_v2i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; SKX-LABEL: store_v4i1:
; SKX: ## %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
; AVX512DQ-LABEL: store_v4i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: knotb %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k2
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k3
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT: kmovw %k3, 6(%rdi)
; AVX512DQ-NEXT: kmovw %k2, 4(%rdi)
; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
; GENERIC-LABEL: f64to4f32_mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [3:1.00]
; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
; SKX-LABEL: f64to4f32_mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm1, %k1 # sched: [1:1.00]
; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [7:1.00]
; SKX-NEXT: vzeroupper # sched: [4:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
; GENERIC-LABEL: zext_4x8mem_to_4x32:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_4x8mem_to_4x32:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i8>,<4 x i8> *%i,align 1
; GENERIC-LABEL: sext_4x8mem_to_4x32:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sext_4x8mem_to_4x32:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i8>,<4 x i8> *%i,align 1
; GENERIC-LABEL: zext_2x8mem_to_2x64:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_2x8mem_to_2x64:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <2 x i8>,<2 x i8> *%i,align 1
; GENERIC-LABEL: sext_2x8mem_to_2x64mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sext_2x8mem_to_2x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <2 x i8>,<2 x i8> *%i,align 1
; GENERIC-LABEL: zext_4x8mem_to_4x64:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_4x8mem_to_4x64:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i8>,<4 x i8> *%i,align 1
; GENERIC-LABEL: sext_4x8mem_to_4x64mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sext_4x8mem_to_4x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i8>,<4 x i8> *%i,align 1
; GENERIC-LABEL: zext_4x16mem_to_4x32:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_4x16mem_to_4x32:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i16>,<4 x i16> *%i,align 1
; GENERIC-LABEL: sext_4x16mem_to_4x32mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sext_4x16mem_to_4x32mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i16>,<4 x i16> *%i,align 1
; GENERIC-LABEL: zext_2x16mem_to_2x64:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_2x16mem_to_2x64:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <2 x i16>,<2 x i16> *%i,align 1
; GENERIC-LABEL: sext_2x16mem_to_2x64mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sext_2x16mem_to_2x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <2 x i16>,<2 x i16> *%i,align 1
; GENERIC-LABEL: zext_4x16mem_to_4x64:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_4x16mem_to_4x64:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i16>,<4 x i16> *%i,align 1
; GENERIC-LABEL: sext_4x16mem_to_4x64mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sext_4x16mem_to_4x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i16>,<4 x i16> *%i,align 1
; GENERIC-LABEL: zext_2x32mem_to_2x64:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_2x32mem_to_2x64:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <2 x i32>,<2 x i32> *%i,align 1
; GENERIC-LABEL: sext_2x32mem_to_2x64mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sext_2x32mem_to_2x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <2 x i32>,<2 x i32> *%i,align 1
; GENERIC-LABEL: zext_4x32mem_to_4x64:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_4x32mem_to_4x64:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i32>,<4 x i32> *%i,align 1
; GENERIC-LABEL: sext_4x32mem_to_4x64mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: sext_4x32mem_to_4x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%a = load <4 x i32>,<4 x i32> *%i,align 1
; GENERIC-LABEL: zext_4x32_to_4x64mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: zext_4x32_to_4x64mask:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm1, %k1 # sched: [1:1.00]
; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%x = zext <4 x i32> %a to <4 x i64>
; GENERIC-LABEL: trunc_16i32_to_16i1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [3:1.00]
-; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %zmm0, %k0 # sched: [1:0.33]
; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
; GENERIC-NEXT: # kill: def $ax killed $ax killed $eax
; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
; SKX-LABEL: trunc_16i32_to_16i1:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %zmm0, %k0 # sched: [1:1.00]
; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
; SKX-NEXT: # kill: def $ax killed $ax killed $eax
; SKX-NEXT: vzeroupper # sched: [4:1.00]
; GENERIC-NEXT: .LBB389_1:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: .LBB389_3:
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:0.33]
; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-NEXT: .LBB389_1:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
; SKX-NEXT: .LBB389_3:
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:1.00]
; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%mask = icmp sgt i32 %a1, %b1
; GENERIC-LABEL: vmov_test22:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:0.33]
; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: vmov_test22:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:1.00]
; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
store <4 x i1> %a, <4 x i1>* %addr
; GENERIC-LABEL: vmov_test23:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovq2m %xmm0, %k0 # sched: [1:0.33]
; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: vmov_test23:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovq2m %xmm0, %k0 # sched: [1:1.00]
; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
store <2 x i1> %a, <2 x i1>* %addr
; GENERIC-LABEL: store_v2i1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovq2m %xmm0, %k0 # sched: [1:0.33]
; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00]
; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
; SKX-LABEL: store_v2i1:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovq2m %xmm0, %k0 # sched: [1:1.00]
; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00]
; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
; GENERIC-LABEL: store_v4i1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:0.33]
; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00]
; GENERIC-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
; SKX-LABEL: store_v4i1:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovd2m %xmm0, %k0 # sched: [1:1.00]
; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00]
; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
; CHECK-LABEL: test:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpmovq2m %xmm0, %k0
; CHECK-NEXT: kshiftlb $2, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
; CHECK-LABEL: test1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpmovq2m %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
; CHECK-LABEL: test2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpmovq2m %xmm0, %k0
; CHECK-NEXT: vpmovm2d %k0, %ymm0
; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; CHECK-NEXT: vpmovd2m %ymm0, %k0
; CHECK-LABEL: test3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpmovd2m %xmm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
; CHECK-LABEL: test4:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k0
+; CHECK-NEXT: vpmovd2m %xmm1, %k0
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT: vpmovd2m %xmm0, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-LABEL: test5:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
-; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k0
+; CHECK-NEXT: vpmovq2m %xmm1, %k0
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT: vpmovq2m %xmm0, %k1
; CHECK-NEXT: kshiftlb $2, %k0, %k0
; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-LABEL: test6:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
-; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k0
+; CHECK-NEXT: vpmovq2m %xmm1, %k0
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT: vpmovq2m %xmm0, %k1
; CHECK-NEXT: kshiftlb $2, %k0, %k0
; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-LABEL: test7:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k0
+; CHECK-NEXT: vpmovd2m %xmm1, %k0
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT: vpmovd2m %xmm0, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2b %k0, %ymm0
; CHECK-LABEL: test10:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpmovd2m %xmm0, %k0
; CHECK-NEXT: kshiftrb $2, %k0, %k0
; CHECK-NEXT: vpmovm2q %k0, %xmm0
; CHECK-NEXT: retq
; CHECK-LABEL: test11:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpmovd2m %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
target triple = "x86_64-unknown-unknown"
define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test1:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0
-; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
-; CHECK-NEXT: retq
+; CHECK-SKX-LABEL: test1:
+; CHECK-SKX: # %bb.0: # %entry
+; CHECK-SKX-NEXT: vpsllq $63, %zmm0, %zmm0
+; CHECK-SKX-NEXT: vpmovq2m %zmm0, %k1
+; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-KNL-LABEL: test1:
+; CHECK-KNL: # %bb.0: # %entry
+; CHECK-KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-KNL-NEXT: retq
entry:
%m.trunc = trunc <8 x i64> %m to <8 x i1>
%ret = select <8 x i1> %m.trunc, <8 x i64> %a, <8 x i64> %b
; SKX-LABEL: test10:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-LABEL: test11:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vpcompressq %xmm0, (%rdi) {%k1}
; SKX-NEXT: retq
;
; SKX-LABEL: test12:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1}
; SKX-NEXT: retq
;
; SKX-LABEL: test15:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
; SKX_32-LABEL: test15:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovd2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX-LABEL: test16:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
; SKX-NEXT: vmovapd %ymm2, %ymm0
; SKX-NEXT: retq
; SKX_32-LABEL: test16:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovd2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
; SKX_32-NEXT: vmovapd %ymm2, %ymm0
; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovapd %xmm2, %xmm0
; SKX-NEXT: retq
; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
; SKX_32-NEXT: vmovapd %xmm2, %xmm0
; SKX-LABEL: test18:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
+; SKX-NEXT: vpmovd2m %xmm2, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX_32-LABEL: test18:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
+; SKX_32-NEXT: vpmovd2m %xmm2, %k1
; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
; SKX-LABEL: test19:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX_32-LABEL: test19:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovd2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX-LABEL: test20:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
+; SKX-NEXT: vpmovq2m %xmm2, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%xmm1) {%k1}
; SKX-NEXT: retq
;
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
+; SKX_32-NEXT: vpmovq2m %xmm2, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
; SKX-LABEL: test21:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
+; SKX-NEXT: vpmovq2m %xmm2, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
; SKX-NEXT: retq
; SKX_32-LABEL: test21:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
+; SKX_32-NEXT: vpmovq2m %xmm2, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
; SKX: # %bb.0:
; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX-LABEL: test22a:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
; SKX_32-LABEL: test22a:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm2 {%k1}
; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX-LABEL: test23:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
; SKX_32-LABEL: test23:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SKX-LABEL: test23b:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX_32-LABEL: test23b:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovdqa %xmm2, %xmm0
; SKX-NEXT: retq
; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
; SKX-LABEL: test30:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
+; SKX-NEXT: vpmovd2m %xmm2, %k1
; SKX-NEXT: kmovw %k1, %eax
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX_32-NEXT: subl $12, %esp
; SKX_32-NEXT: .cfi_def_cfa_offset 16
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
+; SKX_32-NEXT: vpmovd2m %xmm2, %k1
; SKX_32-NEXT: kmovw %k1, %eax
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vpmovd2m %zmm2, %k1
; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpmovd2m %zmm1, %k1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: retl
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vpmovd2m %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpmovd2m %zmm1, %k1
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vpmovd2m %zmm2, %k1
; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpmovd2m %zmm1, %k1
; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vmovaps %zmm2, %zmm0
; SKX_32-NEXT: retl
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vpmovd2m %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpmovd2m %zmm1, %k1
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vpmovd2m %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpmovd2m %zmm1, %k1
; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vpmovd2m %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpmovd2m %zmm1, %k1
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vpmovd2m %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpmovd2m %zmm1, %k1
; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vpmovd2m %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpmovd2m %zmm1, %k1
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
; SKX-LABEL: test_pr28312:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovd2m %xmm1, %k1
; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX_32-NEXT: andl $-32, %esp
; SKX_32-NEXT: subl $32, %esp
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpmovd2m %xmm1, %k1
; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX-LABEL: large_index:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovq2m %xmm0, %k1
; SKX-NEXT: vmovq %rcx, %xmm0
; SKX-NEXT: vmovq %rsi, %xmm2
; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; SKX_32-LABEL: large_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX_32-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX_32-NEXT: vpmovq2m %xmm0, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; SKX-LABEL: test_scatter_2i32_index:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
+; SKX-NEXT: vpmovq2m %xmm2, %k1
; SKX-NEXT: vpsllq $32, %xmm1, %xmm1
; SKX-NEXT: vpsraq $32, %xmm1, %xmm1
; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1}
; SKX_32-LABEL: test_scatter_2i32_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
+; SKX_32-NEXT: vpmovq2m %xmm2, %k1
; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1
; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX-LABEL: test:
; SKX: # %bb.0: # %bb
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: kshiftrb $2, %k0, %k1
; SKX-NEXT: kshiftrw $1, %k1, %k2
; SKX-NEXT: kmovd %k2, %eax
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1
; CHECK-NEXT: vpslld $31, %zmm1, %zmm1
-; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
+; CHECK-NEXT: vpmovd2m %zmm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%a = load <16 x float>, <16 x float>* %ptr
; CHECK-LABEL: test_16f32tosb_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1
-; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
+; CHECK-NEXT: vpmovd2m %zmm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%a = load <16 x float>, <16 x float>* %ptr
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm4
-; AVX512DQ-NEXT: vptestmd %zmm4, %zmm4, %k0
+; AVX512DQ-NEXT: vpmovd2m %zmm4, %k0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k2
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k3
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
; AVX512DQ-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k4
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k5
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5
; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1
-; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k6
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k7
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7
; AVX512DQ-NEXT: kmovw %k7, 14(%rdi)
; AVX512DQ-NEXT: kmovw %k6, 12(%rdi)
; AVX512DQ-NEXT: kmovw %k5, 10(%rdi)
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
-; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
+; AVX512F-NEXT: vpmovd2m %zmm3, %k1
; AVX512F-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; AVX512F-NEXT: vmovaps %zmm2, %zmm0
; AVX512F-NEXT: retq
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
-; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
+; AVX512F-NEXT: vpmovd2m %zmm3, %k1
; AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
; VL_BW_DQ-LABEL: shuf2i1_1_0:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-LABEL: shuf2i1_1_2:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: movq $-1, %rax
; VL_BW_DQ-NEXT: vmovq %rax, %xmm0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0
+; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0