From: Craig Topper Date: Sun, 18 Nov 2018 17:59:28 +0000 (+0000) Subject: [X86] Lower v16i16->v8i16 truncate using an 'and' with 255, an extract_subvector... X-Git-Tag: llvmorg-8.0.0-rc1~3997 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=bc8148f7b056f8cd1c6a2dfe59988b3e1ab4215d;p=platform%2Fupstream%2Fllvm.git [X86] Lower v16i16->v8i16 truncate using an 'and' with 255, an extract_subvector, and a packuswb instruction. Summary: This is an improvement over the two pshufbs and punpcklqdq we'd get otherwise. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D54671 llvm-svn: 347171 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 062c978..1c8f6aa 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18047,6 +18047,22 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getBitcast(MVT::v8i16, res); } + if (VT == MVT::v16i8 && InVT == MVT::v16i16) { + // Use an AND to force a PACKUS. + In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT)); + + SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, + DAG.getIntPtrConstant(0, DL)); + SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, + DAG.getIntPtrConstant(8, DL)); + InLo = DAG.getBitcast(VT, InLo); + InHi = DAG.getBitcast(VT, InHi); + + return DAG.getVectorShuffle(VT, DL, InLo, InHi, + { 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30}); + } + // Handle truncation of V256 to V128 using shuffles. assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 02d8bd8..b1378b6 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -364,17 +364,17 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6 ; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vzeroupper @@ -421,27 +421,27 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX2-NEXT: vpsrld $1, %ymm7, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 ; AVX2-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm5, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper @@ -2282,15 +2282,15 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm2[0,2] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm3[0,2] ; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: popq %rbx @@ -2525,8 +2525,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: vpshufb %ymm1, %ymm8, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0] @@ -2536,8 +2536,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: addq $16, %rsp ; AVX2-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/avx-trunc.ll b/llvm/test/CodeGen/X86/avx-trunc.ll index f1af384..49f57fa 100644 --- a/llvm/test/CodeGen/X86/avx-trunc.ll +++ b/llvm/test/CodeGen/X86/avx-trunc.ll @@ -29,11 +29,9 @@ define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{ define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{ ; CHECK-LABEL: trunc_16_8: ; CHECK: # %bb.0: +; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %B = trunc <16 x i16> %A to <16 x i8> diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index c11337b..e54ca61 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -148,11 +148,9 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone { ; X32-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X32-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X32-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; X32-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -161,11 +159,9 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone { ; X64-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq %x = mul <16 x i8> %i, %j @@ -180,18 +176,16 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; X32-NEXT: vextracti128 $1, %ymm0, %xmm3 ; X32-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; X32-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; X32-NEXT: vextracti128 $1, %ymm2, %xmm3 -; X32-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X32-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; X32-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; X32-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X32-NEXT: vextracti128 $1, %ymm2, %xmm4 +; X32-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; X32-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpand %ymm3, %ymm0, %ymm0 ; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X32-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; X32-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; X32-NEXT: retl ; @@ -202,18 +196,16 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; X64-NEXT: vextracti128 $1, %ymm0, %xmm3 ; X64-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; X64-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; X64-NEXT: vextracti128 $1, %ymm2, %xmm3 -; X64-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X64-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; X64-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; X64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X64-NEXT: vextracti128 $1, %ymm2, %xmm4 +; X64-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; X64-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpand %ymm3, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; X64-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; X64-NEXT: retq %x = mul <32 x i8> %i, %j diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll index d4c2330..1060d70 100644 --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -163,21 +163,17 @@ define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) { define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) { ; X32-LABEL: trunc_16i16_16i8: ; X32: # %bb.0: +; X32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X32-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X32-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; X32-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: trunc_16i16_16i8: ; X64: # %bb.0: +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq %t = trunc <16 x i16> %z to <16 x i8> diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index e47c598..d88b659 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -306,11 +306,9 @@ define <16 x i8> @PR35579(<16 x i8> %x) { ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %r = mul <16 x i8> %x, diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index f5f7019..1526325 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -38,11 +38,9 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -173,11 +171,9 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -424,17 +420,15 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -608,18 +602,16 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -826,31 +818,27 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpackuswb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; @@ -997,36 +985,32 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX2-NEXT: vpmullw %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX2-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll index 829dd42..b4f8e5b 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -48,11 +48,11 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 +; AVX256-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX256-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 120e448..92f6e27 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -34,11 +34,11 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 +; AVX256VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX256VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX256VL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX256VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256VL-NEXT: vzeroupper ; AVX256VL-NEXT: retq ; @@ -169,11 +169,11 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 +; AVX256VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX256VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX256VL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX256VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX256VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll b/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll index 6e21584..3b1ff44 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll @@ -11,11 +11,9 @@ define <16 x i8> @testv16i16_trunc_v16i8(<16 x i16> %x) { ; AVX256NOBW-LABEL: testv16i16_trunc_v16i8: ; AVX256NOBW: # %bb.0: +; AVX256NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX256NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX256NOBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX256NOBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX256NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX256NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256NOBW-NEXT: vzeroupper ; AVX256NOBW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 6e2e979..6e3c137 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -824,12 +824,12 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpandn %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll index 940f475..629d37b 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll @@ -41,16 +41,25 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { } define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX-LABEL: trunc_v16i16_to_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v16i16_to_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v16i16_to_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v16i16_to_v16i8: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 7ef9db5..fc6a2cd 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -41,16 +41,25 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { } define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX-LABEL: trunc_v16i16_to_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v16i16_to_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v16i16_to_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v16i16_to_v16i8: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 4e4c0a0..59f1abb 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -214,11 +214,9 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq %1 = mul <16 x i8> %a0, @@ -577,11 +575,9 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq %1 = mul <16 x i8> %a0, @@ -1158,11 +1154,9 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq %1 = mul <16 x i8> %a0, diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 50d4a74..d7718ba 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1557,31 +1557,29 @@ define i8 @test_v16i8(<16 x i8> %a0) { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1843,39 +1841,37 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -2213,55 +2209,51 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX2-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX2-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -2737,83 +2729,75 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero ; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX2-NEXT: vpackuswb %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero ; AVX2-NEXT: vpmullw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpand %ymm4, %ymm6, %ymm6 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm7 -; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; AVX2-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 1c52b4b..4e8e785 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -909,11 +909,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll index 8f6a1fe..fb5407b 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll @@ -267,8 +267,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -276,8 +276,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -294,15 +294,15 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -366,12 +366,12 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -403,21 +403,19 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -715,8 +713,8 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -724,8 +722,8 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -739,15 +737,15 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -801,12 +799,12 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -834,22 +832,18 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; ; AVX1-LABEL: trunc_add_const_v16i16_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1142,8 +1136,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -1151,8 +1145,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1169,15 +1163,15 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1241,12 +1235,12 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1278,21 +1272,19 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1612,8 +1604,8 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -1621,8 +1613,8 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1639,15 +1631,15 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1709,12 +1701,12 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1745,21 +1737,19 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2315,8 +2305,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -2330,8 +2320,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2348,8 +2338,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1 @@ -2359,8 +2349,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -2474,12 +2464,12 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2511,21 +2501,19 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2946,8 +2934,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 @@ -2957,8 +2945,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2973,8 +2961,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 @@ -2982,8 +2970,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3072,13 +3060,13 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3109,21 +3097,19 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3399,8 +3385,8 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -3408,8 +3394,8 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3426,15 +3412,15 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3494,12 +3480,12 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3528,22 +3514,18 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-LABEL: trunc_and_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3792,8 +3774,8 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -3801,8 +3783,8 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3816,15 +3798,15 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -3878,12 +3860,12 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3911,22 +3893,18 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; ; AVX1-LABEL: trunc_and_const_v16i16_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4203,8 +4181,8 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -4212,8 +4190,8 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4230,15 +4208,15 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4298,12 +4276,12 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4332,22 +4310,18 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-LABEL: trunc_xor_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4596,8 +4570,8 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -4605,8 +4579,8 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -4620,15 +4594,15 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -4682,12 +4656,12 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4715,22 +4689,18 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -5007,8 +4977,8 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -5016,8 +4986,8 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -5034,15 +5004,15 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5102,12 +5072,12 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5136,22 +5106,18 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind ; AVX1-LABEL: trunc_or_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5400,8 +5366,8 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -5409,8 +5375,8 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -5424,15 +5390,15 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -5486,12 +5452,12 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -5519,22 +5485,18 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; ; AVX1-LABEL: trunc_or_const_v16i16_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 97c4f3d..c8ccab0 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -267,8 +267,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -276,8 +276,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -294,15 +294,15 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -366,12 +366,12 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -403,21 +403,19 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -715,8 +713,8 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -724,8 +722,8 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -739,15 +737,15 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -801,12 +799,12 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -834,22 +832,18 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; ; AVX1-LABEL: trunc_add_const_v16i16_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1142,8 +1136,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -1151,8 +1145,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1169,15 +1163,15 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1241,12 +1235,12 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1278,21 +1272,19 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1612,8 +1604,8 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -1621,8 +1613,8 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1639,15 +1631,15 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1709,12 +1701,12 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1745,21 +1737,19 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2315,8 +2305,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -2330,8 +2320,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2348,8 +2338,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1 @@ -2359,8 +2349,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -2474,12 +2464,12 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2511,21 +2501,19 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2946,8 +2934,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 @@ -2957,8 +2945,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2973,8 +2961,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 @@ -2982,8 +2970,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3072,13 +3060,13 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3109,21 +3097,19 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3399,8 +3385,8 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -3408,8 +3394,8 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3426,15 +3412,15 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3494,12 +3480,12 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3528,22 +3514,18 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-LABEL: trunc_and_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3792,8 +3774,8 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -3801,8 +3783,8 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3816,15 +3798,15 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -3878,12 +3860,12 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3911,22 +3893,18 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; ; AVX1-LABEL: trunc_and_const_v16i16_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4203,8 +4181,8 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -4212,8 +4190,8 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4230,15 +4208,15 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4298,12 +4276,12 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4332,22 +4310,18 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX1-LABEL: trunc_xor_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4596,8 +4570,8 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -4605,8 +4579,8 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -4620,15 +4594,15 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -4682,12 +4656,12 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4715,22 +4689,18 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -5007,8 +4977,8 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -5016,8 +4986,8 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -5034,15 +5004,15 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5102,12 +5072,12 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5136,22 +5106,18 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind ; AVX1-LABEL: trunc_or_v16i16_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5400,8 +5366,8 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -5409,8 +5375,8 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -5424,15 +5390,15 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -5486,12 +5452,12 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -5519,22 +5485,18 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; ; AVX1-LABEL: trunc_or_const_v16i16_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i16_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-widen.ll index a15b2aa..4973937 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-widen.ll @@ -916,12 +916,12 @@ define void @trunc16i32_16i8(<16 x i32> %a) { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1098,22 +1098,18 @@ define void @trunc16i16_16i8(<16 x i16> %a) { ; ; AVX1-LABEL: trunc16i16_16i8: ; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i16_16i8: ; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1323,15 +1319,13 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -1339,15 +1333,13 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX2-LABEL: trunc32i16_32i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index fb52636..d80e9b0 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -926,12 +926,12 @@ define void @trunc16i32_16i8(<16 x i32> %a) { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1108,22 +1108,18 @@ define void @trunc16i16_16i8(<16 x i16> %a) { ; ; AVX1-LABEL: trunc16i16_16i8: ; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i16_16i8: ; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1333,15 +1329,13 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -1349,15 +1343,13 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX2-LABEL: trunc32i16_32i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper