;
; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0