; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[0,6,12,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm4[2,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14,u,u,u,u,u],zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30,u,u,u,u,u],zero,zero,zero,ymm3[20,26]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[20,26]
; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm4[0,1]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,8,14],zero,zero,ymm4[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,24,30],zero,zero
; AVX2-SLOW-NEXT: vpor %ymm0, %ymm8, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15,u,u,u,u,u],zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31,u,u,u,u,u],zero,zero,zero,ymm3[21,27]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[21,27]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15],zero,zero,ymm4[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,25,31],zero,zero
; AVX2-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2
; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[u,u,u,u,u]
; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10,u,u,u,u,u,u],zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26,u,u,u,u,u,u],zero,zero,ymm3[16,22,28]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,22,28]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,10],zero,zero,zero,ymm4[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[20,26],zero,zero,zero
; AVX2-SLOW-NEXT: vpor %ymm7, %ymm10, %ymm7
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[u,u,u,u,u]
; AVX2-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11,u,u,u,u,u,u],zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27,u,u,u,u,u,u],zero,zero,ymm3[17,23,29]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,23,29]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,11],zero,zero,zero,ymm4[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[21,27],zero,zero,zero
; AVX2-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[4,10,u,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[4,10],zero,zero,zero,xmm5[2,8,14],zero,zero,xmm5[u,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,ymm4[u,u,u,u,u,0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,ymm4[u,u,u,u,u,16,22,28],zero,zero,zero
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,22,28],zero,zero,zero
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,ymm3[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,8,14],zero,zero,ymm3[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,24,30]
; AVX2-SLOW-NEXT: vpor %ymm7, %ymm13, %ymm7
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[5,11,u,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,11],zero,zero,zero,xmm5[3,9,15],zero,zero,xmm5[u,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,ymm4[u,u,u,u,u,1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,ymm4[u,u,u,u,u,17,23,29],zero,zero,zero
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,23,29],zero,zero,zero
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,9,15],zero,zero,ymm3[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,25,31]
; AVX2-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[0,6,12,u,u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm4[2,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14,u,u,u,u,u],zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30,u,u,u,u,u],zero,zero,zero,ymm3[20,26]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[20,26]
; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm4[0,1]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,8,14],zero,zero,ymm4[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,24,30],zero,zero
; AVX2-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13,u,u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15,u,u,u,u,u],zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31,u,u,u,u,u],zero,zero,zero,ymm3[21,27]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[21,27]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15],zero,zero,ymm4[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,25,31],zero,zero
; AVX2-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2
; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[u,u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm1, %xmm7, %xmm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10,u,u,u,u,u,u],zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26,u,u,u,u,u,u],zero,zero,ymm3[16,22,28]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,22,28]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,10],zero,zero,zero,ymm4[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[20,26],zero,zero,zero
; AVX2-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[u,u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11,u,u,u,u,u,u],zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27,u,u,u,u,u,u],zero,zero,ymm3[17,23,29]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,23,29]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,11],zero,zero,zero,ymm4[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[21,27],zero,zero,zero
; AVX2-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[4,10,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[4,10],zero,zero,zero,xmm5[2,8,14],zero,zero,xmm5[u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,ymm4[u,u,u,u,u,0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,ymm4[u,u,u,u,u,16,22,28],zero,zero,zero
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,22,28],zero,zero,zero
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,ymm3[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,8,14],zero,zero,ymm3[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,24,30]
; AVX2-FAST-NEXT: vpor %ymm7, %ymm13, %ymm7
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[5,11,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,11],zero,zero,zero,xmm5[3,9,15],zero,zero,xmm5[u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,ymm4[u,u,u,u,u,1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,ymm4[u,u,u,u,u,17,23,29],zero,zero,zero
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,23,29],zero,zero,zero
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,9,15],zero,zero,ymm3[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,25,31]
; AVX2-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2
; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9
; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm11
-; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[0,6,12],zero,zero,zero,ymm11[u,u,u,u,u,2,8,14],zero,zero,ymm11[16,22,28],zero,zero,zero,ymm11[u,u,u,u,u,18,24,30],zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,8,14],zero,zero,ymm11[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[18,24,30],zero,zero
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm2[2,3],mem[2,3]
; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm12[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[4,10],zero,zero,zero,ymm12[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,26]
; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[1,7,13],zero,zero,zero,ymm11[u,u,u,u,u,3,9,15],zero,zero,ymm11[17,23,29],zero,zero,zero,ymm11[u,u,u,u,u,19,25,31],zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,9,15],zero,zero,ymm11[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[19,25,31],zero,zero
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm12[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[5,11],zero,zero,zero,ymm12[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[21,27]
; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[2,8,14],zero,zero,ymm11[u,u,u,u,u,u,4,10],zero,zero,zero,ymm11[18,24,30],zero,zero,ymm11[u,u,u,u,u,u,20,26],zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,10],zero,zero,zero,ymm11[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,26],zero,zero,zero
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm12[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,6,12],zero,zero,zero,ymm12[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,22,28]
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: movw $9362, %ax # imm = 0x2492
; AVX512-NEXT: movl $-2097152, %eax # imm = 0xFFE00000
; AVX512-NEXT: kmovd %eax, %k2
; AVX512-NEXT: vmovdqu8 %ymm3, %ymm8 {%k2}
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[3,9,15],zero,zero,ymm11[u,u,u,u,u,u,5,11],zero,zero,zero,ymm11[19,25,31],zero,zero,ymm11[u,u,u,u,u,u,21,27],zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[5,11],zero,zero,zero,ymm11[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[21,27],zero,zero,zero
; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm12[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[1,7,13],zero,zero,zero,ymm12[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[17,23,29]
; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[4,10],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm12[0,6,12,u,u,u,u,u],zero,zero,zero,ymm12[2,8,14],zero,zero,ymm12[16,22,28,u,u,u,u,u],zero,zero,zero,ymm12[18,24,30]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm12[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,8,14],zero,zero,ymm12[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[18,24,30]
; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[0,6,12],zero,zero,zero,ymm11[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,22,28],zero,zero,zero
; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[5,11],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm0, %xmm5, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[1,7,13,u,u,u,u,u],zero,zero,zero,ymm12[3,9,15],zero,zero,ymm12[17,23,29,u,u,u,u,u],zero,zero,zero,ymm12[19,25,31]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[3,9,15],zero,zero,ymm12[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[19,25,31]
; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm11[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[1,7,13],zero,zero,zero,ymm11[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[17,23,29],zero,zero,zero
; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm5
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,ymm0[u,u,u,u,u,u,u,u]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vmovq %xmm1, 16(%rax)
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,ymm0[u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FAST-NEXT: vmovq %xmm1, 16(%rax)
; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2
; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,ymm3[20,28],zero,zero,zero,zero
; AVX2-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,ymm4[21,29]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[5,13,u,u],zero,zero,ymm4[6,14,u,u],zero,zero,ymm4[7,15,u,u],zero,zero,ymm4[16,24,u,u],zero,zero,ymm4[17,25,u,u],zero,zero
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[5,13],zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,1,3]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[5,13],zero,zero,zero,zero,ymm6[6,14],zero,zero,zero,zero,ymm6[7,15],zero,zero,zero,zero,ymm6[16,24],zero,zero,zero,zero,ymm6[17,25],zero,zero,zero,zero,ymm6[18,26]
; AVX2-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u,18,26,u,u]
; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11,u,u],zero,zero,ymm0[4,12,u,u],zero,zero,ymm0[5,13,u,u],zero,zero,ymm0[22,30,u,u],zero,zero,ymm0[23,31,u,u],zero,zero
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,10],zero,zero,zero,zero,ymm2[3,11],zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,ymm2[21,29],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,ymm2[23,31]
; AVX2-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2
; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,ymm3[20,28],zero,zero,zero,zero
; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,ymm4[21,29]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[5,13,u,u],zero,zero,ymm4[6,14,u,u],zero,zero,ymm4[7,15,u,u],zero,zero,ymm4[16,24,u,u],zero,zero,ymm4[17,25,u,u],zero,zero
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[5,13],zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,1,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[5,13],zero,zero,zero,zero,ymm6[6,14],zero,zero,zero,zero,ymm6[7,15],zero,zero,zero,zero,ymm6[16,24],zero,zero,zero,zero,ymm6[17,25],zero,zero,zero,zero,ymm6[18,26]
; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u,18,26,u,u]
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11,u,u],zero,zero,ymm0[4,12,u,u],zero,zero,ymm0[5,13,u,u],zero,zero,ymm0[22,30,u,u],zero,zero,ymm0[23,31,u,u],zero,zero
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,10],zero,zero,zero,zero,ymm2[3,11],zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,ymm2[21,29],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,ymm2[23,31]
; AVX2-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23]
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
; AVX2-FAST-ALL-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u>
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
-; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero
+; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
; AVX512VLBW-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512VLBW-FAST-ALL: # %bb.0:
; AVX512VLBW-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,4,5,7,5,4,1,u>
; AVX512VLBW-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm2
-; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero
+; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
; AVX512VLBW-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512VLBW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
; AVX512VLBW-FAST-ALL-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512VLBW-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
; AVX512VLBW-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
-; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13,u,u,3,3],zero,ymm1[8,u,u,u,12,1,u],zero,zero,ymm1[u,u,20,u,17,22],zero,zero,ymm1[16],zero,ymm1[27,u],zero,zero,zero,zero
+; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
; AVX512VLBW-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512VLBW-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
; AVX512VLBW-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u],zero,zero,ymm1[12],zero,ymm1[u,u,u],zero,zero,ymm1[u,0,3,u,u],zero,ymm1[u],zero,zero,ymm1[21,16],zero,ymm1[26],zero,ymm1[u,20,18,20,23]
+; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm1[12],zero,zero,zero,zero,zero,zero,zero,ymm1[0,3],zero,zero,zero,zero,zero,zero,ymm1[21,16],zero,ymm1[26],zero,zero,ymm1[20,18,20,23]
; XOPAVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]