if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+ // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
+ if (Subtarget.hasXOP()) {
+ SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
+ }
+
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
if (SDValue V = lowerShuffleAsByteRotateAndPermute(
;
; XOP-LABEL: interleave_24i8_out:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqu (%rdi), %xmm0
-; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpor %xmm3, %xmm4, %xmm3
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOP-NEXT: vmovdqu (%rdi), %xmm1
+; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u]
; XOP-NEXT: vmovq %xmm2, (%rsi)
; XOP-NEXT: vmovq %xmm3, (%rdx)
; XOP-NEXT: vmovq %xmm0, (%rcx)
; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; XOP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
-; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
-; XOP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,8],xmm1[0],xmm0[1,9],xmm1[1],xmm0[2,10],xmm1[2],xmm0[3,11],xmm1[3],xmm0[4,12],xmm1[4],xmm0[5]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[13],xmm1[5],xmm0[6,14],xmm1[6],xmm0[7,15],xmm1[7],xmm0[u,u,u,u,u,u,u,u]
; XOP-NEXT: vmovq %xmm0, 16(%rdi)
; XOP-NEXT: vmovdqu %xmm2, (%rdi)
; XOP-NEXT: retq
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; AVX1OR2: # %bb.0: # %entry
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
-; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
-; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
; AVX512VLBW: # %bb.0: # %entry
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0>
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
+;
+; XOP-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; XOP: # %bb.0: # %entry
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,10,2,7],xmm1[6],xmm0[14,7,2],xmm1[2],xmm0[3,1,14],xmm1[2],xmm0[9,11,0]
+; XOP-NEXT: retq
entry:
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
;
; XOPAVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm3 = xmm2[4,3,u,3,u,u,u,u,u,u,u],xmm0[7],xmm2[u,u,u,u]
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
-; XOPAVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm3 = xmm1[u,u],xmm2[4],xmm1[u],xmm2[1,6],xmm1[5,0],xmm2[0],xmm1[10],xmm2[11],xmm1[u,4,2,4,7]
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm5 = xmm4[4,3,u,3,u,u,u,u,u,u,u],xmm0[7],xmm4[u,u,u,u]
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
-; XOPAVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3]
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
-; XOPAVX1-NEXT: vpor %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[u,u,u,u,1,6,13,u,u],zero,xmm2[u,u]
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
-; XOPAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],xmm2[12],xmm1[8,u,u,u,12,1,u],xmm2[0,3]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],xmm4[1,6,13],xmm0[u,u,12,u,u]
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255]
; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOP-NEXT: vmovdqu (%rdi), %xmm0
; XOP-NEXT: vmovdqu 16(%rdi), %xmm1
; XOP-NEXT: vmovdqu 32(%rdi), %xmm2
-; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,11,12,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[1,2,4,5,7,8,10,11,13,14]
-; XOP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,2,3,5,6]
-; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],xmm1[0,2,3,5,6]
+; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[8,9,11,12,14,15],xmm2[1,2,4,5,7,8,10,11,13,14]
+; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX2-LABEL: foo: