; AVX1-NEXT: vmovdqu (%rcx), %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,0,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm2, (%rdi)
-; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: vmovdqu %xmm4, 32(%rdi)
+; AVX1-NEXT: vmovdqu %xmm2, 16(%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave_24i16_in:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm3
+; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vmovaps %ymm2, 64(%rdi)
-; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
-; AVX1-NEXT: vmovaps %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi)
+; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi)
+; AVX1-NEXT: vmovdqa %xmm5, 32(%rdi)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat3_128:
;
; XOP-LABEL: splat3_128:
; XOP: # %bb.0:
-; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
-; XOP-NEXT: vpperm %xmm7, %xmm5, %xmm3, %xmm3
-; XOP-NEXT: vpperm %xmm7, %xmm0, %xmm6, %xmm0
-; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; XOP-NEXT: vpperm %xmm7, %xmm6, %xmm5, %xmm3
-; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; XOP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; XOP-NEXT: vpshufb %xmm5, %xmm4, %xmm3
-; XOP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOP-NEXT: vmovaps %ymm1, 32(%rdi)
-; XOP-NEXT: vmovaps %ymm0, (%rdi)
-; XOP-NEXT: vmovaps %ymm2, 64(%rdi)
-; XOP-NEXT: vzeroupper
+; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
+; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm2, %xmm2
+; XOP-NEXT: vpperm %xmm3, %xmm0, %xmm7, %xmm0
+; XOP-NEXT: vpperm %xmm3, %xmm7, %xmm4, %xmm4
+; XOP-NEXT: vpperm %xmm3, %xmm1, %xmm6, %xmm1
+; XOP-NEXT: vpperm %xmm3, %xmm5, %xmm8, %xmm7
+; XOP-NEXT: vpperm %xmm3, %xmm6, %xmm5, %xmm3
+; XOP-NEXT: vmovdqa %xmm3, 80(%rdi)
+; XOP-NEXT: vmovdqa %xmm7, 64(%rdi)
+; XOP-NEXT: vmovdqa %xmm1, 48(%rdi)
+; XOP-NEXT: vmovdqa %xmm4, 32(%rdi)
+; XOP-NEXT: vmovdqa %xmm2, 16(%rdi)
+; XOP-NEXT: vmovdqa %xmm0, (%rdi)
; XOP-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm3
+; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vmovaps %ymm2, 64(%rdi)
-; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
-; AVX1-NEXT: vmovaps %ymm0, (%rdi)
+; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi)
+; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi)
+; AVX1-NEXT: vmovdqa %xmm5, 32(%rdi)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
;
; XOP-LABEL: splat3_256:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
-; XOP-NEXT: vpperm %xmm7, %xmm5, %xmm3, %xmm3
-; XOP-NEXT: vpperm %xmm7, %xmm0, %xmm6, %xmm0
-; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; XOP-NEXT: vpperm %xmm7, %xmm6, %xmm5, %xmm3
-; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; XOP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; XOP-NEXT: vpshufb %xmm5, %xmm4, %xmm3
-; XOP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOP-NEXT: vmovaps %ymm1, 32(%rdi)
-; XOP-NEXT: vmovaps %ymm0, (%rdi)
-; XOP-NEXT: vmovaps %ymm2, 64(%rdi)
+; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
+; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm1, %xmm1
+; XOP-NEXT: vpperm %xmm3, %xmm0, %xmm7, %xmm0
+; XOP-NEXT: vpperm %xmm3, %xmm7, %xmm4, %xmm4
+; XOP-NEXT: vpperm %xmm3, %xmm2, %xmm6, %xmm2
+; XOP-NEXT: vpperm %xmm3, %xmm5, %xmm8, %xmm7
+; XOP-NEXT: vpperm %xmm3, %xmm6, %xmm5, %xmm3
+; XOP-NEXT: vmovdqa %xmm3, 80(%rdi)
+; XOP-NEXT: vmovdqa %xmm7, 64(%rdi)
+; XOP-NEXT: vmovdqa %xmm2, 48(%rdi)
+; XOP-NEXT: vmovdqa %xmm4, 32(%rdi)
+; XOP-NEXT: vmovdqa %xmm1, 16(%rdi)
+; XOP-NEXT: vmovdqa %xmm0, (%rdi)
; XOP-NEXT: vzeroupper
; XOP-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
-; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm1, (%rdi)
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqu %xmm3, 80(%rdi)
+; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi)
+; AVX1-NEXT: vmovdqu %xmm5, 48(%rdi)
+; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
+; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
+; AVX1-NEXT: vmovdqu %xmm1, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
-; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
; AVX1-NEXT: # xmm6 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm1
-; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm4
-; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm4
-; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm5
-; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX1-NEXT: vmovups %ymm5, 160(%rdi)
-; AVX1-NEXT: vmovups %ymm4, 128(%rdi)
-; AVX1-NEXT: vmovups %ymm3, 96(%rdi)
-; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
-; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm2, (%rdi)
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm14
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm12
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm1
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm11
+; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm9
+; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm3
+; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm4
+; AVX1-NEXT: vmovdqu %xmm1, 80(%rdi)
+; AVX1-NEXT: vmovdqu %xmm7, 64(%rdi)
+; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
+; AVX1-NEXT: vmovdqu %xmm14, (%rdi)
+; AVX1-NEXT: vmovdqu %xmm0, 48(%rdi)
+; AVX1-NEXT: vmovdqu %xmm12, 32(%rdi)
+; AVX1-NEXT: vmovdqu %xmm4, 176(%rdi)
+; AVX1-NEXT: vmovdqu %xmm3, 160(%rdi)
+; AVX1-NEXT: vmovdqu %xmm5, 112(%rdi)
+; AVX1-NEXT: vmovdqu %xmm11, 96(%rdi)
+; AVX1-NEXT: vmovdqu %xmm2, 144(%rdi)
+; AVX1-NEXT: vmovdqu %xmm9, 128(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;