}
define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source2:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSSE3-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd_single_source2:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
;
-; AVX-LABEL: phaddd_single_source2:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT: retq
+; SSSE3-FAST-LABEL: phaddd_single_source2:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source2:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd_single_source2:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX-FAST-NEXT: retq
+;
+; AVX2-SHUF-LABEL: phaddd_single_source2:
+; AVX2-SHUF: # %bb.0:
+; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-SHUF-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
%r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
%add = add <4 x i32> %l, %r
}
define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source2:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
-; SSSE3-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddw_single_source2:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddw_single_source2:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; SSSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: phaddw_single_source2:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
+; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: phaddw_single_source2:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
; AVX-FAST-NEXT: retq
;
; AVX2-SHUF-LABEL: phaddw_single_source2:
; AVX2-SHUF: # %bb.0:
-; AVX2-SHUF-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX2-SHUF-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1]
+; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
+; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX2-SHUF-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-SHUF-NEXT: retq
%l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
%r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[1,0,3,2,4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,u,1,u,7,u,5,u,1,u,5,u,0,u,1,u>
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleaved_load_vf8_i8_stride4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[1,0,3,2,4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,u,1,u,7,u,5,u,1,u,5,u,0,u,1,u>
; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: retq
;
; AVX512-LABEL: interleaved_load_vf8_i8_stride4:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4
-; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4
-; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,33,32,35,34]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm3
+; AVX512-NEXT: vpermi2w %zmm4, %zmm3, %zmm2
+; AVX512-NEXT: vprold $8, %zmm4, %zmm4
+; AVX512-NEXT: vprold $8, %zmm3, %zmm3
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX512-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vmovdqu (%rdi), %ymm1
+; AVX512-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq