From: Sanjay Patel Date: Wed, 10 Oct 2018 13:39:59 +0000 (+0000) Subject: [x86] allow single source horizontal op matching (PR39195) X-Git-Tag: llvmorg-8.0.0-rc1~6885 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6cca8af2270be8bc5494b44bb8856af591d0385b;p=platform%2Fupstream%2Fllvm.git [x86] allow single source horizontal op matching (PR39195) This is intended to restore horizontal codegen to what it looked like before IR demanded elements improved in: rL343727 As noted in PR39195: https://bugs.llvm.org/show_bug.cgi?id=39195 ...horizontal ops can be worse for performance than a shuffle+regular binop, so I've added a TODO. Ideally, we'd solve that in a machine instruction pass, but a quicker solution will be adding a 'HasFastHorizontalOp' feature bit to deal with it here in the DAG. Differential Revision: https://reviews.llvm.org/D52997 llvm-svn: 344141 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4c18c5a..67f98d8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37026,9 +37026,13 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { continue; // The low half of the 128-bit result must choose from A. - // The high half of the 128-bit result must choose from B. + // The high half of the 128-bit result must choose from B, + // unless B is undef. In that case, we are always choosing from A. + // TODO: Using a horizontal op on a single input is likely worse for + // performance on many CPUs, so this should be limited here or reversed + // in a later pass. unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; - unsigned Src = i >= NumEltsPer64BitChunk; + unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index f889bb9..20c5097 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -7210,8 +7210,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) { ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7226,8 +7225,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) { ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -7407,8 +7405,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7425,8 +7422,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index 84decab..d7c0936 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -453,14 +453,12 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { define <2 x double> @add_pd_003(<2 x double> %x) { ; SSE-LABEL: add_pd_003: ; SSE: # %bb.0: -; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: haddpd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_pd_003: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x @@ -472,16 +470,12 @@ define <2 x double> @add_pd_003(<2 x double> %x) { define <2 x double> @add_pd_003_2(<2 x double> %x) { ; SSE-LABEL: add_pd_003_2: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: haddpd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_pd_003_2: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x @@ -491,16 +485,12 @@ define <2 x double> @add_pd_003_2(<2 x double> %x) { define <2 x double> @add_pd_010(<2 x double> %x) { ; SSE-LABEL: add_pd_010: ; SSE: # %bb.0: -; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: haddpd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_pd_010: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> @@ -512,17 +502,12 @@ define <2 x double> @add_pd_010(<2 x double> %x) { define <4 x float> @add_ps_007(<4 x float> %x) { ; SSE-LABEL: add_ps_007: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_007: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -533,18 +518,13 @@ define <4 x float> @add_ps_007(<4 x float> %x) { define <4 x float> @add_ps_030(<4 x float> %x) { ; SSE-LABEL: add_ps_030: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_030: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -557,16 +537,12 @@ define <4 x float> @add_ps_030(<4 x float> %x) { define <4 x float> @add_ps_007_2(<4 x float> %x) { ; SSE-LABEL: add_ps_007_2: ; SSE: # %bb.0: -; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_007_2: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -577,14 +553,12 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) { define <4 x float> @add_ps_008(<4 x float> %x) { ; SSE-LABEL: add_ps_008: ; SSE: # %bb.0: -; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_008: ; AVX: # %bb.0: -; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %x @@ -594,16 +568,13 @@ define <4 x float> @add_ps_008(<4 x float> %x) { define <4 x float> @add_ps_017(<4 x float> %x) { ; SSE-LABEL: add_ps_017: ; SSE: # %bb.0: -; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE-NEXT: addps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_017: ; AVX: # %bb.0: -; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -615,17 +586,13 @@ define <4 x float> @add_ps_017(<4 x float> %x) { define <4 x float> @add_ps_018(<4 x float> %x) { ; SSE-LABEL: add_ps_018: ; SSE: # %bb.0: -; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_018: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll index 5d7c77b..7b3f8db 100644 --- a/llvm/test/CodeGen/X86/phaddsub.ll +++ b/llvm/test/CodeGen/X86/phaddsub.ll @@ -286,16 +286,12 @@ define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source1: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source1: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -306,17 +302,13 @@ define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source2: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source2: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -329,16 +321,12 @@ define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source3: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -349,14 +337,12 @@ define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source4: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x @@ -366,15 +352,13 @@ define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source5: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source5: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -386,17 +370,13 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source6: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source6: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -409,17 +389,12 @@ define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source1: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source1: ; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> @@ -430,22 +405,14 @@ define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source2: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source2: ; AVX: # %bb.0: -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX-NEXT: retq @@ -459,20 +426,12 @@ define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source3: ; AVX: # %bb.0: -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> @@ -483,16 +442,12 @@ define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pslld $16, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source4: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %x @@ -502,18 +457,13 @@ define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source6: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source6: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 2eb9362..5c0a223 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2700,21 +2700,36 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { } define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { -; SSE-LABEL: PR22377: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; SSE-NEXT: addps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: retq +; SSE2-LABEL: PR22377: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE2-NEXT: addps %xmm0, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR22377: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: haddps %xmm0, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR22377: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: haddps %xmm0, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: PR22377: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq entry: %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32>