From c7132031a2a10b2d6fa2333f7e1d397b16014563 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 20 Jul 2018 16:20:45 +0000 Subject: [PATCH] [X86][SSE] Use SplitOpsAndApply to improve HADD/HSUB lowering Improve AVX1 256-bit vector HADD/HSUB matching by using SplitOpsAndApply to split into 128-bit instructions. llvm-svn: 337568 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 28 ++++++++++++----- llvm/test/CodeGen/X86/haddsub-shuf.ll | 50 ++++++++++-------------------- llvm/test/CodeGen/X86/madd.ll | 18 ++++------- llvm/test/CodeGen/X86/sad.ll | 9 ++---- llvm/test/CodeGen/X86/vector-reduce-add.ll | 18 ++++------- 5 files changed, 51 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3e57c3c..23687b6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38871,10 +38871,16 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, return MAdd; // Try to synthesize horizontal adds from adds of shuffles. - if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && - isHorizontalBinOp(Op0, Op1, true)) - return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); + if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || + VT == MVT::v8i32) && + Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) { + auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, + HADDBuilder); + } if (SDValue V = combineIncDecVector(N, DAG)) return V; @@ -38996,10 +39002,16 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal subs from subs of shuffles. EVT VT = N->getValueType(0); - if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && - isHorizontalBinOp(Op0, Op1, false)) - return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); + if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || + VT == MVT::v8i32) && + Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) { + auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, + HSUBBuilder); + } if (SDValue V = combineIncDecVector(N, DAG)) return V; diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 1a602d1..ac5d5a7 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -262,13 +262,10 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { ; ; AVX1-LABEL: hadd_v8i32b: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; @@ -339,13 +336,10 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { ; ; AVX1-LABEL: hsub_v8i32b: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; @@ -416,16 +410,10 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { ; ; AVX1-LABEL: hadd_v16i16b: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; @@ -496,16 +484,10 @@ define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { ; ; AVX1-LABEL: hsub_v16i16b: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index ab59f02..d5aace3 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -51,8 +51,7 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -188,8 +187,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -338,8 +336,7 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -486,8 +483,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -666,8 +662,7 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -894,8 +889,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 9a3f6c7..4917cfb 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -56,8 +56,7 @@ define i32 @sad_16i8() nounwind { ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -318,8 +317,7 @@ define i32 @sad_32i8() nounwind { ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -860,8 +858,7 @@ define i32 @sad_avx64i8() nounwind { ; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll index 21af6fc..7a5e5f3 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -257,8 +257,7 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -321,8 +320,7 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -403,8 +401,7 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -530,8 +527,7 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -610,8 +606,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -708,8 +703,7 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper -- 2.7.4