From 56a8a5c9fe24d6d8809313633a4a5786d4fd29c6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 4 Jul 2020 15:27:55 +0100 Subject: [PATCH] [DAG] matchBinOpReduction - match subvector reduction patterns beyond a matched shufflevector reduction Currently matchBinOpReduction only handles shufflevector reduction patterns, but in many cases these only occur in the final stages of a reduction, once we're down to legal vector widths. Before this its likely that we are performing reductions using subvector extractions to repeatedly split the source vector in half and perform the binop on the halves. Assuming we've found a non-partial reduction, this patch continues looking for subvector reductions as far as it can beyond the last shufflevector. Fixes PR37890 --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 22 +++++++++ llvm/test/CodeGen/X86/horizontal-reduce-add.ll | 59 ++++++++++--------------- llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll | 31 +++++-------- 3 files changed, 56 insertions(+), 56 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 202cacd..732aea8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9381,6 +9381,28 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, PrevOp = Op; } + // Handle subvector reductions, which tend to appear after the shuffle + // reduction stages. + while (Op.getOpcode() == CandidateBinOp) { + unsigned NumElts = Op.getValueType().getVectorNumElements(); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + if (Op0.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Op0.getOperand(0) != Op1.getOperand(0)) + break; + SDValue Src = Op0.getOperand(0); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + if (NumSrcElts != (2 * NumElts)) + break; + if (!(Op0.getConstantOperandAPInt(1) == 0 && + Op1.getConstantOperandAPInt(1) == NumElts) && + !(Op1.getConstantOperandAPInt(1) == 0 && + Op0.getConstantOperandAPInt(1) == NumElts)) + break; + Op = Src; + } + BinOp = (ISD::NodeType)CandidateBinOp; return Op; } diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll index b73dc92..64d8de9 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll @@ -29,10 +29,9 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; SSSE3-FAST-LABEL: PR37890_v4i32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movd %xmm1, %eax +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v4i32: @@ -46,8 +45,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; AVX1-FAST-LABEL: PR37890_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -98,10 +96,8 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; SSSE3-FAST-LABEL: PR37890_v8i16: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-FAST-NEXT: paddw %xmm0, %xmm1 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0 +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -121,10 +117,8 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; AVX1-FAST-LABEL: PR37890_v8i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -177,10 +171,9 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; SSSE3-FAST-LABEL: PR37890_v8i32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movd %xmm1, %eax +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v8i32: @@ -198,9 +191,8 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; AVX1-FAST-LABEL: PR37890_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -261,10 +253,8 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; SSSE3-FAST-LABEL: PR37890_v16i16: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-FAST-NEXT: paddw %xmm0, %xmm1 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0 +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -288,11 +278,9 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; AVX1-FAST-LABEL: PR37890_v16i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -381,13 +369,12 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; ; AVX1-FAST-LABEL: PR37890_v16i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll index 13cf83b..ff635b9 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -31,11 +31,8 @@ define float @PR37890_v4f32(<4 x float> %a) { ; ; SSSE3-FAST-LABEL: PR37890_v4f32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v4f32: @@ -48,8 +45,7 @@ define float @PR37890_v4f32(<4 x float> %a) { ; ; AVX1-FAST-LABEL: PR37890_v4f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -106,7 +102,7 @@ define double @PR37890_v4f64(<4 x double> %a) { ; AVX1-FAST-LABEL: PR37890_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -154,11 +150,8 @@ define float @PR37890_v8f32(<8 x float> %a) { ; SSSE3-FAST-LABEL: PR37890_v8f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v8f32: @@ -175,9 +168,8 @@ define float @PR37890_v8f32(<8 x float> %a) { ; AVX1-FAST-LABEL: PR37890_v8f32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -248,7 +240,7 @@ define double @PR37890_v8f64(<8 x double> %a) { ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -327,9 +319,8 @@ define float @PR37890_v16f32(<16 x float> %a) { ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq -- 2.7.4