From db0ed7d724f5b1d6ac7f8617fe05a4778d1062c8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 26 Feb 2018 18:17:25 +0000 Subject: [PATCH] [X86][AVX] createPSADBW - support 256-bit cases on AVX1 via SplitBinaryOpsAndApply llvm-svn: 326104 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 26 +++--- llvm/test/CodeGen/X86/sad.ll | 135 ++++++++------------------------ 2 files changed, 47 insertions(+), 114 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4738214..8fd39156 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31029,8 +31029,8 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, // Given two zexts of to , create a PSADBW of the inputs // to these zexts. static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, - const SDValue &Zext1, const SDLoc &DL) { - + const SDValue &Zext1, const SDLoc &DL, + const X86Subtarget &Subtarget) { // Find the appropriate width for the PSADBW. EVT InVT = Zext0.getOperand(0).getValueType(); unsigned RegSize = std::max(128u, InVT.getSizeInBits()); @@ -31045,9 +31045,15 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, Ops[0] = Zext1.getOperand(0); SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); - // Actually build the SAD + // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW. + auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL, SDValue Op0, + SDValue Op1) { + MVT VT = MVT::getVectorVT(MVT::i64, Op0.getValueSizeInBits() / 64); + return DAG.getNode(X86ISD::PSADBW, DL, VT, Op0, Op1); + }; MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); - return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); + return SplitBinaryOpsAndApply(DAG, Subtarget, DL, SadVT, SadOp0, SadOp1, + PSADBWBuilder); } // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with @@ -31216,10 +31222,10 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, unsigned RegSize = 128; if (Subtarget.useBWIRegs()) RegSize = 512; - else if (Subtarget.hasAVX2()) + else if (Subtarget.hasAVX()) RegSize = 256; - // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512. + // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. if (RegSize / VT.getVectorNumElements() < 8) @@ -31254,7 +31260,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // Create the SAD instruction. SDLoc DL(Extract); - SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); + SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget); // If the original vector was wider than 8 elements, sum over the results // in the SAD vector. @@ -37404,10 +37410,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, unsigned RegSize = 128; if (Subtarget.useBWIRegs()) RegSize = 512; - else if (Subtarget.hasAVX2()) + else if (Subtarget.hasAVX()) RegSize = 256; - // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. if (VT.getSizeInBits() / 4 > RegSize) @@ -37433,7 +37439,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, // reduction. Note that the number of elements of the result of SAD is less // than the number of elements of its input. Therefore, we could only update // part of elements in the reduction vector. - SDValue Sad = createPSADBW(DAG, Op0, Op1, DL); + SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget); // The output of PSADBW is a vector of i64. // We need to turn the vector of i64 into a vector of i32. diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index f091cf93..e705f3d 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -286,76 +286,36 @@ define i32 @sad_32i8() nounwind { ; ; AVX1-LABEL: sad_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB1_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm7 -; AVX1-NEXT: vpabsd %xmm3, %xmm11 -; AVX1-NEXT: vpabsd %xmm4, %xmm4 -; AVX1-NEXT: vpabsd %xmm5, %xmm5 -; AVX1-NEXT: vpabsd %xmm6, %xmm6 -; AVX1-NEXT: vpabsd %xmm0, %xmm0 -; AVX1-NEXT: vpabsd %xmm1, %xmm1 -; AVX1-NEXT: vpabsd %xmm2, %xmm2 -; AVX1-NEXT: vpabsd %xmm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpaddd %xmm9, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm10, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpaddd %xmm8, %xmm5, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddd %xmm12, %xmm11, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX1-NEXT: vmovdqa a+1024(%rax), %ymm2 +; AVX1-NEXT: vmovdqa b+1024(%rax), %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: addq $4, %rax ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vpaddd %xmm12, %xmm10, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm4 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] @@ -1381,50 +1341,17 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; ; AVX1-LABEL: sad_nonloop_32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpabsd %xmm0, %xmm0 -; AVX1-NEXT: vpabsd %xmm1, %xmm1 -; AVX1-NEXT: vpabsd %xmm2, %xmm2 -; AVX1-NEXT: vpabsd %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpabsd %xmm4, %xmm1 -; AVX1-NEXT: vpabsd %xmm5, %xmm2 -; AVX1-NEXT: vpabsd %xmm6, %xmm3 -; AVX1-NEXT: vpabsd %xmm7, %xmm4 -; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqu (%rdi), %ymm0 +; AVX1-NEXT: vmovdqu (%rdx), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: sad_nonloop_32i8: -- 2.7.4