From: Simon Pilgrim Date: Wed, 3 Apr 2019 17:17:13 +0000 (+0000) Subject: [X86][AVX] combineHorizontalPredicateResult - support v16i16/v32i8 reduction on AVX1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=9e28dddf555c686f23a0de5336e2602b58aab7d5;p=platform%2Fupstream%2Fllvm.git [X86][AVX] combineHorizontalPredicateResult - support v16i16/v32i8 reduction on AVX1 Use getPMOVMSKB helper which splits v32i8 MOVMSK calls on pre-AVX2 targets. llvm-svn: 357608 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 601c7be..0ae7b5b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34301,11 +34301,8 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, if (Match.getScalarValueSizeInBits() != BitWidth) return SDValue(); - // We require AVX2 for PMOVMSKB for v16i16/v32i8; unsigned MatchSizeInBits = Match.getValueSizeInBits(); - if (!(MatchSizeInBits == 128 || - (MatchSizeInBits == 256 && - ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) + if (!(MatchSizeInBits == 128 || (MatchSizeInBits == 256 && Subtarget.hasAVX()))) return SDValue(); // Make sure this isn't a vector of 1 element. The perf win from using MOVMSK @@ -34344,9 +34341,9 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, // The setcc produces an i8 of 0/1, so extend that to the result width and // negate to get the final 0/-1 mask value. - SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match); - SDValue Movmsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BitcastLogicOp); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match); + SDValue Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll index 510c515..8c45094 100644 --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -673,14 +673,14 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm2, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: cmpl $-1, %ecx +; AVX1-NEXT: sete %al +; AVX1-NEXT: negl %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -867,17 +867,13 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm2, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: cmpl $-1, %ecx +; AVX1-NEXT: sete %al +; AVX1-NEXT: negb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1555,21 +1551,17 @@ define i1 @bool_reduction_v32i8(<32 x i8> %x, <32 x i8> %y) { ; ; AVX1-LABEL: bool_reduction_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm2, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: cmpl $-1, %ecx +; AVX1-NEXT: sete %al +; AVX1-NEXT: negb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll index a8bfc42..2236e95 100644 --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -611,14 +611,13 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: vpmovmskb %xmm2, %edx +; AVX1-NEXT: shll $16, %edx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orl %ecx, %edx +; AVX1-NEXT: setne %al +; AVX1-NEXT: negl %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -791,17 +790,12 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm2, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: setne %al +; AVX1-NEXT: negb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1490,21 +1484,16 @@ define i1 @bool_reduction_v32i8(<32 x i8> %x, <32 x i8> %y) { ; ; AVX1-LABEL: bool_reduction_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm2, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: setne %al +; AVX1-NEXT: negb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ;