From cad2038700b204ba437b269dcba5b995f0c385ab Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 19 Jun 2020 10:27:09 +0100 Subject: [PATCH] [X86][SSE] combineSetCCMOVMSK - fold MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) If we're permuting ALL the elements of a single vector, then for allof/anyof MOVMSK tests we can avoid the shuffle entirely. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 33 +++++++++++++++++++++++++ llvm/test/CodeGen/X86/movmsk-cmp.ll | 12 --------- llvm/test/CodeGen/X86/vector-reduce-and-bool.ll | 2 -- llvm/test/CodeGen/X86/vector-reduce-or-bool.ll | 2 -- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ed936af..cd58423 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5441,6 +5441,14 @@ static bool isAnyZero(ArrayRef Mask) { return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); } +/// Return true if the value of any element in Mask is the zero or undef +/// sentinel values. +static bool isAnyZeroOrUndef(ArrayRef Mask) { + return llvm::any_of(Mask, [](int M) { + return M == SM_SentinelZero || M == SM_SentinelUndef; + }); +} + /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { @@ -40590,6 +40598,31 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, } } + // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced. + SmallVector ShuffleMask; + SmallVector ShuffleInputs; + if (NumElts == CmpBits && + getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs, + ShuffleMask, DAG) && + ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) && + ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) { + unsigned NumShuffleElts = ShuffleMask.size(); + APInt DemandedElts = APInt::getNullValue(NumShuffleElts); + for (int M : ShuffleMask) { + assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index"); + DemandedElts.setBit(M); + } + if (DemandedElts.isAllOnesValue()) { + SDLoc DL(EFLAGS); + SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]); + Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); + Result = + DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType()); + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, + EFLAGS.getOperand(1)); + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index e0e8ac0..204bcc97 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -530,7 +530,6 @@ define i1 @allones_v32i16_sign(<32 x i16> %arg) { ; AVX2-LABEL: allones_v32i16_sign: ; AVX2: # %bb.0: ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al @@ -598,7 +597,6 @@ define i1 @allzeros_v32i16_sign(<32 x i16> %arg) { ; AVX2-LABEL: allzeros_v32i16_sign: ; AVX2: # %bb.0: ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al @@ -1008,7 +1006,6 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) { ; AVX2-LABEL: allones_v8i64_sign: ; AVX2: # %bb.0: ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: cmpb $-1, %al ; AVX2-NEXT: sete %al @@ -1068,7 +1065,6 @@ define i1 @allzeros_v8i64_sign(<8 x i64> %arg) { ; AVX2-LABEL: allzeros_v8i64_sign: ; AVX2: # %bb.0: ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: sete %al @@ -1642,7 +1638,6 @@ define i1 @allones_v32i16_and1(<32 x i16> %arg) { ; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al @@ -1722,7 +1717,6 @@ define i1 @allzeros_v32i16_and1(<32 x i16> %arg) { ; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al @@ -2391,7 +2385,6 @@ define i1 @allones_v8i64_and1(<8 x i64> %arg) { ; AVX2-NEXT: vpsllq $63, %ymm1, %ymm1 ; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: cmpb $-1, %al ; AVX2-NEXT: sete %al @@ -2461,7 +2454,6 @@ define i1 @allzeros_v8i64_and1(<8 x i64> %arg) { ; AVX2-NEXT: vpsllq $63, %ymm1, %ymm1 ; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: sete %al @@ -3035,7 +3027,6 @@ define i1 @allones_v32i16_and4(<32 x i16> %arg) { ; AVX2-NEXT: vpsllw $13, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al @@ -3115,7 +3106,6 @@ define i1 @allzeros_v32i16_and4(<32 x i16> %arg) { ; AVX2-NEXT: vpsllw $13, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al @@ -3784,7 +3774,6 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) { ; AVX2-NEXT: vpsllq $61, %ymm1, %ymm1 ; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: cmpb $-1, %al ; AVX2-NEXT: sete %al @@ -3854,7 +3843,6 @@ define i1 @allzeros_v8i64_and4(<8 x i64> %arg) { ; AVX2-NEXT: vpsllq $61, %ymm1, %ymm1 ; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index f3a1b44..ce6a424 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -1443,7 +1443,6 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>) { ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: cmpb $-1, %al ; AVX2-NEXT: sete %al @@ -1582,7 +1581,6 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>) { ; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index e08ec78..3162d25 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -1420,7 +1420,6 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>) { ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: setne %al @@ -1557,7 +1556,6 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>) { ; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al -- 2.7.4