An initial patch adding combineSetCCMOVMSK to simplify MOVMSK and its vector input based on the comparison of the MOVMSK result.
This first stage just adds support for some simple MOVMSK(PACKSSBW()) cases where we remove the PACKSS if we're comparing ne/eq zero (any_of patterns), allowing us to directly compare against the v8i16 source vector(s) bitcasted to v16i8, with suitable masking to take into account of which signbits are valid.
Future combines could peek through further PACKSS, target shuffles, handle all_of patterns (ne/eq -1), optimize to a PTEST op, etc.
Differential Revision: https://reviews.llvm.org/D81171
return SDValue();
}
+// Attempt to simplify the MOVMSK input based on the comparison type.
+static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Only handle eq/ne against zero (any_of).
+ // TODO: Handle eq/ne against -1 (all_of) as well.
+ if (!(CC == X86::COND_E || CC == X86::COND_NE))
+ return SDValue();
+ if (EFLAGS.getValueType() != MVT::i32)
+ return SDValue();
+ unsigned CmpOpcode = EFLAGS.getOpcode();
+ if (CmpOpcode != X86ISD::CMP || !isNullConstant(EFLAGS.getOperand(1)))
+ return SDValue();
+
+ SDValue CmpOp = EFLAGS.getOperand(0);
+ unsigned CmpBits = CmpOp.getValueSizeInBits();
+
+ // Peek through any truncate.
+ if (CmpOp.getOpcode() == ISD::TRUNCATE)
+ CmpOp = CmpOp.getOperand(0);
+
+ // Bail if we don't find a MOVMSK.
+ if (CmpOp.getOpcode() != X86ISD::MOVMSK)
+ return SDValue();
+
+ SDValue Vec = CmpOp.getOperand(0);
+ MVT VecVT = Vec.getSimpleValueType();
+ assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
+ "Unexpected MOVMSK operand");
+
+ // See if we can avoid a PACKSS by calling MOVMSK on the sources.
+ // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
+ // sign bits prior to the comparison with zero unless we know that
+ // the vXi16 splats the sign bit down to the lower i8 half.
+ if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
+ SDValue VecOp0 = Vec.getOperand(0);
+ SDValue VecOp1 = Vec.getOperand(1);
+ bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
+ bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
+ // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
+ if (CmpBits == 8 && VecOp1.isUndef()) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
+ if (!SignExt0) {
+ Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
+ DAG.getConstant(0xAAAA, DL, MVT::i16));
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(0, DL, MVT::i16));
+ }
+ // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
+ // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
+ if (CmpBits == 16 && Subtarget.hasInt256() &&
+ VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
+ VecOp0.getConstantOperandAPInt(1) == 0 &&
+ VecOp1.getConstantOperandAPInt(1) == 8) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ if (!SignExt0 || !SignExt1) {
+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+ }
+
+ return SDValue();
+}
+
/// Optimize an EFLAGS definition used according to the condition code \p CC
/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
/// uses of chain values.
if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
return R;
+ if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
+ return R;
+
return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
}
define i1 @allzeros_v8i16_sign(<8 x i16> %arg) {
; SSE2-LABEL: allzeros_v8i16_sign:
; SSE2: # %bb.0:
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; AVX-LABEL: allzeros_v8i16_sign:
; AVX: # %bb.0:
-; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
-; AVX-NEXT: testb %al, %al
+; AVX-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX-NEXT: sete %al
; AVX-NEXT: retq
;
;
; AVX2-LABEL: allzeros_v16i16_sign:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-LABEL: allzeros_v8i32_sign:
; SSE2: # %bb.0:
; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; SSE2-LABEL: allzeros_v8i16_and1:
; SSE2: # %bb.0:
; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; AVX-LABEL: allzeros_v8i16_and1:
; AVX: # %bb.0:
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
-; AVX-NEXT: testb %al, %al
+; AVX-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX-NEXT: sete %al
; AVX-NEXT: retq
;
; AVX2-LABEL: allzeros_v16i16_and1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-NEXT: psllq $63, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; SSE2-LABEL: allzeros_v8i16_and4:
; SSE2: # %bb.0:
; SSE2-NEXT: psllw $13, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; AVX-LABEL: allzeros_v8i16_and4:
; AVX: # %bb.0:
; AVX-NEXT: vpsllw $13, %xmm0, %xmm0
-; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
-; AVX-NEXT: testb %al, %al
+; AVX-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX-NEXT: sete %al
; AVX-NEXT: retq
;
; AVX2-LABEL: allzeros_v16i16_and4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-NEXT: pslld $29, %xmm1
; SSE2-NEXT: pslld $29, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-NEXT: psllq $61, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
; SSE-NEXT: cmpneqps %xmm3, %xmm1
; SSE-NEXT: cmpneqps %xmm2, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: testb %al, %al
+; SSE-NEXT: testw %ax, %ax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
; SSE-LABEL: bool_reduction_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm0, %xmm1
-; SSE-NEXT: packsswb %xmm1, %xmm1
; SSE-NEXT: pmovmskb %xmm1, %eax
-; SSE-NEXT: testb %al, %al
+; SSE-NEXT: testw %ax, %ax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
; AVX-LABEL: bool_reduction_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
-; AVX-NEXT: testb %al, %al
+; AVX-NEXT: testw %ax, %ax
; AVX-NEXT: setne %al
; AVX-NEXT: retq
;
; SSE-NEXT: pminud %xmm0, %xmm2
; SSE-NEXT: pcmpeqd %xmm0, %xmm2
; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: packsswb %xmm2, %xmm2
; SSE-NEXT: pmovmskb %xmm2, %eax
-; SSE-NEXT: testb %al, %al
+; SSE-NEXT: testw %ax, %ax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
; AVX2-LABEL: bool_reduction_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl %eax, %eax
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: psllw $15, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: testb %al, %al
+; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
-; AVX-NEXT: testb %al, %al
+; AVX-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX-NEXT: setne %al
; AVX-NEXT: retq
;
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: psllw $15, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: testb %al, %al
+; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: testb %al, %al
+; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: packsswb %xmm2, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
; SSE41-NEXT: psllw $15, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: testb %al, %al
+; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: testb %al, %al
+; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: testb %al, %al
+; SSE41-NEXT: testw %ax, %ax
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
-; AVX-NEXT: testb %al, %al
+; AVX-NEXT: testw %ax, %ax
; AVX-NEXT: setne %al
; AVX-NEXT: retq
;
; SSE-NEXT: pcmpeqd %xmm2, %xmm1
; SSE-NEXT: pcmpeqd %xmm2, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: testb %al, %al
+; SSE-NEXT: testw %ax, %ax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl %eax, %eax
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: packssdw %xmm3, %xmm1
-; SSE2-NEXT: packsswb %xmm1, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
-; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
; SSE41-NEXT: pcmpeqq %xmm4, %xmm0
; SSE41-NEXT: packssdw %xmm1, %xmm0
; SSE41-NEXT: packssdw %xmm2, %xmm0
-; SSE41-NEXT: packsswb %xmm0, %xmm0
; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: testb %al, %al
+; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: testw %ax, %ax
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq