// Without PTEST, a masked v2i64 or-reduction is not faster than
// scalarization.
+ bool UseKORTEST = Subtarget.useAVX512Regs();
bool UsePTEST = Subtarget.hasSSE41();
if (!UsePTEST && !Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
return SDValue();
- // Split down to 128/256-bit vector.
- unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
+ // Split down to 128/256/512-bit vector.
+ unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
while (VT.getSizeInBits() > TestSize) {
auto Split = DAG.SplitVector(V, DL);
VT = Split.first.getValueType();
V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
}
+ if (UseKORTEST && VT.is512BitVector()) {
+ V = DAG.getBitcast(MVT::v16i32, MaskBits(V));
+ V = DAG.getSetCC(DL, MVT::v16i1, V,
+ getZeroVector(MVT::v16i32, Subtarget, DAG, DL),
+ ISD::SETNE);
+ return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
+ }
+
if (UsePTEST) {
MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
V = DAG.getBitcast(TestVT, MaskBits(V));
;
; AVX512-LABEL: veccond512:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: je .LBB2_2
; AVX512-NEXT: # %bb.1: # %if-true-block
; AVX512-NEXT: xorl %eax, %eax
;
; AVX512-LABEL: vectest512:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setne %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512-LABEL: vecsel512:
; AVX512: # %bb.0:
; AVX512-NEXT: movl %edi, %eax
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: cmovel %esi, %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-LABEL: test_v8i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: sete %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512-LABEL: test_v16i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setne %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-LABEL: test_v16i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setne %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512-LABEL: test_v32i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: sete %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-LABEL: test_v32i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: sete %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512-LABEL: test_v64i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setne %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-LABEL: test_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: setne %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512-LABEL: test_v128i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vptest %ymm0, %ymm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: sete %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512-LABEL: mask_v128i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673]
-; AVX512-NEXT: vptest %ymm1, %ymm0
+; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
; AVX512-NEXT: sete %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq