From f64e654d62e64ac4a74a49ac78b5609a8bb548a1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 13:05:19 +0000 Subject: [PATCH] [X86][SSE] Improve CTTZ lowering when CTLZ is legal If we have better CTLZ support than CTPOP, then use cttz(x) = width - ctlz(~x & (x - 1)) - and remove the CTTZ_ZERO_UNDEF handling as it no longer gives better codegen. Similar to rL344447, this is also closer to LegalizeDAG's approach llvm-svn: 344448 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++--- llvm/test/CodeGen/X86/vector-tzcnt-128.ll | 137 +++++++++++++++++------------- llvm/test/CodeGen/X86/vector-tzcnt-256.ll | 92 ++++++-------------- llvm/test/CodeGen/X86/vector-tzcnt-512.ll | 114 ++++++------------------- 4 files changed, 146 insertions(+), 221 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5fb3ece..5f1e9ef 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22982,20 +22982,22 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); - // cttz_undef(x) = (width - 1) - ctlz(x & -x) - if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { - SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); - SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, - DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); - return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, - DAG.getNode(ISD::CTLZ, dl, VT, LSB)); + // Tmp = ~x & (x - 1) + SDValue One = DAG.getConstant(1, dl, VT); + SDValue Tmp = DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT), + DAG.getNode(ISD::SUB, dl, VT, N0, One)); + + // cttz(x) = width - ctlz(~x & (x - 1)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isOperationLegal(ISD::CTLZ, VT) && + !TLI.isOperationLegal(ISD::CTPOP, VT)) { + SDValue Width = DAG.getConstant(NumBits, dl, VT); + return DAG.getNode(ISD::SUB, dl, VT, Width, + DAG.getNode(ISD::CTLZ, dl, VT, Tmp)); } // cttz(x) = ctpop(~x & (x - 1)) - SDValue One = DAG.getConstant(1, dl, VT); - return DAG.getNode(ISD::CTPOP, dl, VT, - DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT), - DAG.getNode(ISD::SUB, dl, VT, N0, One))); + return DAG.getNode(ISD::CTPOP, dl, VT, Tmp); } assert(Op.getOpcode() == ISD::CTTZ && diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index 1430ca7..0d392bb 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -102,22 +102,60 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv2i64: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv2i64: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vzeroupper +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv2i64: ; AVX512VPOPCNTDQ: # %bb.0: @@ -303,21 +341,21 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; AVX512CDVL-LABEL: testv2i64u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv2i64u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -537,20 +575,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] +; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i32: @@ -558,20 +585,10 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX512CD-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] +; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv4i32: @@ -798,21 +815,21 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX512CDVL-LABEL: testv4i32u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i32u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll index 46c34fb..59911e5 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -62,16 +62,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] +; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i64: @@ -79,16 +72,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] +; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv4i64: @@ -195,21 +181,21 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX512CDVL-LABEL: testv4i64u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CDVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] +; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i64u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; @@ -332,20 +318,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32] +; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv8i32: @@ -353,20 +328,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32] +; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i32: @@ -498,21 +462,21 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX512CDVL-LABEL: testv8i32u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CDVL-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32] ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv8i32u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32] ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll index 300d7b4..1de0346 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -11,25 +11,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64] +; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv8i64: @@ -37,16 +21,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64] +; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv8i64: @@ -90,21 +67,21 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CD-LABEL: testv8i64u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 -; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CD-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64] ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv8i64u: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CDBW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] +; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64] ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; @@ -152,33 +129,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv16i32: @@ -186,20 +139,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512CDBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv16i32: @@ -251,21 +193,21 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CD-LABEL: testv16i32u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv16i32u: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; -- 2.7.4