From 3f46022e33bd33b3d8f816be3c3adbe7de806119 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Fri, 22 Nov 2019 15:16:03 +0300 Subject: [PATCH] [Codegen] TargetLowering::prepareUREMEqFold(): `x u% C1 ==/!= C2` with tautological C1 u<= C2 (PR35479) Summary: This is a preparatory cleanup before i add more of this fold to deal with comparisons with non-zero. In essence, the current lowering is: ``` Name: (X % C1) == 0 -> X * C3 <= C4 Pre: (C1 u>> countTrailingZeros(C1)) * C3 == 1 %zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition %o0 = urem i8 %x, C1 %r = icmp eq i8 %o0, 0 => %zz = and i8 C3, 0 ; and silence it from complaining about said reg %C4 = -1 /u C1 %n0 = mul i8 %x, C3 %n1 = lshr i8 %n0, countTrailingZeros(C1) ; rotate right %n2 = shl i8 %n0, ((8-countTrailingZeros(C1)) %u 8) ; rotate right %n3 = or i8 %n1, %n2 ; rotate right %r = icmp ule i8 %n3, %C4 ``` https://rise4fun.com/Alive/oqd It kinda just works, really no weird edge-cases. But it isn't all that great for when comparing with non-zero. In particular, given `(X % C1) == C2`, there will be problems in the always-false tautological case where `C2 u>= C1`: https://rise4fun.com/Alive/pH3 That case is tautological, always-false: ``` Name: (X % Y) u>= Y %o0 = urem i8 %x, %y %r = icmp uge i8 %o0, %y => %r = false ``` https://rise4fun.com/Alive/ofu While we can't/shouldn't get such tautological case normally, we do deal with non-splat vectors, so unless we want to give up in this case, we need to fixup/short-circuit such lanes. There are two lowering variants: 1. We can blend between whatever computed result and the correct tautological result ``` Name: (X % C1) == C2 -> X * C3 <= C4 || false Pre: (C2 == 0 || C1 u<= C2) && (C1 u>> countTrailingZeros(C1)) * C3 == 1 %zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition %o0 = urem i8 %x, C1 %r = icmp eq i8 %o0, C2 => %zz = and i8 C3, 0 ; and silence it from complaining about said reg %C4 = -1 /u C1 %n0 = mul i8 %x, C3 %n1 = lshr i8 %n0, countTrailingZeros(C1) ; rotate right %n2 = shl i8 %n0, ((8-countTrailingZeros(C1)) %u 8) ; rotate right %n3 = or i8 %n1, %n2 ; rotate right %is_tautologically_false = icmp ule i8 C1, C2 %res = icmp ule i8 %n3, %C4 %r = select i1 %is_tautologically_false, i1 0, i1 %res ``` https://rise4fun.com/Alive/PjT5 https://rise4fun.com/Alive/1KV 2. We can invert the comparison result ``` Name: (X % C1) == C2 -> X * C3 <= C4 || false Pre: (C2 == 0 || C1 u<= C2) && (C1 u>> countTrailingZeros(C1)) * C3 == 1 %zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition %o0 = urem i8 %x, C1 %r = icmp eq i8 %o0, C2 => %zz = and i8 C3, 0 ; and silence it from complaining about said reg %C4 = -1 /u C1 %n0 = mul i8 %x, C3 %n1 = lshr i8 %n0, countTrailingZeros(C1) ; rotate right %n2 = shl i8 %n0, ((8-countTrailingZeros(C1)) %u 8) ; rotate right %n3 = or i8 %n1, %n2 ; rotate right %is_tautologically_false = icmp ule i8 C1, C2 %C4_fixed = select i1 %is_tautologically_false, i8 -1, i8 %C4 %res = icmp ule i8 %n3, %C4_fixed %r = xor i1 %res, %is_tautologically_false ``` https://rise4fun.com/Alive/2xC https://rise4fun.com/Alive/jpb5 3. We can expand into `and`/`or`: https://rise4fun.com/Alive/WGn https://rise4fun.com/Alive/lcb5 Blend-one is likely better since we avoid having to load the replacement from constant pool. `xor` is second best since it's still pretty general. I'm not adding `and`/`or` variants. Reviewers: RKSimon, craig.topper, spatel Reviewed By: RKSimon Subscribers: nick, hiraditya, xbolva00, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70051 --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 98 ++++-- .../CodeGen/AArch64/urem-seteq-vec-tautological.ll | 92 ++--- .../CodeGen/X86/urem-seteq-vec-tautological.ll | 390 +++++++++------------ 3 files changed, 263 insertions(+), 317 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 047cab5..9a9ac69 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4943,7 +4943,7 @@ SDValue TargetLowering::buildUREMEqFold(EVT SETCCVT, SDValue REMNode, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const { - SmallVector Built; + SmallVector Built; if (SDValue Folded = prepareUREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond, DCI, DL, Built)) { for (SDNode *N : Built) @@ -4978,26 +4978,40 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, if (!isOperationLegalOrCustom(ISD::MUL, VT)) return SDValue(); - // TODO: Could support comparing with non-zero too. - ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode); - if (!CompTarget || !CompTarget->isNullValue()) - return SDValue(); - - bool HadOneDivisor = false; - bool AllDivisorsAreOnes = true; + bool HadTautologicalLanes = false; + bool AllLanesAreTautological = true; bool HadEvenDivisor = false; bool AllDivisorsArePowerOfTwo = true; - SmallVector PAmts, KAmts, QAmts; + bool HadTautologicalInvertedLanes = false; + SmallVector PAmts, KAmts, QAmts, IAmts; - auto BuildUREMPattern = [&](ConstantSDNode *C) { + auto BuildUREMPattern = [&](ConstantSDNode *CDiv, ConstantSDNode *CCmp) { // Division by 0 is UB. Leave it to be constant-folded elsewhere. - if (C->isNullValue()) + if (CDiv->isNullValue()) return false; - const APInt &D = C->getAPIntValue(); - // If all divisors are ones, we will prefer to avoid the fold. - HadOneDivisor |= D.isOneValue(); - AllDivisorsAreOnes &= D.isOneValue(); + const APInt &D = CDiv->getAPIntValue(); + const APInt &Cmp = CCmp->getAPIntValue(); + + // x u% C1` is *always* less than C1. So given `x u% C1 == C2`, + // if C2 is not less than C1, the comparison is always false. + // But we will only be able to produce the comparison that will give the + // opposive tautological answer. So this lane would need to be fixed up. + bool TautologicalInvertedLane = D.ule(Cmp); + HadTautologicalInvertedLanes |= TautologicalInvertedLane; + + // If we are checking that remainder is something smaller than the divisor, + // then this comparison isn't tautological. For now this is not handled, + // other than the comparison that remainder is zero. + if (!Cmp.isNullValue() && !TautologicalInvertedLane) + return false; + + // If all lanes are tautological (either all divisors are ones, or divisor + // is not greater than the constant we are comparing with), + // we will prefer to avoid the fold. + bool TautologicalLane = D.isOneValue() || TautologicalInvertedLane; + HadTautologicalLanes |= TautologicalLane; + AllLanesAreTautological &= TautologicalLane; // Decompose D into D0 * 2^K unsigned K = D.countTrailingZeros(); @@ -5025,13 +5039,14 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) && "We are expecting that K is always less than all-ones for ShSVT"); - // If the divisor is 1 the result can be constant-folded. - if (D.isOneValue()) { + // If the lane is tautological the result can be constant-folded. + if (TautologicalLane) { // Set P and K amount to a bogus values so we can try to splat them. P = 0; K = -1; - assert(Q.isAllOnesValue() && - "Expecting all-ones comparison for one divisor"); + // And ensure that comparison constant is tautological, + // it will always compare true/false. + Q = -1; } PAmts.push_back(DAG.getConstant(P, DL, SVT)); @@ -5045,11 +5060,11 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue D = REMNode.getOperand(1); // Collect the values from each element. - if (!ISD::matchUnaryPredicate(D, BuildUREMPattern)) + if (!ISD::matchBinaryPredicate(D, CompTargetNode, BuildUREMPattern)) return SDValue(); - // If this is a urem by a one, avoid the fold since it can be constant-folded. - if (AllDivisorsAreOnes) + // If all lanes are tautological, the result can be constant-folded. + if (AllLanesAreTautological) return SDValue(); // If this is a urem by a powers-of-two, avoid the fold since it can be @@ -5059,7 +5074,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue PVal, KVal, QVal; if (VT.isVector()) { - if (HadOneDivisor) { + if (HadTautologicalLanes) { // Try to turn PAmts into a splat, since we don't care about the values // that are currently '0'. If we can't, just keep '0'`s. turnVectorIntoSplatVector(PAmts, isNullConstant); @@ -5096,8 +5111,41 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, } // UREM: (setule/setugt (rotr (mul N, P), K), Q) - return DAG.getSetCC(DL, SETCCVT, Op0, QVal, - ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT)); + SDValue NewCC = + DAG.getSetCC(DL, SETCCVT, Op0, QVal, + ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT)); + if (!HadTautologicalInvertedLanes) + return NewCC; + + // If any lanes previously compared always-false, the NewCC will give + // always-true result for them, so we need to fixup those lanes. + // Or the other way around for inequality predicate. + assert(VT.isVector() && "Can/should only get here for vectors."); + Created.push_back(NewCC.getNode()); + + // x u% C1` is *always* less than C1. So given `x u% C1 == C2`, + // if C2 is not less than C1, the comparison is always false. + // But we have produced the comparison that will give the + // opposive tautological answer. So these lanes would need to be fixed up. + SDValue TautologicalInvertedChannels = + DAG.getSetCC(DL, SETCCVT, D, CompTargetNode, ISD::SETULE); + Created.push_back(TautologicalInvertedChannels.getNode()); + + if (isOperationLegalOrCustom(ISD::VSELECT, SETCCVT)) { + // If we have a vector select, let's replace the comparison results in the + // affected lanes with the correct tautological result. + SDValue Replacement = DAG.getBoolConstant(Cond == ISD::SETEQ ? false : true, + DL, SETCCVT, SETCCVT); + return DAG.getNode(ISD::VSELECT, DL, SETCCVT, TautologicalInvertedChannels, + Replacement, NewCC); + } + + // Else, we can just invert the comparison result in the appropriate lanes. + if (isOperationLegalOrCustom(ISD::XOR, SETCCVT)) + return DAG.getNode(ISD::XOR, DL, SETCCVT, NewCC, + TautologicalInvertedChannels); + + return SDValue(); // Don't know how to lower. } /// Given an ISD::SREM used only by an ISD::SETEQ or ISD::SETNE diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll index 9233ad3..cb66f0e 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll @@ -22,23 +22,14 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: adrp x8, .LCPI1_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] -; CHECK-NEXT: adrp x8, .LCPI1_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] -; CHECK-NEXT: adrp x8, .LCPI1_3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_3] -; CHECK-NEXT: adrp x8, .LCPI1_4 -; CHECK-NEXT: umull2 v5.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI1_4] -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -50,24 +41,14 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: adrp x8, .LCPI2_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] -; CHECK-NEXT: adrp x8, .LCPI2_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] -; CHECK-NEXT: adrp x8, .LCPI2_3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI2_3] -; CHECK-NEXT: adrp x8, .LCPI2_4 -; CHECK-NEXT: umull2 v5.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI2_4] -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s -; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -79,25 +60,13 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: adrp x8, .LCPI3_2 -; CHECK-NEXT: umull2 v4.4s, v0.8h, v1.8h -; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h -; CHECK-NEXT: uzp2 v1.8h, v1.8h, v4.8h -; CHECK-NEXT: neg v3.8h, v3.8h -; CHECK-NEXT: movi v2.2d, #0xffff00000000ffff -; CHECK-NEXT: ushl v1.8h, v1.8h, v3.8h -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: adrp x8, .LCPI3_3 -; CHECK-NEXT: movi v4.2d, #0x00ffffffff0000 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: and v4.16b, v0.16b, v4.16b -; CHECK-NEXT: orr v1.16b, v4.16b, v1.16b -; CHECK-NEXT: mls v0.8h, v1.8h, v3.8h -; CHECK-NEXT: cmeq v0.8h, v0.8h, v2.8h +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: dup v2.8h, w8 +; CHECK-NEXT: mul v0.8h, v0.8h, v2.8h +; CHECK-NEXT: cmhs v0.8h, v1.8h, v0.8h ; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <8 x i16> %X, %cmp = icmp eq <8 x i16> %urem, @@ -108,18 +77,19 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; CHECK-LABEL: t3_wide: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x9, #-6148914691236517206 -; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: adrp x11, .LCPI4_0 +; CHECK-NEXT: mov x8, v0.d[1] ; CHECK-NEXT: movk x9, #43691 -; CHECK-NEXT: adrp x10, .LCPI4_0 -; CHECK-NEXT: umulh x9, x8, x9 -; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI4_0] -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: add x9, x9, x9, lsl #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: mov v1.d[0], x8 -; CHECK-NEXT: cmeq v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: ldr q0, [x11, :lo12:.LCPI4_0] +; CHECK-NEXT: mul x10, x10, x9 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: cmhs v0.2d, v0.2d, v1.2d ; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: movi d1, #0xffffffff00000000 +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <2 x i64> %X, %cmp = icmp eq <2 x i64> %urem, diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll index ea9ed07..f361a39 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -25,94 +25,54 @@ define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind { define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t1_all_odd_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $1, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3,1,1,9] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm2[0,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t1_all_odd_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t1_all_odd_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t1_all_odd_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t1_all_odd_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -122,103 +82,60 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind { define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t1_all_odd_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $1, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3,1,1,9] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm2[0,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t1_all_odd_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t1_all_odd_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t1_all_odd_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t1_all_odd_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,0,0,954437177] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -228,72 +145,48 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind { define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { ; CHECK-SSE2-LABEL: t2_narrow: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pand %xmm1, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [43691,0,0,58255,43691,0,0,58255] -; CHECK-SSE2-NEXT: pmulhuw %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrlw $3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: psrlw $1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; CHECK-SSE2-NEXT: pandn %xmm3, %xmm1 -; CHECK-SSE2-NEXT: por %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: psubw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqw {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psubusw {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqw %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t2_narrow: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [43691,0,0,58255,43691,0,0,58255] -; CHECK-SSE41-NEXT: pmulhuw %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrlw $3, %xmm2 -; CHECK-SSE41-NEXT: psrlw $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] -; CHECK-SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubw %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqw {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [21845,65535,65535,65535,21845,65535,65535,65535] +; CHECK-SSE41-NEXT: pminuw %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqw %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t2_narrow: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] -; CHECK-AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t2_narrow: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsrlw $3, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] -; CHECK-AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t2_narrow: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpsrlw $3, %xmm1, %xmm2 -; CHECK-AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] -; CHECK-AVX512VL-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: retq %urem = urem <8 x i16> %X, %cmp = icmp eq <8 x i16> %urem, @@ -301,46 +194,81 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { } define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { -; CHECK-SSE2-LABEL: t3_wide: -; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movq %xmm0, %rcx -; CHECK-SSE2-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB -; CHECK-SSE2-NEXT: movq %rcx, %rax -; CHECK-SSE2-NEXT: mulq %rdx -; CHECK-SSE2-NEXT: shrq %rdx -; CHECK-SSE2-NEXT: leaq (%rdx,%rdx,2), %rax -; CHECK-SSE2-NEXT: subq %rax, %rcx -; CHECK-SSE2-NEXT: movq %rcx, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; CHECK-SSE2-NEXT: pand %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; CHECK-SSE-LABEL: t3_wide: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-SSE-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE-NEXT: psrlq $32, %xmm3 +; CHECK-SSE-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE-NEXT: paddq %xmm3, %xmm0 +; CHECK-SSE-NEXT: psllq $32, %xmm0 +; CHECK-SSE-NEXT: paddq %xmm2, %xmm0 +; CHECK-SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15372286730238776661,9223372034707292159] +; CHECK-SSE-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-SSE-NEXT: pand %xmm3, %xmm0 +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE-NEXT: por %xmm0, %xmm1 +; CHECK-SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-SSE-NEXT: retq ; -; CHECK-SSE41-LABEL: t3_wide: -; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movq %xmm0, %rcx -; CHECK-SSE41-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB -; CHECK-SSE41-NEXT: movq %rcx, %rax -; CHECK-SSE41-NEXT: mulq %rdx -; CHECK-SSE41-NEXT: shrq %rdx -; CHECK-SSE41-NEXT: leaq (%rdx,%rdx,2), %rax -; CHECK-SSE41-NEXT: subq %rax, %rcx -; CHECK-SSE41-NEXT: movq %rcx, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqq {{.*}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: retq +; CHECK-AVX1-LABEL: t3_wide: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; CHECK-AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX-LABEL: t3_wide: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vmovq %xmm0, %rcx -; CHECK-AVX-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB -; CHECK-AVX-NEXT: movq %rcx, %rax -; CHECK-AVX-NEXT: mulq %rdx -; CHECK-AVX-NEXT: shrq %rdx -; CHECK-AVX-NEXT: leaq (%rdx,%rdx,2), %rax -; CHECK-AVX-NEXT: subq %rax, %rcx -; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 -; CHECK-AVX-NEXT: vpcmpeqq {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX2-LABEL: t3_wide: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; CHECK-AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: t3_wide: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; CHECK-AVX512VL-NEXT: vpminuq {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-AVX512VL-NEXT: retq %urem = urem <2 x i64> %X, %cmp = icmp eq <2 x i64> %urem, ret <2 x i1> %cmp -- 2.7.4