From: Simon Pilgrim Date: Tue, 21 Jun 2022 14:08:39 +0000 (+0100) Subject: [X86] fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) iff c2 is all/no bits mask X-Git-Tag: upstream/15.0.7~4044 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ac4cb1775b8f40be2809245b818993a576639c1f;p=platform%2Fupstream%2Fllvm.git [X86] fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) iff c2 is all/no bits mask Noticed on D128216 - if we're zeroing out vector elements of a mul/mulh result then see if we can merge the and-mask into the mul by just multiplying by zero. Ideally we'd make this generic (similar to the existing foldSelectWithIdentityConstant?), but these cases are appearing very late, after the constants have been lowered to constant-pool loads. --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 70cd94e..92ee38e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -47651,6 +47651,20 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) return R; + // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) + // iff c2 is all/no bits mask - i.e. a select-with-zero mask. + // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW? + if (VT.isVector() && getTargetConstantFromNode(N1)) { + unsigned Opc0 = N0.getOpcode(); + if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) && + getTargetConstantFromNode(N0.getOperand(1)) && + DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() && + N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) { + SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1); + return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul); + } + } + // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant // avoids slow variable shift (moving shift amount to ECX etc.) if (isOneConstant(N1) && N0->hasOneUse()) { diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index 6c04016..c609d22 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -510,11 +510,9 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_udiv_nonuniform: @@ -697,12 +695,10 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: paddw %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index a549d3f..2858a2a 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1759,14 +1759,12 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: constant_funnnel_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psllw $1, %xmm0 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v8i16: @@ -1853,14 +1851,12 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-SSE2-LABEL: constant_funnnel_v8i16: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: psllw $1, %xmm0 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> ) ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index c25ccb6..24c963f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -812,11 +812,9 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) { ; SSE2-LABEL: test_v4i16_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 545089b..2b4e58b 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1166,11 +1166,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i16: @@ -1220,11 +1218,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v8i16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: pandn %xmm0, %xmm2 +; X86-SSE-NEXT: pandn %xmm0, %xmm1 ; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: pand %xmm1, %xmm0 -; X86-SSE-NEXT: por %xmm2, %xmm0 +; X86-SSE-NEXT: por %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <8 x i16> %a, ret <8 x i16> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index 4cc16c5..be95500 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1483,11 +1483,9 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i16: @@ -1537,11 +1535,9 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v4i16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: pandn %xmm0, %xmm2 +; X86-SSE-NEXT: pandn %xmm0, %xmm1 ; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: pand %xmm1, %xmm0 -; X86-SSE-NEXT: por %xmm2, %xmm0 +; X86-SSE-NEXT: por %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <4 x i16> %a, ret <4 x i16> %shift