Noticed on D128216 - if we're zeroing out vector elements of a mul/mulh result then see if we can merge the and-mask into the mul by just multiplying by zero.
Ideally we'd make this generic (similar to the existing foldSelectWithIdentityConstant?), but these cases are appearing very late, after the constants have been lowered to constant-pool loads.
if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
return R;
+ // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
+ // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
+ // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
+ if (VT.isVector() && getTargetConstantFromNode(N1)) {
+ unsigned Opc0 = N0.getOpcode();
+ if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
+ getTargetConstantFromNode(N0.getOperand(1)) &&
+ DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
+ N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
+ SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
+ return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
+ }
+ }
+
// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
// avoids slow variable shift (moving shift amount to ECX etc.)
if (isOneConstant(N1) && N0->hasOneUse()) {
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_udiv_nonuniform:
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: paddw %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE2-LABEL: constant_funnnel_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psllw $1, %xmm0
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v8i16:
; X86-SSE2-LABEL: constant_funnnel_v8i16:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: pand %xmm1, %xmm2
; X86-SSE2-NEXT: psllw $1, %xmm0
; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: por %xmm2, %xmm0
+; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
ret <8 x i16> %res
; SSE2-LABEL: test_v4i16_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-LABEL: constant_shift_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
; X86-SSE-LABEL: constant_shift_v8i16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pandn %xmm0, %xmm2
+; X86-SSE-NEXT: pandn %xmm0, %xmm1
; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: pand %xmm1, %xmm0
-; X86-SSE-NEXT: por %xmm2, %xmm0
+; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <8 x i16> %shift
; SSE2-LABEL: constant_shift_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i16:
; X86-SSE-LABEL: constant_shift_v4i16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pandn %xmm0, %xmm2
+; X86-SSE-NEXT: pandn %xmm0, %xmm1
; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: pand %xmm1, %xmm0
-; X86-SSE-NEXT: por %xmm2, %xmm0
+; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift