PMULDQ returns the 64-bit result of the signed multiplication of the lower 32-bits of vXi64 vector inputs, we can lower with this if the sign bits stretch that far.
Differential Revision: https://reviews.llvm.org/D27657
llvm-svn: 289426
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
+ // 32-bit vector types used for MULDQ/MULUDQ.
+ MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
+
+ // MULDQ returns the 64-bit result of the signed multiplication of the lower
+ // 32-bits. We can lower with this if the sign bits stretch that far.
+ if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
+ DAG.ComputeNumSignBits(B) > 32) {
+ return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
+ DAG.getBitcast(MulVT, B));
+ }
+
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
//
bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
- // Bit cast to 32-bit vectors for MULUDQ
- MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
- (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
+ // Bit cast to 32-bit vectors for MULUDQ.
SDValue Alo = DAG.getBitcast(MulVT, A);
SDValue Blo = DAG.getBitcast(MulVT, B);
}
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
- SDValue Op, const SelectionDAG &, unsigned Depth) const {
+ SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
if (Op.getOpcode() == X86ISD::SETCC_CARRY)
return Op.getScalarValueSizeInBits();
+ if (Op.getOpcode() == X86ISD::VSEXT) {
+ EVT VT = Op.getValueType();
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+ return Tmp;
+ }
+
// Fallback case.
return 1;
}
; KNL_64-LABEL: test9:
; KNL_64: # BB#0: # %entry
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
-; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
-; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
-; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
-; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
-; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; KNL_64-LABEL: test10:
; KNL_64: # BB#0: # %entry
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
-; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
-; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
-; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
-; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
-; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
-; SKX-LABEL: test_pr28312:
-; SKX: # BB#0:
-; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
-; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; SKX-NEXT: retq
-;
; KNL_64-LABEL: test_pr28312:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_64-NEXT: retq
+;
+; SKX-LABEL: test_pr28312:
+; SKX: # BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
+; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
+; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; SKX-NEXT: retq
%g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
;
; SSE41-LABEL: mul_v8i64_sext:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwq %xmm2, %xmm2
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm6, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm7, %xmm9
-; SSE41-NEXT: pmovsxdq %xmm4, %xmm4
-; SSE41-NEXT: pmovsxdq %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pmuludq %xmm7, %xmm6
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm5
-; SSE41-NEXT: pmuludq %xmm0, %xmm5
-; SSE41-NEXT: psllq $32, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm7, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: paddq %xmm5, %xmm0
-; SSE41-NEXT: paddq %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pmuludq %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: psrlq $32, %xmm6
-; SSE41-NEXT: pmuludq %xmm2, %xmm6
-; SSE41-NEXT: psllq $32, %xmm6
-; SSE41-NEXT: psrlq $32, %xmm2
-; SSE41-NEXT: pmuludq %xmm4, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: paddq %xmm6, %xmm2
-; SSE41-NEXT: paddq %xmm5, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pmuludq %xmm9, %xmm4
-; SSE41-NEXT: movdqa %xmm9, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm5
-; SSE41-NEXT: pmuludq %xmm1, %xmm5
-; SSE41-NEXT: psllq $32, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: pmuludq %xmm9, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: paddq %xmm5, %xmm1
-; SSE41-NEXT: paddq %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pmuludq %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm8, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm5
-; SSE41-NEXT: pmuludq %xmm3, %xmm5
-; SSE41-NEXT: psllq $32, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm3
-; SSE41-NEXT: pmuludq %xmm8, %xmm3
-; SSE41-NEXT: psllq $32, %xmm3
-; SSE41-NEXT: paddq %xmm5, %xmm3
-; SSE41-NEXT: paddq %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm7
+; SSE41-NEXT: pmovsxwq %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm3
+; SSE41-NEXT: pmovsxdq %xmm2, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm4
+; SSE41-NEXT: pmovsxdq %xmm1, %xmm0
+; SSE41-NEXT: pmuldq %xmm5, %xmm0
+; SSE41-NEXT: pmuldq %xmm7, %xmm4
+; SSE41-NEXT: pmuldq %xmm6, %xmm2
+; SSE41-NEXT: pmuldq %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v8i64_sext:
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm0, %ymm5
-; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: mul_v8i64_sext:
; AVX512: # BB#0:
; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1
-; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
-; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%1 = sext <8 x i16> %val1 to <8 x i64>
%2 = sext <8 x i32> %val2 to <8 x i64>
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
-; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX512: # BB#0:
; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>