From: Craig Topper <craig.topper@intel.com>
Date: Mon, 9 Apr 2018 19:17:38 +0000 (+0000)
Subject: [X86] Remove GCCBuiltin name from pmuldq/pmuludq intrinsics so clang can custom lower... 
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3a0cab73ebb331240fde36370da7f30e9b42dc5f;p=platform%2Fupstream%2Fllvm.git

[X86] Remove GCCBuiltin name from pmuldq/pmuludq intrinsics so clang can custom lower to native IR. Update fast-isel intrinsic tests for clang's new codegen.

In somes cases fast-isel fails to remove the and/shifts and uses blends or conditional moves.

But once masking gets involved, fast-isel aborts on the mask portion and we DAG combine more thorougly.

llvm-svn: 329604
---

diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index b0a9dc1..97ea51d 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -408,7 +408,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw128">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                          llvm_v8i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_sse2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq128">,
+  def int_x86_sse2_pmulu_dq : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty,
                          llvm_v4i32_ty], [IntrNoMem, Commutative]>;
   def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">,
@@ -805,7 +805,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Vector multiply
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_pmuldq          : GCCBuiltin<"__builtin_ia32_pmuldq128">,
+  def int_x86_sse41_pmuldq          : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                         [IntrNoMem, Commutative]>;
 }
@@ -1667,10 +1667,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq256">,
+  def int_x86_avx2_pmulu_dq : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem, Commutative]>;
-  def int_x86_avx2_pmul_dq : GCCBuiltin<"__builtin_ia32_pmuldq256">,
+  def int_x86_avx2_pmul_dq : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
                          llvm_v8i32_ty], [IntrNoMem, Commutative]>;
   def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">,
@@ -4783,9 +4783,9 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">,
           Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
                      llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_pmulu_dq_512 : GCCBuiltin<"__builtin_ia32_pmuludq512">,
+  def int_x86_avx512_pmulu_dq_512 : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_pmul_dq_512 : GCCBuiltin<"__builtin_ia32_pmuldq512">,
+  def int_x86_avx512_pmul_dq_512 : // FIXME: remove this intrinsic
               Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
   def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">,
               Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
index 979d4ec..0a61f21 100644
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -1823,11 +1823,21 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_mul_epi32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllq $32, %ymm0, %ymm0
+; CHECK-NEXT:    vpsrad $31, %ymm0, %ymm2
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; CHECK-NEXT:    vpsllq $32, %ymm1, %ymm1
+; CHECK-NEXT:    vpsrad $31, %ymm1, %ymm2
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
-  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
-  %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %arg0, <8 x i32> %arg1)
+  %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
+  %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
+  %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
+  %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
+  %res = mul nsw <4 x i64> %A1, %B1
   ret <4 x i64> %res
 }
 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
@@ -1835,11 +1845,14 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_mul_epu32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
-  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
-  %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %arg0, <8 x i32> %arg1)
+  %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %res = mul nuw <4 x i64> %A, %B
   ret <4 x i64> %res
 }
 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index e0715b5..1a000fb 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1816,5 +1816,149 @@ define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
   ret <8 x i64> %res
 }
 
+define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
+; X32-LABEL: test_mm512_mul_epi32:
+; X32:       # %bb.0:
+; X32-NEXT:    vpsllq $32, %zmm0, %zmm0
+; X32-NEXT:    vpsraq $32, %zmm0, %zmm0
+; X32-NEXT:    vpsllq $32, %zmm1, %zmm1
+; X32-NEXT:    vpsraq $32, %zmm1, %zmm1
+; X32-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_mul_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllq $32, %zmm0, %zmm0
+; X64-NEXT:    vpsraq $32, %zmm0, %zmm0
+; X64-NEXT:    vpsllq $32, %zmm1, %zmm1
+; X64-NEXT:    vpsraq $32, %zmm1, %zmm1
+; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0
+; X64-NEXT:    retq
+  %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
+  ret <8 x i64> %tmp4
+}
+
+define <8 x i64> @test_mm512_maskz_mul_epi32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
+; X32-LABEL: test_mm512_maskz_mul_epi32:
+; X32:       # %bb.0:
+; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_maskz_mul_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+  %conv = trunc i16 %__k to i8
+  %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
+  %tmp5 = bitcast i8 %conv to <8 x i1>
+  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> zeroinitializer
+  ret <8 x i64> %tmp6
+}
+
+define <8 x i64> @test_mm512_mask_mul_epi32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
+; X32-LABEL: test_mm512_mask_mul_epi32:
+; X32:       # %bb.0:
+; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT:    vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
+; X32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_mask_mul_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
+; X64-NEXT:    retq
+  %conv = trunc i16 %__k to i8
+  %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
+  %tmp5 = bitcast i8 %conv to <8 x i1>
+  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> %__src
+  ret <8 x i64> %tmp6
+}
+
+define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
+; X32-LABEL: test_mm512_mul_epu32:
+; X32:       # %bb.0:
+; X32-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; X32-NEXT:    kmovw %eax, %k0
+; X32-NEXT:    knotw %k0, %k1
+; X32-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; X32-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; X32-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_mul_epu32:
+; X64:       # %bb.0:
+; X64-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; X64-NEXT:    kmovw %eax, %k0
+; X64-NEXT:    knotw %k0, %k1
+; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; X64-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; X64-NEXT:    retq
+  %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
+  ret <8 x i64> %tmp2
+}
+
+define <8 x i64> @test_mm512_maskz_mul_epu32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
+; X32-LABEL: test_mm512_maskz_mul_epu32:
+; X32:       # %bb.0:
+; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_maskz_mul_epu32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+  %conv = trunc i16 %__k to i8
+  %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
+  %tmp3 = bitcast i8 %conv to <8 x i1>
+  %tmp4 = select <8 x i1> %tmp3, <8 x i64> %tmp2, <8 x i64> zeroinitializer
+  ret <8 x i64> %tmp4
+}
+
+define <8 x i64> @test_mm512_mask_mul_epu32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
+; X32-LABEL: test_mm512_mask_mul_epu32:
+; X32:       # %bb.0:
+; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
+; X32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_mask_mul_epu32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
+; X64-NEXT:    retq
+  %conv = trunc i16 %__k to i8
+  %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
+  %tmp3 = bitcast i8 %conv to <8 x i1>
+  %tmp4 = select <8 x i1> %tmp3, <8 x i64> %tmp2, <8 x i64> %__src
+  ret <8 x i64> %tmp4
+}
+
 !0 = !{i32 1}
 
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index 4980704..2914504 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -3215,6 +3215,195 @@ define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x fl
   ret <8 x float> %res1
 }
 
+define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm256_mask_mul_epi32:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_mul_epi32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
+  %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
+  %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
+  %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
+  %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
+  %tmp5 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W
+  ret <4 x i64> %tmp6
+}
+
+define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm256_maskz_mul_epi32:
+; X32:       # %bb.0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_mul_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+  %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
+  %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
+  %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
+  %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
+  %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
+  %tmp5 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer
+  ret <4 x i64> %tmp6
+}
+
+define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm_mask_mul_epi32:
+; X32:       # %bb.0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_mul_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
+  %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
+  %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
+  %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
+  %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
+  %tmp5 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W
+  ret <2 x i64> %tmp6
+}
+
+define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm_maskz_mul_epi32:
+; X32:       # %bb.0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_mul_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+  %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
+  %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
+  %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
+  %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
+  %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
+  %tmp5 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm256_mask_mul_epu32:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_mul_epu32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
+  %tmp3 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W
+  ret <4 x i64> %tmp4
+}
+
+define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm256_maskz_mul_epu32:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_mul_epu32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
+  %tmp3 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer
+  ret <4 x i64> %tmp4
+}
+
+define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm_mask_mul_epu32:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_mul_epu32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
+  %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
+  %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
+  %tmp3 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W
+  ret <2 x i64> %tmp4
+}
+
+define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm_maskz_mul_epu32:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_mul_epu32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
+  %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
+  %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
+  %tmp3 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer
+  ret <2 x i64> %tmp4
+}
+
 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 1acf1ad..df92c47 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -1853,22 +1853,27 @@ define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
 }
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
 
-define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; X32-LABEL: test_mm_mul_epu32:
 ; X32:       # %bb.0:
+; X32-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; X32-NEXT:    pand %xmm2, %xmm0
+; X32-NEXT:    pand %xmm2, %xmm1
 ; X32-NEXT:    pmuludq %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mul_epu32:
 ; X64:       # %bb.0:
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pand %xmm2, %xmm1
 ; X64-NEXT:    pmuludq %xmm1, %xmm0
 ; X64-NEXT:    retq
-  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
-  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
-  %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %arg0, <4 x i32> %arg1)
+  %A = and <2 x i64> %a0, <i64 4294967295, i64 4294967295>
+  %B = and <2 x i64> %a1, <i64 4294967295, i64 4294967295>
+  %res = mul nuw <2 x i64> %A, %B
   ret <2 x i64> %res
 }
-declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; X32-LABEL: test_mm_mul_pd:
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
index f5b3838..d632c7e 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -787,16 +787,34 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
 define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 ; X32-LABEL: test_mm_mul_epi32:
 ; X32:       # %bb.0:
+; X32-NEXT:    psllq $32, %xmm0
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-NEXT:    psrad $31, %xmm0
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; X32-NEXT:    psllq $32, %xmm1
+; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X32-NEXT:    psrad $31, %xmm1
+; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; X32-NEXT:    pmuldq %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mul_epi32:
 ; X64:       # %bb.0:
+; X64-NEXT:    psllq $32, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-NEXT:    psrad $31, %xmm0
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; X64-NEXT:    psllq $32, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X64-NEXT:    psrad $31, %xmm1
+; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; X64-NEXT:    pmuldq %xmm1, %xmm0
 ; X64-NEXT:    retq
-  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
-  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
-  %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %arg0, <4 x i32> %arg1)
+  %A = shl <2 x i64> %a0, <i64 32, i64 32>
+  %A1 = ashr exact <2 x i64> %A, <i64 32, i64 32>
+  %B = shl <2 x i64> %a1, <i64 32, i64 32>
+  %B1 = ashr exact <2 x i64> %B, <i64 32, i64 32>
+  %res = mul nsw <2 x i64> %A1, %B1
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone