From 3a0cab73ebb331240fde36370da7f30e9b42dc5f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 9 Apr 2018 19:17:38 +0000 Subject: [PATCH] [X86] Remove GCCBuiltin name from pmuldq/pmuludq intrinsics so clang can custom lower to native IR. Update fast-isel intrinsic tests for clang's new codegen. In somes cases fast-isel fails to remove the and/shifts and uses blends or conditional moves. But once masking gets involved, fast-isel aborts on the mask portion and we DAG combine more thorougly. llvm-svn: 329604 --- llvm/include/llvm/IR/IntrinsicsX86.td | 12 +- llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll | 25 ++- .../CodeGen/X86/avx512-intrinsics-fast-isel.ll | 144 ++++++++++++++++ .../CodeGen/X86/avx512vl-intrinsics-fast-isel.ll | 189 +++++++++++++++++++++ llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll | 15 +- .../test/CodeGen/X86/sse41-intrinsics-fast-isel.ll | 24 ++- 6 files changed, 389 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index b0a9dc1..97ea51d 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -408,7 +408,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq128">, + def int_x86_sse2_pmulu_dq : // FIXME: remove this intrinsic Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">, @@ -805,7 +805,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Vector multiply let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_pmuldq : GCCBuiltin<"__builtin_ia32_pmuldq128">, + def int_x86_sse41_pmuldq : // FIXME: remove this intrinsic Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem, Commutative]>; } @@ -1667,10 +1667,10 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq256">, + def int_x86_avx2_pmulu_dq : // FIXME: remove this intrinsic Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_pmul_dq : GCCBuiltin<"__builtin_ia32_pmuldq256">, + def int_x86_avx2_pmul_dq : // FIXME: remove this intrinsic Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">, @@ -4783,9 +4783,9 @@ let TargetPrefix = "x86" in { def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_pmulu_dq_512 : GCCBuiltin<"__builtin_ia32_pmuludq512">, + def int_x86_avx512_pmulu_dq_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; - def int_x86_avx512_pmul_dq_512 : GCCBuiltin<"__builtin_ia32_pmuldq512">, + def int_x86_avx512_pmul_dq_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index 979d4ec..0a61f21 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1823,11 +1823,21 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epi32: ; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $32, %ymm0, %ymm0 +; CHECK-NEXT: vpsrad $31, %ymm0, %ymm2 +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; CHECK-NEXT: vpsllq $32, %ymm1, %ymm1 +; CHECK-NEXT: vpsrad $31, %ymm1, %ymm2 +; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} - %arg0 = bitcast <4 x i64> %a0 to <8 x i32> - %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %arg0, <8 x i32> %arg1) + %A = shl <4 x i64> %a0, + %A1 = ashr exact <4 x i64> %A, + %B = shl <4 x i64> %a1, + %B1 = ashr exact <4 x i64> %B, + %res = mul nsw <4 x i64> %A1, %B1 ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone @@ -1835,11 +1845,14 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epu32: ; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} - %arg0 = bitcast <4 x i64> %a0 to <8 x i32> - %arg1 = bitcast <4 x i64> %a1 to <8 x i32> - %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %arg0, <8 x i32> %arg1) + %A = and <4 x i64> %a0, + %B = and <4 x i64> %a1, + %res = mul nuw <4 x i64> %A, %B ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index e0715b5..1a000fb 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1816,5 +1816,149 @@ define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind { ret <8 x i64> %res } +define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind { +; X32-LABEL: test_mm512_mul_epi32: +; X32: # %bb.0: +; X32-NEXT: vpsllq $32, %zmm0, %zmm0 +; X32-NEXT: vpsraq $32, %zmm0, %zmm0 +; X32-NEXT: vpsllq $32, %zmm1, %zmm1 +; X32-NEXT: vpsraq $32, %zmm1, %zmm1 +; X32-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mul_epi32: +; X64: # %bb.0: +; X64-NEXT: vpsllq $32, %zmm0, %zmm0 +; X64-NEXT: vpsraq $32, %zmm0, %zmm0 +; X64-NEXT: vpsllq $32, %zmm1, %zmm1 +; X64-NEXT: vpsraq $32, %zmm1, %zmm1 +; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq + %tmp = shl <8 x i64> %__A, + %tmp1 = ashr exact <8 x i64> %tmp, + %tmp2 = shl <8 x i64> %__B, + %tmp3 = ashr exact <8 x i64> %tmp2, + %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1 + ret <8 x i64> %tmp4 +} + +define <8 x i64> @test_mm512_maskz_mul_epi32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind { +; X32-LABEL: test_mm512_maskz_mul_epi32: +; X32: # %bb.0: +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_mul_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq + %conv = trunc i16 %__k to i8 + %tmp = shl <8 x i64> %__A, + %tmp1 = ashr exact <8 x i64> %tmp, + %tmp2 = shl <8 x i64> %__B, + %tmp3 = ashr exact <8 x i64> %tmp2, + %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1 + %tmp5 = bitcast i8 %conv to <8 x i1> + %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> zeroinitializer + ret <8 x i64> %tmp6 +} + +define <8 x i64> @test_mm512_mask_mul_epi32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind { +; X32-LABEL: test_mm512_mask_mul_epi32: +; X32: # %bb.0: +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1} +; X32-NEXT: vmovdqa64 %zmm2, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_mul_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1} +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 +; X64-NEXT: retq + %conv = trunc i16 %__k to i8 + %tmp = shl <8 x i64> %__A, + %tmp1 = ashr exact <8 x i64> %tmp, + %tmp2 = shl <8 x i64> %__B, + %tmp3 = ashr exact <8 x i64> %tmp2, + %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1 + %tmp5 = bitcast i8 %conv to <8 x i1> + %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> %__src + ret <8 x i64> %tmp6 +} + +define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind { +; X32-LABEL: test_mm512_mul_epu32: +; X32: # %bb.0: +; X32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; X32-NEXT: kmovw %eax, %k0 +; X32-NEXT: knotw %k0, %k1 +; X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; X32-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} +; X32-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mul_epu32: +; X64: # %bb.0: +; X64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; X64-NEXT: kmovw %eax, %k0 +; X64-NEXT: knotw %k0, %k1 +; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} +; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq + %tmp = and <8 x i64> %__A, + %tmp1 = and <8 x i64> %__B, + %tmp2 = mul nuw <8 x i64> %tmp1, %tmp + ret <8 x i64> %tmp2 +} + +define <8 x i64> @test_mm512_maskz_mul_epu32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind { +; X32-LABEL: test_mm512_maskz_mul_epu32: +; X32: # %bb.0: +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_mul_epu32: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq + %conv = trunc i16 %__k to i8 + %tmp = and <8 x i64> %__A, + %tmp1 = and <8 x i64> %__B, + %tmp2 = mul nuw <8 x i64> %tmp1, %tmp + %tmp3 = bitcast i8 %conv to <8 x i1> + %tmp4 = select <8 x i1> %tmp3, <8 x i64> %tmp2, <8 x i64> zeroinitializer + ret <8 x i64> %tmp4 +} + +define <8 x i64> @test_mm512_mask_mul_epu32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind { +; X32-LABEL: test_mm512_mask_mul_epu32: +; X32: # %bb.0: +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X32-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1} +; X32-NEXT: vmovdqa64 %zmm2, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_mul_epu32: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1} +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 +; X64-NEXT: retq + %conv = trunc i16 %__k to i8 + %tmp = and <8 x i64> %__A, + %tmp1 = and <8 x i64> %__B, + %tmp2 = mul nuw <8 x i64> %tmp1, %tmp + %tmp3 = bitcast i8 %conv to <8 x i1> + %tmp4 = select <8 x i1> %tmp3, <8 x i64> %tmp2, <8 x i64> %__src + ret <8 x i64> %tmp4 +} + !0 = !{i32 1} diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index 4980704..2914504 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -3215,6 +3215,195 @@ define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x fl ret <8 x float> %res1 } +define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { +; X32-LABEL: test_mm256_mask_mul_epi32: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_mul_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1} +; X64-NEXT: retq +entry: + %tmp = shl <4 x i64> %__X, + %tmp1 = ashr exact <4 x i64> %tmp, + %tmp2 = shl <4 x i64> %__Y, + %tmp3 = ashr exact <4 x i64> %tmp2, + %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1 + %tmp5 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> + %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W + ret <4 x i64> %tmp6 +} + +define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { +; X32-LABEL: test_mm256_maskz_mul_epi32: +; X32: # %bb.0: +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_mul_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: retq + %tmp = shl <4 x i64> %__X, + %tmp1 = ashr exact <4 x i64> %tmp, + %tmp2 = shl <4 x i64> %__Y, + %tmp3 = ashr exact <4 x i64> %tmp2, + %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1 + %tmp5 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> + %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer + ret <4 x i64> %tmp6 +} + +define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { +; X32-LABEL: test_mm_mask_mul_epi32: +; X32: # %bb.0: +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_mul_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1} +; X64-NEXT: retq + %tmp = shl <2 x i64> %__X, + %tmp1 = ashr exact <2 x i64> %tmp, + %tmp2 = shl <2 x i64> %__Y, + %tmp3 = ashr exact <2 x i64> %tmp2, + %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1 + %tmp5 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> + %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W + ret <2 x i64> %tmp6 +} + +define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { +; X32-LABEL: test_mm_maskz_mul_epi32: +; X32: # %bb.0: +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_mul_epi32: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: retq + %tmp = shl <2 x i64> %__X, + %tmp1 = ashr exact <2 x i64> %tmp, + %tmp2 = shl <2 x i64> %__Y, + %tmp3 = ashr exact <2 x i64> %tmp2, + %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1 + %tmp5 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> + %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer + ret <2 x i64> %tmp6 +} + +define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { +; X32-LABEL: test_mm256_mask_mul_epu32: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_mul_epu32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1} +; X64-NEXT: retq +entry: + %tmp = and <4 x i64> %__X, + %tmp1 = and <4 x i64> %__Y, + %tmp2 = mul nuw <4 x i64> %tmp1, %tmp + %tmp3 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> + %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W + ret <4 x i64> %tmp4 +} + +define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { +; X32-LABEL: test_mm256_maskz_mul_epu32: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_mul_epu32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %tmp = and <4 x i64> %__X, + %tmp1 = and <4 x i64> %__Y, + %tmp2 = mul nuw <4 x i64> %tmp1, %tmp + %tmp3 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> + %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer + ret <4 x i64> %tmp4 +} + +define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { +; X32-LABEL: test_mm_mask_mul_epu32: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_mul_epu32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1} +; X64-NEXT: retq +entry: + %tmp = and <2 x i64> %__X, + %tmp1 = and <2 x i64> %__Y, + %tmp2 = mul nuw <2 x i64> %tmp1, %tmp + %tmp3 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> + %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W + ret <2 x i64> %tmp4 +} + +define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { +; X32-LABEL: test_mm_maskz_mul_epu32: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_mul_epu32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %tmp = and <2 x i64> %__X, + %tmp1 = and <2 x i64> %__Y, + %tmp2 = mul nuw <2 x i64> %tmp1, %tmp + %tmp3 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> + %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer + ret <2 x i64> %tmp4 +} + declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8) diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 1acf1ad..df92c47 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1853,22 +1853,27 @@ define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind { } declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone -define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) { +define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_mul_epu32: ; X32: # %bb.0: +; X32-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; X32-NEXT: pand %xmm2, %xmm0 +; X32-NEXT: pand %xmm2, %xmm1 ; X32-NEXT: pmuludq %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mul_epu32: ; X64: # %bb.0: +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pand %xmm2, %xmm1 ; X64-NEXT: pmuludq %xmm1, %xmm0 ; X64-NEXT: retq - %arg0 = bitcast <2 x i64> %a0 to <4 x i32> - %arg1 = bitcast <2 x i64> %a1 to <4 x i32> - %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %arg0, <4 x i32> %arg1) + %A = and <2 x i64> %a0, + %B = and <2 x i64> %a1, + %res = mul nuw <2 x i64> %A, %B ret <2 x i64> %res } -declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_mul_pd: diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll index f5b3838..d632c7e 100644 --- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -787,16 +787,34 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_mul_epi32: ; X32: # %bb.0: +; X32-NEXT: psllq $32, %xmm0 +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-NEXT: psrad $31, %xmm0 +; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; X32-NEXT: psllq $32, %xmm1 +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X32-NEXT: psrad $31, %xmm1 +; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; X32-NEXT: pmuldq %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mul_epi32: ; X64: # %bb.0: +; X64-NEXT: psllq $32, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-NEXT: psrad $31, %xmm0 +; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; X64-NEXT: psllq $32, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X64-NEXT: psrad $31, %xmm1 +; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; X64-NEXT: pmuldq %xmm1, %xmm0 ; X64-NEXT: retq - %arg0 = bitcast <2 x i64> %a0 to <4 x i32> - %arg1 = bitcast <2 x i64> %a1 to <4 x i32> - %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %arg0, <4 x i32> %arg1) + %A = shl <2 x i64> %a0, + %A1 = ashr exact <2 x i64> %A, + %B = shl <2 x i64> %a1, + %B1 = ashr exact <2 x i64> %B, + %res = mul nsw <2 x i64> %A1, %B1 ret <2 x i64> %res } declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone -- 2.7.4