From: Roman Lebedev Date: Fri, 28 Jun 2019 17:26:28 +0000 (+0000) Subject: [NFC][Codegen] Revisit test coverage for X % C == 0 fold once more (add tests with... X-Git-Tag: llvmorg-10-init~1722 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0b8b419537107f872a2f22cfc68dd57ad8b13b85;p=platform%2Fupstream%2Fllvm.git [NFC][Codegen] Revisit test coverage for X % C == 0 fold once more (add tests with '1' divisor) llvm-svn: 364661 --- diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll index 759ccf3..57f6941 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -33,9 +33,11 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +;==============================================================================; + ; One all-ones divisor in odd divisor -define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_allones: +define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] @@ -58,10 +60,8 @@ define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } - -; One all-ones divisor in even divisor -define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_even_allones: +define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] @@ -69,29 +69,26 @@ define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] ; CHECK-NEXT: adrp x8, .LCPI2_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: adrp x8, .LCPI2_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_3] +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor in odd+even divisor -define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_even_allones: +; One all-ones divisor in even divisor +define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] @@ -113,15 +110,13 @@ define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } - -; One power-of-two divisor in odd divisor -define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_poweroftwo: +define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] @@ -129,25 +124,30 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] ; CHECK-NEXT: adrp x8, .LCPI4_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_3] ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One power-of-two divisor in even divisor -define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_even_poweroftwo: +; One all-ones divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] @@ -169,15 +169,13 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } - -; One power-of-two divisor in odd+even divisor -define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_even_poweroftwo: +define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI6_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] @@ -196,18 +194,21 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s ; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, - %cmp = icmp eq <4 x i32> %urem, + %urem = urem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one power-of-two divisor in odd divisor -define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo: +;------------------------------------------------------------------------------; + +; One power-of-two divisor in odd divisor +define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI7_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] @@ -225,15 +226,15 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one power-of-two divisor in even divisor -define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_even_allones_and_poweroftwo: +; One power-of-two divisor in even divisor +define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI8_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] @@ -255,15 +256,15 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one power-of-two divisor in odd+even divisor -define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_odd_even_allones_and_poweroftwo: +; One power-of-two divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI9_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] @@ -271,27 +272,29 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] ; CHECK-NEXT: adrp x8, .LCPI9_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI9_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3] ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } ;------------------------------------------------------------------------------; -; Negative tests - the fold is invalid if any divisor is 1. -;------------------------------------------------------------------------------; -; One divisor in odd divisor +; One one divisor in odd divisor define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_one: ; CHECK: // %bb.0: @@ -314,13 +317,13 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One divisor in even divisors +; One one divisor in even divisor define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_one: ; CHECK: // %bb.0: @@ -330,26 +333,30 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] ; CHECK-NEXT: adrp x8, .LCPI11_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] +; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI11_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_3] +; CHECK-NEXT: adrp x8, .LCPI11_4 ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One divisor in odd-even divisors +; One one divisor in odd+even divisor define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_even_one: ; CHECK: // %bb.0: @@ -359,11 +366,130 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] ; CHECK-NEXT: adrp x8, .LCPI12_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] +; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: adrp x8, .LCPI12_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;==============================================================================; + +; One all-ones divisor and power-of-two divisor divisor in odd divisor +define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: adrp x8, .LCPI13_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] +; CHECK-NEXT: adrp x8, .LCPI13_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and power-of-two divisor divisor in even divisor +define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_and_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: adrp x8, .LCPI14_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] +; CHECK-NEXT: adrp x8, .LCPI14_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI14_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_3] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and power-of-two divisor divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_allones_and_poweroftwo: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI15_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] +; CHECK-NEXT: adrp x8, .LCPI15_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] +; CHECK-NEXT: adrp x8, .LCPI15_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One all-ones divisor and one one divisor in odd divisor +define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: adrp x8, .LCPI16_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] +; CHECK-NEXT: adrp x8, .LCPI16_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] +; CHECK-NEXT: adrp x8, .LCPI16_3 ; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s ; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_3] ; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b @@ -372,7 +498,224 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one one divisor in even divisor +define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: adrp x8, .LCPI17_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] +; CHECK-NEXT: adrp x8, .LCPI17_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI17_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: adrp x8, .LCPI17_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one one divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_allones_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: adrp x8, .LCPI18_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] +; CHECK-NEXT: adrp x8, .LCPI18_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] +; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One power-of-two divisor divisor and one divisor in odd divisor +define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: adrp x8, .LCPI19_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_1] +; CHECK-NEXT: adrp x8, .LCPI19_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI19_2] +; CHECK-NEXT: adrp x8, .LCPI19_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in even divisor +define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: adrp x8, .LCPI20_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] +; CHECK-NEXT: adrp x8, .LCPI20_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2] +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI20_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_3] +; CHECK-NEXT: adrp x8, .LCPI20_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_even_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: adrp x8, .LCPI21_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] +; CHECK-NEXT: adrp x8, .LCPI21_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2] +; CHECK-NEXT: adrp x8, .LCPI21_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI22_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: adrp x8, .LCPI22_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] +; CHECK-NEXT: adrp x8, .LCPI22_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] +; CHECK-NEXT: adrp x8, .LCPI22_3 +; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_even_allones_and_poweroftwo_and_one: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: adrp x8, .LCPI23_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] +; CHECK-NEXT: adrp x8, .LCPI23_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI23_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3] +; CHECK-NEXT: adrp x8, .LCPI23_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll index 8b08f53..d544e5f 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -97,19 +97,23 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { ; Negative tests ;------------------------------------------------------------------------------; -; The fold is invalid if divisor is 1. -define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_one: +; We can lower remainder of division by powers of two much better elsewhere. +define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_pow2: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: movi v1.4s, #15 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; We can lower remainder of division by all-ones much better elsewhere. +; We could lower remainder of division by all-ones much better elsewhere. define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_allones: ; CHECK: // %bb.0: @@ -124,18 +128,24 @@ define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ret <4 x i32> %ret } -; We can lower remainder of division by powers of two much better elsewhere. -define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { -; CHECK-LABEL: test_urem_pow2: +; If all divisors are ones, this is constant-folded. +define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_one_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #15 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #1 ; CHECK-NEXT: ret - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } +define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind { +; CHECK-LABEL: test_urem_one_ne: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret + %urem = urem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 39a65c4..905f45b 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -136,9 +136,11 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ret <4 x i32> %ret } +;==============================================================================; + ; One all-ones divisor in odd divisor -define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_odd_allones: +define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_allones_eq: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 @@ -164,7 +166,7 @@ define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_allones: +; CHECK-SSE41-LABEL: test_urem_odd_allones_eq: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 @@ -183,7 +185,7 @@ define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_allones: +; CHECK-AVX1-LABEL: test_urem_odd_allones_eq: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 @@ -200,7 +202,7 @@ define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_odd_allones: +; CHECK-AVX2-LABEL: test_urem_odd_allones_eq: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] @@ -216,7 +218,7 @@ define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_odd_allones: +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_eq: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] @@ -236,10 +238,110 @@ define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } +define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_allones_ne: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_allones_ne: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} ; One all-ones divisor in even divisor -define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_even_allones: +define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_allones_eq: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 @@ -269,7 +371,7 @@ define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_even_allones: +; CHECK-SSE41-LABEL: test_urem_even_allones_eq: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE41-NEXT: psrld $1, %xmm1 @@ -290,7 +392,7 @@ define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_even_allones: +; CHECK-AVX1-LABEL: test_urem_even_allones_eq: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] @@ -309,7 +411,7 @@ define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_even_allones: +; CHECK-AVX2-LABEL: test_urem_even_allones_eq: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -326,7 +428,7 @@ define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_even_allones: +; CHECK-AVX512VL-LABEL: test_urem_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -347,10 +449,120 @@ define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind { %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } +define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_allones_ne: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_allones_ne: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_allones_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_even_allones_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} ; One all-ones divisor in odd+even divisor -define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_odd_even_allones: +define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 @@ -385,7 +597,7 @@ define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_even_allones: +; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE41-NEXT: psrld $1, %xmm1 @@ -412,7 +624,7 @@ define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_even_allones: +; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] @@ -436,7 +648,7 @@ define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_odd_even_allones: +; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -454,7 +666,7 @@ define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones: +; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -476,49 +688,179 @@ define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind { %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } - -; One power-of-two divisor in odd divisor -define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: +define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,14,4294967295,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: +; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 ; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One power-of-two divisor in odd divisor +define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo: @@ -807,75 +1149,65 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ret <4 x i32> %ret } -; One all-ones divisor and one power-of-two divisor in odd divisor -define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: +;------------------------------------------------------------------------------; + +; One one divisor in odd divisor +define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,4294967295,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK-SSE41-LABEL: test_urem_odd_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-SSE41-NEXT: movd %eax, %xmm1 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK-AVX1-LABEL: test_urem_odd_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -883,16 +1215,18 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK-AVX2-LABEL: test_urem_odd_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -900,83 +1234,204 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo: +; CHECK-AVX512VL-LABEL: test_urem_odd_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one power-of-two divisor in even divisor -define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo: +; One one divisor in even divisor +define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE2-NEXT: movd %eax, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,16,4294967295,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo: +; CHECK-SSE41-LABEL: test_urem_even_one: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE41-NEXT: movd %eax, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm3 +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_even_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm3 +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm3 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One one divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_even_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 ; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -984,22 +1439,21 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo: +; CHECK-AVX1-LABEL: test_urem_odd_even_one: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3435973837,2454267027,0,1374389535] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1007,9 +1461,9 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo: +; CHECK-AVX2-LABEL: test_urem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,2147483649,2454267027] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -1018,6 +1472,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1025,9 +1480,9 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo: +; CHECK-AVX512VL-LABEL: test_urem_odd_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,2147483649,2454267027] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -1036,23 +1491,26 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One all-ones divisor and one power-of-two divisor in odd+even divisor -define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo: +;==============================================================================; + +; One all-ones divisor and power-of-two divisor divisor in odd divisor +define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -1062,19 +1520,17 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,4294967295,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -1082,9 +1538,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo: +; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 @@ -1092,35 +1548,33 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 ; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo: +; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1128,9 +1582,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo: +; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 @@ -1145,9 +1599,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo: +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 @@ -1161,55 +1615,67 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -;------------------------------------------------------------------------------; -; Negative tests - the fold is invalid if any divisor is 1. -;------------------------------------------------------------------------------; - -; One divisor in odd divisor -define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_odd_one: +; One all-ones divisor and power-of-two divisor divisor in even divisor +define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-SSE2-NEXT: movd %eax, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $3, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,4294967295,16,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_one: +; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-SSE41-NEXT: movd %eax, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $3, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -1217,36 +1683,40 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_one: +; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_odd_one: +; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1254,87 +1724,101 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_odd_one: +; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One divisor in even divisors -define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_even_one: +; One all-ones divisor and power-of-two divisor divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-SSE2-NEXT: movd %eax, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3] +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_even_one: +; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $5, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_even_one: +; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1342,18 +1826,16 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_even_one: +; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1361,57 +1843,58 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_even_one: +; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; One divisor in odd-even divisors -define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_odd_even_one: +;------------------------------------------------------------------------------; + +; One all-ones divisor and one one divisor in odd divisor +define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-SSE2-NEXT: movd %eax, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $3, %xmm2 -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [25,100,1,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,1,5] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -1419,38 +1902,38 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; -; CHECK-SSE41-LABEL: test_urem_odd_even_one: +; CHECK-SSE41-LABEL: test_urem_odd_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $5, %xmm1 -; CHECK-SSE41-NEXT: psrld $3, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_even_one: +; CHECK-AVX1-LABEL: test_urem_odd_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 @@ -1460,16 +1943,15 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_odd_even_one: +; CHECK-AVX2-LABEL: test_urem_odd_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 @@ -1479,16 +1961,15 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_odd_even_one: +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F -; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 @@ -1497,7 +1978,862 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one one divisor in even divisor +define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_allones_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,4294967295,1,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_allones_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_allones_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_even_allones_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One all-ones divisor and one one divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,2] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +; One power-of-two divisor divisor and one divisor in odd divisor +define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,1,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in even divisor +define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,16,1,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +; One power-of-two divisor divisor and one divisor in odd+even divisor +define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,2] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +;------------------------------------------------------------------------------; + +define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm2[3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp eq <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +} + +define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { +; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[2,0] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: retq +; +; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movl $1, %eax +; CHECK-AVX2-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,0] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: movl $1, %eax +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,0] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll index e7475ee..face44a 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -348,34 +348,47 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { ; Negative tests ;------------------------------------------------------------------------------; -; The fold is invalid if divisor is 1. -define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind { -; CHECK-SSE-LABEL: test_urem_one: +; We can lower remainder of division by powers of two much better elsewhere. +define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_urem_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psrld $31, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_one: +; CHECK-AVX1-LABEL: test_urem_pow2: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_one: +; CHECK-AVX2-LABEL: test_urem_pow2: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_one: +; CHECK-AVX512VL-LABEL: test_urem_pow2: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } -; We can lower remainder of division by all-ones much better elsewhere. +; We could lower remainder of division by all-ones much better elsewhere. define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_allones: ; CHECK-SSE2: # %bb.0: @@ -430,42 +443,44 @@ define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ret <4 x i32> %ret } -; We can lower remainder of division by powers of two much better elsewhere. -define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind { -; CHECK-SSE-LABEL: test_urem_pow2: +; If all divisors are ones, this is constant-folded. +define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_urem_one_eq: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_pow2: +; CHECK-AVX1-LABEL: test_urem_one_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX1-NEXT: retq ; -; CHECK-AVX2-LABEL: test_urem_pow2: +; CHECK-AVX2-LABEL: test_urem_one_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512VL-LABEL: test_urem_pow2: +; CHECK-AVX512VL-LABEL: test_urem_one_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX512VL-NEXT: retq - %urem = urem <4 x i32> %X, + %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } +define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind { +; CHECK-SSE-LABEL: test_urem_one_ne: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: test_urem_one_ne: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq + %urem = urem <4 x i32> %X, + %cmp = icmp ne <4 x i32> %urem, + %ret = zext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %ret +}