From 846d0ac43eb95c7f2f7e98b076b42c8c3208f506 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 11 Feb 2020 13:14:03 -0800 Subject: [PATCH] [X86] Don't disable code in combineHorizontalPredicateResult just because we have avx512 We aren't doing a good job of optimizing AVX512 outside of this code. So remove the bail out for AVX512 and replace with a FIXME. This at least gets us the AVX2 codegen. Differential Revision: https://reviews.llvm.org/D74431 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +- llvm/test/CodeGen/X86/vector-compare-all_of.ll | 197 ++++++++++--------------- llvm/test/CodeGen/X86/vector-compare-any_of.ll | 165 +++++++-------------- 3 files changed, 134 insertions(+), 233 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f5a1011..4321bb7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37359,10 +37359,7 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, return SDValue(); Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32); } else { - // Bail with AVX512VL (which uses predicate registers). - if (Subtarget.hasVLX()) - return SDValue(); - + // FIXME: Better handling of k-registers or 512-bit vectors? unsigned MatchSizeInBits = Match.getValueSizeInBits(); if (!(MatchSizeInBits == 128 || (MatchSizeInBits == 256 && Subtarget.hasAVX()))) diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll index 02c9f4f..4348edf 100644 --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -28,9 +28,11 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) { ; AVX512-LABEL: test_v2f64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $3, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: retq %c = fcmp ogt <2 x double> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -67,11 +69,11 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) { ; AVX512-LABEL: test_v4f64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -115,12 +117,11 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) { ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cltq +; AVX512-NEXT: vmovmskps %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -158,11 +159,11 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) { ; AVX512-LABEL: test_v4f32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: retq %c = fcmp ogt <4 x float> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -201,13 +202,11 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) { ; AVX512-LABEL: test_v8f32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $255, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -252,14 +251,11 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0 ; AVX512-NEXT: vpmovm2w %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cwtl +; AVX512-NEXT: vpmovmskb %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -299,9 +295,11 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) { ; AVX512-LABEL: test_v2i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $3, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: retq %c = icmp sgt <2 x i64> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -353,11 +351,11 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) { ; AVX512-LABEL: test_v4i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -416,12 +414,11 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) { ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cltq +; AVX512-NEXT: vmovmskps %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -459,11 +456,11 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) { ; AVX512-LABEL: test_v4i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: retq %c = icmp sgt <4 x i32> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -517,13 +514,11 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) { ; AVX512-LABEL: test_v8i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $255, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -583,14 +578,11 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; AVX512-NEXT: vpmovm2w %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cwtl +; AVX512-NEXT: vpmovmskb %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -632,13 +624,11 @@ define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) { ; AVX512-LABEL: test_v8i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq %c = icmp sgt <8 x i16> %a0, %a1 @@ -698,15 +688,11 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) { ; AVX512-LABEL: test_v16i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $-1, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -772,15 +758,10 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negb %al ; AVX512-NEXT: movsbl %al, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -822,16 +803,10 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) { ; AVX512-LABEL: test_v16i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negb %al ; AVX512-NEXT: retq %c = icmp sgt <16 x i8> %a0, %a1 %s = sext <16 x i1> %c to <16 x i8> @@ -886,18 +861,10 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) { ; AVX512-LABEL: test_v32i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al +; AVX512-NEXT: negb %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <32 x i8> %a0, %a1 diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll index 084de61..b3443b9 100644 --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -24,9 +24,9 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) { ; AVX512-LABEL: test_v2f64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: retq %c = fcmp ogt <2 x double> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -59,11 +59,9 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) { ; AVX512-LABEL: test_v4f64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -103,12 +101,9 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) { ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cltq +; AVX512-NEXT: vmovmskps %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -142,11 +137,9 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) { ; AVX512-LABEL: test_v4f32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: retq %c = fcmp ogt <4 x float> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -181,13 +174,9 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) { ; AVX512-LABEL: test_v8f32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -228,14 +217,9 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0 ; AVX512-NEXT: vpmovm2w %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cwtl +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -271,9 +255,9 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) { ; AVX512-LABEL: test_v2i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: retq %c = icmp sgt <2 x i64> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -319,11 +303,9 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) { ; AVX512-LABEL: test_v4i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -376,12 +358,9 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) { ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cltq +; AVX512-NEXT: vmovmskps %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -415,11 +394,9 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) { ; AVX512-LABEL: test_v4i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: retq %c = icmp sgt <4 x i32> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -467,13 +444,9 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) { ; AVX512-LABEL: test_v8i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -527,14 +500,9 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; AVX512-NEXT: vpmovm2w %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cwtl +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -572,13 +540,9 @@ define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) { ; AVX512-LABEL: test_v8i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq %c = icmp sgt <8 x i16> %a0, %a1 @@ -632,15 +596,9 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) { ; AVX512-LABEL: test_v16i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -700,16 +658,9 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: movsbl %al, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -750,15 +701,9 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) { ; AVX512-LABEL: test_v16i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq %c = icmp sgt <16 x i8> %a0, %a1 @@ -814,17 +759,9 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) { ; AVX512-LABEL: test_v32i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq -- 2.7.4