From e1bc360fc6474074834b93b0ae1bcd6ad3f66123 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 31 Mar 2019 15:11:34 +0000 Subject: [PATCH] [x86] allow movmsk with 2-element reductions One motivation for making this change is that the lack of using movmsk is likely a main source of perf difference between clang and gcc on the C-Ray benchmark as shown here: https://www.phoronix.com/scan.php?page=article&item=gcc-clang-2019&num=5 ...but this change alone isn't enough to solve that problem. The 'all-of' examples show what is likely the worst case trade-off: we end up with an extra instruction (or 2 if we count the 'xor' register clearing). The 'any-of' examples look clearly better using movmsk because we've traded 2 vector instructions for 2 scalar instructions, and movmsk may have better timing than the generic 'movq'. If we examine the llvm-mca output for these cases, it appears that even though the 'all-of' movmsk variant looks worse on paper, it would perform better on both Haswell and Jaguar. $ llvm-mca -mcpu=haswell no_movmsk.s -timeline Iterations: 100 Instructions: 400 Total Cycles: 504 Total uOps: 400 Dispatch Width: 4 uOps Per Cycle: 0.79 IPC: 0.79 Block RThroughput: 1.0 $ llvm-mca -mcpu=haswell movmsk.s -timeline Iterations: 100 Instructions: 600 Total Cycles: 358 Total uOps: 600 Dispatch Width: 4 uOps Per Cycle: 1.68 IPC: 1.68 Block RThroughput: 1.5 $ llvm-mca -mcpu=btver2 no_movmsk.s -timeline Iterations: 100 Instructions: 400 Total Cycles: 407 Total uOps: 400 Dispatch Width: 2 uOps Per Cycle: 0.98 IPC: 0.98 Block RThroughput: 2.0 $ llvm-mca -mcpu=btver2 movmsk.s -timeline Iterations: 100 Instructions: 600 Total Cycles: 311 Total uOps: 600 Dispatch Width: 2 uOps Per Cycle: 1.93 IPC: 1.93 Block RThroughput: 3.0 Finally, there may be CPUs where movmsk is horribly slow (old AMD small cores?), but if that's true, then we're also almost certainly making the wrong transform already for reductions with >2 elements, so that should be fixed independently. Differential Revision: https://reviews.llvm.org/D59997 llvm-svn: 357367 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 ++-- llvm/test/CodeGen/X86/vector-compare-all_of.ll | 48 ++++++++++++++++---------- llvm/test/CodeGen/X86/vector-compare-any_of.ll | 36 +++++++++---------- 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c085bc5..f516312 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34306,8 +34306,11 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) return SDValue(); - // Don't bother performing this for 2-element vectors. - if (Match.getValueType().getVectorNumElements() <= 2) + // Make sure this isn't a vector of 1 element. The perf win from using MOVMSK + // diminishes with less elements in the reduction, but it is generally better + // to get the comparison over to the GPRs as soon as possible to reduce the + // number of vector ops. + if (Match.getValueType().getVectorNumElements() < 2) return SDValue(); // Check that we are extracting a reduction of all sign bits. diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll index ef705b1..510c515 100644 --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -8,17 +8,21 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) { ; SSE-LABEL: test_v2f64_sext: ; SSE: # %bb.0: ; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movmskpd %xmm1, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl $3, %ecx +; SSE-NEXT: sete %al +; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: cmpl $3, %ecx +; AVX-NEXT: sete %al +; AVX-NEXT: negq %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_sext: @@ -42,9 +46,11 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) { ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: andpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movmskpd %xmm2, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl $3, %ecx +; SSE-NEXT: sete %al +; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_sext: @@ -273,17 +279,21 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_v2i64_sext: ; SSE: # %bb.0: ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl $3, %ecx +; SSE-NEXT: sete %al +; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: cmpl $3, %ecx +; AVX-NEXT: sete %al +; AVX-NEXT: negq %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i64_sext: @@ -307,9 +317,11 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) { ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl $3, %ecx +; SSE-NEXT: sete %al +; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64_sext: diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll index d24df0a..a8bfc42 100644 --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -8,17 +8,17 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) { ; SSE-LABEL: test_v2f64_sext: ; SSE: # %bb.0: ; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movmskpd %xmm1, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_sext: @@ -42,9 +42,9 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) { ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movmskpd %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_sext: @@ -255,17 +255,17 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_v2i64_sext: ; SSE: # %bb.0: ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i64_sext: @@ -289,9 +289,9 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) { ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64_sext: -- 2.7.4