From e1bc360fc6474074834b93b0ae1bcd6ad3f66123 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 31 Mar 2019 15:11:34 +0000
Subject: [PATCH] [x86] allow movmsk with 2-element reductions

One motivation for making this change is that the lack of using movmsk is likely
a main source of perf difference between clang and gcc on the C-Ray benchmark as
shown here:
https://www.phoronix.com/scan.php?page=article&item=gcc-clang-2019&num=5
...but this change alone isn't enough to solve that problem.

The 'all-of' examples show what is likely the worst case trade-off: we end up with
an extra instruction (or 2 if we count the 'xor' register clearing). The 'any-of'
examples look clearly better using movmsk because we've traded 2 vector instructions
for 2 scalar instructions, and movmsk may have better timing than the generic 'movq'.

If we examine the llvm-mca output for these cases, it appears that even though the
'all-of' movmsk variant looks worse on paper, it would perform better on both
Haswell and Jaguar.

  $ llvm-mca -mcpu=haswell no_movmsk.s -timeline
  Iterations:        100
  Instructions:      400
  Total Cycles:      504
  Total uOps:        400

  Dispatch Width:    4
  uOps Per Cycle:    0.79
  IPC:               0.79
  Block RThroughput: 1.0

  $ llvm-mca -mcpu=haswell movmsk.s -timeline
  Iterations:        100
  Instructions:      600
  Total Cycles:      358
  Total uOps:        600

  Dispatch Width:    4
  uOps Per Cycle:    1.68
  IPC:               1.68
  Block RThroughput: 1.5

  $ llvm-mca -mcpu=btver2 no_movmsk.s -timeline
  Iterations:        100
  Instructions:      400
  Total Cycles:      407
  Total uOps:        400

  Dispatch Width:    2
  uOps Per Cycle:    0.98
  IPC:               0.98
  Block RThroughput: 2.0

  $ llvm-mca -mcpu=btver2 movmsk.s -timeline
  Iterations:        100
  Instructions:      600
  Total Cycles:      311
  Total uOps:        600

  Dispatch Width:    2
  uOps Per Cycle:    1.93
  IPC:               1.93
  Block RThroughput: 3.0

Finally, there may be CPUs where movmsk is horribly slow (old AMD small cores?), but if
that's true, then we're also almost certainly making the wrong transform already for
reductions with >2 elements, so that should be fixed independently.

Differential Revision: https://reviews.llvm.org/D59997

llvm-svn: 357367
---
 llvm/lib/Target/X86/X86ISelLowering.cpp        |  7 ++--
 llvm/test/CodeGen/X86/vector-compare-all_of.ll | 48 ++++++++++++++++----------
 llvm/test/CodeGen/X86/vector-compare-any_of.ll | 36 +++++++++----------
 3 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c085bc5..f516312 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34306,8 +34306,11 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
          ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
     return SDValue();
 
-  // Don't bother performing this for 2-element vectors.
-  if (Match.getValueType().getVectorNumElements() <= 2)
+  // Make sure this isn't a vector of 1 element. The perf win from using MOVMSK
+  // diminishes with less elements in the reduction, but it is generally better
+  // to get the comparison over to the GPRs as soon as possible to reduce the
+  // number of vector ops.
+  if (Match.getValueType().getVectorNumElements() < 2)
     return SDValue();
 
   // Check that we are extracting a reduction of all sign bits.
diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
index ef705b1..510c515 100644
--- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
@@ -8,17 +8,21 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_v2f64_sext:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movmskpd %xmm1, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl $3, %ecx
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    negq %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vmovmskpd %xmm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl $3, %ecx
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    negq %rax
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f64_sext:
@@ -42,9 +46,11 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
 ; SSE-NEXT:    cmpltpd %xmm1, %xmm3
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm2
 ; SSE-NEXT:    andpd %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movmskpd %xmm2, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl $3, %ecx
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    negq %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64_sext:
@@ -273,17 +279,21 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
 ; SSE-LABEL: test_v2i64_sext:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    movmskpd %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl $3, %ecx
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    negq %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2i64_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vmovmskpd %xmm0, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    cmpl $3, %ecx
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    negq %rax
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2i64_sext:
@@ -307,9 +317,11 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE-NEXT:    pcmpgtq %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    movmskpd %xmm0, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    cmpl $3, %ecx
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    negq %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v4i64_sext:
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
index d24df0a..a8bfc42 100644
--- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -8,17 +8,17 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_v2f64_sext:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movmskpd %xmm1, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbq %rax, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vorpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vmovmskpd %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbq %rax, %rax
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f64_sext:
@@ -42,9 +42,9 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
 ; SSE-NEXT:    cmpltpd %xmm1, %xmm3
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm2
 ; SSE-NEXT:    orpd %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movmskpd %xmm2, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbq %rax, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64_sext:
@@ -255,17 +255,17 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
 ; SSE-LABEL: test_v2i64_sext:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    movmskpd %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbq %rax, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2i64_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vmovmskpd %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbq %rax, %rax
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2i64_sext:
@@ -289,9 +289,9 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE-NEXT:    pcmpgtq %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    movmskpd %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbq %rax, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v4i64_sext:
-- 
2.7.4