From: Luo, Yuanke <yuanke.luo@intel.com>
Date: Tue, 6 Jun 2023 05:27:15 +0000 (+0800)
Subject: [X86] Prefer vmovmsk instead of vtest for alderlake.
X-Git-Tag: upstream/17.0.6~5757
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b0bea80ab479e9bb016fcdb62d7d0eceec2b28e3;p=platform%2Fupstream%2Fllvm.git

[X86] Prefer vmovmsk instead of vtest for alderlake.

On alderlake E-core, the latency of VMOVMSKPS is 5 for YMM/XMM. The
latency of VPTESTPS is 7 for YMM and is 5 for XMM. Since alderlake use
the P-core schedule model, we can't determine which one better based on
the latency information of schedule model. Alternatively we add an
tuning feature for alderlake and select VMOVMSKPS with the indication
for the tuning feature. In the case of "vmovmskps + test + jcc", the
test and jcc can be fused, while vtest and jcc can't.

Differential Revision: https://reviews.llvm.org/D152227
---

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index d664b24..e9f9f1b 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -422,6 +422,9 @@ def FeatureHardenSlsIJmp
 //===----------------------------------------------------------------------===//
 // X86 Subtarget Tuning features
 //===----------------------------------------------------------------------===//
+def TuningPreferMovmskOverVTest : SubtargetFeature<"prefer-movmsk-over-vtest",
+                                       "PreferMovmskOverVTest", "true",
+                                       "Prefer movmsk over vtest instruction">;
 
 def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
@@ -1166,7 +1169,8 @@ def ProcessorFeatures {
                                                   FeatureMOVDIRI,
                                                   FeatureMOVDIR64B,
                                                   FeatureWAITPKG];
-  list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps];
+  list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
+                                                TuningPreferMovmskOverVTest];
   list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
   list<SubtargetFeature> ADLFeatures =
     !listconcat(TRMFeatures, ADLAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4fc96d8..170396d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48024,7 +48024,8 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
   // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
   // iff every element is referenced.
-  if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse &&
+  if (NumElts <= CmpBits && Subtarget.hasAVX() &&
+      !Subtarget.preferMovmskOverVTest() && IsOneUse &&
       (NumEltBits == 32 || NumEltBits == 64)) {
     SDLoc DL(EFLAGS);
     MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
index 871703d..b3f4878 100644
--- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL
 
 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>)
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
@@ -22,7 +22,8 @@ define i1 @movmskps_noneof_bitcast_v4f64(<4 x double> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; ADL-NEXT:    vtestpd %ymm0, %ymm0
+; ADL-NEXT:    vmovmskpd %ymm0, %eax
+; ADL-NEXT:    testl %eax, %eax
 ; ADL-NEXT:    sete %al
 ; ADL-NEXT:    vzeroupper
 ; ADL-NEXT:    retq
@@ -59,9 +60,9 @@ define i1 @movmskps_allof_bitcast_v4f64(<4 x double> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; ADL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; ADL-NEXT:    vtestpd %ymm1, %ymm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %ymm0, %eax
+; ADL-NEXT:    cmpl $15, %eax
+; ADL-NEXT:    sete %al
 ; ADL-NEXT:    vzeroupper
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <4 x double> %a0, zeroinitializer
@@ -203,10 +204,10 @@ define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1)  {
 ; ADL-LABEL: movmskps_concat_v4f32:
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; ADL-NEXT:    vmovmskps %xmm0, %ecx
 ; ADL-NEXT:    xorl %eax, %eax
-; ADL-NEXT:    vtestps %xmm0, %xmm0
-; ADL-NEXT:    setne %al
-; ADL-NEXT:    negl %eax
+; ADL-NEXT:    negl %ecx
+; ADL-NEXT:    sbbl %eax, %eax
 ; ADL-NEXT:    retq
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %1)
diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll
index b365a5f..baa0553 100644
--- a/llvm/test/CodeGen/X86/combine-movmsk.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
@@ -33,7 +33,8 @@ define i1 @movmskps_noneof_bitcast_v2f64(<2 x double> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
-; ADL-NEXT:    vtestpd %xmm0, %xmm0
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
 ; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <2 x double> zeroinitializer, %a0
@@ -67,9 +68,9 @@ define i1 @movmskps_allof_bitcast_v2f64(<2 x double> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestpd %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    cmpl $3, %eax
+; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <2 x double> zeroinitializer, %a0
   %2 = sext <2 x i1> %1 to <2 x i64>
@@ -103,7 +104,8 @@ define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) {
 ;
 ; ADL-LABEL: pmovmskb_noneof_bitcast_v2i64:
 ; ADL:       # %bb.0:
-; ADL-NEXT:    vtestpd %xmm0, %xmm0
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
 ; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = icmp sgt <2 x i64> zeroinitializer, %a0
@@ -139,9 +141,9 @@ define i1 @pmovmskb_allof_bitcast_v2i64(<2 x i64> %a0) {
 ;
 ; ADL-LABEL: pmovmskb_allof_bitcast_v2i64:
 ; ADL:       # %bb.0:
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestpd %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    cmpl $3, %eax
+; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = icmp sgt <2 x i64> zeroinitializer, %a0
   %2 = sext <2 x i1> %1 to <2 x i64>
@@ -173,7 +175,8 @@ define i1 @pmovmskb_noneof_bitcast_v4f32(<4 x float> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vtestps %xmm0, %xmm0
+; ADL-NEXT:    vmovmskps %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
 ; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <4 x float> %a0, zeroinitializer
@@ -207,9 +210,9 @@ define i1 @pmovmskb_allof_bitcast_v4f32(<4 x float> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestps %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskps %xmm0, %eax
+; ADL-NEXT:    cmpl $15, %eax
+; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <4 x float> %a0, zeroinitializer
   %2 = sext <4 x i1> %1 to <4 x i32>
@@ -513,10 +516,11 @@ define i32 @movmskps_ptest_numelts_mismatch(<16 x i8> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; ADL-NEXT:    vmovmskps %xmm0, %ecx
 ; ADL-NEXT:    xorl %eax, %eax
-; ADL-NEXT:    vtestps %xmm1, %xmm0
-; ADL-NEXT:    sbbl %eax, %eax
+; ADL-NEXT:    cmpl $15, %ecx
+; ADL-NEXT:    sete %al
+; ADL-NEXT:    negl %eax
 ; ADL-NEXT:    retq
   %1 = icmp eq <16 x i8> %a0, zeroinitializer
   %2 = sext <16 x i1> %1 to <16 x i8>