[X86] Prefer vmovmsk instead of vtest for alderlake.

author Luo, Yuanke <yuanke.luo@intel.com>

Tue, 6 Jun 2023 05:27:15 +0000 (13:27 +0800)

committer Luo, Yuanke <yuanke.luo@intel.com>

Thu, 8 Jun 2023 09:38:47 +0000 (17:38 +0800)
author Luo, Yuanke <yuanke.luo@intel.com>
Tue, 6 Jun 2023 05:27:15 +0000 (13:27 +0800)
committer Luo, Yuanke <yuanke.luo@intel.com>
Thu, 8 Jun 2023 09:38:47 +0000 (17:38 +0800)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td

index d664b24..e9f9f1b 100644 (file)
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -422,6 +422,9 @@ def FeatureHardenSlsIJmp
  //===----------------------------------------------------------------------===//
  // X86 Subtarget Tuning features
  //===----------------------------------------------------------------------===//
+def TuningPreferMovmskOverVTest : SubtargetFeature<"prefer-movmsk-over-vtest",
+                                       "PreferMovmskOverVTest", "true",
+                                       "Prefer movmsk over vtest instruction">;
  
  def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                         "SHLD instruction is slow">;
@@ -1166,7 +1169,8 @@ def ProcessorFeatures {
                                                    FeatureMOVDIRI,
                                                    FeatureMOVDIR64B,
                                                    FeatureWAITPKG];
-  list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps];
+  list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
+                                                TuningPreferMovmskOverVTest];
    list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
    list<SubtargetFeature> ADLFeatures =
      !listconcat(TRMFeatures, ADLAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 4fc96d8..170396d 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48024,7 +48024,8 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
    // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
    // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
    // iff every element is referenced.
-  if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse &&
+  if (NumElts <= CmpBits && Subtarget.hasAVX() &&
+      !Subtarget.preferMovmskOverVTest() && IsOneUse &&
        (NumEltBits == 32 || NumEltBits == 64)) {
      SDLoc DL(EFLAGS);
      MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll

index 871703d..b3f4878 100644 (file)
--- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
@@ -1,7 +1,7 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1
  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL
  
  declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>)
  declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
@@ -22,7 +22,8 @@ define i1 @movmskps_noneof_bitcast_v4f64(<4 x double> %a0) {
  ; ADL:       # %bb.0:
  ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
  ; ADL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; ADL-NEXT:    vtestpd %ymm0, %ymm0
+; ADL-NEXT:    vmovmskpd %ymm0, %eax
+; ADL-NEXT:    testl %eax, %eax
  ; ADL-NEXT:    sete %al
  ; ADL-NEXT:    vzeroupper
  ; ADL-NEXT:    retq
@@ -59,9 +60,9 @@ define i1 @movmskps_allof_bitcast_v4f64(<4 x double> %a0) {
  ; ADL:       # %bb.0:
  ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
  ; ADL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; ADL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; ADL-NEXT:    vtestpd %ymm1, %ymm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %ymm0, %eax
+; ADL-NEXT:    cmpl $15, %eax
+; ADL-NEXT:    sete %al
  ; ADL-NEXT:    vzeroupper
  ; ADL-NEXT:    retq
    %1 = fcmp oeq <4 x double> %a0, zeroinitializer
@@ -203,10 +204,10 @@ define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1)  {
  ; ADL-LABEL: movmskps_concat_v4f32:
  ; ADL:       # %bb.0:
  ; ADL-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; ADL-NEXT:    vmovmskps %xmm0, %ecx
  ; ADL-NEXT:    xorl %eax, %eax
-; ADL-NEXT:    vtestps %xmm0, %xmm0
-; ADL-NEXT:    setne %al
-; ADL-NEXT:    negl %eax
+; ADL-NEXT:    negl %ecx
+; ADL-NEXT:    sbbl %eax, %eax
  ; ADL-NEXT:    retq
    %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    %2 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %1)
diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll

index b365a5f..baa0553 100644 (file)
--- a/llvm/test/CodeGen/X86/combine-movmsk.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk.ll
@@ -3,7 +3,7 @@
  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL
  
  declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
  declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
@@ -33,7 +33,8 @@ define i1 @movmskps_noneof_bitcast_v2f64(<2 x double> %a0) {
  ; ADL:       # %bb.0:
  ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
  ; ADL-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
-; ADL-NEXT:    vtestpd %xmm0, %xmm0
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
  ; ADL-NEXT:    sete %al
  ; ADL-NEXT:    retq
    %1 = fcmp oeq <2 x double> zeroinitializer, %a0
@@ -67,9 +68,9 @@ define i1 @movmskps_allof_bitcast_v2f64(<2 x double> %a0) {
  ; ADL:       # %bb.0:
  ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
  ; ADL-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestpd %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    cmpl $3, %eax
+; ADL-NEXT:    sete %al
  ; ADL-NEXT:    retq
    %1 = fcmp oeq <2 x double> zeroinitializer, %a0
    %2 = sext <2 x i1> %1 to <2 x i64>
@@ -103,7 +104,8 @@ define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) {
  ;
  ; ADL-LABEL: pmovmskb_noneof_bitcast_v2i64:
  ; ADL:       # %bb.0:
-; ADL-NEXT:    vtestpd %xmm0, %xmm0
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
  ; ADL-NEXT:    sete %al
  ; ADL-NEXT:    retq
    %1 = icmp sgt <2 x i64> zeroinitializer, %a0
@@ -139,9 +141,9 @@ define i1 @pmovmskb_allof_bitcast_v2i64(<2 x i64> %a0) {
  ;
  ; ADL-LABEL: pmovmskb_allof_bitcast_v2i64:
  ; ADL:       # %bb.0:
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestpd %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    cmpl $3, %eax
+; ADL-NEXT:    sete %al
  ; ADL-NEXT:    retq
    %1 = icmp sgt <2 x i64> zeroinitializer, %a0
    %2 = sext <2 x i1> %1 to <2 x i64>
@@ -173,7 +175,8 @@ define i1 @pmovmskb_noneof_bitcast_v4f32(<4 x float> %a0) {
  ; ADL:       # %bb.0:
  ; ADL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
  ; ADL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vtestps %xmm0, %xmm0
+; ADL-NEXT:    vmovmskps %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
  ; ADL-NEXT:    sete %al
  ; ADL-NEXT:    retq
    %1 = fcmp oeq <4 x float> %a0, zeroinitializer
@@ -207,9 +210,9 @@ define i1 @pmovmskb_allof_bitcast_v4f32(<4 x float> %a0) {
  ; ADL:       # %bb.0:
  ; ADL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
  ; ADL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestps %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskps %xmm0, %eax
+; ADL-NEXT:    cmpl $15, %eax
+; ADL-NEXT:    sete %al
  ; ADL-NEXT:    retq
    %1 = fcmp oeq <4 x float> %a0, zeroinitializer
    %2 = sext <4 x i1> %1 to <4 x i32>
@@ -513,10 +516,11 @@ define i32 @movmskps_ptest_numelts_mismatch(<16 x i8> %a0) {
  ; ADL:       # %bb.0:
  ; ADL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
  ; ADL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; ADL-NEXT:    vmovmskps %xmm0, %ecx
  ; ADL-NEXT:    xorl %eax, %eax
-; ADL-NEXT:    vtestps %xmm1, %xmm0
-; ADL-NEXT:    sbbl %eax, %eax
+; ADL-NEXT:    cmpl $15, %ecx
+; ADL-NEXT:    sete %al
+; ADL-NEXT:    negl %eax
  ; ADL-NEXT:    retq
    %1 = icmp eq <16 x i8> %a0, zeroinitializer
    %2 = sext <16 x i1> %1 to <16 x i8>
author	Luo, Yuanke <yuanke.luo@intel.com>
	Tue, 6 Jun 2023 05:27:15 +0000 (13:27 +0800)
committer	Luo, Yuanke <yuanke.luo@intel.com>
	Thu, 8 Jun 2023 09:38:47 +0000 (17:38 +0800)
llvm/lib/Target/X86/X86.td		patch \| blob \| history
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/combine-movmsk-avx.ll		patch \| blob \| history
llvm/test/CodeGen/X86/combine-movmsk.ll		patch \| blob \| history