From: Luo, Yuanke Date: Tue, 6 Jun 2023 05:27:15 +0000 (+0800) Subject: [X86] Prefer vmovmsk instead of vtest for alderlake. X-Git-Tag: upstream/17.0.6~5757 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b0bea80ab479e9bb016fcdb62d7d0eceec2b28e3;p=platform%2Fupstream%2Fllvm.git [X86] Prefer vmovmsk instead of vtest for alderlake. On alderlake E-core, the latency of VMOVMSKPS is 5 for YMM/XMM. The latency of VPTESTPS is 7 for YMM and is 5 for XMM. Since alderlake use the P-core schedule model, we can't determine which one better based on the latency information of schedule model. Alternatively we add an tuning feature for alderlake and select VMOVMSKPS with the indication for the tuning feature. In the case of "vmovmskps + test + jcc", the test and jcc can be fused, while vtest and jcc can't. Differential Revision: https://reviews.llvm.org/D152227 --- diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index d664b24..e9f9f1b 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -422,6 +422,9 @@ def FeatureHardenSlsIJmp //===----------------------------------------------------------------------===// // X86 Subtarget Tuning features //===----------------------------------------------------------------------===// +def TuningPreferMovmskOverVTest : SubtargetFeature<"prefer-movmsk-over-vtest", + "PreferMovmskOverVTest", "true", + "Prefer movmsk over vtest instruction">; def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; @@ -1166,7 +1169,8 @@ def ProcessorFeatures { FeatureMOVDIRI, FeatureMOVDIR64B, FeatureWAITPKG]; - list ADLAdditionalTuning = [TuningPERMFalseDeps]; + list ADLAdditionalTuning = [TuningPERMFalseDeps, + TuningPreferMovmskOverVTest]; list ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); list ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4fc96d8..170396d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48024,7 +48024,8 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V) // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V) // iff every element is referenced. - if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse && + if (NumElts <= CmpBits && Subtarget.hasAVX() && + !Subtarget.preferMovmskOverVTest() && IsOneUse && (NumEltBits == 32 || NumEltBits == 64)) { SDLoc DL(EFLAGS); MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits); diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll index 871703d..b3f4878 100644 --- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) @@ -22,7 +22,8 @@ define i1 @movmskps_noneof_bitcast_v4f64(<4 x double> %a0) { ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 -; ADL-NEXT: vtestpd %ymm0, %ymm0 +; ADL-NEXT: vmovmskpd %ymm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: vzeroupper ; ADL-NEXT: retq @@ -59,9 +60,9 @@ define i1 @movmskps_allof_bitcast_v4f64(<4 x double> %a0) { ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 -; ADL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; ADL-NEXT: vtestpd %ymm1, %ymm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskpd %ymm0, %eax +; ADL-NEXT: cmpl $15, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: vzeroupper ; ADL-NEXT: retq %1 = fcmp oeq <4 x double> %a0, zeroinitializer @@ -203,10 +204,10 @@ define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1) { ; ADL-LABEL: movmskps_concat_v4f32: ; ADL: # %bb.0: ; ADL-NEXT: vorps %xmm1, %xmm0, %xmm0 +; ADL-NEXT: vmovmskps %xmm0, %ecx ; ADL-NEXT: xorl %eax, %eax -; ADL-NEXT: vtestps %xmm0, %xmm0 -; ADL-NEXT: setne %al -; ADL-NEXT: negl %eax +; ADL-NEXT: negl %ecx +; ADL-NEXT: sbbl %eax, %eax ; ADL-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> %2 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %1) diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll index b365a5f..baa0553 100644 --- a/llvm/test/CodeGen/X86/combine-movmsk.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) @@ -33,7 +33,8 @@ define i1 @movmskps_noneof_bitcast_v2f64(<2 x double> %a0) { ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm0 -; ADL-NEXT: vtestpd %xmm0, %xmm0 +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <2 x double> zeroinitializer, %a0 @@ -67,9 +68,9 @@ define i1 @movmskps_allof_bitcast_v2f64(<2 x double> %a0) { ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestpd %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: cmpl $3, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <2 x double> zeroinitializer, %a0 %2 = sext <2 x i1> %1 to <2 x i64> @@ -103,7 +104,8 @@ define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) { ; ; ADL-LABEL: pmovmskb_noneof_bitcast_v2i64: ; ADL: # %bb.0: -; ADL-NEXT: vtestpd %xmm0, %xmm0 +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = icmp sgt <2 x i64> zeroinitializer, %a0 @@ -139,9 +141,9 @@ define i1 @pmovmskb_allof_bitcast_v2i64(<2 x i64> %a0) { ; ; ADL-LABEL: pmovmskb_allof_bitcast_v2i64: ; ADL: # %bb.0: -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestpd %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: cmpl $3, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = icmp sgt <2 x i64> zeroinitializer, %a0 %2 = sext <2 x i1> %1 to <2 x i64> @@ -173,7 +175,8 @@ define i1 @pmovmskb_noneof_bitcast_v4f32(<4 x float> %a0) { ; ADL: # %bb.0: ; ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vtestps %xmm0, %xmm0 +; ADL-NEXT: vmovmskps %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <4 x float> %a0, zeroinitializer @@ -207,9 +210,9 @@ define i1 @pmovmskb_allof_bitcast_v4f32(<4 x float> %a0) { ; ADL: # %bb.0: ; ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestps %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskps %xmm0, %eax +; ADL-NEXT: cmpl $15, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <4 x float> %a0, zeroinitializer %2 = sext <4 x i1> %1 to <4 x i32> @@ -513,10 +516,11 @@ define i32 @movmskps_ptest_numelts_mismatch(<16 x i8> %a0) { ; ADL: # %bb.0: ; ADL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; ADL-NEXT: vmovmskps %xmm0, %ecx ; ADL-NEXT: xorl %eax, %eax -; ADL-NEXT: vtestps %xmm1, %xmm0 -; ADL-NEXT: sbbl %eax, %eax +; ADL-NEXT: cmpl $15, %ecx +; ADL-NEXT: sete %al +; ADL-NEXT: negl %eax ; ADL-NEXT: retq %1 = icmp eq <16 x i8> %a0, zeroinitializer %2 = sext <16 x i1> %1 to <16 x i8>