From 7de210bd5246cb45a601d6c85645838f7585d70d Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 17 Nov 2014 16:17:51 +0000 Subject: [PATCH] [X86] Use ADD/SUB instead of INC/DEC for Haswell and Broadwell CPUs Differential Revision: http://reviews.llvm.org/D5934 llvm-svn: 222141 --- llvm/lib/Target/X86/X86.td | 5 +- llvm/test/CodeGen/X86/fma-phi-213-to-231.ll | 16 +++--- llvm/test/CodeGen/X86/slow-incdec.ll | 80 +++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/X86/slow-incdec.ll diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 094959f..83f55d3 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -280,7 +280,7 @@ def : ProcessorModel<"core-avx2", HaswellModel, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, - FeatureHLE]>; + FeatureHLE, FeatureSlowIncDec]>; // Broadwell def : ProcessorModel<"broadwell", HaswellModel, @@ -288,7 +288,8 @@ def : ProcessorModel<"broadwell", HaswellModel, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, - FeatureHLE, FeatureADX, FeatureRDSEED, FeatureSMAP]>; + FeatureHLE, FeatureADX, FeatureRDSEED, FeatureSMAP, + FeatureSlowIncDec]>; // KNL // FIXME: define KNL model def : ProcessorModel<"knl", HaswellModel, diff --git a/llvm/test/CodeGen/X86/fma-phi-213-to-231.ll b/llvm/test/CodeGen/X86/fma-phi-213-to-231.ll index 29bfb92..9715bc7 100644 --- a/llvm/test/CodeGen/X86/fma-phi-213-to-231.ll +++ b/llvm/test/CodeGen/X86/fma-phi-213-to-231.ll @@ -6,7 +6,7 @@ target triple = "x86_64-apple-macosx10.10.0" ; CHECK: [[BODYLBL:LBB.+]]: ; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} ; CHECK: [[INCLBL:LBB.+]]: -; CHECK: incl [[INDREG:%[a-z0-9]+]] +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] ; CHECK: cmpl {{%.+}}, [[INDREG]] ; CHECK: jl [[BODYLBL]] define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { @@ -35,7 +35,7 @@ for.end: ; CHECK: [[BODYLBL:LBB.+]]: ; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} ; CHECK: [[INCLBL:LBB.+]]: -; CHECK: incl [[INDREG:%[a-z0-9]+]] +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] ; CHECK: cmpl {{%.+}}, [[INDREG]] ; CHECK: jl [[BODYLBL]] define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { @@ -64,7 +64,7 @@ for.end: ; CHECK: [[BODYLBL:LBB.+]]: ; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} ; CHECK: [[INCLBL:LBB.+]]: -; CHECK: incl [[INDREG:%[a-z0-9]+]] +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] ; CHECK: cmpl {{%.+}}, [[INDREG]] ; CHECK: jl [[BODYLBL]] define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { @@ -93,7 +93,7 @@ for.end: ; CHECK: [[BODYLBL:LBB.+]]: ; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} ; CHECK: [[INCLBL:LBB.+]]: -; CHECK: incl [[INDREG:%[a-z0-9]+]] +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] ; CHECK: cmpl {{%.+}}, [[INDREG]] ; CHECK: jl [[BODYLBL]] define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { @@ -128,7 +128,7 @@ declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 ; CHECK: [[BODYLBL:LBB.+]]: ; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} ; CHECK: [[INCLBL:LBB.+]]: -; CHECK: incl [[INDREG:%[a-z0-9]+]] +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] ; CHECK: cmpl {{%.+}}, [[INDREG]] ; CHECK: jl [[BODYLBL]] define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { @@ -157,7 +157,7 @@ for.end: ; CHECK: [[BODYLBL:LBB.+]]: ; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} ; CHECK: [[INCLBL:LBB.+]]: -; CHECK: incl [[INDREG:%[a-z0-9]+]] +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] ; CHECK: cmpl {{%.+}}, [[INDREG]] ; CHECK: jl [[BODYLBL]] define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { @@ -186,7 +186,7 @@ for.end: ; CHECK: [[BODYLBL:LBB.+]]: ; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} ; CHECK: [[INCLBL:LBB.+]]: -; CHECK: incl [[INDREG:%[a-z0-9]+]] +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] ; CHECK: cmpl {{%.+}}, [[INDREG]] ; CHECK: jl [[BODYLBL]] define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { @@ -215,7 +215,7 @@ for.end: ; CHECK: [[BODYLBL:LBB.+]]: ; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} ; CHECK: [[INCLBL:LBB.+]]: -; CHECK: incl [[INDREG:%[a-z0-9]+]] +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] ; CHECK: cmpl {{%.+}}, [[INDREG]] ; CHECK: jl [[BODYLBL]] define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { diff --git a/llvm/test/CodeGen/X86/slow-incdec.ll b/llvm/test/CodeGen/X86/slow-incdec.ll new file mode 100644 index 0000000..541d992 --- /dev/null +++ b/llvm/test/CodeGen/X86/slow-incdec.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-slow-incdec < %s | FileCheck -check-prefix=INCDEC %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+slow-incdec < %s | FileCheck -check-prefix=ADD %s + +; check -mattr=-slow-incdec +; INCDEC-NOT: addl $-1 +; INCDEC: dec +; INCDEC-NOT: addl $1 +; INCDEC: inc + +; check -mattr=+slow-incdec +; ADD: addl $-1 +; ADD-NOT: dec +; ADD: addl $1 +; ADD-NOT: inc + +; Function Attrs: nounwind readonly +define i32 @slow_1(i32* nocapture readonly %a, i32 %s) #0 { +entry: + %cmp5 = icmp eq i32 %s, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond: ; preds = %for.body + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.body: ; preds = %for.body.preheader, %for.cond + %i.06 = phi i32 [ %dec, %for.cond ], [ %s, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i32 %i.06 + %0 = load i32* %arrayidx, align 4, !tbaa !1 + %cmp1 = icmp eq i32 %0, 0 +; + %dec = add nsw i32 %i.06, -1 + br i1 %cmp1, label %for.end.loopexit, label %for.cond + +for.end.loopexit: ; preds = %for.cond, %for.body + %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Function Attrs: nounwind readonly +define i32 @slow_2(i32* nocapture readonly %a, i32 %s) #0 { +entry: + %cmp5 = icmp eq i32 %s, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond: ; preds = %for.body + %cmp = icmp eq i32 %inc, 0 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.body: ; preds = %for.body.preheader, %for.cond + %i.06 = phi i32 [ %inc, %for.cond ], [ %s, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i32 %i.06 + %0 = load i32* %arrayidx, align 4, !tbaa !1 + %cmp1 = icmp eq i32 %0, 0 + %inc = add nsw i32 %i.06, 1 + br i1 %cmp1, label %for.end.loopexit, label %for.cond + +for.end.loopexit: ; preds = %for.cond, %for.body + %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ] + ret i32 %i.0.lcssa +} + +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"int", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} -- 2.7.4