From 91ddaa1b48f720a66b070148aba830e21bb0056f Mon Sep 17 00:00:00 2001 From: Silviu Baranga Date: Mon, 29 Jul 2013 09:25:50 +0000 Subject: [PATCH] Allow generation of vmla.f32 instructions when targeting Cortex-A15. The patch also adds the VFP4 feature to Cortex-A15 and fixes the DontUseFusedMAC predicate so that we can still generate vmla.f32 instructions on non-darwin targets with VFP4. llvm-svn: 187349 --- llvm/lib/Target/ARM/ARM.td | 2 +- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 2 +- llvm/lib/Target/ARM/ARMInstrInfo.td | 4 +++- llvm/lib/Target/ARM/ARMTargetMachine.cpp | 2 +- llvm/test/CodeGen/ARM/a15-mla.ll | 26 +++++++++++++++++++++++++- 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 46928dc..e5da3a5 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -179,7 +179,7 @@ def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", // FIXME: It has not been determined if A15 has these features. def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", "Cortex-A15 ARM processors", - [FeatureT2XtPk, FeatureFP16, + [FeatureT2XtPk, FeatureFP16, FeatureVFP4, FeatureAvoidPartialCPSR, FeatureTrustZone]>; def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5", diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 31ce38e..4ca3af6 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -422,7 +422,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (!CheckVMLxHazard) return true; - if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9() && + if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9() && !Subtarget->isSwift()) return true; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index f543e5d..c243402 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -262,7 +262,9 @@ def UseMulOps : Predicate<"Subtarget->useMulOps()">; def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" " FPOpFusion::Fast) && " "!Subtarget->isTargetDarwin()">; -def DontUseFusedMAC : Predicate<"!Subtarget->hasVFP4() || " +def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast &&" + " Subtarget->hasVFP4()) || " "Subtarget->isTargetDarwin()">; // VGETLNi32 is microcoded on Swift - prefer VMOV. diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 354a779..1ba78e4 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -169,7 +169,7 @@ bool ARMPassConfig::addPreRegAlloc() { // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only()) addPass(createARMLoadStoreOptimizationPass(true)); - if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isLikeA9()) + if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9()) addPass(createMLxExpansionPass()); // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be // enabled when NEON is available. diff --git a/llvm/test/CodeGen/ARM/a15-mla.ll b/llvm/test/CodeGen/ARM/a15-mla.ll index 25f6de4..b233cc2 100644 --- a/llvm/test/CodeGen/ARM/a15-mla.ll +++ b/llvm/test/CodeGen/ARM/a15-mla.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=arm -float-abi=hard -mcpu=cortex-a15 -mattr=+neon,+neonfp | FileCheck %s ; This test checks that the VMLxForwarting feature is disabled for A15. -; CHECK: fun_a +; CHECK: fun_a: define <4 x i32> @fun_a(<4 x i32> %x, <4 x i32> %y) nounwind{ %1 = add <4 x i32> %x, %y ; CHECK-NOT: vmul @@ -10,3 +10,27 @@ define <4 x i32> @fun_a(<4 x i32> %x, <4 x i32> %y) nounwind{ %3 = add <4 x i32> %y, %2 ret <4 x i32> %3 } + +; This tests checks that VMLA FP patterns can be matched in instruction selection when targeting +; Cortex-A15. +; CHECK: fun_b: +define <4 x float> @fun_b(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind{ +; CHECK: vmla.f32 + %t = fmul <4 x float> %x, %y + %r = fadd <4 x float> %t, %z + ret <4 x float> %r +} + +; This tests checks that FP VMLA instructions are not expanded into separate multiply/addition +; operations when targeting Cortex-A15. +; CHECK: fun_c: +define <4 x float> @fun_c(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %u, <4 x float> %v) nounwind{ +; CHECK: vmla.f32 + %t1 = fmul <4 x float> %x, %y + %r1 = fadd <4 x float> %t1, %z +; CHECK: vmla.f32 + %t2 = fmul <4 x float> %u, %v + %r2 = fadd <4 x float> %t2, %r1 + ret <4 x float> %r2 +} + -- 2.7.4