From: Renato Golin Date: Thu, 21 Mar 2013 18:47:47 +0000 (+0000) Subject: Avoid NEON SP-FP unless unsafe-math or Darwin X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b4dd6c5945db4f5e7b107a3c7f8c481b460dec25;p=platform%2Fupstream%2Fllvm.git Avoid NEON SP-FP unless unsafe-math or Darwin NEON is not IEEE 754 compliant, so we should avoid lowering single-precision floating point operations with NEON unless unsafe-math is turned on. The equivalent VFP instructions are IEEE 754 compliant, but in some cores they're much slower, so some archs/OSs might still request it to be on by default, such as Swift and Darwin. llvm-svn: 177651 --- diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 46915ee..6838084 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -143,14 +143,12 @@ include "ARMSchedule.td" // ARM processor families. def ProcA5 : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5", "Cortex-A5 ARM processors", - [FeatureSlowFPBrcc, FeatureNEONForFP, - FeatureHasSlowFPVMLx, FeatureVMLxForwarding, - FeatureT2XtPk]>; + [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, FeatureT2XtPk]>; def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", "Cortex-A8 ARM processors", - [FeatureSlowFPBrcc, FeatureNEONForFP, - FeatureHasSlowFPVMLx, FeatureVMLxForwarding, - FeatureT2XtPk]>; + [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, FeatureT2XtPk]>; def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", "Cortex-A9 ARM processors", [FeatureVMLxForwarding, diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index e11314d..739300e 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetOptions.h" #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR @@ -42,12 +43,13 @@ StrictAlign("arm-strict-align", cl::Hidden, cl::desc("Disallow all unaligned memory accesses")); ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS) + const std::string &FS, const TargetOptions &Options) : ARMGenSubtargetInfo(TT, CPU, FS) , ARMProcFamily(Others) , stackAlignment(4) , CPUString(CPU) , TargetTriple(TT) + , Options(Options) , TargetABI(ARM_ABI_APCS) { initializeEnvironment(); resetSubtargetFeatures(CPU, FS); @@ -92,6 +94,7 @@ void ARMSubtarget::initializeEnvironment() { AllowsUnalignedMem = false; Thumb2DSP = false; UseNaClTrap = false; + UnsafeFPMath = false; } void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) { @@ -162,6 +165,12 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { // configuration. if (!StrictAlign && hasV6Ops() && isTargetDarwin()) AllowsUnalignedMem = true; + + // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default. + uint64_t Bits = getFeatureBits(); + if ((Bits & ARM::ProcA5 || Bits & ARM::ProcA8) && // Where this matters + (Options.UnsafeFPMath || isTargetDarwin())) + UseNEONForSinglePrecisionFP = true; } /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 8ce22e1..5b5ee6a 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -26,6 +26,7 @@ namespace llvm { class GlobalValue; class StringRef; +class TargetOptions; class ARMSubtarget : public ARMGenSubtargetInfo { protected: @@ -159,6 +160,9 @@ protected: /// NaCl TRAP instruction is generated instead of the regular TRAP. bool UseNaClTrap; + /// Target machine allowed unsafe FP math (such as use of NEON fp) + bool UnsafeFPMath; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -175,6 +179,9 @@ protected: /// Selected instruction itineraries (one entry per itinerary class.) InstrItineraryData InstrItins; + /// Options passed via command line that could influence the target + const TargetOptions &Options; + public: enum { isELF, isDarwin @@ -189,7 +196,7 @@ protected: /// of the specified triple. /// ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS); + const std::string &FS, const TargetOptions &Options); /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 3003760..b0f9e56 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -48,7 +48,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS), + Subtarget(TT, CPU, FS, Options), JITInfo(), InstrItins(Subtarget.getInstrItineraryData()) { // Default to soft float ABI diff --git a/llvm/test/CodeGen/ARM/fadds.ll b/llvm/test/CodeGen/ARM/fadds.ll index 48ef5ed..5aabc66 100644 --- a/llvm/test/CodeGen/ARM/fadds.ll +++ b/llvm/test/CodeGen/ARM/fadds.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8 +; RUN: llc < %s -march=arm -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CORTEXA8U ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9 define float @test(float %a, float %b) { @@ -18,6 +19,8 @@ entry: ; NFP0: vadd.f32 s ; CORTEXA8: test: -; CORTEXA8: vadd.f32 d +; CORTEXA8: vadd.f32 s +; CORTEXA8U: test: +; CORTEXA8U: vadd.f32 d ; CORTEXA9: test: -; CORTEXA9: vadd.f32 s{{.}}, s{{.}}, s{{.}} +; CORTEXA9: vadd.f32 s diff --git a/llvm/test/CodeGen/ARM/fmuls.ll b/llvm/test/CodeGen/ARM/fmuls.ll index 1566a92..9569880 100644 --- a/llvm/test/CodeGen/ARM/fmuls.ll +++ b/llvm/test/CodeGen/ARM/fmuls.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8 +; RUN: llc < %s -march=arm -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CORTEXA8U ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9 define float @test(float %a, float %b) { @@ -18,9 +19,11 @@ entry: ; NFP0: vmul.f32 s ; CORTEXA8: test: -; CORTEXA8: vmul.f32 d +; CORTEXA8: vmul.f32 s +; CORTEXA8U: test: +; CORTEXA8U: vmul.f32 d ; CORTEXA9: test: -; CORTEXA9: vmul.f32 s{{.}}, s{{.}}, s{{.}} +; CORTEXA9: vmul.f32 s ; VFP2: test2 define float @test2(float %a) nounwind { diff --git a/llvm/test/CodeGen/ARM/fnegs.ll b/llvm/test/CodeGen/ARM/fnegs.ll index 418b598..30549b0 100644 --- a/llvm/test/CodeGen/ARM/fnegs.ll +++ b/llvm/test/CodeGen/ARM/fnegs.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8 +; RUN: llc < %s -march=arm -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CORTEXA8U ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9 define float @test1(float* %a) { @@ -22,7 +23,10 @@ entry: ; NFP0: vneg.f32 s{{.*}}, s{{.*}} ; CORTEXA8: test1: -; CORTEXA8: vneg.f32 d{{.*}}, d{{.*}} +; CORTEXA8: vneg.f32 s{{.*}}, s{{.*}} + +; CORTEXA8U: test1: +; CORTEXA8U: vneg.f32 d{{.*}}, d{{.*}} ; CORTEXA9: test1: ; CORTEXA9: vneg.f32 s{{.*}}, s{{.*}} @@ -46,7 +50,10 @@ entry: ; NFP0: vneg.f32 s{{.*}}, s{{.*}} ; CORTEXA8: test2: -; CORTEXA8: vneg.f32 d{{.*}}, d{{.*}} +; CORTEXA8: vneg.f32 s{{.*}}, s{{.*}} + +; CORTEXA8U: test2: +; CORTEXA8U: vneg.f32 d{{.*}}, d{{.*}} ; CORTEXA9: test2: ; CORTEXA9: vneg.f32 s{{.*}}, s{{.*}} diff --git a/llvm/test/CodeGen/ARM/fnmscs.ll b/llvm/test/CodeGen/ARM/fnmscs.ll index 9ce9b7a..b5b4211 100644 --- a/llvm/test/CodeGen/ARM/fnmscs.ll +++ b/llvm/test/CodeGen/ARM/fnmscs.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8 +; RUN: llc < %s -march=arm -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=A8U ; RUN: llc < %s -march=arm -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=A8 define float @t1(float %acc, float %a, float %b) nounwind { @@ -11,9 +12,13 @@ entry: ; NEON: t1: ; NEON: vnmla.f32 +; A8U: t1: +; A8U: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}} +; A8U: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}} + ; A8: t1: ; A8: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}} -; A8: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}} +; A8: vsub.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}} %0 = fmul float %a, %b %1 = fsub float -0.0, %0 %2 = fsub float %1, %acc @@ -28,9 +33,13 @@ entry: ; NEON: t2: ; NEON: vnmla.f32 +; A8U: t2: +; A8U: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}} +; A8U: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}} + ; A8: t2: ; A8: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}} -; A8: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}} +; A8: vsub.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}} %0 = fmul float %a, %b %1 = fmul float -1.0, %0 %2 = fsub float %1, %acc @@ -45,6 +54,10 @@ entry: ; NEON: t3: ; NEON: vnmla.f64 +; A8U: t3: +; A8U: vnmul.f64 d +; A8U: vsub.f64 d + ; A8: t3: ; A8: vnmul.f64 d ; A8: vsub.f64 d @@ -62,6 +75,10 @@ entry: ; NEON: t4: ; NEON: vnmla.f64 +; A8U: t4: +; A8U: vnmul.f64 d +; A8U: vsub.f64 d + ; A8: t4: ; A8: vnmul.f64 d ; A8: vsub.f64 d diff --git a/llvm/test/CodeGen/ARM/fp_convert.ll b/llvm/test/CodeGen/ARM/fp_convert.ll index 44298b9..51f3890 100644 --- a/llvm/test/CodeGen/ARM/fp_convert.ll +++ b/llvm/test/CodeGen/ARM/fp_convert.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=VFP2 -; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=NEON +; RUN: llc < %s -march=arm -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=NEON +; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=VFP2 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=VFP2 define i32 @test1(float %a, float %b) { diff --git a/llvm/test/CodeGen/ARM/fsubs.ll b/llvm/test/CodeGen/ARM/fsubs.ll index f039e74..3013e56 100644 --- a/llvm/test/CodeGen/ARM/fsubs.ll +++ b/llvm/test/CodeGen/ARM/fsubs.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2 +; RUN: llc < %s -march=arm -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=NFP1U ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=NFP1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0 @@ -9,5 +10,6 @@ entry: } ; VFP2: vsub.f32 s -; NFP1: vsub.f32 d +; NFP1U: vsub.f32 d +; NFP1: vsub.f32 s ; NFP0: vsub.f32 s diff --git a/llvm/test/CodeGen/ARM/neon-spfp.ll b/llvm/test/CodeGen/ARM/neon-spfp.ll new file mode 100644 index 0000000..c00f0d1 --- /dev/null +++ b/llvm/test/CodeGen/ARM/neon-spfp.ll @@ -0,0 +1,76 @@ +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 | FileCheck %s -check-prefix=LINUXA5 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=LINUXA8 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 | FileCheck %s -check-prefix=LINUXA9 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 | FileCheck %s -check-prefix=LINUXA15 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift | FileCheck %s -check-prefix=LINUXSWIFT + +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA5 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA8 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA9 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA15 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFESWIFT + +; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a5 | FileCheck %s -check-prefix=DARWINA5 +; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=DARWINA8 +; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=DARWINA9 +; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a15 | FileCheck %s -check-prefix=DARWINA15 +; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=swift | FileCheck %s -check-prefix=DARWINSWIFT + +; This test makes sure we're not lowering VMUL.f32 D* (aka. NEON) for single-prec. FP ops, since +; NEON is not fully IEEE 754 compliant, unless unsafe-math is selected. + +@.str = private unnamed_addr constant [12 x i8] c"S317\09%.5g \0A\00", align 1 + +; CHECK-LINUXA5: main: +; CHECK-LINUXA8: main: +; CHECK-LINUXA9: main: +; CHECK-LINUXA15: main: +; CHECK-LINUXSWIFT: main: +; CHECK-UNSAFEA5: main: +; CHECK-UNSAFEA8: main: +; CHECK-UNSAFEA9: main: +; CHECK-UNSAFEA15: main: +; CHECK-UNSAFESWIFT: main: +; CHECK-DARWINA5: main: +; CHECK-DARWINA8: main: +; CHECK-DARWINA9: main: +; CHECK-DARWINA15: main: +; CHECK-DARWINSWIFT: main: +define i32 @main() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.04 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %q.03 = phi float [ 1.000000e+00, %entry ], [ %mul, %for.body ] + %mul = fmul float %q.03, 0x3FEFAE1480000000 +; CHECK-LINUXA5: vmul.f32 s{{[0-9]*}} +; CHECK-LINUXA8: vmul.f32 s{{[0-9]*}} +; CHECK-LINUXA9: vmul.f32 s{{[0-9]*}} +; CHECK-LINUXA15: vmul.f32 s{{[0-9]*}} +; Swift is *always* unsafe +; CHECK-LINUXSWIFT: vmul.f32 d{{[0-9]*}} + +; CHECK-UNSAFEA5: vmul.f32 d{{[0-9]*}} +; CHECK-UNSAFEA8: vmul.f32 d{{[0-9]*}} +; A9 and A15 don't need this +; CHECK-UNSAFEA9: vmul.f32 s{{[0-9]*}} +; CHECK-UNSAFEA15: vmul.f32 s{{[0-9]*}} +; CHECK-UNSAFESWIFT: vmul.f32 d{{[0-9]*}} + +; CHECK-DARWINA5: vmul.f32 d{{[0-9]*}} +; CHECK-DARWINA8: vmul.f32 d{{[0-9]*}} +; CHECK-DARWINA9: vmul.f32 s{{[0-9]*}} +; CHECK-DARWINA15: vmul.f32 s{{[0-9]*}} +; CHECK-DARWINSWIFT: vmul.f32 d{{[0-9]*}} + %conv = fpext float %mul to double + %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), double %conv) #1 + %inc = add nsw i32 %i.04, 1 + %exitcond = icmp eq i32 %inc, 16000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 +} + +declare i32 @printf(i8* nocapture, ...) diff --git a/llvm/test/CodeGen/ARM/neon_minmax.ll b/llvm/test/CodeGen/ARM/neon_minmax.ll index d301c6a..0a7c8b2 100644 --- a/llvm/test/CodeGen/ARM/neon_minmax.ll +++ b/llvm/test/CodeGen/ARM/neon_minmax.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s +; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s define float @fmin_ole(float %x) nounwind { ;CHECK: fmin_ole: