From: Sjoerd Meijer Date: Wed, 17 Oct 2018 07:26:35 +0000 (+0000) Subject: [ARM][NFCI] Do not fuse VADD and VMUL, continued (1/2) X-Git-Tag: llvmorg-8.0.0-rc1~6375 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ff3ab33ec89042ed93bc8488b6c6971a1b4a9cf1;p=platform%2Fupstream%2Fllvm.git [ARM][NFCI] Do not fuse VADD and VMUL, continued (1/2) This is a follow up of rL342874, which stopped fusing muls and adds into VMLAs for performance reasons on the Cortex-M4 and Cortex-M33. This is a serie of 2 patches, that is trying to achieve the same for VFMA. The second column in the table below shows what we were generating before rL342874, the third column what changed with rL342874, and the last column what we want to achieve with these 2 patches: -------------------------------------------------------- | Opt | < rL342874 | >= rL342874 | | |------------------------------------------------------| |-O3 | vmla | vmul | vmul | | | | vadd | vadd | |------------------------------------------------------| |-Ofast | vfma | vfma | vmul | | | | | vadd | |------------------------------------------------------| |-Oz | vmla | vmla | vmla | -------------------------------------------------------- This patch 1/2, is a cleanup of the spaghetti predicate logic on the different VMLA and VFMA codegen rules, so that we can make the final functional change in patch 2/2. This also fixes a typo in the regression test added in rL342874. Differential revision: https://reviews.llvm.org/D53314 llvm-svn: 344671 --- diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index e1a077e..8aa05fa 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -357,7 +357,10 @@ let RecomputePerFunction = 1 in { def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">; def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">; def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">; - def UseFPVMLx : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">; + + def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&" + " !TM.Options.AllowFPOpFusion == FPOpFusion::Fast) ||" + "MF->getFunction().optForMinSize())">; } def UseMulOps : Predicate<"Subtarget->useMulOps()">; @@ -368,10 +371,6 @@ def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" " FPOpFusion::Fast && " " Subtarget->hasVFP4()) && " "!Subtarget->isTargetDarwin()">; -def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion ==" - " FPOpFusion::Fast &&" - " Subtarget->hasVFP4()) || " - "Subtarget->isTargetDarwin()">; def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index a7bb32d..2085507 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4402,16 +4402,16 @@ defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", v2f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", v4f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16", v4f16, fmul_su, fadd_mlx>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16", v8f16, fmul_su, fadd_mlx>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", @@ -4632,16 +4632,16 @@ defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", v2f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", v4f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16", v4f16, fmul, fsub>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16", v8f16, fmul, fsub>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", @@ -7084,9 +7084,9 @@ def : N3VSPat; def : N3VSPat; def : N3VSPat; def : N3VSMulOpPat, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; def : N3VSMulOpPat, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; def : N3VSMulOpPat, Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>; def : N3VSMulOpPat, diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index b4e28b9..b58730c 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -1814,7 +1814,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0, [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VMLAS : ASbIn<0b11100, 0b00, 0, 0, @@ -1823,7 +1823,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1836,17 +1836,17 @@ def VMLAH : AHbI<0b11100, 0b00, 0, 0, [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>; def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>; def VMLSD : ADbI<0b11100, 0b00, 1, 0, @@ -1855,7 +1855,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0, [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VMLSS : ASbIn<0b11100, 0b00, 1, 0, @@ -1864,7 +1864,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1877,17 +1877,17 @@ def VMLSH : AHbI<0b11100, 0b00, 1, 0, [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; def VNMLAD : ADbI<0b11100, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1895,7 +1895,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0, [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VNMLAS : ASbI<0b11100, 0b01, 1, 0, @@ -1904,7 +1904,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1917,29 +1917,29 @@ def VNMLAH : AHbI<0b11100, 0b01, 1, 0, [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; // (-(a * b) - dst) -> -(dst + (a * b)) def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin), (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; // (-dst - (a * b)) -> -(dst + (a * b)) def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))), (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)), (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; def VNMLSD : ADbI<0b11100, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1947,7 +1947,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0, [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VNMLSS : ASbI<0b11100, 0b01, 0, 0, @@ -1955,7 +1955,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1967,17 +1967,17 @@ def VNMLSH : AHbI<0b11100, 0b01, 0, 0, IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm", [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin), (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; //===----------------------------------------------------------------------===// // Fused FP Multiply-Accumulate Operations. diff --git a/llvm/test/CodeGen/ARM/fmacs.ll b/llvm/test/CodeGen/ARM/fmacs.ll index 027991e..140ab93 100644 --- a/llvm/test/CodeGen/ARM/fmacs.ll +++ b/llvm/test/CodeGen/ARM/fmacs.ll @@ -27,10 +27,11 @@ entry: ret float %1 } -define float @vlma_minsize(float %acc, float %a, float %b) #0 { +define float @vmla_minsize(float %acc, float %a, float %b) #0 { entry: -; VMLA-LABEL: vlma_minsize: -; VLMA: vmla.f32 s0, s1, s2 +; VMLA-LABEL: vmla_minsize: +; VMLA: vmla.f32 s0, s1, s2 +; VMLA-NEXT: bx lr %0 = fmul float %a, %b %1 = fadd float %acc, %0