From 89ea2648bbdea80193e9da5657db90d411620100 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Tue, 6 Feb 2018 08:43:56 +0000 Subject: [PATCH] [ARM] Armv8.2-A FP16 code generation (part 3/3) This adds most of the FP16 codegen support, but these areas need further work: - FP16 literals and immediates are not properly supported yet (e.g. literal pool needs work), - Instructions that are generated from intrinsics (e.g. vabs) haven't been added. This will be addressed in follow-up patches. Differential Revision: https://reviews.llvm.org/D42849 llvm-svn: 324321 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 21 ++ llvm/lib/Target/ARM/ARMISelLowering.h | 1 + llvm/lib/Target/ARM/ARMInstrVFP.td | 119 ++++-- llvm/test/CodeGen/ARM/fp16-instructions.ll | 574 ++++++++++++++++++++++++++++- 4 files changed, 673 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8e58bd3..72fede7 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1042,6 +1042,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f16, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::SETCC, MVT::f64, Expand); setOperationAction(ISD::SELECT, MVT::i32, Custom); @@ -12746,6 +12747,24 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool ARMTargetLowering::isFNegFree(EVT VT) const { + if (!VT.isSimple()) + return false; + + // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that + // negate values directly (fneg is free). So, we don't want to let the DAG + // combiner rewrite fneg into xors and some other instructions. For f16 and + // FullFP16 argument passing, some bitcast nodes may be introduced, + // triggering this DAG combine rewrite, so we are avoiding that with this. + switch (VT.getSimpleVT().SimpleTy) { + default: break; + case MVT::f16: + return Subtarget->hasFullFP16(); + } + + return false; +} + bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); @@ -13842,6 +13861,8 @@ bool ARM::isBitFieldInvertedMask(unsigned v) { bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { if (!Subtarget->hasVFP3()) return false; + if (VT == MVT::f16 && Subtarget->hasFullFP16()) + return ARM_AM::getFP16Imm(Imm) != -1; if (VT == MVT::f32) return ARM_AM::getFP32Imm(Imm) != -1; if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index b096331..b196e23 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -331,6 +331,7 @@ class VectorType; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool isFNegFree(EVT VT) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 48c1a38..29c68f7 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -395,9 +395,9 @@ def VDIVS : ASbI<0b11101, 0b00, 0, 0, let TwoOperandAliasConstraint = "$Sn = $Sd" in def VDIVH : AHbI<0b11101, 0b00, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fdiv HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -420,9 +420,9 @@ def VMULS : ASbIn<0b11100, 0b10, 0, 0, let TwoOperandAliasConstraint = "$Sn = $Sd" in def VMULH : AHbI<0b11100, 0b10, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fmul HPR:$Sn, HPR:$Sm))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; def VNMULD : ADbI<0b11100, 0b10, 1, 0, @@ -442,9 +442,9 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0, } def VNMULH : AHbI<0b11100, 0b10, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fneg (fmul HPR:$Sn, HPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; multiclass vsel_inst opc, int CC> { @@ -525,9 +525,9 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, } def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0, - (outs), (ins SPR:$Sd, SPR:$Sm), + (outs), (ins HPR:$Sd, HPR:$Sm), IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm", - []>; + [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>; def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins DPR:$Dd, DPR:$Dm), @@ -544,9 +544,9 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, } def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0, - (outs), (ins SPR:$Sd, SPR:$Sm), + (outs), (ins HPR:$Sd, HPR:$Sm), IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm", - []>; + [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>; } // Defs = [FPSCR_NZCV] //===----------------------------------------------------------------------===// @@ -771,7 +771,7 @@ multiclass vcvt_inst rm, SDPatternOperator node = null_frag> { let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in { def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs SPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"), []>, Requires<[HasFullFP16]> { @@ -779,7 +779,7 @@ multiclass vcvt_inst rm, } def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs SPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"), []>, Requires<[HasFullFP16]> { @@ -834,6 +834,17 @@ multiclass vcvt_inst rm, } let Predicates = [HasFPARMv8] in { + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (fp_to_sint (node HPR:$a))), + (COPY_TO_REGCLASS + (!cast(NAME#"SH") HPR:$a), + GPR)>; + + def : Pat<(i32 (fp_to_uint (node HPR:$a))), + (COPY_TO_REGCLASS + (!cast(NAME#"UH") HPR:$a), + GPR)>; + } def : Pat<(i32 (fp_to_sint (node SPR:$a))), (COPY_TO_REGCLASS (!cast(NAME#"SS") SPR:$a), @@ -875,9 +886,9 @@ def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0, } def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm", - []>; + [(set HPR:$Sd, (fneg HPR:$Sm))]>; multiclass vrint_inst_zrx { def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0, @@ -1313,13 +1324,16 @@ def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs HPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm", []>, Sched<[WriteFPCVT]> { let Inst{7} = 1; // s32 } +def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)), + (VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; + def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, (outs DPR:$Dd), (ins SPR:$Sm), IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm", @@ -1355,13 +1369,16 @@ def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs HPR:$Sd), (ins SPR:$Sm), IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm", []>, Sched<[WriteFPCVT]> { let Inst{7} = 0; // u32 } +def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)), + (VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>; + // FP -> Int: class AVConv1IsD_Encode opcod1, bits<2> opcod2, bits<4> opcod3, @@ -1456,13 +1473,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs SPR:$Sd), (ins HPR:$Sm), IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm", []>, Sched<[WriteFPCVT]> { let Inst{7} = 1; // Z bit } +def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)), + (COPY_TO_REGCLASS (VTOSIZH HPR:$a), GPR)>; + def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm", @@ -1499,13 +1519,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, - (outs SPR:$Sd), (ins SPR:$Sm), + (outs SPR:$Sd), (ins HPR:$Sm), IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm", []>, Sched<[WriteFPCVT]> { let Inst{7} = 1; // Z bit } +def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)), + (COPY_TO_REGCLASS (VTOUIZH HPR:$a), GPR)>; + // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. let Uses = [FPSCR] in { def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, @@ -1789,9 +1812,10 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, } def VMLAH : AHbI<0b11100, 0b00, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm), + HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; @@ -1801,6 +1825,10 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), + (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>, + Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; + def VMLSD : ADbI<0b11100, 0b00, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1825,9 +1853,10 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, } def VMLSH : AHbI<0b11100, 0b00, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), + HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; @@ -1837,6 +1866,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), + (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>, + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; def VNMLAD : ADbI<0b11100, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1861,9 +1893,10 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, } def VNMLAH : AHbI<0b11100, 0b01, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), + HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; @@ -1874,6 +1907,9 @@ def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin), + (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; // (-dst - (a * b)) -> -(dst + (a * b)) def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))), @@ -1882,6 +1918,9 @@ def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)), + (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; def VNMLSD : ADbI<0b11100, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1905,9 +1944,9 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, } def VNMLSH : AHbI<0b11100, 0b01, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; @@ -1917,6 +1956,9 @@ def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin), + (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>, + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; //===----------------------------------------------------------------------===// // Fused FP Multiply-Accumulate Operations. @@ -1943,9 +1985,10 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0, } def VFMAH : AHbI<0b11101, 0b10, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm), + HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFusedMAC]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; @@ -1956,6 +1999,9 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; +def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), + (VFMAH HPR:$dstin, HPR:$a, HPR:$b)>, + Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics // (fma x, y, z) -> (vfms z, x, y) @@ -1988,9 +2034,10 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0, } def VFMSH : AHbI<0b11101, 0b10, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), + HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFusedMAC]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; @@ -2001,6 +2048,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; +def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), + (VFMSH HPR:$dstin, HPR:$a, HPR:$b)>, + Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics // (fma (fneg x), y, z) -> (vfms z, x, y) @@ -2040,9 +2090,10 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0, } def VFNMAH : AHbI<0b11101, 0b01, 1, 0, - (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), + HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFusedMAC]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; @@ -2091,9 +2142,9 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0, } def VFNMSH : AHbI<0b11101, 0b01, 0, 0, - (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm", - []>, + [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFusedMAC]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; diff --git a/llvm/test/CodeGen/ARM/fp16-instructions.ll b/llvm/test/CodeGen/ARM/fp16-instructions.ll index b8ba9a6..702dafa 100644 --- a/llvm/test/CodeGen/ARM/fp16-instructions.ll +++ b/llvm/test/CodeGen/ARM/fp16-instructions.ll @@ -1,29 +1,44 @@ ; SOFT: ; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT +; RUN: llc < %s -mtriple=thumb-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT ; SOFTFP: ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-VFP3 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FP16 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FULLFP16 +; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-VFP3 +; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FP16 +; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FULLFP16 + ; HARD: ; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-VFP3 ; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FP16 ; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16 +; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-VFP3 +; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FP16 +; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16 + +; FP-CONTRACT=FAST +; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16-FAST +; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+fullfp16 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16-FAST + -define float @RetValBug(float %A.coerce) local_unnamed_addr { +define float @RetValBug(float %A.coerce) { entry: ret float undef -; This expression is optimised away due to the undef value. Check that -; LowerReturn can handle undef nodes (i.e. nodes which do not have any -; operands) when FullFP16 is enabled. +; Check thatLowerReturn can handle undef nodes (i.e. nodes which do not have +; any operands) when FullFP16 is enabled. ; ; CHECK-LABEL: RetValBug: -; CHECK-HARDFP-FULLFP16: mov pc, lr +; CHECK-HARDFP-FULLFP16: {{.*}} lr } -define float @Add(float %a.coerce, float %b.coerce) local_unnamed_addr { +; 1. VABS: TODO + +; 2. VADD +define float @Add(float %a.coerce, float %b.coerce) { entry: %0 = bitcast float %a.coerce to i32 %tmp.0.extract.trunc = trunc i32 %0 to i16 @@ -61,7 +76,6 @@ entry: ; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0 ; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]] ; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0 -; CHECK-SOFTFP-FULLFP16-NEXT: mov pc, lr ; CHECK-HARDFP-VFP3: vmov r{{.}}, s0 ; CHECK-HARDFP-VFP3: vmov{{.*}}, s1 @@ -77,5 +91,549 @@ entry: ; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] ; CHECK-HARDFP-FULLFP16: vadd.f16 s0, s0, s1 -; CHECK-HARDFP-FULLFP16-NEXT: mov pc, lr +} + +; 3. VCMP +define zeroext i1 @VCMP(float %F.coerce, float %G.coerce) { +entry: + %0 = bitcast float %F.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %G.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %cmp = fcmp ogt half %1, %3 + ret i1 %cmp + +; CHECK-LABEL: VCMP: + +; CHECK-SOFT: bl __aeabi_fcmpgt + +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: vcmpe.f32 s{{.}}, s{{.}} + +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 s{{.}}, s{{.}} +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 s{{.}}, s{{.}} +; CHECK-SOFTFP-FP16: vcmpe.f32 s{{.}}, s{{.}} + +; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0 +; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1 +; CHECK-SOFTFP-FULLFP16: vcmpe.f16 [[S2]], [[S0]] + +; CHECK-SOFTFP-FULLFP16-NOT: vmov.f16 s{{.}}, r0 +; CHECK-SOFTFP-FULLFP16-NOT: vmov.f16 s{{.}}, r1 +; CHECK-HARDFP-FULLFP16: vcmpe.f16 s0, s1 +} + +; 4. VCMPE + +; FIXME: enable when constant pool is fixed +; +;define i32 @VCMPE_IMM(float %F.coerce) { +;entry: +; %0 = bitcast float %F.coerce to i32 +; %tmp.0.extract.trunc = trunc i32 %0 to i16 +; %1 = bitcast i16 %tmp.0.extract.trunc to half +; %tmp = fcmp olt half %1, 1.000000e+00 +; %tmp1 = zext i1 %tmp to i32 +; ret i32 %tmp1 +;} + +define i32 @VCMPE(float %F.coerce, float %G.coerce) { +entry: + %0 = bitcast float %F.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %G.coerce to i32 + %tmp.1.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp.1.extract.trunc to half + %tmp = fcmp olt half %1, %3 + %tmp1 = zext i1 %tmp to i32 + ret i32 %tmp1 + +; CHECK-LABEL: VCMPE: +} + +; 5. VCVT (between floating-point and fixed-point) +; Only assembly/disassembly support + +; 6. VCVT (between floating-point and integer, both directions) +define i32 @fptosi(i32 %A.coerce) { +entry: + %tmp.0.extract.trunc = trunc i32 %A.coerce to i16 + %0 = bitcast i16 %tmp.0.extract.trunc to half + %conv = fptosi half %0 to i32 + ret i32 %conv + +; CHECK-LABEL: fptosi: + +; CHECK-HARDFP-FULLFP16: vmov.f16 s0, r0 +; CHECK-HARDFP-FULLFP16-NEXT: vcvt.s32.f16 s0, s0 +; CHECK-HARDFP-FULLFP16-NEXT: vmov r0, s0 +} + +define i32 @fptoui(i32 %A.coerce) { +entry: + %tmp.0.extract.trunc = trunc i32 %A.coerce to i16 + %0 = bitcast i16 %tmp.0.extract.trunc to half + %conv = fptoui half %0 to i32 + ret i32 %conv + +; CHECK-HARDFP-FULLFP16: vcvt.u32.f16 s0, s0 +; CHECK-HARDFP-FULLFP16-NEXT: vmov r0, s0 +} + +define float @UintToH(i32 %a, i32 %b) { +entry: + %0 = uitofp i32 %a to half + %1 = bitcast half %0 to i16 + %tmp0.insert.ext = zext i16 %1 to i32 + %2 = bitcast i32 %tmp0.insert.ext to float + ret float %2 + +; CHECK-LABEL: UintToH: + +; CHECK-HARDFP-FULLFP16: vmov s0, r0 +; CHECK-HARDFP-FULLFP16-NEXT: vcvt.f16.u32 s0, s0 +} + +define float @SintToH(i32 %a, i32 %b) { +entry: + %0 = sitofp i32 %a to half + %1 = bitcast half %0 to i16 + %tmp0.insert.ext = zext i16 %1 to i32 + %2 = bitcast i32 %tmp0.insert.ext to float + ret float %2 + +; CHECK-LABEL: SintToH: + +; CHECK-HARDFP-FULLFP16: vmov s0, r0 +; CHECK-HARDFP-FULLFP16-NEXT: vcvt.f16.s32 s0, s0 +} + +; TODO: +; 7. VCVTA +; 8. VCVTM +; 9. VCVTN +; 10. VCVTP +; 11. VCVTR + +; 12. VDIV +define float @Div(float %a.coerce, float %b.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %add = fdiv half %1, %3 + %4 = bitcast half %add to i16 + %tmp4.0.insert.ext = zext i16 %4 to i32 + %5 = bitcast i32 %tmp4.0.insert.ext to float + ret float %5 + +; CHECK-LABEL: Div: + +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_fdiv +; CHECK-SOFT: bl __aeabi_f2h + +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: vdiv.f32 +; CHECK-SOFTFP-VFP3: bl __aeabi_f2h + +; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1 +; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0 +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vdiv.f32 [[S0]], [[S0]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vmov r0, s0 + +; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1 +; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0 +; CHECK-SOFTFP-FULLFP16: vdiv.f16 [[S0]], [[S2]], [[S0]] +; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0 + +; CHECK-HARDFP-VFP3: vmov r{{.}}, s0 +; CHECK-HARDFP-VFP3: vmov{{.*}}, s1 +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: vdiv.f32 +; CHECK-HARDFP-VFP3: bl __aeabi_f2h +; CHECK-HARDFP-VFP3: vmov s0, r0 + +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1 +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0 +; CHECK-HARDFP-FP16: vdiv.f32 [[S0]], [[S0]], [[S2]] +; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] + +; CHECK-HARDFP-FULLFP16: vdiv.f16 s0, s0, s1 +} + +; 13. VFMA +define float @VFMA(float %a.coerce, float %b.coerce, float %c.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %4 = bitcast float %c.coerce to i32 + %tmp2.0.extract.trunc = trunc i32 %4 to i16 + %5 = bitcast i16 %tmp2.0.extract.trunc to half + %mul = fmul half %1, %3 + %add = fadd half %mul, %5 + %6 = bitcast half %add to i16 + %tmp4.0.insert.ext = zext i16 %6 to i32 + %7 = bitcast i32 %tmp4.0.insert.ext to float + ret float %7 + +; CHECK-LABEL: VFMA: +; CHECK-HARDFP-FULLFP16-FAST: vfma.f16 s2, s0, s1 +; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2 +} + +; 14. VFMS +define float @VFMS(float %a.coerce, float %b.coerce, float %c.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %4 = bitcast float %c.coerce to i32 + %tmp2.0.extract.trunc = trunc i32 %4 to i16 + %5 = bitcast i16 %tmp2.0.extract.trunc to half + %mul = fmul half %1, %3 + %sub = fsub half %5, %mul + %6 = bitcast half %sub to i16 + %tmp4.0.insert.ext = zext i16 %6 to i32 + %7 = bitcast i32 %tmp4.0.insert.ext to float + ret float %7 + +; CHECK-LABEL: VFMS: +; CHECK-HARDFP-FULLFP16-FAST: vfms.f16 s2, s0, s1 +; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2 +} + +; 15. VFNMA +define float @VFNMA(float %a.coerce, float %b.coerce, float %c.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %4 = bitcast float %c.coerce to i32 + %tmp2.0.extract.trunc = trunc i32 %4 to i16 + %5 = bitcast i16 %tmp2.0.extract.trunc to half + %mul = fmul half %1, %3 + %sub = fsub half -0.0, %mul + %sub2 = fsub half %sub, %5 + %6 = bitcast half %sub2 to i16 + %tmp4.0.insert.ext = zext i16 %6 to i32 + %7 = bitcast i32 %tmp4.0.insert.ext to float + ret float %7 + +; CHECK-LABEL: VFNMA: +; CHECK-HARDFP-FULLFP16-FAST: vfnma.f16 s2, s0, s1 +; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2 +} + +; 16. VFNMS +define float @VFNMS(float %a.coerce, float %b.coerce, float %c.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %4 = bitcast float %c.coerce to i32 + %tmp2.0.extract.trunc = trunc i32 %4 to i16 + %5 = bitcast i16 %tmp2.0.extract.trunc to half + %mul = fmul half %1, %3 + %sub2 = fsub half %mul, %5 + %6 = bitcast half %sub2 to i16 + %tmp4.0.insert.ext = zext i16 %6 to i32 + %7 = bitcast i32 %tmp4.0.insert.ext to float + ret float %7 + +; CHECK-LABEL: VFNMS: +; CHECK-HARDFP-FULLFP16-FAST: vfnms.f16 s2, s0, s1 +; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2 +} + +; TODO: +; 17. VMAXNM +; 18. VMINNM + +; 19. VMLA +define float @VMLA(float %a.coerce, float %b.coerce, float %c.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %4 = bitcast float %c.coerce to i32 + %tmp2.0.extract.trunc = trunc i32 %4 to i16 + %5 = bitcast i16 %tmp2.0.extract.trunc to half + %mul = fmul half %1, %3 + %add = fadd half %5, %mul + %6 = bitcast half %add to i16 + %tmp4.0.insert.ext = zext i16 %6 to i32 + %7 = bitcast i32 %tmp4.0.insert.ext to float + ret float %7 + +; CHECK-LABEL: VMLA: +; CHECK-HARDFP-FULLFP16: vmla.f16 s2, s0, s1 +; CHECK-HARDFP-FULLFP16-NEXT: vmov.f32 s0, s2 +} + +; 20. VMLS +define float @VMLS(float %a.coerce, float %b.coerce, float %c.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %4 = bitcast float %c.coerce to i32 + %tmp2.0.extract.trunc = trunc i32 %4 to i16 + %5 = bitcast i16 %tmp2.0.extract.trunc to half + %mul = fmul half %1, %3 + %add = fsub half %5, %mul + %6 = bitcast half %add to i16 + %tmp4.0.insert.ext = zext i16 %6 to i32 + %7 = bitcast i32 %tmp4.0.insert.ext to float + ret float %7 + +; CHECK-LABEL: VMLS: +; CHECK-HARDFP-FULLFP16: vmls.f16 s2, s0, s1 +; CHECK-HARDFP-FULLFP16-NEXT: vmov.f32 s0, s2 +} + +; TODO: fix immediates. +; 21. VMOV (between general-purpose register and half-precision register) +; 22. VMOV (immediate) + +; 23. VMUL +define float @Mul(float %a.coerce, float %b.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %add = fmul half %1, %3 + %4 = bitcast half %add to i16 + %tmp4.0.insert.ext = zext i16 %4 to i32 + %5 = bitcast i32 %tmp4.0.insert.ext to float + ret float %5 + +; CHECK-LABEL: Mul: + +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_fmul +; CHECK-SOFT: bl __aeabi_f2h + +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: vmul.f32 +; CHECK-SOFTFP-VFP3: bl __aeabi_f2h + +; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1 +; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0 +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vmul.f32 [[S0]], [[S0]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vmov r0, s0 + +; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1 +; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0 +; CHECK-SOFTFP-FULLFP16: vmul.f16 [[S0]], [[S2]], [[S0]] +; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0 + +; CHECK-HARDFP-VFP3: vmov r{{.}}, s0 +; CHECK-HARDFP-VFP3: vmov{{.*}}, s1 +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: vmul.f32 +; CHECK-HARDFP-VFP3: bl __aeabi_f2h +; CHECK-HARDFP-VFP3: vmov s0, r0 + +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1 +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0 +; CHECK-HARDFP-FP16: vmul.f32 [[S0]], [[S0]], [[S2]] +; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] + +; CHECK-HARDFP-FULLFP16: vmul.f16 s0, s0, s1 +} + +; 24. VNEG +define float @Neg(float %a.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = fsub half -0.000000e+00, %1 + %3 = bitcast half %2 to i16 + %tmp4.0.insert.ext = zext i16 %3 to i32 + %4 = bitcast i32 %tmp4.0.insert.ext to float + ret float %4 + +; CHECK-LABEL: Neg: +; CHECK-HARDFP-FULLFP16: vneg.f16 s0, s0 +} + +; 25. VNMLA +define float @VNMLA(float %a.coerce, float %b.coerce, float %c.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %4 = bitcast float %c.coerce to i32 + %tmp2.0.extract.trunc = trunc i32 %4 to i16 + %5 = bitcast i16 %tmp2.0.extract.trunc to half + %add = fmul half %1, %3 + %add2 = fsub half -0.000000e+00, %add + %add3 = fsub half %add2, %5 + %6 = bitcast half %add3 to i16 + %tmp4.0.insert.ext = zext i16 %6 to i32 + %7 = bitcast i32 %tmp4.0.insert.ext to float + ret float %7 + +; CHECK-LABEL: VNMLA: +; CHECK-HARDFP-FULLFP16: vnmla.f16 s2, s0, s1 +; CHECK-HARDFP-FULLFP16: vmov.f32 s0, s2 +} + +; 26. VNMLS +define float @VNMLS(float %a.coerce, float %b.coerce, float %c.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %4 = bitcast float %c.coerce to i32 + %tmp2.0.extract.trunc = trunc i32 %4 to i16 + %5 = bitcast i16 %tmp2.0.extract.trunc to half + %add = fmul half %1, %3 + %add2 = fsub half %add, %5 + %6 = bitcast half %add2 to i16 + %tmp4.0.insert.ext = zext i16 %6 to i32 + %7 = bitcast i32 %tmp4.0.insert.ext to float + ret float %7 + +; CHECK-LABEL: VNMLS: +; CHECK-HARDFP-FULLFP16: vnmls.f16 s2, s0, s1 +; CHECK-HARDFP-FULLFP16: vmov.f32 s0, s2 +} + +; 27. VNMUL +define float @NMul(float %a.coerce, float %b.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %add = fmul half %1, %3 + %add2 = fsub half -0.0, %add + %4 = bitcast half %add2 to i16 + %tmp4.0.insert.ext = zext i16 %4 to i32 + %5 = bitcast i32 %tmp4.0.insert.ext to float + ret float %5 + +; CHECK-LABEL: NMul: +; CHECK-HARDFP-FULLFP16: vnmul.f16 s0, s0, s1 +} + +; 28. VRINTA +; 29. VRINTM +; 30. VRINTN +; 31. VRINTP +; 32. VRINTR +; 33. VRINTX +; 34. VRINTZ +; 35. VSELEQ +; 36. VSELGE +; 37. VSELGT +; 38. VSELVS +; 39. VSQRT + +; 40. VSUB +define float @Sub(float %a.coerce, float %b.coerce) { +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = bitcast float %b.coerce to i32 + %tmp1.0.extract.trunc = trunc i32 %2 to i16 + %3 = bitcast i16 %tmp1.0.extract.trunc to half + %add = fsub half %1, %3 + %4 = bitcast half %add to i16 + %tmp4.0.insert.ext = zext i16 %4 to i32 + %5 = bitcast i32 %tmp4.0.insert.ext to float + ret float %5 + +; CHECK-LABEL: Sub: + +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_h2f +; CHECK-SOFT: bl __aeabi_fsub +; CHECK-SOFT: bl __aeabi_f2h + +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: bl __aeabi_h2f +; CHECK-SOFTFP-VFP3: vsub.f32 +; CHECK-SOFTFP-VFP3: bl __aeabi_f2h + +; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1 +; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0 +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vsub.f32 [[S0]], [[S0]], [[S2]] +; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] +; CHECK-SOFTFP-FP16: vmov r0, s0 + +; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1 +; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0 +; CHECK-SOFTFP-FULLFP16: vsub.f16 [[S0]], [[S2]], [[S0]] +; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0 + +; CHECK-HARDFP-VFP3: vmov r{{.}}, s0 +; CHECK-HARDFP-VFP3: vmov{{.*}}, s1 +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: bl __aeabi_h2f +; CHECK-HARDFP-VFP3: vsub.f32 +; CHECK-HARDFP-VFP3: bl __aeabi_f2h +; CHECK-HARDFP-VFP3: vmov s0, r0 + +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1 +; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0 +; CHECK-HARDFP-FP16: vsub.f32 [[S0]], [[S0]], [[S2]] +; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]] + +; CHECK-HARDFP-FULLFP16: vsub.f16 s0, s0, s1 } -- 2.7.4