From 951a10e330f25b86ad7c6aba496f724c37c05e11 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Wed, 6 Nov 2019 23:27:35 +0300 Subject: [PATCH] Improve Math(F).FusedMultiplyAdd codegen (dotnet/coreclr#27060) * improve Math(F).FusedMultiplyAdd * fix wrong order and clang formatting * fix ordering again * code formatting * fix typos * Move to Lower * clean up * Fix assert * code formatting * code formatting * Address feedback * Clean up * Add an assert to make sure we only have 3 args * Address feedback and add tests * Address feedback * update tests * Fix CI failure * Address feedback * formatting * Fix failures Commit migrated from https://github.com/dotnet/coreclr/commit/d9272784d10f38991e3004c9b074d6aa444a39cb --- src/coreclr/src/jit/instrsxarch.h | 48 ++-- src/coreclr/src/jit/lower.h | 1 + src/coreclr/src/jit/lowerxarch.cpp | 76 +++++ .../src/JIT/Intrinsics/MathFusedMultiplyAdd.cs | 320 +++++++++++++++++++++ .../JIT/Intrinsics/MathFusedMultiplyAdd_r.csproj | 13 + .../JIT/Intrinsics/MathFusedMultiplyAdd_ro.csproj | 13 + 6 files changed, 447 insertions(+), 24 deletions(-) create mode 100644 src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd.cs create mode 100644 src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd_r.csproj create mode 100644 src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd_ro.csproj diff --git a/src/coreclr/src/jit/instrsxarch.h b/src/coreclr/src/jit/instrsxarch.h index 0b98f49..33ece44 100644 --- a/src/coreclr/src/jit/instrsxarch.h +++ b/src/coreclr/src/jit/instrsxarch.h @@ -557,30 +557,30 @@ INST3(vfmsub231sd, "fmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, INST3(vfmsub132ss, "fmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9B), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values INST3(vfmsub213ss, "fmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAB), INS_Flags_IsDstDstSrcAVXInstruction) // INST3(vfmsub231ss, "fmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBB), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132pd, "fmnadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values -INST3(vfnmadd213pd, "fmnadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231pd, "fmnadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132ps, "fmnadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values -INST3(vfnmadd213ps, "fmnadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231ps, "fmnadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132sd, "fmnadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values -INST3(vfnmadd213sd, "fmnadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231sd, "fmnadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd132ss, "fmnadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values -INST3(vfnmadd213ss, "fmnadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmadd231ss, "fmnadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132pd, "fmnsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values -INST3(vfnmsub213pd, "fmnsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231pd, "fmnsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132ps, "fmnsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values -INST3(vfnmsub213ps, "fmnsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231ps, "fmnsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132sd, "fmnsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values -INST3(vfnmsub213sd, "fmnsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231sd, "fmnsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub132ss, "fmnsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values -INST3(vfnmsub213ss, "fmnsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) // -INST3(vfnmsub231ss, "fmnsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132pd, "fnmadd132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfnmadd213pd, "fnmadd213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231pd, "fnmadd231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132ps, "fnmadd132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9C), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfnmadd213ps, "fnmadd213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAC), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231ps, "fnmadd231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBC), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132sd, "fnmadd132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfnmadd213sd, "fnmadd213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231sd, "fnmadd231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd132ss, "fnmadd132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9D), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfnmadd213ss, "fnmadd213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAD), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmadd231ss, "fnmadd231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBD), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132pd, "fnmsub132pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfnmsub213pd, "fnmsub213pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231pd, "fnmsub231pd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132ps, "fnmsub132ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9E), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfnmsub213ps, "fnmsub213ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAE), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231ps, "fnmsub231ps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBE), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132sd, "fnmsub132sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfnmsub213sd, "fnmsub213sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231sd, "fnmsub231sd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub132ss, "fnmsub132ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x9F), INS_Flags_IsDstDstSrcAVXInstruction) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfnmsub213ss, "fnmsub213ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xAF), INS_Flags_IsDstDstSrcAVXInstruction) // +INST3(vfnmsub231ss, "fnmsub231ss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xBF), INS_Flags_IsDstDstSrcAVXInstruction) // INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) // BMI1 diff --git a/src/coreclr/src/jit/lower.h b/src/coreclr/src/jit/lower.h index 2f03b1d..e4817ee 100644 --- a/src/coreclr/src/jit/lower.h +++ b/src/coreclr/src/jit/lower.h @@ -311,6 +311,7 @@ private: #ifdef FEATURE_HW_INTRINSICS void LowerHWIntrinsic(GenTreeHWIntrinsic* node); void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition); + void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); #endif // FEATURE_HW_INTRINSICS // Utility functions diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp index 575b7af..fc3ff74 100644 --- a/src/coreclr/src/jit/lowerxarch.cpp +++ b/src/coreclr/src/jit/lowerxarch.cpp @@ -841,6 +841,78 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn } //---------------------------------------------------------------------------------------------- +// LowerFusedMultiplyAdd: Changes NI_FMA_MultiplyAddScalar produced by Math(F).FusedMultiplyAdd +// to a better FMA intrinsics if there are GT_NEG around in order to eliminate them. +// +// Arguments: +// node - The hardware intrinsic node +// +// Notes: +// Math(F).FusedMultiplyAdd is expanded into NI_FMA_MultiplyAddScalar and +// depending on additional GT_NEG nodes around it can be: +// +// x * y + z -> NI_FMA_MultiplyAddScalar +// x * -y + z -> NI_FMA_MultiplyAddNegatedScalar +// -x * y + z -> NI_FMA_MultiplyAddNegatedScalar +// -x * -y + z -> NI_FMA_MultiplyAddScalar +// x * y - z -> NI_FMA_MultiplySubtractScalar +// x * -y - z -> NI_FMA_MultiplySubtractNegatedScalar +// -x * y - z -> NI_FMA_MultiplySubtractNegatedScalar +// -x * -y - z -> NI_FMA_MultiplySubtractScalar +// +void Lowering::LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node) +{ + assert(node->gtHWIntrinsicId == NI_FMA_MultiplyAddScalar); + GenTreeArgList* argList = node->gtGetOp1()->AsArgList(); + GenTreeHWIntrinsic* createScalarOps[3]; + + for (GenTreeHWIntrinsic*& createScalarOp : createScalarOps) + { + GenTree*& current = argList->Current(); + assert(current != nullptr); + if (!current->OperIsHWIntrinsic()) + { + return; // Math(F).FusedMultiplyAdd is expected to emit three NI_Vector128_CreateScalarUnsafe + // but it's also possible to use NI_FMA_MultiplyAddScalar directly with any operands + } + GenTreeHWIntrinsic* hwArg = current->AsHWIntrinsic(); + if (hwArg->gtHWIntrinsicId != NI_Vector128_CreateScalarUnsafe) + { + return; + } + createScalarOp = hwArg; + argList = argList->Rest(); + } + assert(argList == nullptr); + + GenTree* argX = createScalarOps[0]->gtGetOp1(); + GenTree* argY = createScalarOps[1]->gtGetOp1(); + GenTree* argZ = createScalarOps[2]->gtGetOp1(); + + const bool negMul = argX->OperIs(GT_NEG) != argY->OperIs(GT_NEG); + if (argX->OperIs(GT_NEG)) + { + createScalarOps[0]->gtOp1 = argX->gtGetOp1(); + BlockRange().Remove(argX); + } + if (argY->OperIs(GT_NEG)) + { + createScalarOps[1]->gtOp1 = argY->gtGetOp1(); + BlockRange().Remove(argY); + } + if (argZ->OperIs(GT_NEG)) + { + createScalarOps[2]->gtOp1 = argZ->gtGetOp1(); + BlockRange().Remove(argZ); + node->gtHWIntrinsicId = negMul ? NI_FMA_MultiplySubtractNegatedScalar : NI_FMA_MultiplySubtractScalar; + } + else + { + node->gtHWIntrinsicId = negMul ? NI_FMA_MultiplyAddNegatedScalar : NI_FMA_MultiplyAddScalar; + } +} + +//---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node. // // Arguments: @@ -946,6 +1018,10 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerHWIntrinsicCC(node, NI_AVX_PTEST, GenCondition::UGT); break; + case NI_FMA_MultiplyAddScalar: + LowerFusedMultiplyAdd(node); + break; + default: break; } diff --git a/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd.cs b/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd.cs new file mode 100644 index 0000000..3f11704 --- /dev/null +++ b/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd.cs @@ -0,0 +1,320 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace MathFusedMultiplyAddTest +{ + class Program + { + private static int _returnCode = 100; + + static int Main() + { + TestFloats(); + TestDoubles(); + return _returnCode; + } + +#region MathF.FusedMultiplyAdd + static void TestFloats() + { + float[] testValues = + { + MathF.PI, MathF.E, 0.0f, -0.0f, float.MinValue, float.MaxValue, 42, -42, 1000, -1000, + int.MaxValue, int.MinValue, float.NaN, float.PositiveInfinity, float.NegativeInfinity + }; + + foreach (float a in testValues) + { + foreach (float b in testValues) + { + foreach (float c in testValues) + { + Check1(a, b, c); + Check2(a, b, c); + Check3(a, b, c); + Check4(a, b, c); + Check5(a, b, c); + Check6(a, b, c); + Check7(a, b, c); + Check8(a, b, c); + + if (Fma.IsSupported) + { + Vector128 vecA = Vector128.Create(42f); + TestExplicitFmaUsage1(ref vecA, 9f); + TestExplicitFmaUsage2(ref vecA, 9f); + TestExplicitFmaUsage3(ref vecA, 9f); + TestExplicitFmaUsage4(ref vecA, 9f); + TestExplicitFmaUsage5(ref vecA, 9f); + } + } + } + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check1(float a, float b, float c) => + CompareFloats(ReferenceMultiplyAdd( a, b, c), + MathF.FusedMultiplyAdd( a, b, c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check2(float a, float b, float c) => + CompareFloats(ReferenceMultiplyAdd(-a, b, c), + MathF.FusedMultiplyAdd(-a, b, c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check3(float a, float b, float c) => + CompareFloats(ReferenceMultiplyAdd(-a, -b, c), + MathF.FusedMultiplyAdd(-a, -b, c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check4(float a, float b, float c) => + CompareFloats(ReferenceMultiplyAdd(-a, -b, -c), + MathF.FusedMultiplyAdd(-a, -b, -c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check5(float a, float b, float c) => + CompareFloats(ReferenceMultiplyAdd( a, -b, c), + MathF.FusedMultiplyAdd( a, -b, c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check6(float a, float b, float c) => + CompareFloats(ReferenceMultiplyAdd( a, -b, -c), + MathF.FusedMultiplyAdd( a, -b, -c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check7(float a, float b, float c) => + CompareFloats(ReferenceMultiplyAdd(-a, b, -c), + MathF.FusedMultiplyAdd(-a, b, -c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check8(float a, float b, float c) => + CompareFloats(ReferenceMultiplyAdd( a, b, -c), + MathF.FusedMultiplyAdd( a, b, -c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static float ReferenceMultiplyAdd(float a, float b, float c) => a * b + c; + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CompareFloats(float a, float b) + { + if (Math.Abs(a - b) > 0.001f) + { + Console.WriteLine($"{a} != {b}"); + _returnCode--; + } + } + + // FMA intrinsics can be used explicitly, make sure nothing asserts + // with various types of arguments (fields, local variables, constants and refs) + + static Vector128 _c32 = Vector128.CreateScalarUnsafe(MathF.PI); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage1(ref Vector128 a, float b) + { + CompareFloats(ReferenceMultiplyAdd(a.ToScalar(), b, _c32.ToScalar()), + Fma.MultiplyAdd(a, Vector128.CreateScalarUnsafe(b), _c32).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage2(ref Vector128 a, float b) + { + CompareFloats(ReferenceMultiplyAdd(a.ToScalar(), a.ToScalar(), a.ToScalar()), + Fma.MultiplyAdd(a, a, a).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage3(ref Vector128 a, float b) + { + CompareFloats(ReferenceMultiplyAdd(_c32.ToScalar(), _c32.ToScalar(), _c32.ToScalar()), + Fma.MultiplyAdd(_c32, _c32, _c32).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage4(ref Vector128 a, float b) + { + CompareFloats(ReferenceMultiplyAdd(b, b, 333f), + Fma.MultiplyAdd( + Vector128.CreateScalarUnsafe(b), + Vector128.CreateScalarUnsafe(b), + Vector128.CreateScalarUnsafe(333f)).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage5(ref Vector128 a, float b) + { + CompareDoubles(ReferenceMultiplyAdd(-b, -b, -333f), + Fma.MultiplyAdd( + Vector128.CreateScalarUnsafe(-b), + Vector128.CreateScalarUnsafe(-b), + Vector128.CreateScalarUnsafe(-333f)).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage6(ref Vector128 a, float b) + { + CompareFloats(ReferenceMultiplyAdd(b, b, b), + Fma.MultiplyAdd( + Vector128.CreateScalarUnsafe(b), + Vector128.CreateScalar(b), + Vector128.Create(b)).ToScalar()); + } + #endregion + + #region Math.FusedMultiplyAdd + static void TestDoubles() + { + double[] testValues = + { + Math.PI, Math.E, 0.0, -0.0, double.MinValue, double.MaxValue, 42, -42, 100000, -100000, + long.MaxValue, long.MinValue, double.NaN, double.PositiveInfinity, double.NegativeInfinity + }; + + foreach (double a in testValues) + { + foreach (double b in testValues) + { + foreach (double c in testValues) + { + Check1(a, b, c); + Check2(a, b, c); + Check3(a, b, c); + Check4(a, b, c); + Check5(a, b, c); + Check6(a, b, c); + Check7(a, b, c); + Check8(a, b, c); + + if (Fma.IsSupported) + { + Vector128 vecA = Vector128.Create(42.0); + TestExplicitFmaUsage1(ref vecA, 9f); + TestExplicitFmaUsage2(ref vecA, 9f); + TestExplicitFmaUsage3(ref vecA, 9f); + TestExplicitFmaUsage4(ref vecA, 9f); + TestExplicitFmaUsage5(ref vecA, 9f); + } + } + } + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check1(double a, double b, double c) => + CompareDoubles(ReferenceMultiplyAdd( a, b, c), + Math.FusedMultiplyAdd( a, b, c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check2(double a, double b, double c) => + CompareDoubles(ReferenceMultiplyAdd(-a, b, c), + Math.FusedMultiplyAdd(-a, b, c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check3(double a, double b, double c) => + CompareDoubles(ReferenceMultiplyAdd(-a, -b, c), + Math.FusedMultiplyAdd(-a, -b, c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check4(double a, double b, double c) => + CompareDoubles(ReferenceMultiplyAdd(-a, -b, -c), + Math.FusedMultiplyAdd(-a, -b, -c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check5(double a, double b, double c) => + CompareDoubles(ReferenceMultiplyAdd( a, -b, c), + Math.FusedMultiplyAdd( a, -b, c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check6(double a, double b, double c) => + CompareDoubles(ReferenceMultiplyAdd( a, -b, -c), + Math.FusedMultiplyAdd( a, -b, -c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check7(double a, double b, double c) => + CompareDoubles(ReferenceMultiplyAdd(-a, b, -c), + Math.FusedMultiplyAdd(-a, b, -c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Check8(double a, double b, double c) => + CompareDoubles(ReferenceMultiplyAdd( a, b, -c), + Math.FusedMultiplyAdd( a, b, -c)); + + [MethodImpl(MethodImplOptions.NoInlining)] + static double ReferenceMultiplyAdd(double a, double b, double c) => a * b + c; + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CompareDoubles(double a, double b) + { + if (Math.Abs(a - b) > 0.00001) + { + Console.WriteLine($"{a} != {b}"); + _returnCode--; + } + } + + // FMA intrinsics can be used explicitly, make sure nothing asserts + // with various types of arguments (fields, local variables, constants and refs) + + static Vector128 _c64 = Vector128.CreateScalarUnsafe(Math.PI); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage1(ref Vector128 a, double b) + { + CompareDoubles(ReferenceMultiplyAdd(a.ToScalar(), b, _c64.ToScalar()), + Fma.MultiplyAdd(a, Vector128.CreateScalarUnsafe(b), _c64).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage2(ref Vector128 a, double b) + { + CompareDoubles(ReferenceMultiplyAdd(a.ToScalar(), a.ToScalar(), a.ToScalar()), + Fma.MultiplyAdd(a, a, a).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage3(ref Vector128 a, double b) + { + CompareDoubles(ReferenceMultiplyAdd(_c64.ToScalar(), _c64.ToScalar(), _c64.ToScalar()), + Fma.MultiplyAdd(_c64, _c64, _c64).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage4(ref Vector128 a, double b) + { + CompareDoubles(ReferenceMultiplyAdd(b, b, b), + Fma.MultiplyAdd( + Vector128.CreateScalarUnsafe(b), + Vector128.CreateScalarUnsafe(b), + Vector128.CreateScalarUnsafe(b)).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage5(ref Vector128 a, double b) + { + CompareDoubles(ReferenceMultiplyAdd(-b, -b, -333.0), + Fma.MultiplyAdd( + Vector128.CreateScalarUnsafe(-b), + Vector128.CreateScalarUnsafe(-b), + Vector128.CreateScalarUnsafe(-333.0)).ToScalar()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestExplicitFmaUsage6(ref Vector128 a, double b) + { + CompareDoubles(ReferenceMultiplyAdd(b, b, b), + Fma.MultiplyAdd( + Vector128.CreateScalarUnsafe(b), + Vector128.CreateScalar(b), + Vector128.Create(b)).ToScalar()); + } +#endregion + } +} \ No newline at end of file diff --git a/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd_r.csproj b/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd_r.csproj new file mode 100644 index 0000000..879a577 --- /dev/null +++ b/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd_r.csproj @@ -0,0 +1,13 @@ + + + Exe + + + true + None + + + + + + diff --git a/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd_ro.csproj b/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd_ro.csproj new file mode 100644 index 0000000..c7a857e --- /dev/null +++ b/src/coreclr/tests/src/JIT/Intrinsics/MathFusedMultiplyAdd_ro.csproj @@ -0,0 +1,13 @@ + + + Exe + + + true + None + True + + + + + -- 2.7.4