From 7345109fc3f4726611cd02ec3e522f4bc42f3a16 Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Sun, 20 Feb 2022 02:51:48 +0300
Subject: [PATCH] Faster Math.Max/Min on arm64 (#65584)

Co-authored-by: Adeel Mujahid <3840695+am11@users.noreply.github.com>
Co-authored-by: Tanner Gooding <tagoo@outlook.com>
---
 src/coreclr/jit/codegenarmarch.cpp                 |  14 +-
 src/coreclr/jit/fgdiagnostic.cpp                   |   2 +-
 src/coreclr/jit/gentree.cpp                        |  43 ++++++
 src/coreclr/jit/gentree.h                          |  17 +--
 src/coreclr/jit/importer.cpp                       |  19 +++
 src/coreclr/jit/lsraarm64.cpp                      |  47 ++++--
 src/coreclr/jit/namedintrinsiclist.h               |   2 +
 src/coreclr/jit/utils.cpp                          | 162 +++++++++++++++++++++
 src/coreclr/jit/utils.h                            |  16 ++
 src/coreclr/jit/valuenum.cpp                       |  40 +++++
 src/coreclr/jit/valuenumfuncs.h                    |   2 +
 .../System.Private.CoreLib/src/System/Math.cs      |   4 +
 12 files changed, 334 insertions(+), 34 deletions(-)

diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp
index b6d744a..6c0ec3c 100644
--- a/src/coreclr/jit/codegenarmarch.cpp
+++ b/src/coreclr/jit/codegenarmarch.cpp
@@ -669,7 +669,7 @@ void CodeGen::genIntrinsic(GenTree* treeNode)
     assert(varTypeIsFloating(srcNode));
     assert(srcNode->TypeGet() == treeNode->TypeGet());
 
-    // Right now only Abs/Ceiling/Floor/Truncate/Round/Sqrt are treated as math intrinsics.
+    // Only a subset of functions are treated as math intrinsics.
     //
     switch (treeNode->AsIntrinsic()->gtIntrinsicName)
     {
@@ -698,6 +698,18 @@ void CodeGen::genIntrinsic(GenTree* treeNode)
             genConsumeOperands(treeNode->AsOp());
             GetEmitter()->emitInsBinary(INS_frintn, emitActualTypeSize(treeNode), treeNode, srcNode);
             break;
+
+        case NI_System_Math_Max:
+            genConsumeOperands(treeNode->AsOp());
+            GetEmitter()->emitIns_R_R_R(INS_fmax, emitActualTypeSize(treeNode), treeNode->GetRegNum(),
+                                        treeNode->gtGetOp1()->GetRegNum(), treeNode->gtGetOp2()->GetRegNum());
+            break;
+
+        case NI_System_Math_Min:
+            genConsumeOperands(treeNode->AsOp());
+            GetEmitter()->emitIns_R_R_R(INS_fmin, emitActualTypeSize(treeNode), treeNode->GetRegNum(),
+                                        treeNode->gtGetOp1()->GetRegNum(), treeNode->gtGetOp2()->GetRegNum());
+            break;
 #endif // TARGET_ARM64
 
         case NI_System_Math_Sqrt:
diff --git a/src/coreclr/jit/fgdiagnostic.cpp b/src/coreclr/jit/fgdiagnostic.cpp
index 2df6d00..13fd138 100644
--- a/src/coreclr/jit/fgdiagnostic.cpp
+++ b/src/coreclr/jit/fgdiagnostic.cpp
@@ -2954,7 +2954,7 @@ void Compiler::fgDebugCheckFlags(GenTree* tree)
     // We reuse GTF_REVERSE_OPS as GTF_VAR_ARR_INDEX for LCL_VAR nodes.
     if (((tree->gtFlags & GTF_REVERSE_OPS) != 0) && !tree->OperIs(GT_LCL_VAR))
     {
-        assert(tree->OperSupportsReverseOps());
+        assert(tree->OperSupportsReverseOpEvalOrder(this));
     }
 
     GenTree* op1 = tree->OperIsSimple() ? tree->gtGetOp1() : nullptr;
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index a47a56a..afa6c78 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -3855,6 +3855,8 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                         case NI_System_Math_Log:
                         case NI_System_Math_Log2:
                         case NI_System_Math_Log10:
+                        case NI_System_Math_Max:
+                        case NI_System_Math_Min:
                         case NI_System_Math_Pow:
                         case NI_System_Math_Round:
                         case NI_System_Math_Sin:
@@ -4372,6 +4374,12 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                         // register requirement.
                         level += 2;
                         break;
+
+                    case NI_System_Math_Max:
+                    case NI_System_Math_Min:
+                        level++;
+                        break;
+
                     default:
                         assert(!"Unknown binary GT_INTRINSIC operator");
                         break;
@@ -4805,6 +4813,35 @@ DONE:
 #pragma warning(pop)
 #endif
 
+#ifdef DEBUG
+bool GenTree::OperSupportsReverseOpEvalOrder(Compiler* comp) const
+{
+    if (OperIsBinary())
+    {
+        if ((AsOp()->gtGetOp1() == nullptr) || (AsOp()->gtGetOp2() == nullptr))
+        {
+            return false;
+        }
+        if (OperIs(GT_COMMA, GT_BOUNDS_CHECK))
+        {
+            return false;
+        }
+        if (OperIs(GT_INTRINSIC))
+        {
+            return !comp->IsIntrinsicImplementedByUserCall(AsIntrinsic()->gtIntrinsicName);
+        }
+        return true;
+    }
+#if defined(FEATURE_SIMD) || defined(FEATURE_HW_INTRINSICS)
+    if (OperIsMultiOp())
+    {
+        return AsMultiOp()->GetOperandCount() == 2;
+    }
+#endif // FEATURE_SIMD || FEATURE_HW_INTRINSICS
+    return false;
+}
+#endif // DEBUG
+
 /*****************************************************************************
  *
  *  If the given tree is an integer constant that can be used
@@ -10893,6 +10930,12 @@ void Compiler::gtDispTree(GenTree*     tree,
                 case NI_System_Math_Log10:
                     printf(" log10");
                     break;
+                case NI_System_Math_Max:
+                    printf(" max");
+                    break;
+                case NI_System_Math_Min:
+                    printf(" min");
+                    break;
                 case NI_System_Math_Pow:
                     printf(" pow");
                     break;
diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h
index fb340cd..9ec1958 100644
--- a/src/coreclr/jit/gentree.h
+++ b/src/coreclr/jit/gentree.h
@@ -1704,7 +1704,7 @@ public:
         return (DebugOperKind() & DBK_NOTLIR) == 0;
     }
 
-    bool        OperSupportsReverseOps() const;
+    bool OperSupportsReverseOpEvalOrder(Compiler* comp) const;
     static bool RequiresNonNullOp2(genTreeOps oper);
     bool IsValidCallArgument();
 #endif // DEBUG
@@ -7870,22 +7870,7 @@ inline GenTree* GenTree::gtGetOp1() const
     }
 }
 
-inline bool GenTree::OperSupportsReverseOps() const
-{
-    if (OperIsBinary() && !OperIs(GT_COMMA, GT_INTRINSIC, GT_BOUNDS_CHECK))
-    {
-        return (AsOp()->gtGetOp1() != nullptr) && (AsOp()->gtGetOp2() != nullptr);
-    }
-#if defined(FEATURE_SIMD) || defined(FEATURE_HW_INTRINSICS)
-    if (OperIsMultiOp())
-    {
-        return AsMultiOp()->GetOperandCount() == 2;
-    }
-#endif // FEATURE_SIMD || FEATURE_HW_INTRINSICS
-    return false;
-}
 #endif // DEBUG
-
 inline GenTree* GenTree::gtGetOp2() const
 {
     assert(OperIsBinary());
diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
index 01942a2..3e84db2 100644
--- a/src/coreclr/jit/importer.cpp
+++ b/src/coreclr/jit/importer.cpp
@@ -4492,6 +4492,13 @@ GenTree* Compiler::impIntrinsic(GenTree*                newobjThis,
             case NI_System_Math_Log:
             case NI_System_Math_Log2:
             case NI_System_Math_Log10:
+#ifdef TARGET_ARM64
+            // ARM64 has fmax/fmin which are IEEE754:2019 minimum/maximum compatible
+            // TODO-XARCH-CQ: Enable this for XARCH when one of the arguments is a constant
+            // so we can then emit maxss/minss and avoid NaN/-0.0 handling
+            case NI_System_Math_Max:
+            case NI_System_Math_Min:
+#endif
             case NI_System_Math_Pow:
             case NI_System_Math_Round:
             case NI_System_Math_Sin:
@@ -5118,6 +5125,14 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
             {
                 result = NI_System_Math_Log10;
             }
+            else if (strcmp(methodName, "Max") == 0)
+            {
+                result = NI_System_Math_Max;
+            }
+            else if (strcmp(methodName, "Min") == 0)
+            {
+                result = NI_System_Math_Min;
+            }
             else if (strcmp(methodName, "Pow") == 0)
             {
                 result = NI_System_Math_Pow;
@@ -20549,6 +20564,8 @@ bool Compiler::IsTargetIntrinsic(NamedIntrinsic intrinsicName)
         case NI_System_Math_Truncate:
         case NI_System_Math_Round:
         case NI_System_Math_Sqrt:
+        case NI_System_Math_Max:
+        case NI_System_Math_Min:
             return true;
 
         case NI_System_Math_FusedMultiplyAdd:
@@ -20613,6 +20630,8 @@ bool Compiler::IsMathIntrinsic(NamedIntrinsic intrinsicName)
         case NI_System_Math_Log:
         case NI_System_Math_Log2:
         case NI_System_Math_Log10:
+        case NI_System_Math_Max:
+        case NI_System_Math_Min:
         case NI_System_Math_Pow:
         case NI_System_Math_Round:
         case NI_System_Math_Sin:
diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp
index 9a23248..091068a 100644
--- a/src/coreclr/jit/lsraarm64.cpp
+++ b/src/coreclr/jit/lsraarm64.cpp
@@ -326,22 +326,37 @@ int LinearScan::BuildNode(GenTree* tree)
 
         case GT_INTRINSIC:
         {
-            noway_assert((tree->AsIntrinsic()->gtIntrinsicName == NI_System_Math_Abs) ||
-                         (tree->AsIntrinsic()->gtIntrinsicName == NI_System_Math_Ceiling) ||
-                         (tree->AsIntrinsic()->gtIntrinsicName == NI_System_Math_Floor) ||
-                         (tree->AsIntrinsic()->gtIntrinsicName == NI_System_Math_Truncate) ||
-                         (tree->AsIntrinsic()->gtIntrinsicName == NI_System_Math_Round) ||
-                         (tree->AsIntrinsic()->gtIntrinsicName == NI_System_Math_Sqrt));
-
-            // Both operand and its result must be of the same floating point type.
-            GenTree* op1 = tree->gtGetOp1();
-            assert(varTypeIsFloating(op1));
-            assert(op1->TypeGet() == tree->TypeGet());
-
-            BuildUse(op1);
-            srcCount = 1;
-            assert(dstCount == 1);
-            BuildDef(tree);
+            switch (tree->AsIntrinsic()->gtIntrinsicName)
+            {
+                case NI_System_Math_Max:
+                case NI_System_Math_Min:
+                    assert(varTypeIsFloating(tree->gtGetOp1()));
+                    assert(varTypeIsFloating(tree->gtGetOp2()));
+                    assert(tree->gtGetOp1()->TypeIs(tree->TypeGet()));
+
+                    srcCount = BuildBinaryUses(tree->AsOp());
+                    assert(dstCount == 1);
+                    BuildDef(tree);
+                    break;
+
+                case NI_System_Math_Abs:
+                case NI_System_Math_Ceiling:
+                case NI_System_Math_Floor:
+                case NI_System_Math_Truncate:
+                case NI_System_Math_Round:
+                case NI_System_Math_Sqrt:
+                    assert(varTypeIsFloating(tree->gtGetOp1()));
+                    assert(tree->gtGetOp1()->TypeIs(tree->TypeGet()));
+
+                    BuildUse(tree->gtGetOp1());
+                    srcCount = 1;
+                    assert(dstCount == 1);
+                    BuildDef(tree);
+                    break;
+
+                default:
+                    unreached();
+            }
         }
         break;
 
diff --git a/src/coreclr/jit/namedintrinsiclist.h b/src/coreclr/jit/namedintrinsiclist.h
index d864c3f..c830a26 100644
--- a/src/coreclr/jit/namedintrinsiclist.h
+++ b/src/coreclr/jit/namedintrinsiclist.h
@@ -36,6 +36,8 @@ enum NamedIntrinsic : unsigned short
     NI_System_Math_Log,
     NI_System_Math_Log2,
     NI_System_Math_Log10,
+    NI_System_Math_Max,
+    NI_System_Math_Min,
     NI_System_Math_Pow,
     NI_System_Math_Round,
     NI_System_Math_Sin,
diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp
index 1629b10..529c653 100644
--- a/src/coreclr/jit/utils.cpp
+++ b/src/coreclr/jit/utils.cpp
@@ -2211,6 +2211,168 @@ bool FloatingPointUtils::hasPreciseReciprocal(float x)
     return mantissa == 0 && exponent != 0 && exponent != 127;
 }
 
+//------------------------------------------------------------------------
+// isNegative: Determines whether the specified value is negative
+//
+// Arguments:
+//    val - value to check
+//
+// Return Value:
+//    True if val is negative
+//
+
+bool FloatingPointUtils::isNegative(float val)
+{
+    return *reinterpret_cast<INT32*>(&val) < 0;
+}
+
+//------------------------------------------------------------------------
+// isNegative: Determines whether the specified value is negative
+//
+// Arguments:
+//    val - value to check
+//
+// Return Value:
+//    True if val is negative
+//
+
+bool FloatingPointUtils::isNegative(double val)
+{
+    return *reinterpret_cast<INT64*>(&val) < 0;
+}
+
+//------------------------------------------------------------------------
+// isNaN: Determines whether the specified value is NaN
+//
+// Arguments:
+//    val - value to check for NaN
+//
+// Return Value:
+//    True if val is NaN
+//
+
+bool FloatingPointUtils::isNaN(float val)
+{
+    UINT32 bits = *reinterpret_cast<UINT32*>(&val);
+    return (bits & 0x7FFFFFFFU) > 0x7F800000U;
+}
+
+//------------------------------------------------------------------------
+// isNaN: Determines whether the specified value is NaN
+//
+// Arguments:
+//    val - value to check for NaN
+//
+// Return Value:
+//    True if val is NaN
+//
+
+bool FloatingPointUtils::isNaN(double val)
+{
+    UINT64 bits = *reinterpret_cast<UINT64*>(&val);
+    return (bits & 0x7FFFFFFFFFFFFFFFULL) > 0x7FF0000000000000ULL;
+}
+
+//------------------------------------------------------------------------
+// maximum: This matches the IEEE 754:2019 `maximum` function
+//    It propagates NaN inputs back to the caller and
+//    otherwise returns the larger of the inputs. It
+//    treats +0 as larger than -0 as per the specification.
+//
+// Arguments:
+//    val1 - left operand
+//    val2 - right operand
+//
+// Return Value:
+//    Either val1 or val2
+//
+
+double FloatingPointUtils::maximum(double val1, double val2)
+{
+    if (val1 != val2)
+    {
+        if (!isNaN(val1))
+        {
+            return val2 < val1 ? val1 : val2;
+        }
+        return val1;
+    }
+    return isNegative(val2) ? val1 : val2;
+}
+
+//------------------------------------------------------------------------
+// maximum: This matches the IEEE 754:2019 `maximum` function
+//    It propagates NaN inputs back to the caller and
+//    otherwise returns the larger of the inputs. It
+//    treats +0 as larger than -0 as per the specification.
+//
+// Arguments:
+//    val1 - left operand
+//    val2 - right operand
+//
+// Return Value:
+//    Either val1 or val2
+//
+
+float FloatingPointUtils::maximum(float val1, float val2)
+{
+    if (val1 != val2)
+    {
+        if (!isNaN(val1))
+        {
+            return val2 < val1 ? val1 : val2;
+        }
+        return val1;
+    }
+    return isNegative(val2) ? val1 : val2;
+}
+
+//------------------------------------------------------------------------
+// minimum: This matches the IEEE 754:2019 `minimum` function
+//    It propagates NaN inputs back to the caller and
+//    otherwise returns the larger of the inputs. It
+//    treats +0 as larger than -0 as per the specification.
+//
+// Arguments:
+//    val1 - left operand
+//    val2 - right operand
+//
+// Return Value:
+//    Either val1 or val2
+//
+
+double FloatingPointUtils::minimum(double val1, double val2)
+{
+    if (val1 != val2 && !isNaN(val1))
+    {
+        return val1 < val2 ? val1 : val2;
+    }
+    return isNegative(val1) ? val1 : val2;
+}
+
+//------------------------------------------------------------------------
+// minimum: This matches the IEEE 754:2019 `minimum` function
+//    It propagates NaN inputs back to the caller and
+//    otherwise returns the larger of the inputs. It
+//    treats +0 as larger than -0 as per the specification.
+//
+// Arguments:
+//    val1 - left operand
+//    val2 - right operand
+//
+// Return Value:
+//    Either val1 or val2
+//
+
+float FloatingPointUtils::minimum(float val1, float val2)
+{
+    if (val1 != val2 && !isNaN(val1))
+    {
+        return val1 < val2 ? val1 : val2;
+    }
+    return isNegative(val1) ? val1 : val2;
+}
+
 namespace MagicDivide
 {
 template <int TableBase = 0, int TableSize, typename Magic>
diff --git a/src/coreclr/jit/utils.h b/src/coreclr/jit/utils.h
index f0d29d0..39985f1 100644
--- a/src/coreclr/jit/utils.h
+++ b/src/coreclr/jit/utils.h
@@ -690,6 +690,22 @@ public:
     static double infinite_double();
 
     static float infinite_float();
+
+    static bool isNegative(float val);
+
+    static bool isNegative(double val);
+
+    static bool isNaN(float val);
+
+    static bool isNaN(double val);
+
+    static double maximum(double val1, double val2);
+
+    static float maximum(float val1, float val2);
+
+    static double minimum(double val1, double val2);
+
+    static float minimum(float val1, float val2);
 };
 
 // The CLR requires that critical section locks be initialized via its ClrCreateCriticalSection API...but
diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp
index 7ea3770..27e37dc 100644
--- a/src/coreclr/jit/valuenum.cpp
+++ b/src/coreclr/jit/valuenum.cpp
@@ -5726,6 +5726,22 @@ ValueNum ValueNumStore::EvalMathFuncBinary(var_types typ, NamedIntrinsic gtMathF
                     break;
                 }
 
+                case NI_System_Math_Max:
+                {
+                    assert(typ == TypeOfVN(arg1VN));
+                    double arg1Val = GetConstantDouble(arg1VN);
+                    res            = FloatingPointUtils::maximum(arg0Val, arg1Val);
+                    break;
+                }
+
+                case NI_System_Math_Min:
+                {
+                    assert(typ == TypeOfVN(arg1VN));
+                    double arg1Val = GetConstantDouble(arg1VN);
+                    res            = FloatingPointUtils::minimum(arg0Val, arg1Val);
+                    break;
+                }
+
                 default:
                     // the above are the only binary math intrinsics at the time of this writing.
                     unreached();
@@ -5759,6 +5775,22 @@ ValueNum ValueNumStore::EvalMathFuncBinary(var_types typ, NamedIntrinsic gtMathF
                     break;
                 }
 
+                case NI_System_Math_Max:
+                {
+                    assert(typ == TypeOfVN(arg1VN));
+                    float arg1Val = GetConstantSingle(arg1VN);
+                    res           = FloatingPointUtils::maximum(arg0Val, arg1Val);
+                    break;
+                }
+
+                case NI_System_Math_Min:
+                {
+                    assert(typ == TypeOfVN(arg1VN));
+                    float arg1Val = GetConstantSingle(arg1VN);
+                    res           = FloatingPointUtils::minimum(arg0Val, arg1Val);
+                    break;
+                }
+
                 case NI_System_Math_Pow:
                 {
                     assert(typ == TypeOfVN(arg1VN));
@@ -5789,6 +5821,14 @@ ValueNum ValueNumStore::EvalMathFuncBinary(var_types typ, NamedIntrinsic gtMathF
                 vnf = VNF_FMod;
                 break;
 
+            case NI_System_Math_Max:
+                vnf = VNF_Max;
+                break;
+
+            case NI_System_Math_Min:
+                vnf = VNF_Min;
+                break;
+
             case NI_System_Math_Pow:
                 vnf = VNF_Pow;
                 break;
diff --git a/src/coreclr/jit/valuenumfuncs.h b/src/coreclr/jit/valuenumfuncs.h
index 47135a0..74e8b90 100644
--- a/src/coreclr/jit/valuenumfuncs.h
+++ b/src/coreclr/jit/valuenumfuncs.h
@@ -92,6 +92,8 @@ ValueNumFuncDef(ILogB, 1, false, false, false)
 ValueNumFuncDef(Log, 1, false, false, false)
 ValueNumFuncDef(Log2, 1, false, false, false)
 ValueNumFuncDef(Log10, 1, false, false, false)
+ValueNumFuncDef(Max, 2, false, false, false)
+ValueNumFuncDef(Min, 2, false, false, false)
 ValueNumFuncDef(Pow, 2, false, false, false)
 ValueNumFuncDef(RoundDouble, 1, false, false, false)
 ValueNumFuncDef(RoundInt32, 1, false, false, false)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs
index cdf6f3c..52f95fa 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Math.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs
@@ -881,6 +881,7 @@ namespace System
             return decimal.Max(val1, val2);
         }
 
+        [Intrinsic]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static double Max(double val1, double val2)
         {
@@ -938,6 +939,7 @@ namespace System
             return (val1 >= val2) ? val1 : val2;
         }
 
+        [Intrinsic]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static float Max(float val1, float val2)
         {
@@ -1028,6 +1030,7 @@ namespace System
             return decimal.Min(val1, val2);
         }
 
+        [Intrinsic]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static double Min(double val1, double val2)
         {
@@ -1080,6 +1083,7 @@ namespace System
             return (val1 <= val2) ? val1 : val2;
         }
 
+        [Intrinsic]
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static float Min(float val1, float val2)
         {
-- 
2.7.4