From a1da393cac02d0588ae499f68d37a3d6e07572f7 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 6 Aug 2020 14:56:56 -0700 Subject: [PATCH] [Arm64] Treat Math/MathF.FusedMultiplyAdd as intrinsics (#40124) * Transform Math{F}.FusedMultiplyAdd(x,y,z) into AdvSimd.FusedMultiplyAddScalar( Vector64.CreateScalarUnsafe(z), Vector64.CreateScalarUnsafe(y), Vector64.CreateScalarUnsafe(x)).ToScalar() on Arm64 in importer.cpp * Add containment analysis for AdvSimd_FusedMultiplyAddScalar in lower.h lowerarmarch.cpp * Set tgtPrefOp1 for intrinsics with SIMD-to-SIMD move semantics in lsraarm64.cpp --- src/coreclr/src/jit/importer.cpp | 39 +++++++++++++++++-- src/coreclr/src/jit/lower.h | 4 +- src/coreclr/src/jit/lowerarmarch.cpp | 74 ++++++++++++++++++++++++++++++++++++ src/coreclr/src/jit/lsraarm64.cpp | 21 ++++++++-- 4 files changed, 129 insertions(+), 9 deletions(-) diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp index d111158..8d485e6 100644 --- a/src/coreclr/src/jit/importer.cpp +++ b/src/coreclr/src/jit/importer.cpp @@ -4201,9 +4201,9 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis, // We are constructing a chain of intrinsics similar to: // return FMA.MultiplyAddScalar( - // Vector128.CreateScalar(x), - // Vector128.CreateScalar(y), - // Vector128.CreateScalar(z) + // Vector128.CreateScalarUnsafe(x), + // Vector128.CreateScalarUnsafe(y), + // Vector128.CreateScalarUnsafe(z) // ).ToScalar(); GenTree* op3 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, impPopStack().val, @@ -4217,7 +4217,38 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis, retNode = gtNewSimdHWIntrinsicNode(callType, res, NI_Vector128_ToScalar, callType, 16); } -#endif // TARGET_XARCH +#elif defined(TARGET_ARM64) + if (compExactlyDependsOn(InstructionSet_AdvSimd)) + { + assert(varTypeIsFloating(callType)); + + // We are constructing a chain of intrinsics similar to: + // return AdvSimd.FusedMultiplyAddScalar( + // Vector64.Create{ScalarUnsafe}(z), + // Vector64.Create{ScalarUnsafe}(y), + // Vector64.Create{ScalarUnsafe}(x) + // ).ToScalar(); + + NamedIntrinsic createVector64 = + (callType == TYP_DOUBLE) ? NI_Vector64_Create : NI_Vector64_CreateScalarUnsafe; + + constexpr unsigned int simdSize = 8; + + GenTree* op3 = + gtNewSimdHWIntrinsicNode(TYP_SIMD8, impPopStack().val, createVector64, callType, simdSize); + GenTree* op2 = + gtNewSimdHWIntrinsicNode(TYP_SIMD8, impPopStack().val, createVector64, callType, simdSize); + GenTree* op1 = + gtNewSimdHWIntrinsicNode(TYP_SIMD8, impPopStack().val, createVector64, callType, simdSize); + + // Note that AdvSimd.FusedMultiplyAddScalar(op1,op2,op3) corresponds to op1 + op2 * op3 + // while Math{F}.FusedMultiplyAddScalar(op1,op2,op3) corresponds to op1 * op2 + op3 + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op3, op2, op1, NI_AdvSimd_FusedMultiplyAddScalar, + callType, simdSize); + + retNode = gtNewSimdHWIntrinsicNode(callType, retNode, NI_Vector64_ToScalar, callType, simdSize); + } +#endif break; } #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/src/jit/lower.h b/src/coreclr/src/jit/lower.h index 135446c..ff13302 100644 --- a/src/coreclr/src/jit/lower.h +++ b/src/coreclr/src/jit/lower.h @@ -327,12 +327,12 @@ private: void LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp); void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); void LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); - void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); - #if defined(TARGET_XARCH) + void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); void LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); + void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node); #endif // !TARGET_XARCH && !TARGET_ARM64 union VectorConstant { diff --git a/src/coreclr/src/jit/lowerarmarch.cpp b/src/coreclr/src/jit/lowerarmarch.cpp index 2ab6fa9..6d46545 100644 --- a/src/coreclr/src/jit/lowerarmarch.cpp +++ b/src/coreclr/src/jit/lowerarmarch.cpp @@ -517,6 +517,76 @@ void Lowering::LowerSIMD(GenTreeSIMD* simdNode) #endif // FEATURE_SIMD #ifdef FEATURE_HW_INTRINSICS + +//---------------------------------------------------------------------------------------------- +// LowerHWIntrinsicFusedMultiplyAddScalar: Lowers AdvSimd_FusedMultiplyAddScalar intrinsics +// when some of the operands are negated by "containing" such negation. +// +// Arguments: +// node - The original hardware intrinsic node +// +// | op1 | op2 | op3 | +// | + | + | + | AdvSimd_FusedMultiplyAddScalar +// | + | + | - | AdvSimd_FusedMultiplySubtractScalar +// | + | - | + | AdvSimd_FusedMultiplySubtractScalar +// | + | - | - | AdvSimd_FusedMultiplyAddScalar +// | - | + | + | AdvSimd_FusedMultiplySubtractNegatedScalar +// | - | + | - | AdvSimd_FusedMultiplyAddNegatedScalar +// | - | - | + | AdvSimd_FusedMultiplyAddNegatedScalar +// | - | - | - | AdvSimd_FusedMultiplySubtractNegatedScalar +// +void Lowering::LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node) +{ + assert(node->gtHWIntrinsicId == NI_AdvSimd_FusedMultiplyAddScalar); + + const HWIntrinsic intrin(node); + + GenTree* op1 = intrin.op1; + GenTree* op2 = intrin.op2; + GenTree* op3 = intrin.op3; + + auto lowerOperand = [this](GenTree* op) { + bool wasNegated = false; + + if (op->OperIsHWIntrinsic() && + ((op->AsHWIntrinsic()->gtHWIntrinsicId == NI_AdvSimd_Arm64_DuplicateToVector64) || + (op->AsHWIntrinsic()->gtHWIntrinsicId == NI_Vector64_CreateScalarUnsafe))) + { + GenTreeHWIntrinsic* createVector64 = op->AsHWIntrinsic(); + GenTree* valueOp = createVector64->gtGetOp1(); + + if (valueOp->OperIs(GT_NEG)) + { + createVector64->gtOp1 = valueOp->gtGetOp1(); + BlockRange().Remove(valueOp); + wasNegated = true; + } + } + + return wasNegated; + }; + + const bool op1WasNegated = lowerOperand(op1); + const bool op2WasNegated = lowerOperand(op2); + const bool op3WasNegated = lowerOperand(op3); + + if (op1WasNegated) + { + if (op2WasNegated != op3WasNegated) + { + node->gtHWIntrinsicId = NI_AdvSimd_FusedMultiplyAddNegatedScalar; + } + else + { + node->gtHWIntrinsicId = NI_AdvSimd_FusedMultiplySubtractNegatedScalar; + } + } + else if (op2WasNegated != op3WasNegated) + { + node->gtHWIntrinsicId = NI_AdvSimd_FusedMultiplySubtractScalar; + } +} + //---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node. // @@ -573,6 +643,10 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return; } + case NI_AdvSimd_FusedMultiplyAddScalar: + LowerHWIntrinsicFusedMultiplyAddScalar(node); + break; + default: break; } diff --git a/src/coreclr/src/jit/lsraarm64.cpp b/src/coreclr/src/jit/lsraarm64.cpp index bdb626d..2138b78 100644 --- a/src/coreclr/src/jit/lsraarm64.cpp +++ b/src/coreclr/src/jit/lsraarm64.cpp @@ -1050,9 +1050,24 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) if (intrin.op1 != nullptr) { - // If we have an RMW intrinsic, we want to preference op1Reg to the target if - // op1 is not contained. - if (isRMW) + bool simdRegToSimdRegMove = false; + + if ((intrin.id == NI_Vector64_CreateScalarUnsafe) || (intrin.id == NI_Vector128_CreateScalarUnsafe)) + { + simdRegToSimdRegMove = varTypeIsFloating(intrin.op1); + } + else if (intrin.id == NI_AdvSimd_Arm64_DuplicateToVector64) + { + simdRegToSimdRegMove = (intrin.op1->TypeGet() == TYP_DOUBLE); + } + else if ((intrin.id == NI_Vector64_ToScalar) || (intrin.id == NI_Vector128_ToScalar)) + { + simdRegToSimdRegMove = varTypeIsFloating(intrinsicTree); + } + + // If we have an RMW intrinsic or an intrinsic with simple move semantic between two SIMD registers, + // we want to preference op1Reg to the target if op1 is not contained. + if (isRMW || simdRegToSimdRegMove) { tgtPrefOp1 = !intrin.op1->isContained(); } -- 2.7.4