From: Tanner Gooding Date: Wed, 7 Jun 2023 23:01:59 +0000 (-0700) Subject: Improve codegen for AVX-512 comparisons and general handling of `TYP_MASK` (#87089) X-Git-Tag: accepted/tizen/unified/riscv/20231226.055536~1782 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8685cacd46a484e2e861b44916e4fde09c781204;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Improve codegen for AVX-512 comparisons and general handling of `TYP_MASK` (#87089) * Add the various base kmask instructions * Updating lowering to better handle the AVX512 comparisons that produce a mask * Remove INS_Flags_IsMskSrcSrcEvexInstruction as its the same as INS_Flags_IsDstDstSrcAVXInstruction * Save 1-byte on the encoding for CompareEqualMask and signed CompareGreaterThanMask * Apply formatting patch * Ensure kmovd is encoded correctly and kmov is disassembled correctly * Ensure vpcmpgtq is actually handled * Fix the definition of a couple kmask instructions * Ensure gtNewSimdCmpOpAll node creates correct IR for simdSize == 64 * Add , that were dropped when resolving merge * Ensure that the new hwintrinsiclistxarch entries don't assert ValueNum --- diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 2fad742..e1586dc 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -18856,6 +18856,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_pcmpgtq: case INS_psadbw: case INS_vdbpsadbw: + case INS_vpcmpgtq: case INS_vpermps: case INS_vpermpd: case INS_vpermpd_reg: @@ -19290,10 +19291,6 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } - case INS_kmovb_msk: - case INS_kmovw_msk: - case INS_kmovd_msk: - case INS_kmovq_msk: case INS_kmovb_gpr: case INS_kmovw_gpr: case INS_kmovd_gpr: @@ -19304,6 +19301,16 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + { + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + case INS_vpcmpb: case INS_vpcmpw: case INS_vpcmpd: @@ -19333,13 +19340,72 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } + case INS_kandb: + case INS_kandd: + case INS_kandq: + case INS_kandw: + case INS_kandnb: + case INS_kandnd: + case INS_kandnq: + case INS_kandnw: + case INS_knotb: + case INS_knotd: + case INS_knotq: + case INS_knotw: + case INS_korb: + case INS_kord: + case INS_korq: + case INS_korw: + case INS_kxnorb: + case INS_kxnord: + case INS_kxnorq: + case INS_kxnorw: + case INS_kxorb: + case INS_kxord: + case INS_kxorq: + case INS_kxorw: + { + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + case INS_kortestb: - case INS_kortestw: case INS_kortestd: case INS_kortestq: + case INS_kortestw: + case INS_ktestb: + case INS_ktestd: + case INS_ktestq: + case INS_ktestw: { + // Keep these in a separate group as there isn't a documented latency + // Similar instructions have a 1 cycle latency, however + result.insLatency += PERFSCORE_LATENCY_1C; result.insThroughput = PERFSCORE_THROUGHPUT_1C; + + break; + } + + case INS_kaddb: + case INS_kaddd: + case INS_kaddq: + case INS_kaddw: + case INS_kshiftlb: + case INS_kshiftld: + case INS_kshiftlq: + case INS_kshiftlw: + case INS_kshiftrb: + case INS_kshiftrd: + case INS_kshiftrq: + case INS_kshiftrw: + case INS_kunpckbw: + case INS_kunpckdq: + case INS_kunpckwd: + { + result.insLatency += PERFSCORE_LATENCY_4C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; break; } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 6b314b0..43a2556 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20441,7 +20441,20 @@ GenTree* Compiler::gtNewSimdCmpOpNode( #if defined(TARGET_XARCH) case GT_EQ: { - if (simdSize == 32) + if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_CompareEqual; + } + else + { + intrinsic = NI_AVX512F_CompareEqual; + } + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); @@ -20455,17 +20468,15 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX2_CompareEqual; } } - else if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_AVX512F_CompareEqualSpecial; - } else if (simdBaseType == TYP_FLOAT) { + assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16)); intrinsic = NI_SSE_CompareEqual; } else if (varTypeIsLong(simdBaseType)) { + assert(simdSize == 16); + if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) { intrinsic = NI_SSE41_CompareEqual; @@ -20494,6 +20505,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode( } else { + assert(simdSize == 16); intrinsic = NI_SSE2_CompareEqual; } break; @@ -20501,6 +20513,37 @@ GenTree* Compiler::gtNewSimdCmpOpNode( case GT_GE: { + if (IsBaselineVector512IsaSupported()) + { + if (simdSize == 64) + { + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_CompareGreaterThanOrEqual; + } + else + { + intrinsic = NI_AVX512F_CompareGreaterThanOrEqual; + } + break; + } + else if (!varTypeIsFloating(simdBaseType)) + { + assert((simdSize == 16) || (simdSize == 32)); + + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_VL_CompareGreaterThanOrEqual; + } + else + { + intrinsic = NI_AVX512F_VL_CompareGreaterThanOrEqual; + } + + break; + } + } + if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); @@ -20510,17 +20553,14 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX_CompareGreaterThanOrEqual; } } - else if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_AVX512F_CompareGreaterThanOrEqualSpecial; - } else if (simdBaseType == TYP_FLOAT) { + assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16)); intrinsic = NI_SSE_CompareGreaterThanOrEqual; } else if (simdBaseType == TYP_DOUBLE) { + assert(simdSize == 16); intrinsic = NI_SSE2_CompareGreaterThanOrEqual; } @@ -20568,6 +20608,37 @@ GenTree* Compiler::gtNewSimdCmpOpNode( case GT_GT: { + if (IsBaselineVector512IsaSupported()) + { + if (simdSize == 64) + { + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_CompareGreaterThan; + } + else + { + intrinsic = NI_AVX512F_CompareGreaterThan; + } + break; + } + else if (varTypeIsUnsigned(simdBaseType)) + { + assert((simdSize == 16) || (simdSize == 32)); + + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_VL_CompareGreaterThan; + } + else + { + intrinsic = NI_AVX512F_VL_CompareGreaterThan; + } + + break; + } + } + if (varTypeIsUnsigned(simdBaseType)) { // Vector of byte, ushort, uint and ulong: @@ -20661,17 +20732,15 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX2_CompareGreaterThan; } } - else if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_AVX512F_CompareGreaterThanSpecial; - } else if (simdBaseType == TYP_FLOAT) { + assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16)); intrinsic = NI_SSE_CompareGreaterThan; } else if (varTypeIsLong(simdBaseType)) { + assert(simdSize == 16); + if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) { intrinsic = NI_SSE42_CompareGreaterThan; @@ -20733,6 +20802,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode( } else { + assert(simdSize == 16); intrinsic = NI_SSE2_CompareGreaterThan; } break; @@ -20740,6 +20810,37 @@ GenTree* Compiler::gtNewSimdCmpOpNode( case GT_LE: { + if (IsBaselineVector512IsaSupported()) + { + if (simdSize == 64) + { + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_CompareLessThanOrEqual; + } + else + { + intrinsic = NI_AVX512F_CompareLessThanOrEqual; + } + break; + } + else if (!varTypeIsFloating(simdBaseType)) + { + assert((simdSize == 16) || (simdSize == 32)); + + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_VL_CompareLessThanOrEqual; + } + else + { + intrinsic = NI_AVX512F_VL_CompareLessThanOrEqual; + } + + break; + } + } + if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); @@ -20749,17 +20850,14 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX_CompareLessThanOrEqual; } } - else if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_AVX512F_CompareLessThanOrEqualSpecial; - } else if (simdBaseType == TYP_FLOAT) { + assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16)); intrinsic = NI_SSE_CompareLessThanOrEqual; } else if (simdBaseType == TYP_DOUBLE) { + assert(simdSize == 16); intrinsic = NI_SSE2_CompareLessThanOrEqual; } @@ -20807,6 +20905,37 @@ GenTree* Compiler::gtNewSimdCmpOpNode( case GT_LT: { + if (IsBaselineVector512IsaSupported()) + { + if (simdSize == 64) + { + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_CompareLessThan; + } + else + { + intrinsic = NI_AVX512F_CompareLessThan; + } + break; + } + else if (varTypeIsUnsigned(simdBaseType)) + { + assert((simdSize == 16) || (simdSize == 32)); + + if (varTypeIsSmall(simdBaseType)) + { + intrinsic = NI_AVX512BW_VL_CompareLessThan; + } + else + { + intrinsic = NI_AVX512F_VL_CompareLessThan; + } + + break; + } + } + if (varTypeIsUnsigned(simdBaseType)) { // Vector of byte, ushort, uint and ulong: @@ -20900,17 +21029,15 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX2_CompareLessThan; } } - else if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_AVX512F_CompareLessThanSpecial; - } else if (simdBaseType == TYP_FLOAT) { + assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16)); intrinsic = NI_SSE_CompareLessThan; } else if (varTypeIsLong(simdBaseType)) { + assert(simdSize == 16); + if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) { intrinsic = NI_SSE42_CompareLessThan; @@ -20972,6 +21099,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode( } else { + assert(simdSize == 16); intrinsic = NI_SSE2_CompareLessThan; } break; @@ -21057,15 +21185,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode( assert(intrinsic != NI_Illegal); #if defined(TARGET_XARCH) - if (simdSize != 64) - { - return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); - } - else - { - GenTree* cmp = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, intrinsic, simdBaseJitType, simdSize); - return gtNewSimdHWIntrinsicNode(type, cmp, NI_AVX512F_MoveMaskToVectorSpecial, simdBaseJitType, simdSize); - } + return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); #else return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); #endif @@ -21116,125 +21236,8 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( } case GT_GE: - { - // We want to generate a comparison along the lines of - // GT_XX(op1, op2).As() == Vector128.AllBitsSet - - if (simdSize == 32) - { - // TODO-XArch-CQ: It's a non-trivial amount of work to support these - // for floating-point while only utilizing AVX. It would require, among - // other things, inverting the comparison and potentially support for a - // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient. - assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); - intrinsic = NI_Vector256_op_Equality; - } - else if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_Vector512_GreaterThanOrEqualAll; - break; - } - else - { - intrinsic = NI_Vector128_op_Equality; - } - - op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize); - op2 = gtNewAllBitsSetConNode(simdType); - - if (simdBaseType == TYP_FLOAT) - { - simdBaseType = TYP_INT; - simdBaseJitType = CORINFO_TYPE_INT; - } - else if (simdBaseType == TYP_DOUBLE) - { - simdBaseType = TYP_LONG; - simdBaseJitType = CORINFO_TYPE_LONG; - } - break; - } case GT_GT: - { - // We want to generate a comparison along the lines of - // GT_XX(op1, op2).As() == Vector128.AllBitsSet - - if (simdSize == 32) - { - // TODO-XArch-CQ: It's a non-trivial amount of work to support these - // for floating-point while only utilizing AVX. It would require, among - // other things, inverting the comparison and potentially support for a - // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient. - assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); - intrinsic = NI_Vector256_op_Equality; - } - else if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_Vector512_GreaterThanAll; - break; - } - else - { - intrinsic = NI_Vector128_op_Equality; - } - - op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize); - op2 = gtNewAllBitsSetConNode(simdType); - - if (simdBaseType == TYP_FLOAT) - { - simdBaseType = TYP_INT; - simdBaseJitType = CORINFO_TYPE_INT; - } - else if (simdBaseType == TYP_DOUBLE) - { - simdBaseType = TYP_LONG; - simdBaseJitType = CORINFO_TYPE_LONG; - } - break; - } case GT_LE: - { - // We want to generate a comparison along the lines of - // GT_XX(op1, op2).As() == Vector128.AllBitsSet - - if (simdSize == 32) - { - // TODO-XArch-CQ: It's a non-trivial amount of work to support these - // for floating-point while only utilizing AVX. It would require, among - // other things, inverting the comparison and potentially support for a - // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient. - assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); - intrinsic = NI_Vector256_op_Equality; - } - else if (simdSize == 64) - { - assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_Vector512_LessThanOrEqualAll; - break; - } - else - { - intrinsic = NI_Vector128_op_Equality; - } - - op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize); - op2 = gtNewAllBitsSetConNode(simdType); - - if (simdBaseType == TYP_FLOAT) - { - simdBaseType = TYP_INT; - simdBaseJitType = CORINFO_TYPE_INT; - } - else if (simdBaseType == TYP_DOUBLE) - { - simdBaseType = TYP_LONG; - simdBaseJitType = CORINFO_TYPE_LONG; - } - break; - } case GT_LT: { // We want to generate a comparison along the lines of @@ -21252,8 +21255,7 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( else if (simdSize == 64) { assert(IsBaselineVector512IsaSupportedDebugOnly()); - intrinsic = NI_Vector512_LessThanAll; - break; + intrinsic = NI_Vector512_op_Equality; } else { @@ -21494,14 +21496,12 @@ GenTree* Compiler::gtNewSimdCndSelNode( return gtNewSimdTernaryLogicNode(type, op1, op2, op3, control, simdBaseJitType, simdSize); } + assert(simdSize != 64); + if (simdSize == 32) { intrinsic = NI_Vector256_ConditionalSelect; } - else if (simdSize == 64) - { - intrinsic = NI_Vector512_ConditionalSelect; - } else { intrinsic = NI_Vector128_ConditionalSelect; @@ -21931,6 +21931,7 @@ GenTree* Compiler::gtNewSimdDotProdNode( #if defined(TARGET_XARCH) assert(!varTypeIsByte(simdBaseType) && !varTypeIsLong(simdBaseType)); + assert(simdSize != 64); if (simdSize == 32) { @@ -23827,6 +23828,7 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, CorInfoType si #if defined(TARGET_XARCH) assert(!varTypeIsByte(simdBaseType) && !varTypeIsLong(simdBaseType)); + assert(simdSize != 64); // HorizontalAdd combines pairs so we need log2(vectorLength) passes to sum all elements together. unsigned vectorLength = getSIMDVectorLength(simdSize, simdBaseType); @@ -24795,6 +24797,25 @@ GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode( } //------------------------------------------------------------------------ +// OperIsHWIntrinsic: Is this a hwintrinsic with the specified id +// +// Arguments: +// intrinsicId -- the id to compare with the current node +// +// Return Value: +// true if the node is a hwintrinsic intrinsic with the specified id +// otherwise; false +// +bool GenTree::OperIsHWIntrinsic(NamedIntrinsic intrinsicId) const +{ + if (OperIsHWIntrinsic()) + { + return AsHWIntrinsic()->GetHWIntrinsicId() == intrinsicId; + } + return false; +} + +//------------------------------------------------------------------------ // OperIsMemoryLoad: Does this HWI node have memory load semantics? // // Arguments: diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 08bc535..4483cd7 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1651,6 +1651,8 @@ public: return OperIsHWIntrinsic(gtOper); } + bool OperIsHWIntrinsic(NamedIntrinsic intrinsicId) const; + // This is here for cleaner GT_LONG #ifdefs. static bool OperIsLong(genTreeOps gtOper) { diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index f30f622..01b8867 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -330,6 +330,25 @@ enum class FloatRoundingMode : uint8_t NoException = 0x08, }; +enum class IntComparisonMode : uint8_t +{ + Equal = 0, + LessThan = 1, + LessThanOrEqual = 2, + False = 3, + + NotEqual = 4, + GreaterThanOrEqual = 5, + GreaterThan = 6, + True = 7, + + NotGreaterThanOrEqual = LessThan, + NotGreaterThan = LessThanOrEqual, + + NotLessThan = GreaterThanOrEqual, + NotLessThanOrEqual = GreaterThan +}; + enum class TernaryLogicUseFlags : uint8_t { // Indicates no flags are present @@ -493,217 +512,7 @@ struct HWIntrinsicInfo } #ifdef TARGET_XARCH - static int lookupIval(NamedIntrinsic id, bool opportunisticallyDependsOnAVX) - { - switch (id) - { - case NI_SSE_CompareEqual: - case NI_SSE_CompareScalarEqual: - case NI_SSE2_CompareEqual: - case NI_SSE2_CompareScalarEqual: - case NI_AVX_CompareEqual: - { - return static_cast(FloatComparisonMode::OrderedEqualNonSignaling); - } - - case NI_SSE_CompareGreaterThan: - case NI_SSE_CompareScalarGreaterThan: - case NI_SSE2_CompareGreaterThan: - case NI_SSE2_CompareScalarGreaterThan: - case NI_AVX_CompareGreaterThan: - { - if (opportunisticallyDependsOnAVX) - { - return static_cast(FloatComparisonMode::OrderedGreaterThanSignaling); - } - - // CompareGreaterThan is not directly supported in hardware without AVX support. - // We will return the inverted case here and lowering will itself swap the ops - // to ensure the emitted code remains correct. This simplifies the overall logic - // here and for other use cases. - - assert(id != NI_AVX_CompareGreaterThan); - return static_cast(FloatComparisonMode::OrderedLessThanSignaling); - } - - case NI_SSE_CompareLessThan: - case NI_SSE_CompareScalarLessThan: - case NI_SSE2_CompareLessThan: - case NI_SSE2_CompareScalarLessThan: - case NI_AVX_CompareLessThan: - { - return static_cast(FloatComparisonMode::OrderedLessThanSignaling); - } - - case NI_SSE_CompareGreaterThanOrEqual: - case NI_SSE_CompareScalarGreaterThanOrEqual: - case NI_SSE2_CompareGreaterThanOrEqual: - case NI_SSE2_CompareScalarGreaterThanOrEqual: - case NI_AVX_CompareGreaterThanOrEqual: - { - if (opportunisticallyDependsOnAVX) - { - return static_cast(FloatComparisonMode::OrderedGreaterThanOrEqualSignaling); - } - - // CompareGreaterThanOrEqual is not directly supported in hardware without AVX support. - // We will return the inverted case here and lowering will itself swap the ops - // to ensure the emitted code remains correct. This simplifies the overall logic - // here and for other use cases. - - assert(id != NI_AVX_CompareGreaterThanOrEqual); - return static_cast(FloatComparisonMode::OrderedLessThanOrEqualSignaling); - } - - case NI_SSE_CompareLessThanOrEqual: - case NI_SSE_CompareScalarLessThanOrEqual: - case NI_SSE2_CompareLessThanOrEqual: - case NI_SSE2_CompareScalarLessThanOrEqual: - case NI_AVX_CompareLessThanOrEqual: - { - return static_cast(FloatComparisonMode::OrderedLessThanOrEqualSignaling); - } - - case NI_SSE_CompareNotEqual: - case NI_SSE_CompareScalarNotEqual: - case NI_SSE2_CompareNotEqual: - case NI_SSE2_CompareScalarNotEqual: - case NI_AVX_CompareNotEqual: - { - return static_cast(FloatComparisonMode::UnorderedNotEqualNonSignaling); - } - - case NI_SSE_CompareNotGreaterThan: - case NI_SSE_CompareScalarNotGreaterThan: - case NI_SSE2_CompareNotGreaterThan: - case NI_SSE2_CompareScalarNotGreaterThan: - case NI_AVX_CompareNotGreaterThan: - { - if (opportunisticallyDependsOnAVX) - { - return static_cast(FloatComparisonMode::UnorderedNotGreaterThanSignaling); - } - - // CompareNotGreaterThan is not directly supported in hardware without AVX support. - // We will return the inverted case here and lowering will itself swap the ops - // to ensure the emitted code remains correct. This simplifies the overall logic - // here and for other use cases. - - assert(id != NI_AVX_CompareNotGreaterThan); - return static_cast(FloatComparisonMode::UnorderedNotLessThanSignaling); - } - - case NI_SSE_CompareNotLessThan: - case NI_SSE_CompareScalarNotLessThan: - case NI_SSE2_CompareNotLessThan: - case NI_SSE2_CompareScalarNotLessThan: - case NI_AVX_CompareNotLessThan: - { - return static_cast(FloatComparisonMode::UnorderedNotLessThanSignaling); - } - - case NI_SSE_CompareNotGreaterThanOrEqual: - case NI_SSE_CompareScalarNotGreaterThanOrEqual: - case NI_SSE2_CompareNotGreaterThanOrEqual: - case NI_SSE2_CompareScalarNotGreaterThanOrEqual: - case NI_AVX_CompareNotGreaterThanOrEqual: - { - if (opportunisticallyDependsOnAVX) - { - return static_cast(FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling); - } - - // CompareNotGreaterThanOrEqual is not directly supported in hardware without AVX support. - // We will return the inverted case here and lowering will itself swap the ops - // to ensure the emitted code remains correct. This simplifies the overall logic - // here and for other use cases. - - assert(id != NI_AVX_CompareNotGreaterThanOrEqual); - return static_cast(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling); - } - - case NI_SSE_CompareNotLessThanOrEqual: - case NI_SSE_CompareScalarNotLessThanOrEqual: - case NI_SSE2_CompareNotLessThanOrEqual: - case NI_SSE2_CompareScalarNotLessThanOrEqual: - case NI_AVX_CompareNotLessThanOrEqual: - { - return static_cast(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling); - } - - case NI_SSE_CompareOrdered: - case NI_SSE_CompareScalarOrdered: - case NI_SSE2_CompareOrdered: - case NI_SSE2_CompareScalarOrdered: - case NI_AVX_CompareOrdered: - { - return static_cast(FloatComparisonMode::OrderedNonSignaling); - } - - case NI_SSE_CompareUnordered: - case NI_SSE_CompareScalarUnordered: - case NI_SSE2_CompareUnordered: - case NI_SSE2_CompareScalarUnordered: - case NI_AVX_CompareUnordered: - { - return static_cast(FloatComparisonMode::UnorderedNonSignaling); - } - - case NI_SSE41_Ceiling: - case NI_SSE41_CeilingScalar: - case NI_AVX_Ceiling: - { - FALLTHROUGH; - } - - case NI_SSE41_RoundToPositiveInfinity: - case NI_SSE41_RoundToPositiveInfinityScalar: - case NI_AVX_RoundToPositiveInfinity: - { - return static_cast(FloatRoundingMode::ToPositiveInfinity); - } - - case NI_SSE41_Floor: - case NI_SSE41_FloorScalar: - case NI_AVX_Floor: - { - FALLTHROUGH; - } - - case NI_SSE41_RoundToNegativeInfinity: - case NI_SSE41_RoundToNegativeInfinityScalar: - case NI_AVX_RoundToNegativeInfinity: - { - return static_cast(FloatRoundingMode::ToNegativeInfinity); - } - - case NI_SSE41_RoundCurrentDirection: - case NI_SSE41_RoundCurrentDirectionScalar: - case NI_AVX_RoundCurrentDirection: - { - return static_cast(FloatRoundingMode::CurrentDirection); - } - - case NI_SSE41_RoundToNearestInteger: - case NI_SSE41_RoundToNearestIntegerScalar: - case NI_AVX_RoundToNearestInteger: - { - return static_cast(FloatRoundingMode::ToNearestInteger); - } - - case NI_SSE41_RoundToZero: - case NI_SSE41_RoundToZeroScalar: - case NI_AVX_RoundToZero: - { - return static_cast(FloatRoundingMode::ToZero); - } - - default: - { - return -1; - } - } - } + static int lookupIval(Compiler* comp, NamedIntrinsic id, var_types simdBaseType); #endif static bool tryLookupSimdSize(NamedIntrinsic id, unsigned* pSimdSize) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 973d417..7570765 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -97,9 +97,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // We need to validate that other phases of the compiler haven't introduced unsupported intrinsics assert(compiler->compIsaSupportedDebugOnly(isa)); - - int ival = HWIntrinsicInfo::lookupIval(intrinsicId, compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)); - assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId)); if (genIsTableDrivenHWIntrinsic(intrinsicId, category)) @@ -119,12 +116,15 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) emitter* emit = GetEmitter(); assert(numArgs >= 0); + instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); assert(ins != INS_invalid); - emitAttr simdSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize())); + emitAttr simdSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize())); assert(simdSize != 0); + int ival = HWIntrinsicInfo::lookupIval(compiler, intrinsicId, baseType); + switch (numArgs) { case 1: @@ -144,7 +144,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) genConsumeRegs(op1); op1Reg = op1->GetRegNum(); - if ((ival != -1) && varTypeIsFloating(baseType)) + if (ival != -1) { assert((ival >= 0) && (ival <= 127)); if (HWIntrinsicInfo::CopiesUpperBits(intrinsicId)) @@ -208,7 +208,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) op1Reg = targetReg; } - if ((ival != -1) && varTypeIsFloating(baseType)) + if (ival != -1) { assert((ival >= 0) && (ival <= 127)); genHWIntrinsic_R_R_RM_I(node, ins, simdSize, static_cast(ival)); @@ -235,7 +235,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2)) { - assert(ival == -1); auto emitSwCase = [&](int8_t i) { if (HWIntrinsicInfo::CopiesUpperBits(intrinsicId)) { @@ -293,10 +292,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) genConsumeRegs(op3); op3Reg = op3->GetRegNum(); + assert(ival == -1); + if (HWIntrinsicInfo::isImmOp(intrinsicId, op3)) { - assert(ival == -1); - auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, simdSize, i); }; if (op3->IsCnsIntOrI()) @@ -384,10 +383,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) genConsumeRegs(op4); op4Reg = op4->GetRegNum(); + assert(ival == -1); + if (HWIntrinsicInfo::isImmOp(intrinsicId, op4)) { - assert(ival == -1); - auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_R_RM_I(node, ins, simdSize, i); }; if (op4->IsCnsIntOrI()) @@ -455,6 +454,8 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case InstructionSet_AVX512F_X64: case InstructionSet_AVX512BW: case InstructionSet_AVX512BW_VL: + case InstructionSet_AVX512VBMI: + case InstructionSet_AVX512VBMI_VL: genAvxFamilyIntrinsic(node); break; case InstructionSet_AES: @@ -1913,6 +1914,7 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) instruction maskIns; instruction kmovIns; + emitAttr kmovAttr = EA_4BYTE; // TODO-XARCH-AVX512 note that this type/kmov combination assumes 512-bit vector types but would change // if used for other vector lengths, i.e., TYPE_BYTE requires kmovq for for 512-bit vector, but kmovd @@ -1921,130 +1923,78 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) { case TYP_BYTE: case TYP_UBYTE: - maskIns = INS_vpmovb2m; - kmovIns = INS_kmovq_gpr; + { + maskIns = INS_vpmovb2m; + kmovIns = INS_kmovq_gpr; + kmovAttr = EA_8BYTE; break; + } + case TYP_SHORT: case TYP_USHORT: + { maskIns = INS_vpmovw2m; kmovIns = INS_kmovd_gpr; break; + } + case TYP_INT: case TYP_UINT: case TYP_FLOAT: + { maskIns = INS_vpmovd2m; kmovIns = INS_kmovw_gpr; break; + } + case TYP_DOUBLE: case TYP_LONG: case TYP_ULONG: + { maskIns = INS_vpmovq2m; kmovIns = INS_kmovb_gpr; break; + } + default: + { unreached(); + } } assert(emitter::isMaskReg(maskReg)); emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); - emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE); - break; - } - - case NI_AVX512F_CompareEqualSpecial: - { - GenTree* op2 = node->Op(2); - op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareEqualSpecial, baseType); - - assert(compareIns != INS_invalid); - assert(emitter::isMaskReg(targetReg)); - - emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 0); + emit->emitIns_Mov(kmovIns, kmovAttr, targetReg, maskReg, INS_FLAGS_DONT_CARE); break; } - case NI_AVX512F_CompareGreaterThanOrEqualSpecial: - { - GenTree* op2 = node->Op(2); - op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanOrEqualSpecial, baseType); - - assert(compareIns != INS_invalid); - assert(emitter::isMaskReg(targetReg)); - - emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 5); - break; - } - - case NI_AVX512F_CompareGreaterThanSpecial: - { - GenTree* op2 = node->Op(2); - op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanSpecial, baseType); - - assert(compareIns != INS_invalid); - assert(emitter::isMaskReg(targetReg)); - - emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 6); - break; - } - - case NI_AVX512F_CompareLessThanOrEqualSpecial: - { - GenTree* op2 = node->Op(2); - op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanOrEqualSpecial, baseType); - - assert(compareIns != INS_invalid); - assert(emitter::isMaskReg(targetReg)); - - emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 2); - break; - } - - case NI_AVX512F_CompareLessThanSpecial: - { - GenTree* op2 = node->Op(2); - op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanSpecial, baseType); - - assert(compareIns != INS_invalid); - assert(emitter::isMaskReg(targetReg)); - - emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 1); - break; - } - - case NI_AVX512F_MoveMaskToVectorSpecial: + case NI_AVX512F_KORTEST: { op1Reg = op1->GetRegNum(); - instruction maskMovIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_MoveMaskToVectorSpecial, baseType); + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); - assert(maskMovIns != INS_invalid); - assert(emitter::isMaskReg(op1Reg)); + instruction testIns; - emit->emitIns_R_R(maskMovIns, attr, targetReg, op1Reg); - break; - } - - case NI_AVX512F_KORTEST: - { - op1Reg = op1->GetRegNum(); - - instruction testIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_KORTEST, baseType); + if (count <= 8) + { + testIns = INS_kortestb; + } + else if (count == 16) + { + testIns = INS_kortestw; + } + else if (count == 32) + { + testIns = INS_kortestd; + } + else + { + assert(count == 64); + testIns = INS_kortestq; + } assert(testIns != INS_invalid); assert(emitter::isMaskReg(op1Reg)); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 66ad4be..e1649b2 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -836,6 +836,12 @@ HARDWARE_INTRINSIC(AVX512F, AndNot, HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareNotEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -943,6 +949,11 @@ HARDWARE_INTRINSIC(AVX512F_VL, AlignRight32, HARDWARE_INTRINSIC(AVX512F_VL, AlignRight64, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F_VL, Max, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaxsq, INS_vpmaxuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F_VL, Min, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpminsq, INS_vpminuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpud, INS_invalid, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareNotEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Byte, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128ByteWithSaturation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Double, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2pd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) @@ -1004,6 +1015,12 @@ HARDWARE_INTRINSIC(AVX512BW, AddSaturate, HARDWARE_INTRINSIC(AVX512BW, AlignRight, 64, 3, false, {INS_palignr, INS_palignr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512BW, Average, 64, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512BW, BroadcastScalarToVector512, 64, 1, true, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512BW, CompareEqual, 64, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThan, 64, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareLessThan, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareLessThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareNotEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256Byte, 64, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256ByteWithSaturation, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256SByte, 64, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1045,6 +1062,11 @@ HARDWARE_INTRINSIC(AVX512BW, UnpackLow, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512BW.VL Intrinsics +HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_vpcmpub, INS_invalid, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThan, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareNotEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128Byte, -1, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128ByteWithSaturation, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128SByte, -1, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1275,16 +1297,17 @@ HARDWARE_INTRINSIC(SSE, UCOMISS, HARDWARE_INTRINSIC(SSE2, COMISD, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, UCOMISD, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareEqualSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqualSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqualSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, CompareLessThanSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, MoveMaskToVectorSpecial, 64, 1, true, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, false, {INS_kortestq, INS_kortestq, INS_kortestd, INS_kortestd, INS_kortestw, INS_kortestw, INS_kortestb, INS_kortestb, INS_kortestw, INS_kortestb}, HW_Category_Special, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, CompareEqualMask, -1, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanMask, -1, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqualMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqualMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, CompareNotEqualMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, ConvertMaskToVector, -1, 1, true, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 9c38e45..dadabad 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -554,6 +554,302 @@ bool HWIntrinsicInfo::isScalarIsa(CORINFO_InstructionSet isa) } //------------------------------------------------------------------------ +// lookupIval: Gets a the implicit immediate value for the given intrinsic +// +// Arguments: +// comp - The compiler +// id - The intrinsic for which to get the ival +// simdBaseType - The base type for the intrinsic +// +// Return Value: +// The immediate value for the given intrinsic or -1 if none exists +int HWIntrinsicInfo::lookupIval(Compiler* comp, NamedIntrinsic id, var_types simdBaseType) +{ + switch (id) + { + case NI_SSE_CompareEqual: + case NI_SSE_CompareScalarEqual: + case NI_SSE2_CompareEqual: + case NI_SSE2_CompareScalarEqual: + case NI_AVX_CompareEqual: + case NI_AVX512F_CompareEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + return static_cast(FloatComparisonMode::OrderedEqualNonSignaling); + } + else + { + // We can emit `vpcmpeqb`, `vpcmpeqw`, `vpcmpeqd`, or `vpcmpeqq` + } + break; + } + + case NI_SSE_CompareGreaterThan: + case NI_SSE_CompareScalarGreaterThan: + case NI_SSE2_CompareGreaterThan: + case NI_SSE2_CompareScalarGreaterThan: + case NI_AVX_CompareGreaterThan: + case NI_AVX512F_CompareGreaterThanMask: + { + if (varTypeIsFloating(simdBaseType)) + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + return static_cast(FloatComparisonMode::OrderedGreaterThanSignaling); + } + + // CompareGreaterThan is not directly supported in hardware without AVX support. + // We will return the inverted case here and lowering will itself swap the ops + // to ensure the emitted code remains correct. This simplifies the overall logic + // here and for other use cases. + + assert(id != NI_AVX_CompareGreaterThan); + return static_cast(FloatComparisonMode::OrderedLessThanSignaling); + } + else if ((id == NI_AVX512F_CompareGreaterThanMask) && varTypeIsUnsigned(simdBaseType)) + { + // TODO-XARCH-CQ: Allow the other integer paths to use the EVEX encoding + return static_cast(IntComparisonMode::GreaterThan); + } + break; + } + + case NI_SSE_CompareLessThan: + case NI_SSE_CompareScalarLessThan: + case NI_SSE2_CompareLessThan: + case NI_SSE2_CompareScalarLessThan: + case NI_AVX_CompareLessThan: + case NI_AVX512F_CompareLessThanMask: + { + if (varTypeIsFloating(simdBaseType)) + { + return static_cast(FloatComparisonMode::OrderedLessThanSignaling); + } + else if (id == NI_AVX512F_CompareLessThanMask) + { + // TODO-XARCH-CQ: Allow the other integer paths to use the EVEX encoding + return static_cast(IntComparisonMode::LessThan); + } + break; + } + + case NI_SSE_CompareGreaterThanOrEqual: + case NI_SSE_CompareScalarGreaterThanOrEqual: + case NI_SSE2_CompareGreaterThanOrEqual: + case NI_SSE2_CompareScalarGreaterThanOrEqual: + case NI_AVX_CompareGreaterThanOrEqual: + case NI_AVX512F_CompareGreaterThanOrEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + return static_cast(FloatComparisonMode::OrderedGreaterThanOrEqualSignaling); + } + + // CompareGreaterThanOrEqual is not directly supported in hardware without AVX support. + // We will return the inverted case here and lowering will itself swap the ops + // to ensure the emitted code remains correct. This simplifies the overall logic + // here and for other use cases. + + assert(id != NI_AVX_CompareGreaterThanOrEqual); + return static_cast(FloatComparisonMode::OrderedLessThanOrEqualSignaling); + } + else + { + assert(id == NI_AVX512F_CompareGreaterThanOrEqualMask); + return static_cast(IntComparisonMode::GreaterThanOrEqual); + } + break; + } + + case NI_SSE_CompareLessThanOrEqual: + case NI_SSE_CompareScalarLessThanOrEqual: + case NI_SSE2_CompareLessThanOrEqual: + case NI_SSE2_CompareScalarLessThanOrEqual: + case NI_AVX_CompareLessThanOrEqual: + case NI_AVX512F_CompareLessThanOrEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + return static_cast(FloatComparisonMode::OrderedLessThanOrEqualSignaling); + } + else + { + assert(id == NI_AVX512F_CompareLessThanOrEqualMask); + return static_cast(IntComparisonMode::LessThanOrEqual); + } + break; + } + + case NI_SSE_CompareNotEqual: + case NI_SSE_CompareScalarNotEqual: + case NI_SSE2_CompareNotEqual: + case NI_SSE2_CompareScalarNotEqual: + case NI_AVX_CompareNotEqual: + case NI_AVX512F_CompareNotEqualMask: + { + if (varTypeIsFloating(simdBaseType)) + { + return static_cast(FloatComparisonMode::UnorderedNotEqualNonSignaling); + } + else + { + assert(id == NI_AVX512F_CompareNotEqualMask); + return static_cast(IntComparisonMode::NotEqual); + } + break; + } + + case NI_SSE_CompareNotGreaterThan: + case NI_SSE_CompareScalarNotGreaterThan: + case NI_SSE2_CompareNotGreaterThan: + case NI_SSE2_CompareScalarNotGreaterThan: + case NI_AVX_CompareNotGreaterThan: + { + assert(varTypeIsFloating(simdBaseType)); + + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + return static_cast(FloatComparisonMode::UnorderedNotGreaterThanSignaling); + } + + // CompareNotGreaterThan is not directly supported in hardware without AVX support. + // We will return the inverted case here and lowering will itself swap the ops + // to ensure the emitted code remains correct. This simplifies the overall logic + // here and for other use cases. + + assert(id != NI_AVX_CompareNotGreaterThan); + return static_cast(FloatComparisonMode::UnorderedNotLessThanSignaling); + } + + case NI_SSE_CompareNotLessThan: + case NI_SSE_CompareScalarNotLessThan: + case NI_SSE2_CompareNotLessThan: + case NI_SSE2_CompareScalarNotLessThan: + case NI_AVX_CompareNotLessThan: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatComparisonMode::UnorderedNotLessThanSignaling); + } + + case NI_SSE_CompareNotGreaterThanOrEqual: + case NI_SSE_CompareScalarNotGreaterThanOrEqual: + case NI_SSE2_CompareNotGreaterThanOrEqual: + case NI_SSE2_CompareScalarNotGreaterThanOrEqual: + case NI_AVX_CompareNotGreaterThanOrEqual: + { + assert(varTypeIsFloating(simdBaseType)); + + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + return static_cast(FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling); + } + + // CompareNotGreaterThanOrEqual is not directly supported in hardware without AVX support. + // We will return the inverted case here and lowering will itself swap the ops + // to ensure the emitted code remains correct. This simplifies the overall logic + // here and for other use cases. + + assert(id != NI_AVX_CompareNotGreaterThanOrEqual); + return static_cast(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling); + } + + case NI_SSE_CompareNotLessThanOrEqual: + case NI_SSE_CompareScalarNotLessThanOrEqual: + case NI_SSE2_CompareNotLessThanOrEqual: + case NI_SSE2_CompareScalarNotLessThanOrEqual: + case NI_AVX_CompareNotLessThanOrEqual: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling); + } + + case NI_SSE_CompareOrdered: + case NI_SSE_CompareScalarOrdered: + case NI_SSE2_CompareOrdered: + case NI_SSE2_CompareScalarOrdered: + case NI_AVX_CompareOrdered: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatComparisonMode::OrderedNonSignaling); + } + + case NI_SSE_CompareUnordered: + case NI_SSE_CompareScalarUnordered: + case NI_SSE2_CompareUnordered: + case NI_SSE2_CompareScalarUnordered: + case NI_AVX_CompareUnordered: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatComparisonMode::UnorderedNonSignaling); + } + + case NI_SSE41_Ceiling: + case NI_SSE41_CeilingScalar: + case NI_AVX_Ceiling: + { + FALLTHROUGH; + } + + case NI_SSE41_RoundToPositiveInfinity: + case NI_SSE41_RoundToPositiveInfinityScalar: + case NI_AVX_RoundToPositiveInfinity: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatRoundingMode::ToPositiveInfinity); + } + + case NI_SSE41_Floor: + case NI_SSE41_FloorScalar: + case NI_AVX_Floor: + { + FALLTHROUGH; + } + + case NI_SSE41_RoundToNegativeInfinity: + case NI_SSE41_RoundToNegativeInfinityScalar: + case NI_AVX_RoundToNegativeInfinity: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatRoundingMode::ToNegativeInfinity); + } + + case NI_SSE41_RoundCurrentDirection: + case NI_SSE41_RoundCurrentDirectionScalar: + case NI_AVX_RoundCurrentDirection: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatRoundingMode::CurrentDirection); + } + + case NI_SSE41_RoundToNearestInteger: + case NI_SSE41_RoundToNearestIntegerScalar: + case NI_AVX_RoundToNearestInteger: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatRoundingMode::ToNearestInteger); + } + + case NI_SSE41_RoundToZero: + case NI_SSE41_RoundToZeroScalar: + case NI_AVX_RoundToZero: + { + assert(varTypeIsFloating(simdBaseType)); + return static_cast(FloatRoundingMode::ToZero); + } + + default: + { + break; + } + } + + return -1; +} + +//------------------------------------------------------------------------ // impNonConstFallback: convert certain SSE2/AVX2 shift intrinsic to its semantic alternative when the imm-arg is // not a compile-time constant // @@ -2871,10 +3167,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // These intrinsics are "special import" because the non-AVX path isn't directly // hardware supported. Instead, they start with "swapped operands" and we fix that here. - FloatComparisonMode comparison = - static_cast(HWIntrinsicInfo::lookupIval(intrinsic, true)); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(static_cast(comparison)), - NI_AVX_CompareScalar, simdBaseJitType, simdSize); + int ival = HWIntrinsicInfo::lookupIval(this, intrinsic, simdBaseType); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(ival), NI_AVX_CompareScalar, + simdBaseJitType, simdSize); } else { @@ -2931,10 +3226,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // These intrinsics are "special import" because the non-AVX path isn't directly // hardware supported. Instead, they start with "swapped operands" and we fix that here. - FloatComparisonMode comparison = - static_cast(HWIntrinsicInfo::lookupIval(intrinsic, true)); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(static_cast(comparison)), - NI_AVX_CompareScalar, simdBaseJitType, simdSize); + int ival = HWIntrinsicInfo::lookupIval(this, intrinsic, simdBaseType); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(ival), NI_AVX_CompareScalar, + simdBaseJitType, simdSize); } else { diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index cb03f30..bbc8089 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -146,8 +146,7 @@ enum insFlags : uint64_t // Avx INS_Flags_IsDstDstSrcAVXInstruction = 1ULL << 26, INS_Flags_IsDstSrcSrcAVXInstruction = 1ULL << 27, - INS_Flags_IsMskSrcSrcEvexInstruction = 1ULL << 28, - INS_Flags_Is3OperandInstructionMask = (INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_IsDstSrcSrcAVXInstruction | INS_Flags_IsMskSrcSrcEvexInstruction), + INS_Flags_Is3OperandInstructionMask = (INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_IsDstSrcSrcAVXInstruction), // w and s bits INS_FLAGS_Has_Wbit = 1ULL << 29, diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 4cb9cc2..161df44 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -608,19 +608,28 @@ INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // AVX512F -INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) -INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) -INST3(kortestw, "kortestw", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(kandw, "kandw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x41), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical AND masks +INST3(kandnw, "kandnw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x42), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical AND NOT masks +INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(knotw, "knotw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x44), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // NOT mask register +INST3(korw, "korw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x45), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical OR masks +INST3(kortestw, "kortestw", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags +INST3(kshiftlw, "kshiftlw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x32), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Shift left mask registers +INST3(kshiftrw, "kshiftrw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x30), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Shift right mask registers +INST3(kunpckbw, "kunpckbw", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x4B), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Unpack for mask registers +INST3(kxnorw, "kxnorw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x46), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical XNOR masks +INST3(kxorw, "kxorw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x47), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical XOR masks INST3(valignd, "alignd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x03), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Align doubleword vectors INST3(valignq, "alignq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x03), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Align quadword vectors INST3(vbroadcastf64x2, "broadcastf64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1A), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register INST3(vbroadcasti64x2, "broadcasti64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5A), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register INST3(vbroadcastf64x4, "broadcastf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1B), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register INST3(vbroadcasti64x4, "broadcasti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5B), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register -INST3(vcmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare packed singles -INST3(vcmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare scalar singles -INST3(vcmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare packed doubles -INST3(vcmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare scalar doubles +INST3(vcmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed singles +INST3(vcmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar singles +INST3(vcmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed doubles +INST3(vcmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar doubles INST3(vcvtpd2udq, "cvtpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x79), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed doubles to unsigned DWORDs INST3(vcvtps2udq, "cvtps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x79), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed singles to unsigned DWORDs INST3(vcvtsd2usi, "cvtsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x79), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_EVEX) // cvt scalar double to unsigned DWORD/QWORD @@ -658,10 +667,10 @@ INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_ INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs INST3(vpbroadcastd_gpr, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast int32 value from gpr to entire register INST3(vpbroadcastq_gpr, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast int64 value from gpr to entire register -INST3(vpcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 32-bit integers for equality -INST3(vpcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 32-bit signed integers for greater than -INST3(vpcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 64-bit integers for equality -INST3(vpcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 64-bit integers for equality +INST3(vpcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality +INST3(vpcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than +INST3(vpcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality +INST3(vpcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality INST3(vpermq_reg, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register INST3(vpermpd_reg, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register INST3(vpermi2d, "permi2d", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x76), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Full Permute From Two Tables Overwriting the Index @@ -735,25 +744,47 @@ INST3(vshufi32x4, "shufi32x4", IUM_WR, BAD_CODE, BAD_ INST3(vshufi64x2, "shufi64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x43), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shuffle packed values at 128-bit granularity // AVX512BW -INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) -INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) -INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) -INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) -INST3(kortestd, "kortestd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) -INST3(kortestq, "kortestq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(kaddd, "kaddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x4A), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Add two masks +INST3(kaddq, "kaddq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x4A), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Add two masks +INST3(kandd, "kandd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x41), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical AND masks +INST3(kandq, "kandq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x41), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical AND masks +INST3(kandnd, "kandnd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x42), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical AND NOT masks +INST3(kandnq, "kandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x42), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical AND NOT masks +INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(knotd, "knotd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x44), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // NOT mask register +INST3(knotq, "knotq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x44), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // NOT mask register +INST3(kord, "kord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x45), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical OR masks +INST3(korq, "korq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x45), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical OR masks +INST3(kortestd, "kortestd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags +INST3(kortestq, "kortestq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags +INST3(kshiftld, "kshiftld", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x33), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Shift left mask registers +INST3(kshiftlq, "kshiftlq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x33), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Shift left mask registers +INST3(kshiftrd, "kshiftrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x31), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Shift right mask registers +INST3(kshiftrq, "kshiftrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x31), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Shift right mask registers +INST3(ktestd, "ktestd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x99), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // Packed bit test masks and set flags +INST3(ktestq, "ktestq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x99), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // Packed bit test masks and set flags +INST3(kunpckdq, "kunpckdq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x4B), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Unpack for mask registers +INST3(kunpckwd, "kunpckwd", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x4B), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Unpack for mask registers +INST3(kxnord, "kxnord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x46), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical XNOR masks +INST3(kxnorq, "kxnorq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x46), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical XNOR masks +INST3(kxord, "kxord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x47), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical XOR masks +INST3(kxorq, "kxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x47), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical XOR masks INST3(vdbpsadbw, "dbpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Double block packed Sum-Absolute-Differences (SAD) on unsigned bytes INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX) INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX) INST3(vpbroadcastb_gpr, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7A), INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_EVEX) // Broadcast int8 value from gpr to entire register INST3(vpbroadcastw_gpr, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7B), INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Broadcast int16 value from gpr to entire register -INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) -INST3(vpcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 8-bit integers for equality -INST3(vpcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 16-bit integers for equality -INST3(vpcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 8-bit signed integers for greater than -INST3(vpcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 16-bit signed integers for greater than -INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) -INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) -INST3(vpcmpuw, "pcmpuw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(vpcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality +INST3(vpcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality +INST3(vpcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than +INST3(vpcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than +INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(vpcmpuw, "pcmpuw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) INST3(vpermw, "permw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8D), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute Packed Doublewords Elements INST3(vpermi2w, "permi2w", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Full Permute From Two Tables Overwriting the Index INST3(vpermt2w, "permt2w", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7D), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Full Permute From Two Tables Overwriting one Table @@ -775,9 +806,21 @@ INST3(vplzcntd, "plzcntd", IUM_WR, BAD_CODE, BAD_ INST3(vplzcntq, "plzcntq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x44), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // Count the number of leading zero bits for packed qword values // AVX512DQ -INST3(kortestb, "kortestb", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) -INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) -INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) +INST3(kaddb, "kaddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x4A), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Add two masks +INST3(kaddw, "kaddw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x4A), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Add two masks +INST3(kandb, "kandb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x41), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical AND masks +INST3(kandnb, "kandnb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x42), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical AND NOT masks +INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(knotb, "knotb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x44), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // NOT mask register +INST3(korb, "korb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x45), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical OR masks +INST3(kortestb, "kortestb", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags +INST3(kshiftlb, "kshiftlb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x32), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Shift left mask registers +INST3(kshiftrb, "kshiftrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x30), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Shift right mask registers +INST3(ktestb, "ktestb", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x99), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // Packed bit test masks and set flags +INST3(ktestw, "ktestw", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x99), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // Packed bit test masks and set flags +INST3(kxnorb, "kxnorb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x46), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical XNOR masks +INST3(kxorb, "kxorb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x47), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical XOR masks INST3(vbroadcastf32x2, "broadcastf32x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x19), INS_TT_TUPLE2, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register INST3(vbroadcasti32x2, "broadcasti32x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE2, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register INST3(vbroadcastf32x8, "broadcastf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1B), INS_TT_TUPLE8, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index f49b024..2afbeb1 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -355,11 +355,11 @@ private: GenTree* LowerHWIntrinsic(GenTreeHWIntrinsic* node); void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition); GenTree* LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp); - GenTree* LowerHWIntrinsicCmpOpWithKReg(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); #if defined(TARGET_XARCH) void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); + GenTree* LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4a69c64..24a581a 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -894,12 +894,20 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn } break; - case NI_AVX512F_KORTEST: case NI_SSE41_PTEST: case NI_AVX_PTEST: + { // If we need the Carry flag then we can't swap operands. canSwapOperands = (cc == nullptr) || cc->gtCondition.Is(GenCondition::EQ, GenCondition::NE); break; + } + + case NI_AVX512F_KORTEST: + { + // TODO-XARCH-AVX512 remove the KORTEST check when its promoted to 2 proper arguments + assert(HWIntrinsicInfo::lookupNumArgs(newIntrinsicId) == 1); + break; + } default: unreached(); @@ -1166,28 +1174,16 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector128_op_Equality: case NI_Vector256_op_Equality: + case NI_Vector512_op_Equality: { return LowerHWIntrinsicCmpOp(node, GT_EQ); } case NI_Vector128_op_Inequality: case NI_Vector256_op_Inequality: - { - return LowerHWIntrinsicCmpOp(node, GT_NE); - } - - case NI_Vector512_GreaterThanAll: - case NI_Vector512_GreaterThanAny: - case NI_Vector512_GreaterThanOrEqualAll: - case NI_Vector512_GreaterThanOrEqualAny: - case NI_Vector512_LessThanAll: - case NI_Vector512_LessThanAny: - case NI_Vector512_LessThanOrEqualAll: - case NI_Vector512_LessThanOrEqualAny: - case NI_Vector512_op_Equality: case NI_Vector512_op_Inequality: { - return LowerHWIntrinsicCmpOpWithKReg(node); + return LowerHWIntrinsicCmpOp(node, GT_NE); } case NI_Vector128_ToScalar: @@ -1614,6 +1610,32 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerFusedMultiplyAdd(node); break; + case NI_AVX512F_CompareEqual: + case NI_AVX512F_CompareGreaterThan: + case NI_AVX512F_CompareGreaterThanOrEqual: + case NI_AVX512F_CompareLessThan: + case NI_AVX512F_CompareLessThanOrEqual: + case NI_AVX512F_CompareNotEqual: + case NI_AVX512F_VL_CompareGreaterThan: + case NI_AVX512F_VL_CompareGreaterThanOrEqual: + case NI_AVX512F_VL_CompareLessThan: + case NI_AVX512F_VL_CompareLessThanOrEqual: + case NI_AVX512F_VL_CompareNotEqual: + case NI_AVX512BW_CompareEqual: + case NI_AVX512BW_CompareGreaterThan: + case NI_AVX512BW_CompareGreaterThanOrEqual: + case NI_AVX512BW_CompareLessThan: + case NI_AVX512BW_CompareLessThanOrEqual: + case NI_AVX512BW_CompareNotEqual: + case NI_AVX512BW_VL_CompareGreaterThan: + case NI_AVX512BW_VL_CompareGreaterThanOrEqual: + case NI_AVX512BW_VL_CompareLessThan: + case NI_AVX512BW_VL_CompareLessThanOrEqual: + case NI_AVX512BW_VL_CompareNotEqual: + { + return LowerHWIntrinsicWithAvx512Mask(node); + } + default: break; } @@ -1638,7 +1660,8 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm var_types simdType = Compiler::getSIMDTypeForSize(simdSize); assert((intrinsicId == NI_Vector128_op_Equality) || (intrinsicId == NI_Vector128_op_Inequality) || - (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality)); + (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality) || + (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality)); assert(varTypeIsSIMD(simdType)); assert(varTypeIsArithmetic(simdBaseType)); @@ -1655,8 +1678,9 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTree* op2 = node->Op(2); GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE; - if (!varTypeIsFloating(simdBaseType) && op2->IsVectorZero() && - comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && op2->IsVectorZero() && + comp->compOpportunisticallyDependsOn(InstructionSet_SSE41) && + !op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector)) { // On SSE4.1 or higher we can optimize comparisons against zero to // just use PTEST. We can't support it for floating-point, however, @@ -1681,14 +1705,269 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm } else { + assert(simdSize == 16); + // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed? node->ChangeHWIntrinsicId(NI_SSE41_TestZ); LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd); } - return node->gtNext; + return LowerNode(node); } + // TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing + // so will require us to account for the unused 4th element. + + if ((simdType != TYP_SIMD12) && comp->IsBaselineVector512IsaSupported()) + { + // The EVEX encoded versions of the comparison instructions all return a kmask + // + // For the comparisons against zero that we normally optimize to use `PTEST` we + // have to make a decision to use EVEX and emit 2 instructions (vpcmp + kortest) + // or to continue emitting PTEST and hope that the register allocator isn't limited + // by it not supporting the extended register set. + // + // Ideally we'd opt to not use PTEST when EVEX is available, This would be done so we can + // best take advantage of EVEX exclusive features such as embedded broadcast and the + // 16 additional registers. In many cases this allows for overall denser codegen where + // we are doing more in the same number of bytes, even though the individual instruction + // is 1-2 bytes larger. Even though there may be cases where continuing to use PTEST for select- + // 128/256-bit code paths would still be beneficial, the additional complexity required + // to detect and account for those differences is not likely to be worth the tradeoff. + // + // TODO-XARCH-AVX512: Given the above don't emit the PTEST path above when AVX-512 is available + // This will require exposing `NI_AVX512F_TestZ` so that we can keep codegen optimized to just + // `vptestm` followed by `kortest`. This will be one instruction more than just `vptest` but + // it has the advantages detailed above. + // + // For other comparisons, using EVEX allows us to avoid leaving the SIMD domain, avoids + // needing to use a general-purpose register, and allows us to generate less instructions. + + GenTree* nextNode = node->gtNext; + + NamedIntrinsic maskIntrinsicId = NI_AVX512F_CompareEqualMask; + uint32_t count = simdSize / genTypeSize(simdBaseType); + + // KORTEST does a bitwise or on the result and sets ZF if it is zero and CF if it is all + // bits set. Because of this, when we have at least 8 elements to compare we can use a + // normal comparison alongside CF. + // + // That is, if the user wants `x == y`, we can keep it as `mask = (x == y)` and then emit + // `kortest mask, mask` and check `CF == 1`. This will be true if all elements matched and + // false otherwise. Things work out nicely and we keep readable disasm. + // + // Likewise, if the user wants `x != y`, we can keep it as `mask = (x != y)` and then emit + // `kortest mask, mask` and check `ZF != 0`. This will be true if any elements mismatched. + // + // However, if we have less than 8 elements then we have to change it up since we have less + // than 8 bits in the output mask and unused bits will be set to 0. This occurs for 32-bit + // for Vector128 and and 64-bit elements when using either Vector128 or Vector256. + // + // To account for this, we will invert the comparison being done. So if the user wants + // `x == y`, we will instead emit `mask = (x != y)`, we will still emit `kortest mask, mask`, + // but we will then check for `ZF == 0`. This works since that equates to all elements being equal + // + // Likewise for `x != y` we will instead emit `mask = (x == y)`, then `kortest mask, mask`, + // and will then check for `CF == 0` which equates to one or more elements not being equal + + // The scenarios we have to for a full mask are: + // * No matches: 0000_0000 - ZF == 1, CF == 0 + // * Partial matches: 0000_1111 - ZF == 0, CF == 0 + // * All matches: 1111_1111 - ZF == 0, CF == 1 + // + // The scenarios we have to for a partial mask are: + // * No matches: 0000_0000 - ZF == 1, CF == 0 + // * Partial matches: 0000_0011 - ZF == 0, CF == 0 + // * All matches: 0000_1111 - ZF == 0, CF == 0 + // + // When we have less than a full mask worth of elements, we need to account for the upper + // bits being implicitly zero. To do that, we may need to invert the comparison. + // + // By inverting the comparison we'll get: + // * All matches: 0000_0000 - ZF == 1, CF == 0 + // * Partial matches: 0000_0011 - ZF == 0, CF == 0 + // * No matches: 0000_1111 - ZF == 0, CF == 0 + // + // This works since the upper bits are implicitly zero and so by inverting matches also become + // zero, which in turn means that `AllBitsSet` will become `Zero` and other cases become non-zero + + if (op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) && op2->IsCnsVec()) + { + // We want to specially handle the common cases of `mask op Zero` and `mask op AllBitsSet` + // + // These get created for the various `gtNewSimdCmpOpAnyNode` and `gtNewSimdCmpOpAllNode` + // scenarios and we want to ensure they still get "optimal" codegen. To handle that, we + // simply consume the mask directly and preserve the intended comparison by tweaking the + // compare condition passed down into `KORTEST` + + GenTreeHWIntrinsic* maskNode = op1->AsHWIntrinsic()->Op(1)->AsHWIntrinsic(); + assert(maskNode->TypeIs(TYP_MASK)); + + bool isHandled = false; + GenTreeVecCon* vecCon = op2->AsVecCon(); + + if (vecCon->IsZero()) + { + // We have `mask == Zero` which is the same as checking that nothing in the mask + // is set. This scenario can be handled by `kortest` and then checking that `ZF == 1` + // + // -or- + // + // We have `mask != Zero` which is the same as checking that something in the mask + // is set. This scenario can be handled by `kortest` and then checking that `ZF == 0` + // + // Since this is the default state for `CompareEqualMask` + `GT_EQ`/`GT_NE`, there is nothing + // for us to change. This also applies to cases where we have less than a full mask of + // elements since the upper mask bits are implicitly zero. + + isHandled = true; + } + else if (vecCon->IsAllBitsSet()) + { + // We have `mask == AllBitsSet` which is the same as checking that everything in the mask + // is set. This scenario can be handled by `kortest` and then checking that `CF == 1` for + // a full mask and checking `ZF == 1` for a partial mask using an inverted comparison + // + // -or- + // + // We have `mask != AllBitsSet` which is the same as checking that something in the mask + // is set. This scenario can be handled by `kortest` and then checking that `CF == 0` for + // a full mask and checking `ZF != 0` for a partial mask using an inverted comparison + + if (count < 8) + { + assert((count == 1) || (count == 2) || (count == 4)); + + switch (maskNode->GetHWIntrinsicId()) + { + case NI_AVX512F_CompareEqualMask: + { + maskIntrinsicId = NI_AVX512F_CompareNotEqualMask; + break; + } + + case NI_AVX512F_CompareGreaterThanMask: + { + maskIntrinsicId = NI_AVX512F_CompareLessThanOrEqualMask; + break; + } + + case NI_AVX512F_CompareGreaterThanOrEqualMask: + { + maskIntrinsicId = NI_AVX512F_CompareLessThanMask; + break; + } + + case NI_AVX512F_CompareLessThanMask: + { + maskIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualMask; + break; + } + + case NI_AVX512F_CompareLessThanOrEqualMask: + { + maskIntrinsicId = NI_AVX512F_CompareGreaterThanMask; + break; + } + + case NI_AVX512F_CompareNotEqualMask: + { + maskIntrinsicId = NI_AVX512F_CompareEqualMask; + break; + } + + default: + { + unreached(); + } + } + + maskNode->ChangeHWIntrinsicId(maskIntrinsicId); + } + else if (cmpOp == GT_EQ) + { + cmpCnd = GenCondition::C; + } + else + { + cmpCnd = GenCondition::NC; + } + isHandled = true; + } + + if (isHandled) + { + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(maskNode); + } + else + { + maskNode->SetUnusedValue(); + } + + BlockRange().Remove(op2); + BlockRange().Remove(op1); + BlockRange().Remove(node); + + node = maskNode; + } + } + + if (node->gtType != TYP_MASK) + { + // We have `x == y` or `x != y` both of which where we want to find `AllBitsSet` in the mask since + // we can directly do the relevant comparison. Given the above tables then when we have a full mask + // we can simply check against `CF == 1` for `op_Equality` and `ZF == 0` for `op_Inequality`. + // + // For a partial mask, we need to invert the `op_Equality` comparisons which means that we now need + // to check for `ZF == 1` (we're looking for `AllBitsSet`, that is `all match`). For `op_Inequality` + // we can keep things as is since we're looking for `any match` and just want to check `ZF == 0` + + if (count < 8) + { + assert((count == 1) || (count == 2) || (count == 4)); + maskIntrinsicId = NI_AVX512F_CompareNotEqualMask; + } + else + { + assert((count == 8) || (count == 16) || (count == 32) || (count == 64)); + + if (cmpOp == GT_EQ) + { + cmpCnd = GenCondition::C; + } + else + { + maskIntrinsicId = NI_AVX512F_CompareNotEqualMask; + } + } + + node->gtType = TYP_MASK; + node->ChangeHWIntrinsicId(maskIntrinsicId); + + LowerNode(node); + } + + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + GenTreeHWIntrinsic* cc; + + cc = comp->gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_KORTEST, simdBaseJitType, simdSize); + BlockRange().InsertBefore(nextNode, cc); + + use.ReplaceWith(cc); + LowerHWIntrinsicCC(cc, NI_AVX512F_KORTEST, cmpCnd); + + nextNode = cc->gtNext; + } + return nextNode; + } + + assert(simdSize != 64); + NamedIntrinsic cmpIntrinsic; CorInfoType cmpJitType; NamedIntrinsic mskIntrinsic; @@ -1728,11 +2007,11 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm case TYP_ULONG: { mskJitType = CORINFO_TYPE_UBYTE; + cmpJitType = simdBaseJitType; if (simdSize == 32) { cmpIntrinsic = NI_AVX2_CompareEqual; - cmpJitType = simdBaseJitType; mskIntrinsic = NI_AVX2_MoveMask; mskConstant = -1; } @@ -1743,7 +2022,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) { cmpIntrinsic = NI_SSE41_CompareEqual; - cmpJitType = simdBaseJitType; } else { @@ -1856,80 +2134,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm } //---------------------------------------------------------------------------------------------- -// Lowering::LowerHWIntrinsicCmpOpWithKReg: Lowers a Vector512 comparison intrinsic -// -// Arguments: -// node - The hardware intrinsic node. -// -GenTree* Lowering::LowerHWIntrinsicCmpOpWithKReg(GenTreeHWIntrinsic* node) -{ - NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); - CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); - var_types simdBaseType = node->GetSimdBaseType(); - unsigned simdSize = node->GetSimdSize(); - var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - - assert((intrinsicId == NI_Vector512_GreaterThanAll) || (intrinsicId == NI_Vector512_GreaterThanOrEqualAll) || - (intrinsicId == NI_Vector512_LessThanAll) || (intrinsicId == NI_Vector512_LessThanOrEqualAll) || - (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality)); - - assert(varTypeIsSIMD(simdType)); - assert(varTypeIsArithmetic(simdBaseType)); - assert(simdSize == 64); - assert(node->gtType == TYP_BOOL); - - NamedIntrinsic newIntrinsicId = NI_Illegal; - switch (intrinsicId) - { - case NI_Vector512_GreaterThanAll: - { - newIntrinsicId = NI_AVX512F_CompareGreaterThanSpecial; - break; - } - case NI_Vector512_GreaterThanOrEqualAll: - { - newIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualSpecial; - break; - } - case NI_Vector512_LessThanAll: - { - newIntrinsicId = NI_AVX512F_CompareLessThanSpecial; - break; - } - case NI_Vector512_LessThanOrEqualAll: - { - newIntrinsicId = NI_AVX512F_CompareLessThanOrEqualSpecial; - break; - } - case NI_Vector512_op_Equality: - case NI_Vector512_op_Inequality: - { - newIntrinsicId = NI_AVX512F_CompareEqualSpecial; - break; - } - - default: - { - assert(false); - break; - } - } - - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); - - GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, newIntrinsicId, simdBaseJitType, simdSize); - BlockRange().InsertBefore(node, cmp); - LowerNode(cmp); - - node->ResetHWIntrinsicId(NI_AVX512F_KORTEST, cmp); - GenCondition cmpCnd = (intrinsicId != NI_Vector512_op_Inequality) ? GenCondition::C : GenCondition::NC; - LowerHWIntrinsicCC(node, NI_AVX512F_KORTEST, cmpCnd); - - return node->gtNext; -} - -//---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsicCndSel: Lowers a Vector128 or Vector256 Conditional Select call // // Arguments: @@ -4826,6 +5030,125 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) } //---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicWithAvx512Mask: Lowers a HWIntrinsic node that utilizes the AVX512 KMASK registers +// +// Arguments: +// node - The hardware intrinsic node. +// +GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node) +{ + NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + assert(varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(simdBaseType)); + assert(simdSize != 0); + + NamedIntrinsic maskIntrinsicId; + + switch (intrinsicId) + { + case NI_AVX512F_CompareEqual: + case NI_AVX512BW_CompareEqual: + { + maskIntrinsicId = NI_AVX512F_CompareEqualMask; + break; + } + + case NI_AVX512F_VL_CompareGreaterThan: + case NI_AVX512BW_VL_CompareGreaterThan: + { + assert(varTypeIsUnsigned(simdBaseType)); + FALLTHROUGH; + } + + case NI_AVX512F_CompareGreaterThan: + case NI_AVX512BW_CompareGreaterThan: + { + maskIntrinsicId = NI_AVX512F_CompareGreaterThanMask; + break; + } + + case NI_AVX512F_VL_CompareGreaterThanOrEqual: + case NI_AVX512BW_VL_CompareGreaterThanOrEqual: + { + assert(!varTypeIsFloating(simdBaseType)); + FALLTHROUGH; + } + + case NI_AVX512F_CompareGreaterThanOrEqual: + case NI_AVX512BW_CompareGreaterThanOrEqual: + { + maskIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualMask; + break; + } + + case NI_AVX512F_VL_CompareLessThan: + case NI_AVX512BW_VL_CompareLessThan: + { + assert(varTypeIsUnsigned(simdBaseType)); + FALLTHROUGH; + } + + case NI_AVX512F_CompareLessThan: + case NI_AVX512BW_CompareLessThan: + { + maskIntrinsicId = NI_AVX512F_CompareLessThanMask; + break; + } + + case NI_AVX512F_VL_CompareLessThanOrEqual: + case NI_AVX512BW_VL_CompareLessThanOrEqual: + { + assert(!varTypeIsFloating(simdBaseType)); + FALLTHROUGH; + } + + case NI_AVX512F_CompareLessThanOrEqual: + case NI_AVX512BW_CompareLessThanOrEqual: + { + maskIntrinsicId = NI_AVX512F_CompareLessThanOrEqualMask; + break; + } + + case NI_AVX512F_VL_CompareNotEqual: + case NI_AVX512BW_VL_CompareNotEqual: + { + assert(!varTypeIsFloating(simdBaseType)); + FALLTHROUGH; + } + + case NI_AVX512F_CompareNotEqual: + case NI_AVX512BW_CompareNotEqual: + { + maskIntrinsicId = NI_AVX512F_CompareNotEqualMask; + break; + } + + default: + { + unreached(); + } + } + + node->gtType = TYP_MASK; + node->ChangeHWIntrinsicId(maskIntrinsicId); + + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + GenTree* maskToVector = + comp->gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + BlockRange().InsertAfter(node, maskToVector); + use.ReplaceWith(maskToVector); + } + return LowerNode(node); +} + +//---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsicToScalar: Lowers a Vector128 or Vector256 ToScalar call // // Arguments: