case INS_pcmpgtq:
case INS_psadbw:
case INS_vdbpsadbw:
+ case INS_vpcmpgtq:
case INS_vpermps:
case INS_vpermpd:
case INS_vpermpd_reg:
break;
}
- case INS_kmovb_msk:
- case INS_kmovw_msk:
- case INS_kmovd_msk:
- case INS_kmovq_msk:
case INS_kmovb_gpr:
case INS_kmovw_gpr:
case INS_kmovd_gpr:
break;
}
+ case INS_kmovb_msk:
+ case INS_kmovw_msk:
+ case INS_kmovd_msk:
+ case INS_kmovq_msk:
+ {
+ result.insLatency += PERFSCORE_LATENCY_1C;
+ result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+ break;
+ }
+
case INS_vpcmpb:
case INS_vpcmpw:
case INS_vpcmpd:
break;
}
+ case INS_kandb:
+ case INS_kandd:
+ case INS_kandq:
+ case INS_kandw:
+ case INS_kandnb:
+ case INS_kandnd:
+ case INS_kandnq:
+ case INS_kandnw:
+ case INS_knotb:
+ case INS_knotd:
+ case INS_knotq:
+ case INS_knotw:
+ case INS_korb:
+ case INS_kord:
+ case INS_korq:
+ case INS_korw:
+ case INS_kxnorb:
+ case INS_kxnord:
+ case INS_kxnorq:
+ case INS_kxnorw:
+ case INS_kxorb:
+ case INS_kxord:
+ case INS_kxorq:
+ case INS_kxorw:
+ {
+ result.insLatency += PERFSCORE_LATENCY_1C;
+ result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+ break;
+ }
+
case INS_kortestb:
- case INS_kortestw:
case INS_kortestd:
case INS_kortestq:
+ case INS_kortestw:
+ case INS_ktestb:
+ case INS_ktestd:
+ case INS_ktestq:
+ case INS_ktestw:
{
+ // Keep these in a separate group as there isn't a documented latency
+ // Similar instructions have a 1 cycle latency, however
+
result.insLatency += PERFSCORE_LATENCY_1C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+
+ break;
+ }
+
+ case INS_kaddb:
+ case INS_kaddd:
+ case INS_kaddq:
+ case INS_kaddw:
+ case INS_kshiftlb:
+ case INS_kshiftld:
+ case INS_kshiftlq:
+ case INS_kshiftlw:
+ case INS_kshiftrb:
+ case INS_kshiftrd:
+ case INS_kshiftrq:
+ case INS_kshiftrw:
+ case INS_kunpckbw:
+ case INS_kunpckdq:
+ case INS_kunpckwd:
+ {
+ result.insLatency += PERFSCORE_LATENCY_4C;
+ result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}
#if defined(TARGET_XARCH)
case GT_EQ:
{
- if (simdSize == 32)
+ if (simdSize == 64)
+ {
+ assert(IsBaselineVector512IsaSupportedDebugOnly());
+
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_CompareEqual;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_CompareEqual;
+ }
+ }
+ else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
intrinsic = NI_AVX2_CompareEqual;
}
}
- else if (simdSize == 64)
- {
- assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_AVX512F_CompareEqualSpecial;
- }
else if (simdBaseType == TYP_FLOAT)
{
+ assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
intrinsic = NI_SSE_CompareEqual;
}
else if (varTypeIsLong(simdBaseType))
{
+ assert(simdSize == 16);
+
if (compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
intrinsic = NI_SSE41_CompareEqual;
}
else
{
+ assert(simdSize == 16);
intrinsic = NI_SSE2_CompareEqual;
}
break;
case GT_GE:
{
+ if (IsBaselineVector512IsaSupported())
+ {
+ if (simdSize == 64)
+ {
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_CompareGreaterThanOrEqual;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_CompareGreaterThanOrEqual;
+ }
+ break;
+ }
+ else if (!varTypeIsFloating(simdBaseType))
+ {
+ assert((simdSize == 16) || (simdSize == 32));
+
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_VL_CompareGreaterThanOrEqual;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_VL_CompareGreaterThanOrEqual;
+ }
+
+ break;
+ }
+ }
+
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
intrinsic = NI_AVX_CompareGreaterThanOrEqual;
}
}
- else if (simdSize == 64)
- {
- assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_AVX512F_CompareGreaterThanOrEqualSpecial;
- }
else if (simdBaseType == TYP_FLOAT)
{
+ assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
intrinsic = NI_SSE_CompareGreaterThanOrEqual;
}
else if (simdBaseType == TYP_DOUBLE)
{
+ assert(simdSize == 16);
intrinsic = NI_SSE2_CompareGreaterThanOrEqual;
}
case GT_GT:
{
+ if (IsBaselineVector512IsaSupported())
+ {
+ if (simdSize == 64)
+ {
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_CompareGreaterThan;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_CompareGreaterThan;
+ }
+ break;
+ }
+ else if (varTypeIsUnsigned(simdBaseType))
+ {
+ assert((simdSize == 16) || (simdSize == 32));
+
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_VL_CompareGreaterThan;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_VL_CompareGreaterThan;
+ }
+
+ break;
+ }
+ }
+
if (varTypeIsUnsigned(simdBaseType))
{
// Vector of byte, ushort, uint and ulong:
intrinsic = NI_AVX2_CompareGreaterThan;
}
}
- else if (simdSize == 64)
- {
- assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_AVX512F_CompareGreaterThanSpecial;
- }
else if (simdBaseType == TYP_FLOAT)
{
+ assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
intrinsic = NI_SSE_CompareGreaterThan;
}
else if (varTypeIsLong(simdBaseType))
{
+ assert(simdSize == 16);
+
if (compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
intrinsic = NI_SSE42_CompareGreaterThan;
}
else
{
+ assert(simdSize == 16);
intrinsic = NI_SSE2_CompareGreaterThan;
}
break;
case GT_LE:
{
+ if (IsBaselineVector512IsaSupported())
+ {
+ if (simdSize == 64)
+ {
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_CompareLessThanOrEqual;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_CompareLessThanOrEqual;
+ }
+ break;
+ }
+ else if (!varTypeIsFloating(simdBaseType))
+ {
+ assert((simdSize == 16) || (simdSize == 32));
+
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_VL_CompareLessThanOrEqual;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_VL_CompareLessThanOrEqual;
+ }
+
+ break;
+ }
+ }
+
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
intrinsic = NI_AVX_CompareLessThanOrEqual;
}
}
- else if (simdSize == 64)
- {
- assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_AVX512F_CompareLessThanOrEqualSpecial;
- }
else if (simdBaseType == TYP_FLOAT)
{
+ assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
intrinsic = NI_SSE_CompareLessThanOrEqual;
}
else if (simdBaseType == TYP_DOUBLE)
{
+ assert(simdSize == 16);
intrinsic = NI_SSE2_CompareLessThanOrEqual;
}
case GT_LT:
{
+ if (IsBaselineVector512IsaSupported())
+ {
+ if (simdSize == 64)
+ {
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_CompareLessThan;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_CompareLessThan;
+ }
+ break;
+ }
+ else if (varTypeIsUnsigned(simdBaseType))
+ {
+ assert((simdSize == 16) || (simdSize == 32));
+
+ if (varTypeIsSmall(simdBaseType))
+ {
+ intrinsic = NI_AVX512BW_VL_CompareLessThan;
+ }
+ else
+ {
+ intrinsic = NI_AVX512F_VL_CompareLessThan;
+ }
+
+ break;
+ }
+ }
+
if (varTypeIsUnsigned(simdBaseType))
{
// Vector of byte, ushort, uint and ulong:
intrinsic = NI_AVX2_CompareLessThan;
}
}
- else if (simdSize == 64)
- {
- assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_AVX512F_CompareLessThanSpecial;
- }
else if (simdBaseType == TYP_FLOAT)
{
+ assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
intrinsic = NI_SSE_CompareLessThan;
}
else if (varTypeIsLong(simdBaseType))
{
+ assert(simdSize == 16);
+
if (compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
intrinsic = NI_SSE42_CompareLessThan;
}
else
{
+ assert(simdSize == 16);
intrinsic = NI_SSE2_CompareLessThan;
}
break;
assert(intrinsic != NI_Illegal);
#if defined(TARGET_XARCH)
- if (simdSize != 64)
- {
- return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
- }
- else
- {
- GenTree* cmp = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, intrinsic, simdBaseJitType, simdSize);
- return gtNewSimdHWIntrinsicNode(type, cmp, NI_AVX512F_MoveMaskToVectorSpecial, simdBaseJitType, simdSize);
- }
+ return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
#else
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
#endif
}
case GT_GE:
- {
- // We want to generate a comparison along the lines of
- // GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet
-
- if (simdSize == 32)
- {
- // TODO-XArch-CQ: It's a non-trivial amount of work to support these
- // for floating-point while only utilizing AVX. It would require, among
- // other things, inverting the comparison and potentially support for a
- // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
- assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
- intrinsic = NI_Vector256_op_Equality;
- }
- else if (simdSize == 64)
- {
- assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_Vector512_GreaterThanOrEqualAll;
- break;
- }
- else
- {
- intrinsic = NI_Vector128_op_Equality;
- }
-
- op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
- op2 = gtNewAllBitsSetConNode(simdType);
-
- if (simdBaseType == TYP_FLOAT)
- {
- simdBaseType = TYP_INT;
- simdBaseJitType = CORINFO_TYPE_INT;
- }
- else if (simdBaseType == TYP_DOUBLE)
- {
- simdBaseType = TYP_LONG;
- simdBaseJitType = CORINFO_TYPE_LONG;
- }
- break;
- }
case GT_GT:
- {
- // We want to generate a comparison along the lines of
- // GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet
-
- if (simdSize == 32)
- {
- // TODO-XArch-CQ: It's a non-trivial amount of work to support these
- // for floating-point while only utilizing AVX. It would require, among
- // other things, inverting the comparison and potentially support for a
- // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
- assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
- intrinsic = NI_Vector256_op_Equality;
- }
- else if (simdSize == 64)
- {
- assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_Vector512_GreaterThanAll;
- break;
- }
- else
- {
- intrinsic = NI_Vector128_op_Equality;
- }
-
- op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
- op2 = gtNewAllBitsSetConNode(simdType);
-
- if (simdBaseType == TYP_FLOAT)
- {
- simdBaseType = TYP_INT;
- simdBaseJitType = CORINFO_TYPE_INT;
- }
- else if (simdBaseType == TYP_DOUBLE)
- {
- simdBaseType = TYP_LONG;
- simdBaseJitType = CORINFO_TYPE_LONG;
- }
- break;
- }
case GT_LE:
- {
- // We want to generate a comparison along the lines of
- // GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet
-
- if (simdSize == 32)
- {
- // TODO-XArch-CQ: It's a non-trivial amount of work to support these
- // for floating-point while only utilizing AVX. It would require, among
- // other things, inverting the comparison and potentially support for a
- // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
- assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
- intrinsic = NI_Vector256_op_Equality;
- }
- else if (simdSize == 64)
- {
- assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_Vector512_LessThanOrEqualAll;
- break;
- }
- else
- {
- intrinsic = NI_Vector128_op_Equality;
- }
-
- op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
- op2 = gtNewAllBitsSetConNode(simdType);
-
- if (simdBaseType == TYP_FLOAT)
- {
- simdBaseType = TYP_INT;
- simdBaseJitType = CORINFO_TYPE_INT;
- }
- else if (simdBaseType == TYP_DOUBLE)
- {
- simdBaseType = TYP_LONG;
- simdBaseJitType = CORINFO_TYPE_LONG;
- }
- break;
- }
case GT_LT:
{
// We want to generate a comparison along the lines of
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
- intrinsic = NI_Vector512_LessThanAll;
- break;
+ intrinsic = NI_Vector512_op_Equality;
}
else
{
return gtNewSimdTernaryLogicNode(type, op1, op2, op3, control, simdBaseJitType, simdSize);
}
+ assert(simdSize != 64);
+
if (simdSize == 32)
{
intrinsic = NI_Vector256_ConditionalSelect;
}
- else if (simdSize == 64)
- {
- intrinsic = NI_Vector512_ConditionalSelect;
- }
else
{
intrinsic = NI_Vector128_ConditionalSelect;
#if defined(TARGET_XARCH)
assert(!varTypeIsByte(simdBaseType) && !varTypeIsLong(simdBaseType));
+ assert(simdSize != 64);
if (simdSize == 32)
{
#if defined(TARGET_XARCH)
assert(!varTypeIsByte(simdBaseType) && !varTypeIsLong(simdBaseType));
+ assert(simdSize != 64);
// HorizontalAdd combines pairs so we need log2(vectorLength) passes to sum all elements together.
unsigned vectorLength = getSIMDVectorLength(simdSize, simdBaseType);
}
//------------------------------------------------------------------------
+// OperIsHWIntrinsic: Is this a hwintrinsic with the specified id
+//
+// Arguments:
+// intrinsicId -- the id to compare with the current node
+//
+// Return Value:
+// true if the node is a hwintrinsic intrinsic with the specified id
+// otherwise; false
+//
+bool GenTree::OperIsHWIntrinsic(NamedIntrinsic intrinsicId) const
+{
+ if (OperIsHWIntrinsic())
+ {
+ return AsHWIntrinsic()->GetHWIntrinsicId() == intrinsicId;
+ }
+ return false;
+}
+
+//------------------------------------------------------------------------
// OperIsMemoryLoad: Does this HWI node have memory load semantics?
//
// Arguments:
return OperIsHWIntrinsic(gtOper);
}
+ bool OperIsHWIntrinsic(NamedIntrinsic intrinsicId) const;
+
// This is here for cleaner GT_LONG #ifdefs.
static bool OperIsLong(genTreeOps gtOper)
{
NoException = 0x08,
};
+enum class IntComparisonMode : uint8_t
+{
+ Equal = 0,
+ LessThan = 1,
+ LessThanOrEqual = 2,
+ False = 3,
+
+ NotEqual = 4,
+ GreaterThanOrEqual = 5,
+ GreaterThan = 6,
+ True = 7,
+
+ NotGreaterThanOrEqual = LessThan,
+ NotGreaterThan = LessThanOrEqual,
+
+ NotLessThan = GreaterThanOrEqual,
+ NotLessThanOrEqual = GreaterThan
+};
+
enum class TernaryLogicUseFlags : uint8_t
{
// Indicates no flags are present
}
#ifdef TARGET_XARCH
- static int lookupIval(NamedIntrinsic id, bool opportunisticallyDependsOnAVX)
- {
- switch (id)
- {
- case NI_SSE_CompareEqual:
- case NI_SSE_CompareScalarEqual:
- case NI_SSE2_CompareEqual:
- case NI_SSE2_CompareScalarEqual:
- case NI_AVX_CompareEqual:
- {
- return static_cast<int>(FloatComparisonMode::OrderedEqualNonSignaling);
- }
-
- case NI_SSE_CompareGreaterThan:
- case NI_SSE_CompareScalarGreaterThan:
- case NI_SSE2_CompareGreaterThan:
- case NI_SSE2_CompareScalarGreaterThan:
- case NI_AVX_CompareGreaterThan:
- {
- if (opportunisticallyDependsOnAVX)
- {
- return static_cast<int>(FloatComparisonMode::OrderedGreaterThanSignaling);
- }
-
- // CompareGreaterThan is not directly supported in hardware without AVX support.
- // We will return the inverted case here and lowering will itself swap the ops
- // to ensure the emitted code remains correct. This simplifies the overall logic
- // here and for other use cases.
-
- assert(id != NI_AVX_CompareGreaterThan);
- return static_cast<int>(FloatComparisonMode::OrderedLessThanSignaling);
- }
-
- case NI_SSE_CompareLessThan:
- case NI_SSE_CompareScalarLessThan:
- case NI_SSE2_CompareLessThan:
- case NI_SSE2_CompareScalarLessThan:
- case NI_AVX_CompareLessThan:
- {
- return static_cast<int>(FloatComparisonMode::OrderedLessThanSignaling);
- }
-
- case NI_SSE_CompareGreaterThanOrEqual:
- case NI_SSE_CompareScalarGreaterThanOrEqual:
- case NI_SSE2_CompareGreaterThanOrEqual:
- case NI_SSE2_CompareScalarGreaterThanOrEqual:
- case NI_AVX_CompareGreaterThanOrEqual:
- {
- if (opportunisticallyDependsOnAVX)
- {
- return static_cast<int>(FloatComparisonMode::OrderedGreaterThanOrEqualSignaling);
- }
-
- // CompareGreaterThanOrEqual is not directly supported in hardware without AVX support.
- // We will return the inverted case here and lowering will itself swap the ops
- // to ensure the emitted code remains correct. This simplifies the overall logic
- // here and for other use cases.
-
- assert(id != NI_AVX_CompareGreaterThanOrEqual);
- return static_cast<int>(FloatComparisonMode::OrderedLessThanOrEqualSignaling);
- }
-
- case NI_SSE_CompareLessThanOrEqual:
- case NI_SSE_CompareScalarLessThanOrEqual:
- case NI_SSE2_CompareLessThanOrEqual:
- case NI_SSE2_CompareScalarLessThanOrEqual:
- case NI_AVX_CompareLessThanOrEqual:
- {
- return static_cast<int>(FloatComparisonMode::OrderedLessThanOrEqualSignaling);
- }
-
- case NI_SSE_CompareNotEqual:
- case NI_SSE_CompareScalarNotEqual:
- case NI_SSE2_CompareNotEqual:
- case NI_SSE2_CompareScalarNotEqual:
- case NI_AVX_CompareNotEqual:
- {
- return static_cast<int>(FloatComparisonMode::UnorderedNotEqualNonSignaling);
- }
-
- case NI_SSE_CompareNotGreaterThan:
- case NI_SSE_CompareScalarNotGreaterThan:
- case NI_SSE2_CompareNotGreaterThan:
- case NI_SSE2_CompareScalarNotGreaterThan:
- case NI_AVX_CompareNotGreaterThan:
- {
- if (opportunisticallyDependsOnAVX)
- {
- return static_cast<int>(FloatComparisonMode::UnorderedNotGreaterThanSignaling);
- }
-
- // CompareNotGreaterThan is not directly supported in hardware without AVX support.
- // We will return the inverted case here and lowering will itself swap the ops
- // to ensure the emitted code remains correct. This simplifies the overall logic
- // here and for other use cases.
-
- assert(id != NI_AVX_CompareNotGreaterThan);
- return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanSignaling);
- }
-
- case NI_SSE_CompareNotLessThan:
- case NI_SSE_CompareScalarNotLessThan:
- case NI_SSE2_CompareNotLessThan:
- case NI_SSE2_CompareScalarNotLessThan:
- case NI_AVX_CompareNotLessThan:
- {
- return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanSignaling);
- }
-
- case NI_SSE_CompareNotGreaterThanOrEqual:
- case NI_SSE_CompareScalarNotGreaterThanOrEqual:
- case NI_SSE2_CompareNotGreaterThanOrEqual:
- case NI_SSE2_CompareScalarNotGreaterThanOrEqual:
- case NI_AVX_CompareNotGreaterThanOrEqual:
- {
- if (opportunisticallyDependsOnAVX)
- {
- return static_cast<int>(FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling);
- }
-
- // CompareNotGreaterThanOrEqual is not directly supported in hardware without AVX support.
- // We will return the inverted case here and lowering will itself swap the ops
- // to ensure the emitted code remains correct. This simplifies the overall logic
- // here and for other use cases.
-
- assert(id != NI_AVX_CompareNotGreaterThanOrEqual);
- return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling);
- }
-
- case NI_SSE_CompareNotLessThanOrEqual:
- case NI_SSE_CompareScalarNotLessThanOrEqual:
- case NI_SSE2_CompareNotLessThanOrEqual:
- case NI_SSE2_CompareScalarNotLessThanOrEqual:
- case NI_AVX_CompareNotLessThanOrEqual:
- {
- return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling);
- }
-
- case NI_SSE_CompareOrdered:
- case NI_SSE_CompareScalarOrdered:
- case NI_SSE2_CompareOrdered:
- case NI_SSE2_CompareScalarOrdered:
- case NI_AVX_CompareOrdered:
- {
- return static_cast<int>(FloatComparisonMode::OrderedNonSignaling);
- }
-
- case NI_SSE_CompareUnordered:
- case NI_SSE_CompareScalarUnordered:
- case NI_SSE2_CompareUnordered:
- case NI_SSE2_CompareScalarUnordered:
- case NI_AVX_CompareUnordered:
- {
- return static_cast<int>(FloatComparisonMode::UnorderedNonSignaling);
- }
-
- case NI_SSE41_Ceiling:
- case NI_SSE41_CeilingScalar:
- case NI_AVX_Ceiling:
- {
- FALLTHROUGH;
- }
-
- case NI_SSE41_RoundToPositiveInfinity:
- case NI_SSE41_RoundToPositiveInfinityScalar:
- case NI_AVX_RoundToPositiveInfinity:
- {
- return static_cast<int>(FloatRoundingMode::ToPositiveInfinity);
- }
-
- case NI_SSE41_Floor:
- case NI_SSE41_FloorScalar:
- case NI_AVX_Floor:
- {
- FALLTHROUGH;
- }
-
- case NI_SSE41_RoundToNegativeInfinity:
- case NI_SSE41_RoundToNegativeInfinityScalar:
- case NI_AVX_RoundToNegativeInfinity:
- {
- return static_cast<int>(FloatRoundingMode::ToNegativeInfinity);
- }
-
- case NI_SSE41_RoundCurrentDirection:
- case NI_SSE41_RoundCurrentDirectionScalar:
- case NI_AVX_RoundCurrentDirection:
- {
- return static_cast<int>(FloatRoundingMode::CurrentDirection);
- }
-
- case NI_SSE41_RoundToNearestInteger:
- case NI_SSE41_RoundToNearestIntegerScalar:
- case NI_AVX_RoundToNearestInteger:
- {
- return static_cast<int>(FloatRoundingMode::ToNearestInteger);
- }
-
- case NI_SSE41_RoundToZero:
- case NI_SSE41_RoundToZeroScalar:
- case NI_AVX_RoundToZero:
- {
- return static_cast<int>(FloatRoundingMode::ToZero);
- }
-
- default:
- {
- return -1;
- }
- }
- }
+ static int lookupIval(Compiler* comp, NamedIntrinsic id, var_types simdBaseType);
#endif
static bool tryLookupSimdSize(NamedIntrinsic id, unsigned* pSimdSize)
// We need to validate that other phases of the compiler haven't introduced unsupported intrinsics
assert(compiler->compIsaSupportedDebugOnly(isa));
-
- int ival = HWIntrinsicInfo::lookupIval(intrinsicId, compiler->compOpportunisticallyDependsOn(InstructionSet_AVX));
-
assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
emitter* emit = GetEmitter();
assert(numArgs >= 0);
+
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
assert(ins != INS_invalid);
- emitAttr simdSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize()));
+ emitAttr simdSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize()));
assert(simdSize != 0);
+ int ival = HWIntrinsicInfo::lookupIval(compiler, intrinsicId, baseType);
+
switch (numArgs)
{
case 1:
genConsumeRegs(op1);
op1Reg = op1->GetRegNum();
- if ((ival != -1) && varTypeIsFloating(baseType))
+ if (ival != -1)
{
assert((ival >= 0) && (ival <= 127));
if (HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
op1Reg = targetReg;
}
- if ((ival != -1) && varTypeIsFloating(baseType))
+ if (ival != -1)
{
assert((ival >= 0) && (ival <= 127));
genHWIntrinsic_R_R_RM_I(node, ins, simdSize, static_cast<int8_t>(ival));
}
else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
{
- assert(ival == -1);
auto emitSwCase = [&](int8_t i) {
if (HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
{
genConsumeRegs(op3);
op3Reg = op3->GetRegNum();
+ assert(ival == -1);
+
if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
{
- assert(ival == -1);
-
auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, simdSize, i); };
if (op3->IsCnsIntOrI())
genConsumeRegs(op4);
op4Reg = op4->GetRegNum();
+ assert(ival == -1);
+
if (HWIntrinsicInfo::isImmOp(intrinsicId, op4))
{
- assert(ival == -1);
-
auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_R_RM_I(node, ins, simdSize, i); };
if (op4->IsCnsIntOrI())
case InstructionSet_AVX512F_X64:
case InstructionSet_AVX512BW:
case InstructionSet_AVX512BW_VL:
+ case InstructionSet_AVX512VBMI:
+ case InstructionSet_AVX512VBMI_VL:
genAvxFamilyIntrinsic(node);
break;
case InstructionSet_AES:
instruction maskIns;
instruction kmovIns;
+ emitAttr kmovAttr = EA_4BYTE;
// TODO-XARCH-AVX512 note that this type/kmov combination assumes 512-bit vector types but would change
// if used for other vector lengths, i.e., TYPE_BYTE requires kmovq for for 512-bit vector, but kmovd
{
case TYP_BYTE:
case TYP_UBYTE:
- maskIns = INS_vpmovb2m;
- kmovIns = INS_kmovq_gpr;
+ {
+ maskIns = INS_vpmovb2m;
+ kmovIns = INS_kmovq_gpr;
+ kmovAttr = EA_8BYTE;
break;
+ }
+
case TYP_SHORT:
case TYP_USHORT:
+ {
maskIns = INS_vpmovw2m;
kmovIns = INS_kmovd_gpr;
break;
+ }
+
case TYP_INT:
case TYP_UINT:
case TYP_FLOAT:
+ {
maskIns = INS_vpmovd2m;
kmovIns = INS_kmovw_gpr;
break;
+ }
+
case TYP_DOUBLE:
case TYP_LONG:
case TYP_ULONG:
+ {
maskIns = INS_vpmovq2m;
kmovIns = INS_kmovb_gpr;
break;
+ }
+
default:
+ {
unreached();
+ }
}
assert(emitter::isMaskReg(maskReg));
emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg);
- emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE);
- break;
- }
-
- case NI_AVX512F_CompareEqualSpecial:
- {
- GenTree* op2 = node->Op(2);
- op1Reg = op1->GetRegNum();
- regNumber op2Reg = op2->GetRegNum();
-
- instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareEqualSpecial, baseType);
-
- assert(compareIns != INS_invalid);
- assert(emitter::isMaskReg(targetReg));
-
- emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 0);
+ emit->emitIns_Mov(kmovIns, kmovAttr, targetReg, maskReg, INS_FLAGS_DONT_CARE);
break;
}
- case NI_AVX512F_CompareGreaterThanOrEqualSpecial:
- {
- GenTree* op2 = node->Op(2);
- op1Reg = op1->GetRegNum();
- regNumber op2Reg = op2->GetRegNum();
-
- instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanOrEqualSpecial, baseType);
-
- assert(compareIns != INS_invalid);
- assert(emitter::isMaskReg(targetReg));
-
- emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 5);
- break;
- }
-
- case NI_AVX512F_CompareGreaterThanSpecial:
- {
- GenTree* op2 = node->Op(2);
- op1Reg = op1->GetRegNum();
- regNumber op2Reg = op2->GetRegNum();
-
- instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanSpecial, baseType);
-
- assert(compareIns != INS_invalid);
- assert(emitter::isMaskReg(targetReg));
-
- emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 6);
- break;
- }
-
- case NI_AVX512F_CompareLessThanOrEqualSpecial:
- {
- GenTree* op2 = node->Op(2);
- op1Reg = op1->GetRegNum();
- regNumber op2Reg = op2->GetRegNum();
-
- instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanOrEqualSpecial, baseType);
-
- assert(compareIns != INS_invalid);
- assert(emitter::isMaskReg(targetReg));
-
- emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 2);
- break;
- }
-
- case NI_AVX512F_CompareLessThanSpecial:
- {
- GenTree* op2 = node->Op(2);
- op1Reg = op1->GetRegNum();
- regNumber op2Reg = op2->GetRegNum();
-
- instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanSpecial, baseType);
-
- assert(compareIns != INS_invalid);
- assert(emitter::isMaskReg(targetReg));
-
- emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 1);
- break;
- }
-
- case NI_AVX512F_MoveMaskToVectorSpecial:
+ case NI_AVX512F_KORTEST:
{
op1Reg = op1->GetRegNum();
- instruction maskMovIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_MoveMaskToVectorSpecial, baseType);
+ uint32_t simdSize = node->GetSimdSize();
+ uint32_t count = simdSize / genTypeSize(baseType);
- assert(maskMovIns != INS_invalid);
- assert(emitter::isMaskReg(op1Reg));
+ instruction testIns;
- emit->emitIns_R_R(maskMovIns, attr, targetReg, op1Reg);
- break;
- }
-
- case NI_AVX512F_KORTEST:
- {
- op1Reg = op1->GetRegNum();
-
- instruction testIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_KORTEST, baseType);
+ if (count <= 8)
+ {
+ testIns = INS_kortestb;
+ }
+ else if (count == 16)
+ {
+ testIns = INS_kortestw;
+ }
+ else if (count == 32)
+ {
+ testIns = INS_kortestd;
+ }
+ else
+ {
+ assert(count == 64);
+ testIns = INS_kortestq;
+ }
assert(testIns != INS_invalid);
assert(emitter::isMaskReg(op1Reg));
HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F, CompareLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F, CompareNotEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F_VL, AlignRight64, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512F_VL, Max, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaxsq, INS_vpmaxuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX512F_VL, Min, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpminsq, INS_vpminuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpud, INS_invalid, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F_VL, CompareNotEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Byte, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128ByteWithSaturation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Double, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2pd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX512BW, AlignRight, 64, 3, false, {INS_palignr, INS_palignr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512BW, Average, 64, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX512BW, BroadcastScalarToVector512, 64, 1, true, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX512BW, CompareEqual, 64, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThan, 64, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, CompareLessThan, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, CompareLessThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, CompareNotEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256Byte, 64, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256ByteWithSaturation, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256SByte, 64, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// AVX512BW.VL Intrinsics
+HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_vpcmpub, INS_invalid, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThan, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW_VL, CompareNotEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128Byte, -1, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128ByteWithSaturation, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128SByte, -1, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(SSE2, COMISD, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE2, UCOMISD, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
-HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
+HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
+HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F, CompareEqualSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqualSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqualSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F, CompareLessThanSpecial, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F, MoveMaskToVectorSpecial, 64, 1, true, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, false, {INS_kortestq, INS_kortestq, INS_kortestd, INS_kortestd, INS_kortestw, INS_kortestw, INS_kortestb, INS_kortestb, INS_kortestw, INS_kortestb}, HW_Category_Special, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX512F, CompareEqualMask, -1, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanMask, -1, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqualMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F, CompareLessThanMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqualMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F, CompareNotEqualMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F, ConvertMaskToVector, -1, 1, true, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment)
#endif // FEATURE_HW_INTRINSIC
}
//------------------------------------------------------------------------
+// lookupIval: Gets a the implicit immediate value for the given intrinsic
+//
+// Arguments:
+// comp - The compiler
+// id - The intrinsic for which to get the ival
+// simdBaseType - The base type for the intrinsic
+//
+// Return Value:
+// The immediate value for the given intrinsic or -1 if none exists
+int HWIntrinsicInfo::lookupIval(Compiler* comp, NamedIntrinsic id, var_types simdBaseType)
+{
+ switch (id)
+ {
+ case NI_SSE_CompareEqual:
+ case NI_SSE_CompareScalarEqual:
+ case NI_SSE2_CompareEqual:
+ case NI_SSE2_CompareScalarEqual:
+ case NI_AVX_CompareEqual:
+ case NI_AVX512F_CompareEqualMask:
+ {
+ if (varTypeIsFloating(simdBaseType))
+ {
+ return static_cast<int>(FloatComparisonMode::OrderedEqualNonSignaling);
+ }
+ else
+ {
+ // We can emit `vpcmpeqb`, `vpcmpeqw`, `vpcmpeqd`, or `vpcmpeqq`
+ }
+ break;
+ }
+
+ case NI_SSE_CompareGreaterThan:
+ case NI_SSE_CompareScalarGreaterThan:
+ case NI_SSE2_CompareGreaterThan:
+ case NI_SSE2_CompareScalarGreaterThan:
+ case NI_AVX_CompareGreaterThan:
+ case NI_AVX512F_CompareGreaterThanMask:
+ {
+ if (varTypeIsFloating(simdBaseType))
+ {
+ if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+ {
+ return static_cast<int>(FloatComparisonMode::OrderedGreaterThanSignaling);
+ }
+
+ // CompareGreaterThan is not directly supported in hardware without AVX support.
+ // We will return the inverted case here and lowering will itself swap the ops
+ // to ensure the emitted code remains correct. This simplifies the overall logic
+ // here and for other use cases.
+
+ assert(id != NI_AVX_CompareGreaterThan);
+ return static_cast<int>(FloatComparisonMode::OrderedLessThanSignaling);
+ }
+ else if ((id == NI_AVX512F_CompareGreaterThanMask) && varTypeIsUnsigned(simdBaseType))
+ {
+ // TODO-XARCH-CQ: Allow the other integer paths to use the EVEX encoding
+ return static_cast<int>(IntComparisonMode::GreaterThan);
+ }
+ break;
+ }
+
+ case NI_SSE_CompareLessThan:
+ case NI_SSE_CompareScalarLessThan:
+ case NI_SSE2_CompareLessThan:
+ case NI_SSE2_CompareScalarLessThan:
+ case NI_AVX_CompareLessThan:
+ case NI_AVX512F_CompareLessThanMask:
+ {
+ if (varTypeIsFloating(simdBaseType))
+ {
+ return static_cast<int>(FloatComparisonMode::OrderedLessThanSignaling);
+ }
+ else if (id == NI_AVX512F_CompareLessThanMask)
+ {
+ // TODO-XARCH-CQ: Allow the other integer paths to use the EVEX encoding
+ return static_cast<int>(IntComparisonMode::LessThan);
+ }
+ break;
+ }
+
+ case NI_SSE_CompareGreaterThanOrEqual:
+ case NI_SSE_CompareScalarGreaterThanOrEqual:
+ case NI_SSE2_CompareGreaterThanOrEqual:
+ case NI_SSE2_CompareScalarGreaterThanOrEqual:
+ case NI_AVX_CompareGreaterThanOrEqual:
+ case NI_AVX512F_CompareGreaterThanOrEqualMask:
+ {
+ if (varTypeIsFloating(simdBaseType))
+ {
+ if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+ {
+ return static_cast<int>(FloatComparisonMode::OrderedGreaterThanOrEqualSignaling);
+ }
+
+ // CompareGreaterThanOrEqual is not directly supported in hardware without AVX support.
+ // We will return the inverted case here and lowering will itself swap the ops
+ // to ensure the emitted code remains correct. This simplifies the overall logic
+ // here and for other use cases.
+
+ assert(id != NI_AVX_CompareGreaterThanOrEqual);
+ return static_cast<int>(FloatComparisonMode::OrderedLessThanOrEqualSignaling);
+ }
+ else
+ {
+ assert(id == NI_AVX512F_CompareGreaterThanOrEqualMask);
+ return static_cast<int>(IntComparisonMode::GreaterThanOrEqual);
+ }
+ break;
+ }
+
+ case NI_SSE_CompareLessThanOrEqual:
+ case NI_SSE_CompareScalarLessThanOrEqual:
+ case NI_SSE2_CompareLessThanOrEqual:
+ case NI_SSE2_CompareScalarLessThanOrEqual:
+ case NI_AVX_CompareLessThanOrEqual:
+ case NI_AVX512F_CompareLessThanOrEqualMask:
+ {
+ if (varTypeIsFloating(simdBaseType))
+ {
+ return static_cast<int>(FloatComparisonMode::OrderedLessThanOrEqualSignaling);
+ }
+ else
+ {
+ assert(id == NI_AVX512F_CompareLessThanOrEqualMask);
+ return static_cast<int>(IntComparisonMode::LessThanOrEqual);
+ }
+ break;
+ }
+
+ case NI_SSE_CompareNotEqual:
+ case NI_SSE_CompareScalarNotEqual:
+ case NI_SSE2_CompareNotEqual:
+ case NI_SSE2_CompareScalarNotEqual:
+ case NI_AVX_CompareNotEqual:
+ case NI_AVX512F_CompareNotEqualMask:
+ {
+ if (varTypeIsFloating(simdBaseType))
+ {
+ return static_cast<int>(FloatComparisonMode::UnorderedNotEqualNonSignaling);
+ }
+ else
+ {
+ assert(id == NI_AVX512F_CompareNotEqualMask);
+ return static_cast<int>(IntComparisonMode::NotEqual);
+ }
+ break;
+ }
+
+ case NI_SSE_CompareNotGreaterThan:
+ case NI_SSE_CompareScalarNotGreaterThan:
+ case NI_SSE2_CompareNotGreaterThan:
+ case NI_SSE2_CompareScalarNotGreaterThan:
+ case NI_AVX_CompareNotGreaterThan:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+
+ if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+ {
+ return static_cast<int>(FloatComparisonMode::UnorderedNotGreaterThanSignaling);
+ }
+
+ // CompareNotGreaterThan is not directly supported in hardware without AVX support.
+ // We will return the inverted case here and lowering will itself swap the ops
+ // to ensure the emitted code remains correct. This simplifies the overall logic
+ // here and for other use cases.
+
+ assert(id != NI_AVX_CompareNotGreaterThan);
+ return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanSignaling);
+ }
+
+ case NI_SSE_CompareNotLessThan:
+ case NI_SSE_CompareScalarNotLessThan:
+ case NI_SSE2_CompareNotLessThan:
+ case NI_SSE2_CompareScalarNotLessThan:
+ case NI_AVX_CompareNotLessThan:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanSignaling);
+ }
+
+ case NI_SSE_CompareNotGreaterThanOrEqual:
+ case NI_SSE_CompareScalarNotGreaterThanOrEqual:
+ case NI_SSE2_CompareNotGreaterThanOrEqual:
+ case NI_SSE2_CompareScalarNotGreaterThanOrEqual:
+ case NI_AVX_CompareNotGreaterThanOrEqual:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+
+ if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+ {
+ return static_cast<int>(FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling);
+ }
+
+ // CompareNotGreaterThanOrEqual is not directly supported in hardware without AVX support.
+ // We will return the inverted case here and lowering will itself swap the ops
+ // to ensure the emitted code remains correct. This simplifies the overall logic
+ // here and for other use cases.
+
+ assert(id != NI_AVX_CompareNotGreaterThanOrEqual);
+ return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling);
+ }
+
+ case NI_SSE_CompareNotLessThanOrEqual:
+ case NI_SSE_CompareScalarNotLessThanOrEqual:
+ case NI_SSE2_CompareNotLessThanOrEqual:
+ case NI_SSE2_CompareScalarNotLessThanOrEqual:
+ case NI_AVX_CompareNotLessThanOrEqual:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling);
+ }
+
+ case NI_SSE_CompareOrdered:
+ case NI_SSE_CompareScalarOrdered:
+ case NI_SSE2_CompareOrdered:
+ case NI_SSE2_CompareScalarOrdered:
+ case NI_AVX_CompareOrdered:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatComparisonMode::OrderedNonSignaling);
+ }
+
+ case NI_SSE_CompareUnordered:
+ case NI_SSE_CompareScalarUnordered:
+ case NI_SSE2_CompareUnordered:
+ case NI_SSE2_CompareScalarUnordered:
+ case NI_AVX_CompareUnordered:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatComparisonMode::UnorderedNonSignaling);
+ }
+
+ case NI_SSE41_Ceiling:
+ case NI_SSE41_CeilingScalar:
+ case NI_AVX_Ceiling:
+ {
+ FALLTHROUGH;
+ }
+
+ case NI_SSE41_RoundToPositiveInfinity:
+ case NI_SSE41_RoundToPositiveInfinityScalar:
+ case NI_AVX_RoundToPositiveInfinity:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatRoundingMode::ToPositiveInfinity);
+ }
+
+ case NI_SSE41_Floor:
+ case NI_SSE41_FloorScalar:
+ case NI_AVX_Floor:
+ {
+ FALLTHROUGH;
+ }
+
+ case NI_SSE41_RoundToNegativeInfinity:
+ case NI_SSE41_RoundToNegativeInfinityScalar:
+ case NI_AVX_RoundToNegativeInfinity:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatRoundingMode::ToNegativeInfinity);
+ }
+
+ case NI_SSE41_RoundCurrentDirection:
+ case NI_SSE41_RoundCurrentDirectionScalar:
+ case NI_AVX_RoundCurrentDirection:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatRoundingMode::CurrentDirection);
+ }
+
+ case NI_SSE41_RoundToNearestInteger:
+ case NI_SSE41_RoundToNearestIntegerScalar:
+ case NI_AVX_RoundToNearestInteger:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatRoundingMode::ToNearestInteger);
+ }
+
+ case NI_SSE41_RoundToZero:
+ case NI_SSE41_RoundToZeroScalar:
+ case NI_AVX_RoundToZero:
+ {
+ assert(varTypeIsFloating(simdBaseType));
+ return static_cast<int>(FloatRoundingMode::ToZero);
+ }
+
+ default:
+ {
+ break;
+ }
+ }
+
+ return -1;
+}
+
+//------------------------------------------------------------------------
// impNonConstFallback: convert certain SSE2/AVX2 shift intrinsic to its semantic alternative when the imm-arg is
// not a compile-time constant
//
// These intrinsics are "special import" because the non-AVX path isn't directly
// hardware supported. Instead, they start with "swapped operands" and we fix that here.
- FloatComparisonMode comparison =
- static_cast<FloatComparisonMode>(HWIntrinsicInfo::lookupIval(intrinsic, true));
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(static_cast<int>(comparison)),
- NI_AVX_CompareScalar, simdBaseJitType, simdSize);
+ int ival = HWIntrinsicInfo::lookupIval(this, intrinsic, simdBaseType);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(ival), NI_AVX_CompareScalar,
+ simdBaseJitType, simdSize);
}
else
{
// These intrinsics are "special import" because the non-AVX path isn't directly
// hardware supported. Instead, they start with "swapped operands" and we fix that here.
- FloatComparisonMode comparison =
- static_cast<FloatComparisonMode>(HWIntrinsicInfo::lookupIval(intrinsic, true));
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(static_cast<int>(comparison)),
- NI_AVX_CompareScalar, simdBaseJitType, simdSize);
+ int ival = HWIntrinsicInfo::lookupIval(this, intrinsic, simdBaseType);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(ival), NI_AVX_CompareScalar,
+ simdBaseJitType, simdSize);
}
else
{
// Avx
INS_Flags_IsDstDstSrcAVXInstruction = 1ULL << 26,
INS_Flags_IsDstSrcSrcAVXInstruction = 1ULL << 27,
- INS_Flags_IsMskSrcSrcEvexInstruction = 1ULL << 28,
- INS_Flags_Is3OperandInstructionMask = (INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_IsDstSrcSrcAVXInstruction | INS_Flags_IsMskSrcSrcEvexInstruction),
+ INS_Flags_Is3OperandInstructionMask = (INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_IsDstSrcSrcAVXInstruction),
// w and s bits
INS_FLAGS_Has_Wbit = 1ULL << 29,
INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
// AVX512F
-INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction)
-INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction)
-INST3(kortestw, "kortestw", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction)
+INST3(kandw, "kandw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x41), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical AND masks
+INST3(kandnw, "kandnw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x42), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical AND NOT masks
+INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers
+INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers
+INST3(knotw, "knotw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x44), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // NOT mask register
+INST3(korw, "korw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x45), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical OR masks
+INST3(kortestw, "kortestw", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags
+INST3(kshiftlw, "kshiftlw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x32), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Shift left mask registers
+INST3(kshiftrw, "kshiftrw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x30), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Shift right mask registers
+INST3(kunpckbw, "kunpckbw", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x4B), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Unpack for mask registers
+INST3(kxnorw, "kxnorw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x46), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical XNOR masks
+INST3(kxorw, "kxorw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x47), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical XOR masks
INST3(valignd, "alignd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x03), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Align doubleword vectors
INST3(valignq, "alignq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x03), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Align quadword vectors
INST3(vbroadcastf64x2, "broadcastf64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1A), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register
INST3(vbroadcasti64x2, "broadcasti64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5A), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register
INST3(vbroadcastf64x4, "broadcastf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1B), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register
INST3(vbroadcasti64x4, "broadcasti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x5B), INS_TT_TUPLE2, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register
-INST3(vcmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare packed singles
-INST3(vcmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare scalar singles
-INST3(vcmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare packed doubles
-INST3(vcmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // compare scalar doubles
+INST3(vcmpps, "cmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed singles
+INST3(vcmpss, "cmpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xC2), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar singles
+INST3(vcmppd, "cmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare packed doubles
+INST3(vcmpsd, "cmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // compare scalar doubles
INST3(vcvtpd2udq, "cvtpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x79), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt packed doubles to unsigned DWORDs
INST3(vcvtps2udq, "cvtps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x79), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed singles to unsigned DWORDs
INST3(vcvtsd2usi, "cvtsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x79), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_EVEX) // cvt scalar double to unsigned DWORD/QWORD
INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs
INST3(vpbroadcastd_gpr, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast int32 value from gpr to entire register
INST3(vpbroadcastq_gpr, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX) // Broadcast int64 value from gpr to entire register
-INST3(vpcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 32-bit integers for equality
-INST3(vpcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 32-bit signed integers for greater than
-INST3(vpcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 64-bit integers for equality
-INST3(vpcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 64-bit integers for equality
+INST3(vpcmpeqd, "pcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality
+INST3(vpcmpgtd, "pcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than
+INST3(vpcmpeqq, "pcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality
+INST3(vpcmpgtq, "pcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality
INST3(vpermq_reg, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register
INST3(vpermpd_reg, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register
INST3(vpermi2d, "permi2d", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x76), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Full Permute From Two Tables Overwriting the Index
INST3(vshufi64x2, "shufi64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x43), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shuffle packed values at 128-bit granularity
// AVX512BW
-INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction)
-INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction)
-INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction)
-INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction)
-INST3(kortestd, "kortestd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction)
-INST3(kortestq, "kortestq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction)
+INST3(kaddd, "kaddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x4A), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Add two masks
+INST3(kaddq, "kaddq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x4A), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Add two masks
+INST3(kandd, "kandd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x41), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical AND masks
+INST3(kandq, "kandq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x41), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical AND masks
+INST3(kandnd, "kandnd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x42), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical AND NOT masks
+INST3(kandnq, "kandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x42), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical AND NOT masks
+INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers
+INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers
+INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers
+INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers
+INST3(knotd, "knotd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x44), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // NOT mask register
+INST3(knotq, "knotq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x44), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // NOT mask register
+INST3(kord, "kord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x45), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical OR masks
+INST3(korq, "korq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x45), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical OR masks
+INST3(kortestd, "kortestd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags
+INST3(kortestq, "kortestq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags
+INST3(kshiftld, "kshiftld", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x33), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Shift left mask registers
+INST3(kshiftlq, "kshiftlq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x33), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Shift left mask registers
+INST3(kshiftrd, "kshiftrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x31), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Shift right mask registers
+INST3(kshiftrq, "kshiftrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x31), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Shift right mask registers
+INST3(ktestd, "ktestd", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x99), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // Packed bit test masks and set flags
+INST3(ktestq, "ktestq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x99), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // Packed bit test masks and set flags
+INST3(kunpckdq, "kunpckdq", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x4B), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Unpack for mask registers
+INST3(kunpckwd, "kunpckwd", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x4B), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Unpack for mask registers
+INST3(kxnord, "kxnord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x46), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical XNOR masks
+INST3(kxnorq, "kxnorq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x46), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical XNOR masks
+INST3(kxord, "kxord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x47), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical XOR masks
+INST3(kxorq, "kxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x47), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Bitwise logical XOR masks
INST3(vdbpsadbw, "dbpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Double block packed Sum-Absolute-Differences (SAD) on unsigned bytes
INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX)
INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX)
INST3(vpbroadcastb_gpr, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7A), INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_EVEX) // Broadcast int8 value from gpr to entire register
INST3(vpbroadcastw_gpr, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7B), INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Broadcast int16 value from gpr to entire register
-INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction)
-INST3(vpcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 8-bit integers for equality
-INST3(vpcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 16-bit integers for equality
-INST3(vpcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 8-bit signed integers for greater than
-INST3(vpcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) // Packed compare 16-bit signed integers for greater than
-INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction)
-INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction)
-INST3(vpcmpuw, "pcmpuw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction)
+INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(vpcmpeqb, "pcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality
+INST3(vpcmpeqw, "pcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality
+INST3(vpcmpgtb, "pcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than
+INST3(vpcmpgtw, "pcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than
+INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(vpcmpuw, "pcmpuw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)
INST3(vpermw, "permw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8D), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute Packed Doublewords Elements
INST3(vpermi2w, "permi2w", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x75), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Full Permute From Two Tables Overwriting the Index
INST3(vpermt2w, "permt2w", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7D), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Full Permute From Two Tables Overwriting one Table
INST3(vplzcntq, "plzcntq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x44), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // Count the number of leading zero bits for packed qword values
// AVX512DQ
-INST3(kortestb, "kortestb", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction)
-INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction)
-INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction)
+INST3(kaddb, "kaddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x4A), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Add two masks
+INST3(kaddw, "kaddw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x4A), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Add two masks
+INST3(kandb, "kandb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x41), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical AND masks
+INST3(kandnb, "kandnb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x42), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical AND NOT masks
+INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers
+INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers
+INST3(knotb, "knotb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x44), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // NOT mask register
+INST3(korb, "korb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x45), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical OR masks
+INST3(kortestb, "kortestb", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags
+INST3(kshiftlb, "kshiftlb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x32), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Shift left mask registers
+INST3(kshiftrb, "kshiftrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x30), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Shift right mask registers
+INST3(ktestb, "ktestb", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x99), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // Packed bit test masks and set flags
+INST3(ktestw, "ktestw", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x99), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // Packed bit test masks and set flags
+INST3(kxnorb, "kxnorb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x46), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical XNOR masks
+INST3(kxorb, "kxorb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x47), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Bitwise logical XOR masks
INST3(vbroadcastf32x2, "broadcastf32x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x19), INS_TT_TUPLE2, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register
INST3(vbroadcasti32x2, "broadcasti32x2", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE2, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed integer values read from memory to entire register
INST3(vbroadcastf32x8, "broadcastf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1B), INS_TT_TUPLE8, Input_32Bit | REX_W0 | Encoding_EVEX) // Broadcast packed float values read from memory to entire register
GenTree* LowerHWIntrinsic(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
GenTree* LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp);
- GenTree* LowerHWIntrinsicCmpOpWithKReg(GenTreeHWIntrinsic* node);
GenTree* LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node);
#if defined(TARGET_XARCH)
void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
+ GenTree* LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node);
GenTree* LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node);
GenTree* LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node);
GenTree* LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node);
}
break;
- case NI_AVX512F_KORTEST:
case NI_SSE41_PTEST:
case NI_AVX_PTEST:
+ {
// If we need the Carry flag then we can't swap operands.
canSwapOperands = (cc == nullptr) || cc->gtCondition.Is(GenCondition::EQ, GenCondition::NE);
break;
+ }
+
+ case NI_AVX512F_KORTEST:
+ {
+ // TODO-XARCH-AVX512 remove the KORTEST check when its promoted to 2 proper arguments
+ assert(HWIntrinsicInfo::lookupNumArgs(newIntrinsicId) == 1);
+ break;
+ }
default:
unreached();
case NI_Vector128_op_Equality:
case NI_Vector256_op_Equality:
+ case NI_Vector512_op_Equality:
{
return LowerHWIntrinsicCmpOp(node, GT_EQ);
}
case NI_Vector128_op_Inequality:
case NI_Vector256_op_Inequality:
- {
- return LowerHWIntrinsicCmpOp(node, GT_NE);
- }
-
- case NI_Vector512_GreaterThanAll:
- case NI_Vector512_GreaterThanAny:
- case NI_Vector512_GreaterThanOrEqualAll:
- case NI_Vector512_GreaterThanOrEqualAny:
- case NI_Vector512_LessThanAll:
- case NI_Vector512_LessThanAny:
- case NI_Vector512_LessThanOrEqualAll:
- case NI_Vector512_LessThanOrEqualAny:
- case NI_Vector512_op_Equality:
case NI_Vector512_op_Inequality:
{
- return LowerHWIntrinsicCmpOpWithKReg(node);
+ return LowerHWIntrinsicCmpOp(node, GT_NE);
}
case NI_Vector128_ToScalar:
LowerFusedMultiplyAdd(node);
break;
+ case NI_AVX512F_CompareEqual:
+ case NI_AVX512F_CompareGreaterThan:
+ case NI_AVX512F_CompareGreaterThanOrEqual:
+ case NI_AVX512F_CompareLessThan:
+ case NI_AVX512F_CompareLessThanOrEqual:
+ case NI_AVX512F_CompareNotEqual:
+ case NI_AVX512F_VL_CompareGreaterThan:
+ case NI_AVX512F_VL_CompareGreaterThanOrEqual:
+ case NI_AVX512F_VL_CompareLessThan:
+ case NI_AVX512F_VL_CompareLessThanOrEqual:
+ case NI_AVX512F_VL_CompareNotEqual:
+ case NI_AVX512BW_CompareEqual:
+ case NI_AVX512BW_CompareGreaterThan:
+ case NI_AVX512BW_CompareGreaterThanOrEqual:
+ case NI_AVX512BW_CompareLessThan:
+ case NI_AVX512BW_CompareLessThanOrEqual:
+ case NI_AVX512BW_CompareNotEqual:
+ case NI_AVX512BW_VL_CompareGreaterThan:
+ case NI_AVX512BW_VL_CompareGreaterThanOrEqual:
+ case NI_AVX512BW_VL_CompareLessThan:
+ case NI_AVX512BW_VL_CompareLessThanOrEqual:
+ case NI_AVX512BW_VL_CompareNotEqual:
+ {
+ return LowerHWIntrinsicWithAvx512Mask(node);
+ }
+
default:
break;
}
var_types simdType = Compiler::getSIMDTypeForSize(simdSize);
assert((intrinsicId == NI_Vector128_op_Equality) || (intrinsicId == NI_Vector128_op_Inequality) ||
- (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality));
+ (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality) ||
+ (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality));
assert(varTypeIsSIMD(simdType));
assert(varTypeIsArithmetic(simdBaseType));
GenTree* op2 = node->Op(2);
GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE;
- if (!varTypeIsFloating(simdBaseType) && op2->IsVectorZero() &&
- comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+ if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && op2->IsVectorZero() &&
+ comp->compOpportunisticallyDependsOn(InstructionSet_SSE41) &&
+ !op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector))
{
// On SSE4.1 or higher we can optimize comparisons against zero to
// just use PTEST. We can't support it for floating-point, however,
}
else
{
+ assert(simdSize == 16);
+
// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
node->ChangeHWIntrinsicId(NI_SSE41_TestZ);
LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
}
- return node->gtNext;
+ return LowerNode(node);
}
+ // TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
+ // so will require us to account for the unused 4th element.
+
+ if ((simdType != TYP_SIMD12) && comp->IsBaselineVector512IsaSupported())
+ {
+ // The EVEX encoded versions of the comparison instructions all return a kmask
+ //
+ // For the comparisons against zero that we normally optimize to use `PTEST` we
+ // have to make a decision to use EVEX and emit 2 instructions (vpcmp + kortest)
+ // or to continue emitting PTEST and hope that the register allocator isn't limited
+ // by it not supporting the extended register set.
+ //
+ // Ideally we'd opt to not use PTEST when EVEX is available, This would be done so we can
+ // best take advantage of EVEX exclusive features such as embedded broadcast and the
+ // 16 additional registers. In many cases this allows for overall denser codegen where
+ // we are doing more in the same number of bytes, even though the individual instruction
+ // is 1-2 bytes larger. Even though there may be cases where continuing to use PTEST for select-
+ // 128/256-bit code paths would still be beneficial, the additional complexity required
+ // to detect and account for those differences is not likely to be worth the tradeoff.
+ //
+ // TODO-XARCH-AVX512: Given the above don't emit the PTEST path above when AVX-512 is available
+ // This will require exposing `NI_AVX512F_TestZ` so that we can keep codegen optimized to just
+ // `vptestm` followed by `kortest`. This will be one instruction more than just `vptest` but
+ // it has the advantages detailed above.
+ //
+ // For other comparisons, using EVEX allows us to avoid leaving the SIMD domain, avoids
+ // needing to use a general-purpose register, and allows us to generate less instructions.
+
+ GenTree* nextNode = node->gtNext;
+
+ NamedIntrinsic maskIntrinsicId = NI_AVX512F_CompareEqualMask;
+ uint32_t count = simdSize / genTypeSize(simdBaseType);
+
+ // KORTEST does a bitwise or on the result and sets ZF if it is zero and CF if it is all
+ // bits set. Because of this, when we have at least 8 elements to compare we can use a
+ // normal comparison alongside CF.
+ //
+ // That is, if the user wants `x == y`, we can keep it as `mask = (x == y)` and then emit
+ // `kortest mask, mask` and check `CF == 1`. This will be true if all elements matched and
+ // false otherwise. Things work out nicely and we keep readable disasm.
+ //
+ // Likewise, if the user wants `x != y`, we can keep it as `mask = (x != y)` and then emit
+ // `kortest mask, mask` and check `ZF != 0`. This will be true if any elements mismatched.
+ //
+ // However, if we have less than 8 elements then we have to change it up since we have less
+ // than 8 bits in the output mask and unused bits will be set to 0. This occurs for 32-bit
+ // for Vector128 and and 64-bit elements when using either Vector128 or Vector256.
+ //
+ // To account for this, we will invert the comparison being done. So if the user wants
+ // `x == y`, we will instead emit `mask = (x != y)`, we will still emit `kortest mask, mask`,
+ // but we will then check for `ZF == 0`. This works since that equates to all elements being equal
+ //
+ // Likewise for `x != y` we will instead emit `mask = (x == y)`, then `kortest mask, mask`,
+ // and will then check for `CF == 0` which equates to one or more elements not being equal
+
+ // The scenarios we have to for a full mask are:
+ // * No matches: 0000_0000 - ZF == 1, CF == 0
+ // * Partial matches: 0000_1111 - ZF == 0, CF == 0
+ // * All matches: 1111_1111 - ZF == 0, CF == 1
+ //
+ // The scenarios we have to for a partial mask are:
+ // * No matches: 0000_0000 - ZF == 1, CF == 0
+ // * Partial matches: 0000_0011 - ZF == 0, CF == 0
+ // * All matches: 0000_1111 - ZF == 0, CF == 0
+ //
+ // When we have less than a full mask worth of elements, we need to account for the upper
+ // bits being implicitly zero. To do that, we may need to invert the comparison.
+ //
+ // By inverting the comparison we'll get:
+ // * All matches: 0000_0000 - ZF == 1, CF == 0
+ // * Partial matches: 0000_0011 - ZF == 0, CF == 0
+ // * No matches: 0000_1111 - ZF == 0, CF == 0
+ //
+ // This works since the upper bits are implicitly zero and so by inverting matches also become
+ // zero, which in turn means that `AllBitsSet` will become `Zero` and other cases become non-zero
+
+ if (op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) && op2->IsCnsVec())
+ {
+ // We want to specially handle the common cases of `mask op Zero` and `mask op AllBitsSet`
+ //
+ // These get created for the various `gtNewSimdCmpOpAnyNode` and `gtNewSimdCmpOpAllNode`
+ // scenarios and we want to ensure they still get "optimal" codegen. To handle that, we
+ // simply consume the mask directly and preserve the intended comparison by tweaking the
+ // compare condition passed down into `KORTEST`
+
+ GenTreeHWIntrinsic* maskNode = op1->AsHWIntrinsic()->Op(1)->AsHWIntrinsic();
+ assert(maskNode->TypeIs(TYP_MASK));
+
+ bool isHandled = false;
+ GenTreeVecCon* vecCon = op2->AsVecCon();
+
+ if (vecCon->IsZero())
+ {
+ // We have `mask == Zero` which is the same as checking that nothing in the mask
+ // is set. This scenario can be handled by `kortest` and then checking that `ZF == 1`
+ //
+ // -or-
+ //
+ // We have `mask != Zero` which is the same as checking that something in the mask
+ // is set. This scenario can be handled by `kortest` and then checking that `ZF == 0`
+ //
+ // Since this is the default state for `CompareEqualMask` + `GT_EQ`/`GT_NE`, there is nothing
+ // for us to change. This also applies to cases where we have less than a full mask of
+ // elements since the upper mask bits are implicitly zero.
+
+ isHandled = true;
+ }
+ else if (vecCon->IsAllBitsSet())
+ {
+ // We have `mask == AllBitsSet` which is the same as checking that everything in the mask
+ // is set. This scenario can be handled by `kortest` and then checking that `CF == 1` for
+ // a full mask and checking `ZF == 1` for a partial mask using an inverted comparison
+ //
+ // -or-
+ //
+ // We have `mask != AllBitsSet` which is the same as checking that something in the mask
+ // is set. This scenario can be handled by `kortest` and then checking that `CF == 0` for
+ // a full mask and checking `ZF != 0` for a partial mask using an inverted comparison
+
+ if (count < 8)
+ {
+ assert((count == 1) || (count == 2) || (count == 4));
+
+ switch (maskNode->GetHWIntrinsicId())
+ {
+ case NI_AVX512F_CompareEqualMask:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareNotEqualMask;
+ break;
+ }
+
+ case NI_AVX512F_CompareGreaterThanMask:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareLessThanOrEqualMask;
+ break;
+ }
+
+ case NI_AVX512F_CompareGreaterThanOrEqualMask:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareLessThanMask;
+ break;
+ }
+
+ case NI_AVX512F_CompareLessThanMask:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualMask;
+ break;
+ }
+
+ case NI_AVX512F_CompareLessThanOrEqualMask:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareGreaterThanMask;
+ break;
+ }
+
+ case NI_AVX512F_CompareNotEqualMask:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareEqualMask;
+ break;
+ }
+
+ default:
+ {
+ unreached();
+ }
+ }
+
+ maskNode->ChangeHWIntrinsicId(maskIntrinsicId);
+ }
+ else if (cmpOp == GT_EQ)
+ {
+ cmpCnd = GenCondition::C;
+ }
+ else
+ {
+ cmpCnd = GenCondition::NC;
+ }
+ isHandled = true;
+ }
+
+ if (isHandled)
+ {
+ LIR::Use use;
+ if (BlockRange().TryGetUse(node, &use))
+ {
+ use.ReplaceWith(maskNode);
+ }
+ else
+ {
+ maskNode->SetUnusedValue();
+ }
+
+ BlockRange().Remove(op2);
+ BlockRange().Remove(op1);
+ BlockRange().Remove(node);
+
+ node = maskNode;
+ }
+ }
+
+ if (node->gtType != TYP_MASK)
+ {
+ // We have `x == y` or `x != y` both of which where we want to find `AllBitsSet` in the mask since
+ // we can directly do the relevant comparison. Given the above tables then when we have a full mask
+ // we can simply check against `CF == 1` for `op_Equality` and `ZF == 0` for `op_Inequality`.
+ //
+ // For a partial mask, we need to invert the `op_Equality` comparisons which means that we now need
+ // to check for `ZF == 1` (we're looking for `AllBitsSet`, that is `all match`). For `op_Inequality`
+ // we can keep things as is since we're looking for `any match` and just want to check `ZF == 0`
+
+ if (count < 8)
+ {
+ assert((count == 1) || (count == 2) || (count == 4));
+ maskIntrinsicId = NI_AVX512F_CompareNotEqualMask;
+ }
+ else
+ {
+ assert((count == 8) || (count == 16) || (count == 32) || (count == 64));
+
+ if (cmpOp == GT_EQ)
+ {
+ cmpCnd = GenCondition::C;
+ }
+ else
+ {
+ maskIntrinsicId = NI_AVX512F_CompareNotEqualMask;
+ }
+ }
+
+ node->gtType = TYP_MASK;
+ node->ChangeHWIntrinsicId(maskIntrinsicId);
+
+ LowerNode(node);
+ }
+
+ LIR::Use use;
+ if (BlockRange().TryGetUse(node, &use))
+ {
+ GenTreeHWIntrinsic* cc;
+
+ cc = comp->gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_KORTEST, simdBaseJitType, simdSize);
+ BlockRange().InsertBefore(nextNode, cc);
+
+ use.ReplaceWith(cc);
+ LowerHWIntrinsicCC(cc, NI_AVX512F_KORTEST, cmpCnd);
+
+ nextNode = cc->gtNext;
+ }
+ return nextNode;
+ }
+
+ assert(simdSize != 64);
+
NamedIntrinsic cmpIntrinsic;
CorInfoType cmpJitType;
NamedIntrinsic mskIntrinsic;
case TYP_ULONG:
{
mskJitType = CORINFO_TYPE_UBYTE;
+ cmpJitType = simdBaseJitType;
if (simdSize == 32)
{
cmpIntrinsic = NI_AVX2_CompareEqual;
- cmpJitType = simdBaseJitType;
mskIntrinsic = NI_AVX2_MoveMask;
mskConstant = -1;
}
if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
cmpIntrinsic = NI_SSE41_CompareEqual;
- cmpJitType = simdBaseJitType;
}
else
{
}
//----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsicCmpOpWithKReg: Lowers a Vector512 comparison intrinsic
-//
-// Arguments:
-// node - The hardware intrinsic node.
-//
-GenTree* Lowering::LowerHWIntrinsicCmpOpWithKReg(GenTreeHWIntrinsic* node)
-{
- NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
- CorInfoType simdBaseJitType = node->GetSimdBaseJitType();
- var_types simdBaseType = node->GetSimdBaseType();
- unsigned simdSize = node->GetSimdSize();
- var_types simdType = Compiler::getSIMDTypeForSize(simdSize);
-
- assert((intrinsicId == NI_Vector512_GreaterThanAll) || (intrinsicId == NI_Vector512_GreaterThanOrEqualAll) ||
- (intrinsicId == NI_Vector512_LessThanAll) || (intrinsicId == NI_Vector512_LessThanOrEqualAll) ||
- (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality));
-
- assert(varTypeIsSIMD(simdType));
- assert(varTypeIsArithmetic(simdBaseType));
- assert(simdSize == 64);
- assert(node->gtType == TYP_BOOL);
-
- NamedIntrinsic newIntrinsicId = NI_Illegal;
- switch (intrinsicId)
- {
- case NI_Vector512_GreaterThanAll:
- {
- newIntrinsicId = NI_AVX512F_CompareGreaterThanSpecial;
- break;
- }
- case NI_Vector512_GreaterThanOrEqualAll:
- {
- newIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualSpecial;
- break;
- }
- case NI_Vector512_LessThanAll:
- {
- newIntrinsicId = NI_AVX512F_CompareLessThanSpecial;
- break;
- }
- case NI_Vector512_LessThanOrEqualAll:
- {
- newIntrinsicId = NI_AVX512F_CompareLessThanOrEqualSpecial;
- break;
- }
- case NI_Vector512_op_Equality:
- case NI_Vector512_op_Inequality:
- {
- newIntrinsicId = NI_AVX512F_CompareEqualSpecial;
- break;
- }
-
- default:
- {
- assert(false);
- break;
- }
- }
-
- GenTree* op1 = node->Op(1);
- GenTree* op2 = node->Op(2);
-
- GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, newIntrinsicId, simdBaseJitType, simdSize);
- BlockRange().InsertBefore(node, cmp);
- LowerNode(cmp);
-
- node->ResetHWIntrinsicId(NI_AVX512F_KORTEST, cmp);
- GenCondition cmpCnd = (intrinsicId != NI_Vector512_op_Inequality) ? GenCondition::C : GenCondition::NC;
- LowerHWIntrinsicCC(node, NI_AVX512F_KORTEST, cmpCnd);
-
- return node->gtNext;
-}
-
-//----------------------------------------------------------------------------------------------
// Lowering::LowerHWIntrinsicCndSel: Lowers a Vector128 or Vector256 Conditional Select call
//
// Arguments:
}
//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicWithAvx512Mask: Lowers a HWIntrinsic node that utilizes the AVX512 KMASK registers
+//
+// Arguments:
+// node - The hardware intrinsic node.
+//
+GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node)
+{
+ NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
+ CorInfoType simdBaseJitType = node->GetSimdBaseJitType();
+ var_types simdBaseType = node->GetSimdBaseType();
+ unsigned simdSize = node->GetSimdSize();
+ var_types simdType = Compiler::getSIMDTypeForSize(simdSize);
+
+ assert(varTypeIsSIMD(simdType));
+ assert(varTypeIsArithmetic(simdBaseType));
+ assert(simdSize != 0);
+
+ NamedIntrinsic maskIntrinsicId;
+
+ switch (intrinsicId)
+ {
+ case NI_AVX512F_CompareEqual:
+ case NI_AVX512BW_CompareEqual:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareEqualMask;
+ break;
+ }
+
+ case NI_AVX512F_VL_CompareGreaterThan:
+ case NI_AVX512BW_VL_CompareGreaterThan:
+ {
+ assert(varTypeIsUnsigned(simdBaseType));
+ FALLTHROUGH;
+ }
+
+ case NI_AVX512F_CompareGreaterThan:
+ case NI_AVX512BW_CompareGreaterThan:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareGreaterThanMask;
+ break;
+ }
+
+ case NI_AVX512F_VL_CompareGreaterThanOrEqual:
+ case NI_AVX512BW_VL_CompareGreaterThanOrEqual:
+ {
+ assert(!varTypeIsFloating(simdBaseType));
+ FALLTHROUGH;
+ }
+
+ case NI_AVX512F_CompareGreaterThanOrEqual:
+ case NI_AVX512BW_CompareGreaterThanOrEqual:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualMask;
+ break;
+ }
+
+ case NI_AVX512F_VL_CompareLessThan:
+ case NI_AVX512BW_VL_CompareLessThan:
+ {
+ assert(varTypeIsUnsigned(simdBaseType));
+ FALLTHROUGH;
+ }
+
+ case NI_AVX512F_CompareLessThan:
+ case NI_AVX512BW_CompareLessThan:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareLessThanMask;
+ break;
+ }
+
+ case NI_AVX512F_VL_CompareLessThanOrEqual:
+ case NI_AVX512BW_VL_CompareLessThanOrEqual:
+ {
+ assert(!varTypeIsFloating(simdBaseType));
+ FALLTHROUGH;
+ }
+
+ case NI_AVX512F_CompareLessThanOrEqual:
+ case NI_AVX512BW_CompareLessThanOrEqual:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareLessThanOrEqualMask;
+ break;
+ }
+
+ case NI_AVX512F_VL_CompareNotEqual:
+ case NI_AVX512BW_VL_CompareNotEqual:
+ {
+ assert(!varTypeIsFloating(simdBaseType));
+ FALLTHROUGH;
+ }
+
+ case NI_AVX512F_CompareNotEqual:
+ case NI_AVX512BW_CompareNotEqual:
+ {
+ maskIntrinsicId = NI_AVX512F_CompareNotEqualMask;
+ break;
+ }
+
+ default:
+ {
+ unreached();
+ }
+ }
+
+ node->gtType = TYP_MASK;
+ node->ChangeHWIntrinsicId(maskIntrinsicId);
+
+ LIR::Use use;
+ if (BlockRange().TryGetUse(node, &use))
+ {
+ GenTree* maskToVector =
+ comp->gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize);
+ BlockRange().InsertAfter(node, maskToVector);
+ use.ReplaceWith(maskToVector);
+ }
+ return LowerNode(node);
+}
+
+//----------------------------------------------------------------------------------------------
// Lowering::LowerHWIntrinsicToScalar: Lowers a Vector128 or Vector256 ToScalar call
//
// Arguments: