* fixing the JITDbl2Ulng helper function. The new AVX512 instruction vcvtsd2usi uses ulong.max_value to show FPE for negative, NAN and ulong_max + 1 values.
* Making changes to the library test case expected output based on the architecture. This is because we have changed the JITDbl2Ulng helper function to mimic the new IEEE compliant AVX512 instruction vcvtsd2usi. In the process, we needed to update the library test case because the default Floating Point Error (FPE) value for the new instruction is different from the default MSVC FPE value i.e. 0.
* Fixing the JITDbl2Ulng helper function. Also making sure that we are not changing the library test case but the API to make sure NaN cases are handled.
* reverting jitformat
* Adding a truncate function to the Dbl2Ulng helper to make sure we avoid handling edge cases (-1,0) separately inside the helper.
* Adding code to handle vectorized conversion for float/double to/from ulong/uint
* reverting changes for float to ulong
* enabling float to ulong conversion
* Making change to set w1 bit for evex
* merging with main. Picking up hwintrinsiclistxarh from main
trying to return EA_4BYTE for INS_vcvttss2usi to make sure that we read dword and not qword for float to ulong
* jit format
* Splitting vcvttss2usi to vcvttss2usi32 and vcvttss2usi64. Also adding a special handling for vcvttss2usi64 to make sure we read only dword instead of qword for float to ulong conversion
* undoing jitformat changes due to merge error
* removing unused code and correcting throughput and latency information for vcvttsd2usi, vcvttusi2sd32/64
* correcting throughput and latency for vcvttss2usi32 and placing it with other similar instructions
* formatting
* formatting
* updating comments
* updating code for github comments. Using compIsaSupportedDebugOnly for nowayasserts and also checking for float and doubel both in lowercast for overflow and conversion to ulong
* reverting to original checks for ISA supported Debug only because they are not available in release mode
* running jitformat
* running jitformat
* combine the 2 nodes GT_CAST(GT_CAST(TYP_ULONG, TYP_DOUBLE), TYP_FLOAT) into a single node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT)
* merging with main and updating hwintrinsiclistxarch to take into consideration 32bit and 64 bit version of vcvttss2usi.
* Changing noway_assert to assert to make sure compOpportunisticallyDependsOn only runs in debug mode.
* running jitformat
* Changing compOpportunisticallyDependsOn to compIsaSupportedDebugOnly in asserts aka code review changes
* Making code review changes. Moving around the comOpportunisticallyDependsOn checks to make sure they are ran only if we need AVX512. These checks being costly, moving them to the innermost checks in nested if checks.
* FCALL_CONTRACT should be only used on FCalls itself
* Making paralle changes to JITHelper in MathHelper for native AOT
* resolving regression issues
* Rolling back changes for double/float -> ulong
* Rolling back changes for double/float -> ulong
* Reverting ouf_or_range_fp_conversion to original version
* Reverting ouf_or_range_fp_conversion to original version
* Reverting jithelpers.cpp to original versino
* Reverting jithelpers.cpp to original version
* Changind comments, reverting asserts, skipping to change node for cast
* addressing review comments
* Update src/coreclr/jit/morph.cpp
---------
Co-authored-by: Tanner Gooding <tagoo@outlook.com>
// Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
// here since they should have been lowered appropriately.
noway_assert(srcType != TYP_UINT);
- noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT));
+ assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT) ||
+ compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F));
+
+ if ((srcType == TYP_ULONG) && varTypeIsFloating(dstType) &&
+ compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
+ {
+ assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F));
+ genConsumeOperands(treeNode->AsOp());
+ instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType));
+ GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
+ genProduceReg(treeNode);
+ return;
+ }
// To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
// which does a partial write to lower 4/8 bytes of xmm register keeping the other
// We shouldn't be seeing uint64 here as it should have been converted
// into a helper call by either front-end or lowering phase.
- noway_assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))));
+ assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))));
// If the dstType is TYP_UINT, we have 32-bits to encode the
// float number. Any of 33rd or above bits can be the sign bit.
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
- case INS_vcvttss2usi:
{
if (attr == EA_8BYTE)
{
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
- case INS_vcvttss2usi:
+ case INS_vcvttss2usi32:
+ case INS_vcvttss2usi64:
{
// These SSE instructions write to a general purpose integer register.
return false;
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
- case INS_vcvttss2usi:
{
printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE));
break;
}
+ case INS_vcvttss2usi32:
+ case INS_vcvttss2usi64:
+ {
+ printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_4BYTE));
+ break;
+ }
+
#ifdef TARGET_AMD64
case INS_movsxd:
{
case INS_cvtsi2sd64:
case INS_cvtsi2ss64:
case INS_vcvtsd2usi:
- case INS_vcvttsd2usi:
- case INS_vcvtusi2sd32:
- case INS_vcvtusi2sd64:
case INS_vcvtusi2ss32:
case INS_vcvtusi2ss64:
+ case INS_vcvttsd2usi:
+ case INS_vcvttss2usi32:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_7C;
break;
+ case INS_vcvtusi2sd64:
+ case INS_vcvtusi2sd32:
+ result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+ result.insLatency += PERFSCORE_LATENCY_5C;
+ break;
+
case INS_cvttss2si:
case INS_cvtss2si:
case INS_vcvtss2usi:
- case INS_vcvttss2usi:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C;
break;
+ case INS_vcvttss2usi64:
+ result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+ result.insLatency += PERFSCORE_LATENCY_8C;
+ break;
+
case INS_cvtss2sd:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_5C;
HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi32, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Byte, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F, ConvertToVector128ByteWithSaturation, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Int16, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqw, INS_vpmovqw, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F_X64, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F_X64, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi64, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags
//
instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
{
- // AVX: For now we support only conversion from Int/Long -> float
+ // AVX: Supports following conversions
+ // srcType = int16/int64 castToType = float
+ // AVX512: Supports following conversions
+ // srcType = ulong castToType = double/float
switch (from)
{
}
break;
+ case TYP_ULONG:
+ switch (to)
+ {
+ case TYP_DOUBLE:
+ return INS_vcvtusi2sd64;
+ case TYP_FLOAT:
+ return INS_vcvtusi2ss64;
+ default:
+ unreached();
+ }
+
default:
unreached();
}
INST3(vcvttpd2udq, "cvttpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation packed doubles to unsigned DWORDs
INST3(vcvttps2udq, "cvttps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation packed singles to unsigned DWORDs
INST3(vcvttsd2usi, "cvttsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x78), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar double to unsigned DWORD/QWORD
-INST3(vcvttss2usi, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD
+INST3(vcvttss2usi32, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD
+INST3(vcvttss2usi64, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD
INST3(vcvtudq2pd, "cvtudq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7A), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to doubles
INST3(vcvtudq2ps, "cvtudq2ps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7A), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to singles
INST3(vcvtusi2sd32, "cvtusi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned DWORD to double
}
else if (srcType == TYP_ULONG)
{
- noway_assert(castToType != TYP_FLOAT);
+ assert(castToType != TYP_FLOAT || comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F));
}
// Case of src is a small type and dst is a floating point type.
var_types dstType = tree->CastToType();
unsigned dstSize = genTypeSize(dstType);
+#if defined(TARGET_AMD64)
+ // If AVX512 is present, we have intrinsic available to convert
+ // ulong directly to float. Hence, we need to combine the 2 nodes
+ // GT_CAST(GT_CAST(TYP_ULONG, TYP_DOUBLE), TYP_FLOAT) into a single
+ // node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT). At this point, we already
+ // have the 2 GT_CAST nodes in the tree and we are combining them below.
+ if (oper->OperIs(GT_CAST))
+ {
+ GenTreeCast* innerCast = oper->AsCast();
+
+ if (innerCast->IsUnsigned())
+ {
+ GenTree* innerOper = innerCast->CastOp();
+ var_types innerSrcType = genActualType(innerOper);
+ var_types innerDstType = innerCast->CastToType();
+ unsigned innerDstSize = genTypeSize(innerDstType);
+ innerSrcType = varTypeToUnsigned(innerSrcType);
+
+ // Check if we are going from ulong->double->float
+ if ((innerSrcType == TYP_ULONG) && (innerDstType == TYP_DOUBLE) && (dstType == TYP_FLOAT))
+ {
+ if (compOpportunisticallyDependsOn(InstructionSet_AVX512F))
+ {
+ // One optimized (combined) cast here
+ tree = gtNewCastNode(TYP_FLOAT, innerOper, true, TYP_FLOAT);
+ return fgMorphTree(tree);
+ }
+ }
+ }
+ }
+#endif // TARGET_AMD64
+
// See if the cast has to be done in two steps. R -> I
if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType))
{
{
srcType = varTypeToUnsigned(srcType);
- if (srcType == TYP_ULONG)
+ if (srcType == TYP_ULONG && !compOpportunisticallyDependsOn(InstructionSet_AVX512F))
{
if (dstType == TYP_FLOAT)
{