break;
}
+ case NI_AVX512F_ConvertToVector128Int16:
+ case NI_AVX512F_ConvertToVector128Int32:
+ case NI_AVX512F_ConvertToVector128UInt16:
+ case NI_AVX512F_ConvertToVector128UInt32:
+ case NI_AVX512F_ConvertToVector256Int16:
+ case NI_AVX512F_ConvertToVector256Int32:
+ case NI_AVX512F_ConvertToVector256UInt16:
+ case NI_AVX512F_ConvertToVector256UInt32:
+ case NI_AVX512BW_ConvertToVector128Byte:
+ case NI_AVX512BW_ConvertToVector128SByte:
+ case NI_AVX512BW_ConvertToVector256Byte:
+ case NI_AVX512BW_ConvertToVector256SByte:
+ {
+ // These intrinsics are "ins reg/mem, xmm"
+ ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+ attr = emitActualTypeSize(Compiler::getSIMDTypeForSize(hwintrinsic->GetSimdSize()));
+ break;
+ }
+
default:
{
unreached();
case INS_cvtdq2pd:
case INS_cvtps2pd:
+ case INS_vpmovdw:
+ case INS_vpmovqd:
+ case INS_vpmovwb:
{
- if (defaultSize == 32)
+ if (defaultSize == 64)
+ {
+ return EA_32BYTE;
+ }
+ else if (defaultSize == 32)
{
return EA_16BYTE;
}
case INS_cvttps2dq:
case INS_cvtps2dq:
case INS_cvtdq2ps:
+ case INS_vpmovdw:
+ case INS_vpmovqd:
+ case INS_vpmovwb:
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_4C;
break;
case NI_AVX2_ConvertToUInt32:
case NI_AVX2_ExtractVector128:
case NI_AVX512F_ExtractVector256:
+ case NI_AVX512F_ConvertToVector128Int16:
+ case NI_AVX512F_ConvertToVector128Int32:
+ case NI_AVX512F_ConvertToVector128UInt16:
+ case NI_AVX512F_ConvertToVector128UInt32:
+ case NI_AVX512F_ConvertToVector256Int16:
+ case NI_AVX512F_ConvertToVector256Int32:
+ case NI_AVX512F_ConvertToVector256UInt16:
+ case NI_AVX512F_ConvertToVector256UInt32:
+ case NI_AVX512BW_ConvertToVector128Byte:
+ case NI_AVX512BW_ConvertToVector128SByte:
+ case NI_AVX512BW_ConvertToVector256Byte:
+ case NI_AVX512BW_ConvertToVector256SByte:
{
// These HWIntrinsic operations are contained as part of a store
return true;
GenTree* tmp3;
GenTree* tmp4;
- if (simdSize == 32)
+ if (IsBaselineVector512IsaSupported())
+ {
+ // This is the same in principle to the other comments below, however due to
+ // code formatting, its too long to reasonably display here.
+
+ assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64));
+ var_types tmpSimdType = (simdSize == 64) ? TYP_SIMD32 : TYP_SIMD16;
+
+ NamedIntrinsic intrinsicId;
+ CorInfoType opBaseJitType;
+
+ switch (simdBaseType)
+ {
+ case TYP_BYTE:
+ {
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_AVX512BW_ConvertToVector256SByte;
+ }
+ else
+ {
+ intrinsicId = NI_AVX512BW_ConvertToVector128SByte;
+ }
+
+ opBaseJitType = CORINFO_TYPE_SHORT;
+ break;
+ }
+
+ case TYP_UBYTE:
+ {
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_AVX512BW_ConvertToVector256Byte;
+ }
+ else
+ {
+ intrinsicId = NI_AVX512BW_ConvertToVector128Byte;
+ }
+
+ opBaseJitType = CORINFO_TYPE_USHORT;
+ break;
+ }
+
+ case TYP_SHORT:
+ {
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector256Int16;
+ }
+ else
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector128Int16;
+ }
+
+ opBaseJitType = CORINFO_TYPE_INT;
+ break;
+ }
+
+ case TYP_USHORT:
+ {
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector256UInt16;
+ }
+ else
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector128UInt16;
+ }
+
+ opBaseJitType = CORINFO_TYPE_UINT;
+ break;
+ }
+
+ case TYP_INT:
+ {
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector256Int32;
+ }
+ else
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector128Int32;
+ }
+
+ opBaseJitType = CORINFO_TYPE_LONG;
+ break;
+ }
+
+ case TYP_UINT:
+ {
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector256UInt32;
+ }
+ else
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector128UInt32;
+ }
+
+ opBaseJitType = CORINFO_TYPE_ULONG;
+ break;
+ }
+
+ case TYP_FLOAT:
+ {
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_AVX512F_ConvertToVector256Single;
+ }
+ else if (simdSize == 32)
+ {
+ intrinsicId = NI_AVX_ConvertToVector128Single;
+ }
+ else
+ {
+ intrinsicId = NI_SSE2_ConvertToVector128Single;
+ }
+
+ opBaseJitType = CORINFO_TYPE_DOUBLE;
+ break;
+ }
+
+ default:
+ {
+ unreached();
+ }
+ }
+
+ tmp1 = gtNewSimdHWIntrinsicNode(tmpSimdType, op1, intrinsicId, opBaseJitType, simdSize, isSimdAsHWIntrinsic);
+ tmp2 = gtNewSimdHWIntrinsicNode(tmpSimdType, op2, intrinsicId, opBaseJitType, simdSize, isSimdAsHWIntrinsic);
+
+ if (simdSize == 16)
+ {
+ return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE_MoveLowToHigh, CORINFO_TYPE_FLOAT, simdSize,
+ isSimdAsHWIntrinsic);
+ }
+
+ intrinsicId = (simdSize == 64) ? NI_Vector256_ToVector512Unsafe : NI_Vector128_ToVector256Unsafe;
+
+ tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, intrinsicId, simdBaseJitType, simdSize / 2, isSimdAsHWIntrinsic);
+ return gtNewSimdWithUpperNode(type, tmp1, tmp2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+ }
+ else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
// var tmp2 = Avx.ConvertToVector128Single(op2);
// return Avx.InsertVector128(tmp1, tmp2, 1);
- tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AVX_ConvertToVector128Single, simdBaseJitType,
+ CorInfoType opBaseJitType = CORINFO_TYPE_DOUBLE;
+
+ tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AVX_ConvertToVector128Single, opBaseJitType,
simdSize, isSimdAsHWIntrinsic);
- tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_AVX_ConvertToVector128Single, simdBaseJitType,
+ tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_AVX_ConvertToVector128Single, opBaseJitType,
simdSize, isSimdAsHWIntrinsic);
tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_ToVector256Unsafe, simdBaseJitType, 16,
}
else
{
+ assert(simdSize == 16);
+
switch (simdBaseType)
{
case TYP_BYTE:
GenTree* tmp1;
#if defined(TARGET_XARCH)
- if (simdSize == 32)
+ if (simdSize == 64)
+ {
+ assert(IsBaselineVector512IsaSupportedDebugOnly());
+
+ tmp1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+
+ switch (simdBaseType)
+ {
+ case TYP_BYTE:
+ {
+ intrinsic = NI_AVX512BW_ConvertToVector512Int16;
+ break;
+ }
+
+ case TYP_UBYTE:
+ {
+ intrinsic = NI_AVX512BW_ConvertToVector512UInt16;
+ break;
+ }
+
+ case TYP_SHORT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512Int32;
+ break;
+ }
+
+ case TYP_USHORT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512UInt32;
+ break;
+ }
+
+ case TYP_INT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512Int64;
+ break;
+ }
+
+ case TYP_UINT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512UInt64;
+ break;
+ }
+
+ case TYP_FLOAT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512Double;
+ break;
+ }
+
+ default:
+ {
+ unreached();
+ }
+ }
+
+ assert(intrinsic != NI_Illegal);
+ return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+ }
+ else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
GenTree* tmp1;
#if defined(TARGET_XARCH)
- if (simdSize == 32)
+ if (simdSize == 64)
+ {
+ assert(IsBaselineVector512IsaSupportedDebugOnly());
+
+ tmp1 = gtNewSimdGetUpperNode(TYP_SIMD32, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+
+ switch (simdBaseType)
+ {
+ case TYP_BYTE:
+ {
+ intrinsic = NI_AVX512BW_ConvertToVector512Int16;
+ break;
+ }
+
+ case TYP_UBYTE:
+ {
+ intrinsic = NI_AVX512BW_ConvertToVector512UInt16;
+ break;
+ }
+
+ case TYP_SHORT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512Int32;
+ break;
+ }
+
+ case TYP_USHORT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512UInt32;
+ break;
+ }
+
+ case TYP_INT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512Int64;
+ break;
+ }
+
+ case TYP_UINT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512UInt64;
+ break;
+ }
+
+ case TYP_FLOAT:
+ {
+ intrinsic = NI_AVX512F_ConvertToVector512Double;
+ break;
+ }
+
+ default:
+ {
+ unreached();
+ }
+ }
+
+ assert(intrinsic != NI_Illegal);
+ return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+ }
+ else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
case InstructionSet_AVX:
case InstructionSet_AVX2:
case InstructionSet_AVX512F:
+ case InstructionSet_AVX512BW:
genAvxFamilyIntrinsic(node);
break;
case InstructionSet_AES:
break;
}
+ case NI_AVX512F_ConvertToVector128Int16:
+ case NI_AVX512F_ConvertToVector128Int32:
+ case NI_AVX512F_ConvertToVector128UInt16:
+ case NI_AVX512F_ConvertToVector128UInt32:
+ case NI_AVX512F_ConvertToVector256Int16:
+ case NI_AVX512F_ConvertToVector256Int32:
+ case NI_AVX512F_ConvertToVector256UInt16:
+ case NI_AVX512F_ConvertToVector256UInt32:
+ case NI_AVX512BW_ConvertToVector128Byte:
+ case NI_AVX512BW_ConvertToVector128SByte:
+ case NI_AVX512BW_ConvertToVector256Byte:
+ case NI_AVX512BW_ConvertToVector256SByte:
+ {
+ // These instructions are RM_R and so we need to ensure the targetReg
+ // is passed in as the RM register and op1 is passed as the R register
+
+ op1Reg = op1->GetRegNum();
+ instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+
+ emit->emitIns_R_R(ins, attr, op1Reg, targetReg);
+ break;
+ }
+
default:
unreached();
break;
HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, LoadUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(Vector512, Narrow, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, OnesComplement, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, op_Addition, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, op_BitwiseAnd, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative)
HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, Subtract, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(Vector512, WidenLower, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(Vector512, WidenUpper, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, WithLower, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, WithUpper, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, Xor, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX, ConvertToVector256Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX, ConvertToVector256Double, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2pd, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2pd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX512F, And, 64, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Int16, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdw, INS_vpmovdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Int32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqd, INS_vpmovqd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector128UInt16, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdw, INS_vpmovdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector128UInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqd, INS_vpmovqd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector256Int16, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdw, INS_vpmovdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector256Int32, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqd, INS_vpmovqd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector256Single, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector256UInt16, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdw, INS_vpmovdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector256UInt32, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqd, INS_vpmovqd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector512Double, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2pd, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2pd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector512Int32, 64, 1, {INS_invalid, INS_invalid, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector512Int64, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector512UInt32, 64, 1, {INS_invalid, INS_invalid, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F, ConvertToVector512UInt64, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX512F, ExtractVector256, 64, 2, {INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextractf64x4, INS_vextractf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
// ISA Function name SIMD size NumArg Instructions Category Flags
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
-// AVX512BW Intrinsics
-HARDWARE_INTRINSIC(AVX512BW, Add, 64, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
-HARDWARE_INTRINSIC(AVX512BW, BroadcastScalarToVector512, 64, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad)
-HARDWARE_INTRINSIC(AVX512BW, Subtract, 64, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+// AVX512F Intrinsics
+HARDWARE_INTRINSIC(AVX512BW, Add, 64, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX512BW, BroadcastScalarToVector512, 64, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX512BW, ConvertToVector128Byte, -1, 1, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, ConvertToVector128SByte, -1, 1, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256Byte, 64, 1, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256SByte, 64, 1, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512BW, ConvertToVector512Int16, 64, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512BW, ConvertToVector512UInt16, 64, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512BW, Subtract, 64, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
case NI_Vector128_Narrow:
case NI_Vector256_Narrow:
+ case NI_Vector512_Narrow:
{
assert(sig->numArgs == 2);
if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
{
+ assert((simdSize != 64) || IsBaselineVector512IsaSupportedDebugOnly());
+
op2 = impSIMDPopStack(retType);
op1 = impSIMDPopStack(retType);
case NI_Vector128_WidenLower:
case NI_Vector256_WidenLower:
+ case NI_Vector512_WidenLower:
{
assert(sig->numArgs == 1);
if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
{
+ assert((simdSize != 64) || IsBaselineVector512IsaSupportedDebugOnly());
+
op1 = impSIMDPopStack(retType);
retNode =
case NI_Vector128_WidenUpper:
case NI_Vector256_WidenUpper:
+ case NI_Vector512_WidenUpper:
{
assert(sig->numArgs == 1);
if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
{
+ assert((simdSize != 64) || IsBaselineVector512IsaSupportedDebugOnly());
+
op1 = impSIMDPopStack(retType);
retNode =
// These macros encode extra byte that is implicit in the macro.
#define PACK4(byte1,byte2,byte3,byte4) (((byte1) << 16) | ((byte2) << 24) | (byte3) | ((byte4) << 8))
-#define SSE38(c) PACK4(0x66, 0x0f, 0x38, c)
-#define SSE3A(c) PACK4(0x66, 0x0f, 0x3A, c)
+
+#define PSSE38(p, c) PACK4(p, 0x0f, 0x38, c)
+#define PSSE3A(p, c) PACK4(p, 0x0f, 0x3A, c)
+
+#define SSE38(c) PSSE38(0x66, c)
+#define SSE3A(c) PSSE3A(0x66, c)
// VEX* encodes the implied leading opcode bytes in c1:
// 1: implied 0f, 2: implied 0f 38, 3: implied 0f 3a
INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract
INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_TT_NONE, REX_WX | Encoding_VEX)
#ifdef TARGET_AMD64
-INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags
+INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags
INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags
-INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags
+INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags
#endif
INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
// AVX512F
-INST3(vextractf64x4, "extractf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1B), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values
-INST3(vextracti64x4, "extracti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3B), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values
-INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values
-INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values
-INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None)
-INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None)
-INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs
-INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs
-INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs
-INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs
+INST3(vextractf64x4, "extractf64x4", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values
+INST3(vextracti64x4, "extracti64x4", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values
+INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values
+INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values
+INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None)
+INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None)
+INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs
+INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs
+INST3(vpmovdw, "pmovdw", IUM_WR, PSSE38(0xF3, 0x33), BAD_CODE, PSSE38(0xF3, 0x33), INS_TT_HALF_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX)
+INST3(vpmovqd, "pmovqd", IUM_WR, PSSE38(0xF3, 0x35), BAD_CODE, PSSE38(0xF3, 0x35), INS_TT_HALF_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX)
+INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs
+INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs
// AVX512BW
-INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX)
-INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX)
-INST3(vpmovb2m, "pmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_8Bit | REX_W0_EVEX | Encoding_EVEX)
-INST3(vpmovw2m, "pmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX)
+INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX)
+INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX)
+INST3(vpmovb2m, "pmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_8Bit | REX_W0_EVEX | Encoding_EVEX)
+INST3(vpmovw2m, "pmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX)
+INST3(vpmovwb, "pmovwb", IUM_WR, PSSE38(0xF3, 0x30), BAD_CODE, PSSE38(0xF3, 0x30), INS_TT_HALF_MEM, Input_16Bit | REX_W0_EVEX | Encoding_EVEX)
// AVX512DQ
-INST3(vextractf32x8, "extractf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1B), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values
-INST3(vextracti32x8, "extracti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3B), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values
-INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values
-INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values
-INST3(vpmovd2m, "pmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX)
-INST3(vpmovq2m, "pmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_64Bit | REX_W1_EVEX | Encoding_EVEX)
+INST3(vextractf32x8, "extractf32x8", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values
+INST3(vextracti32x8, "extracti32x8", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values
+INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values
+INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values
+INST3(vpmovd2m, "pmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX)
+INST3(vpmovq2m, "pmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_64Bit | REX_W1_EVEX | Encoding_EVEX)
INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
// Scalar instructions in SSE4.2
-INST3(crc32, "crc32", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF0), INS_TT_NONE, INS_FLAGS_None)
+INST3(crc32, "crc32", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF0), INS_TT_NONE, INS_FLAGS_None)
// BMI1
INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Count the Number of Trailing Zero Bits
break;
}
+ case NI_AVX512F_ConvertToVector128Int16:
+ case NI_AVX512F_ConvertToVector128Int32:
+ case NI_AVX512F_ConvertToVector128UInt16:
+ case NI_AVX512F_ConvertToVector128UInt32:
+ case NI_AVX512F_ConvertToVector256Int16:
+ case NI_AVX512F_ConvertToVector256Int32:
+ case NI_AVX512F_ConvertToVector256UInt16:
+ case NI_AVX512F_ConvertToVector256UInt32:
+ case NI_AVX512BW_ConvertToVector128Byte:
+ case NI_AVX512BW_ConvertToVector128SByte:
+ case NI_AVX512BW_ConvertToVector256Byte:
+ case NI_AVX512BW_ConvertToVector256SByte:
+ {
+ // These intrinsics are "ins reg/mem, xmm"
+ unsigned simdSize = hwintrinsic->GetSimdSize();
+
+ if (simdSize == 16)
+ {
+ // For TYP_SIMD16, we produce a TYP_SIMD16 register
+ // but only store TYP_SIMD8 to memory and so we cannot
+ // contain without additional work.
+ isContainable = false;
+ }
+ else
+ {
+ assert((simdSize == 32) || (simdSize == 64));
+ isContainable = true;
+ }
+ break;
+ }
+
default:
{
break;
case NI_AVX_ExtractVector128:
case NI_AVX2_ExtractVector128:
case NI_AVX512F_ExtractVector256:
+ case NI_AVX512F_ConvertToVector128Int16:
+ case NI_AVX512F_ConvertToVector128Int32:
+ case NI_AVX512F_ConvertToVector128UInt16:
+ case NI_AVX512F_ConvertToVector128UInt32:
+ case NI_AVX512F_ConvertToVector256Int16:
+ case NI_AVX512F_ConvertToVector256Int32:
+ case NI_AVX512F_ConvertToVector256UInt16:
+ case NI_AVX512F_ConvertToVector256UInt32:
+ case NI_AVX512BW_ConvertToVector128Byte:
+ case NI_AVX512BW_ConvertToVector128SByte:
+ case NI_AVX512BW_ConvertToVector256Byte:
+ case NI_AVX512BW_ConvertToVector256SByte:
{
// These are only containable as part of a store
return false;
break;
}
+ case NI_AVX512F_ConvertToVector128Int16:
+ case NI_AVX512F_ConvertToVector128Int32:
+ case NI_AVX512F_ConvertToVector128UInt16:
+ case NI_AVX512F_ConvertToVector128UInt32:
+ case NI_AVX512F_ConvertToVector256Int16:
+ case NI_AVX512F_ConvertToVector256Int32:
+ case NI_AVX512F_ConvertToVector256UInt16:
+ case NI_AVX512F_ConvertToVector256UInt32:
+ case NI_AVX512BW_ConvertToVector128Byte:
+ case NI_AVX512BW_ConvertToVector128SByte:
+ case NI_AVX512BW_ConvertToVector256Byte:
+ case NI_AVX512BW_ConvertToVector256SByte:
+ {
+ // These intrinsics are "ins reg/mem, xmm" and get
+ // contained by the relevant store operation instead.
+ return;
+ }
+
default:
{
break;