Updating the JIT to support Narrow, WidenLower, and WidenUpper for Vector512 (#84498)
authorTanner Gooding <tagoo@outlook.com>
Sun, 9 Apr 2023 04:09:45 +0000 (21:09 -0700)
committerGitHub <noreply@github.com>
Sun, 9 Apr 2023 04:09:45 +0000 (21:09 -0700)
* Updating the JIT to support Narrow, WidenLower, and WidenUpper for Vector512

* Ensure tgtReg is passed in the RM slot for the AVX512 narrowing instructions

* Applying formatting patch

* Update src/coreclr/jit/lowerxarch.cpp

src/coreclr/jit/codegenxarch.cpp
src/coreclr/jit/emit.h
src/coreclr/jit/emitxarch.cpp
src/coreclr/jit/gentree.cpp
src/coreclr/jit/hwintrinsiccodegenxarch.cpp
src/coreclr/jit/hwintrinsiclistxarch.h
src/coreclr/jit/hwintrinsicxarch.cpp
src/coreclr/jit/instrsxarch.h
src/coreclr/jit/lowerxarch.cpp

index 7dee61b6177af4f941befeaf331d48b2f6ce476c..898d3ea312376d021da6770aa0f21d9695fed763 100644 (file)
@@ -5649,6 +5649,25 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
                             break;
                         }
 
+                        case NI_AVX512F_ConvertToVector128Int16:
+                        case NI_AVX512F_ConvertToVector128Int32:
+                        case NI_AVX512F_ConvertToVector128UInt16:
+                        case NI_AVX512F_ConvertToVector128UInt32:
+                        case NI_AVX512F_ConvertToVector256Int16:
+                        case NI_AVX512F_ConvertToVector256Int32:
+                        case NI_AVX512F_ConvertToVector256UInt16:
+                        case NI_AVX512F_ConvertToVector256UInt32:
+                        case NI_AVX512BW_ConvertToVector128Byte:
+                        case NI_AVX512BW_ConvertToVector128SByte:
+                        case NI_AVX512BW_ConvertToVector256Byte:
+                        case NI_AVX512BW_ConvertToVector256SByte:
+                        {
+                            // These intrinsics are "ins reg/mem, xmm"
+                            ins  = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+                            attr = emitActualTypeSize(Compiler::getSIMDTypeForSize(hwintrinsic->GetSimdSize()));
+                            break;
+                        }
+
                         default:
                         {
                             unreached();
index fefbc7e514dcbffc3919e89afb0c000072c821ba..33f3570deae2e29587d9d4d004d8ec7b8d8a6175 100644 (file)
@@ -3570,8 +3570,15 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
 
         case INS_cvtdq2pd:
         case INS_cvtps2pd:
+        case INS_vpmovdw:
+        case INS_vpmovqd:
+        case INS_vpmovwb:
         {
-            if (defaultSize == 32)
+            if (defaultSize == 64)
+            {
+                return EA_32BYTE;
+            }
+            else if (defaultSize == 32)
             {
                 return EA_16BYTE;
             }
index 88150b42d8a475f113d19483b6147681eeaf2a4a..359571e686f27e94eca2580abd5782176c2b4ab4 100644 (file)
@@ -17704,6 +17704,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
         case INS_cvttps2dq:
         case INS_cvtps2dq:
         case INS_cvtdq2ps:
+        case INS_vpmovdw:
+        case INS_vpmovqd:
+        case INS_vpmovwb:
             result.insThroughput = PERFSCORE_THROUGHPUT_2X;
             result.insLatency += PERFSCORE_LATENCY_4C;
             break;
index b7e875b844e5408f8c5e3ca71f342b021896cede..bdefd38fb427981eba476fd9b19c61a612f6178e 100644 (file)
@@ -19129,6 +19129,18 @@ bool GenTree::isContainableHWIntrinsic() const
         case NI_AVX2_ConvertToUInt32:
         case NI_AVX2_ExtractVector128:
         case NI_AVX512F_ExtractVector256:
+        case NI_AVX512F_ConvertToVector128Int16:
+        case NI_AVX512F_ConvertToVector128Int32:
+        case NI_AVX512F_ConvertToVector128UInt16:
+        case NI_AVX512F_ConvertToVector128UInt32:
+        case NI_AVX512F_ConvertToVector256Int16:
+        case NI_AVX512F_ConvertToVector256Int32:
+        case NI_AVX512F_ConvertToVector256UInt16:
+        case NI_AVX512F_ConvertToVector256UInt32:
+        case NI_AVX512BW_ConvertToVector128Byte:
+        case NI_AVX512BW_ConvertToVector128SByte:
+        case NI_AVX512BW_ConvertToVector256Byte:
+        case NI_AVX512BW_ConvertToVector256SByte:
         {
             // These HWIntrinsic operations are contained as part of a store
             return true;
@@ -22393,7 +22405,149 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types   type,
     GenTree* tmp3;
     GenTree* tmp4;
 
-    if (simdSize == 32)
+    if (IsBaselineVector512IsaSupported())
+    {
+        // This is the same in principle to the other comments below, however due to
+        // code formatting, its too long to reasonably display here.
+
+        assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64));
+        var_types tmpSimdType = (simdSize == 64) ? TYP_SIMD32 : TYP_SIMD16;
+
+        NamedIntrinsic intrinsicId;
+        CorInfoType    opBaseJitType;
+
+        switch (simdBaseType)
+        {
+            case TYP_BYTE:
+            {
+                if (simdSize == 64)
+                {
+                    intrinsicId = NI_AVX512BW_ConvertToVector256SByte;
+                }
+                else
+                {
+                    intrinsicId = NI_AVX512BW_ConvertToVector128SByte;
+                }
+
+                opBaseJitType = CORINFO_TYPE_SHORT;
+                break;
+            }
+
+            case TYP_UBYTE:
+            {
+                if (simdSize == 64)
+                {
+                    intrinsicId = NI_AVX512BW_ConvertToVector256Byte;
+                }
+                else
+                {
+                    intrinsicId = NI_AVX512BW_ConvertToVector128Byte;
+                }
+
+                opBaseJitType = CORINFO_TYPE_USHORT;
+                break;
+            }
+
+            case TYP_SHORT:
+            {
+                if (simdSize == 64)
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector256Int16;
+                }
+                else
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector128Int16;
+                }
+
+                opBaseJitType = CORINFO_TYPE_INT;
+                break;
+            }
+
+            case TYP_USHORT:
+            {
+                if (simdSize == 64)
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector256UInt16;
+                }
+                else
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector128UInt16;
+                }
+
+                opBaseJitType = CORINFO_TYPE_UINT;
+                break;
+            }
+
+            case TYP_INT:
+            {
+                if (simdSize == 64)
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector256Int32;
+                }
+                else
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector128Int32;
+                }
+
+                opBaseJitType = CORINFO_TYPE_LONG;
+                break;
+            }
+
+            case TYP_UINT:
+            {
+                if (simdSize == 64)
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector256UInt32;
+                }
+                else
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector128UInt32;
+                }
+
+                opBaseJitType = CORINFO_TYPE_ULONG;
+                break;
+            }
+
+            case TYP_FLOAT:
+            {
+                if (simdSize == 64)
+                {
+                    intrinsicId = NI_AVX512F_ConvertToVector256Single;
+                }
+                else if (simdSize == 32)
+                {
+                    intrinsicId = NI_AVX_ConvertToVector128Single;
+                }
+                else
+                {
+                    intrinsicId = NI_SSE2_ConvertToVector128Single;
+                }
+
+                opBaseJitType = CORINFO_TYPE_DOUBLE;
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+
+        tmp1 = gtNewSimdHWIntrinsicNode(tmpSimdType, op1, intrinsicId, opBaseJitType, simdSize, isSimdAsHWIntrinsic);
+        tmp2 = gtNewSimdHWIntrinsicNode(tmpSimdType, op2, intrinsicId, opBaseJitType, simdSize, isSimdAsHWIntrinsic);
+
+        if (simdSize == 16)
+        {
+            return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE_MoveLowToHigh, CORINFO_TYPE_FLOAT, simdSize,
+                                            isSimdAsHWIntrinsic);
+        }
+
+        intrinsicId = (simdSize == 64) ? NI_Vector256_ToVector512Unsafe : NI_Vector128_ToVector256Unsafe;
+
+        tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, intrinsicId, simdBaseJitType, simdSize / 2, isSimdAsHWIntrinsic);
+        return gtNewSimdWithUpperNode(type, tmp1, tmp2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+    }
+    else if (simdSize == 32)
     {
         assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
 
@@ -22518,9 +22672,11 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types   type,
                 // var tmp2 = Avx.ConvertToVector128Single(op2);
                 // return Avx.InsertVector128(tmp1, tmp2, 1);
 
-                tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AVX_ConvertToVector128Single, simdBaseJitType,
+                CorInfoType opBaseJitType = CORINFO_TYPE_DOUBLE;
+
+                tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AVX_ConvertToVector128Single, opBaseJitType,
                                                 simdSize, isSimdAsHWIntrinsic);
-                tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_AVX_ConvertToVector128Single, simdBaseJitType,
+                tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_AVX_ConvertToVector128Single, opBaseJitType,
                                                 simdSize, isSimdAsHWIntrinsic);
 
                 tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_ToVector256Unsafe, simdBaseJitType, 16,
@@ -22536,6 +22692,8 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types   type,
     }
     else
     {
+        assert(simdSize == 16);
+
         switch (simdBaseType)
         {
             case TYP_BYTE:
@@ -23521,7 +23679,66 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(
     GenTree* tmp1;
 
 #if defined(TARGET_XARCH)
-    if (simdSize == 32)
+    if (simdSize == 64)
+    {
+        assert(IsBaselineVector512IsaSupportedDebugOnly());
+
+        tmp1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+
+        switch (simdBaseType)
+        {
+            case TYP_BYTE:
+            {
+                intrinsic = NI_AVX512BW_ConvertToVector512Int16;
+                break;
+            }
+
+            case TYP_UBYTE:
+            {
+                intrinsic = NI_AVX512BW_ConvertToVector512UInt16;
+                break;
+            }
+
+            case TYP_SHORT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512Int32;
+                break;
+            }
+
+            case TYP_USHORT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512UInt32;
+                break;
+            }
+
+            case TYP_INT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512Int64;
+                break;
+            }
+
+            case TYP_UINT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512UInt64;
+                break;
+            }
+
+            case TYP_FLOAT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512Double;
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+
+        assert(intrinsic != NI_Illegal);
+        return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+    }
+    else if (simdSize == 32)
     {
         assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
         assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
@@ -23681,7 +23898,66 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(
     GenTree* tmp1;
 
 #if defined(TARGET_XARCH)
-    if (simdSize == 32)
+    if (simdSize == 64)
+    {
+        assert(IsBaselineVector512IsaSupportedDebugOnly());
+
+        tmp1 = gtNewSimdGetUpperNode(TYP_SIMD32, op1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+
+        switch (simdBaseType)
+        {
+            case TYP_BYTE:
+            {
+                intrinsic = NI_AVX512BW_ConvertToVector512Int16;
+                break;
+            }
+
+            case TYP_UBYTE:
+            {
+                intrinsic = NI_AVX512BW_ConvertToVector512UInt16;
+                break;
+            }
+
+            case TYP_SHORT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512Int32;
+                break;
+            }
+
+            case TYP_USHORT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512UInt32;
+                break;
+            }
+
+            case TYP_INT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512Int64;
+                break;
+            }
+
+            case TYP_UINT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512UInt64;
+                break;
+            }
+
+            case TYP_FLOAT:
+            {
+                intrinsic = NI_AVX512F_ConvertToVector512Double;
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+
+        assert(intrinsic != NI_Illegal);
+        return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+    }
+    else if (simdSize == 32)
     {
         assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
         assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
index 7b344ee615203c200f2f0ac2f48a283f9997e874..9448ba8b8d148d064057d2f44d3e80a56bf64307 100644 (file)
@@ -386,6 +386,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
         case InstructionSet_AVX:
         case InstructionSet_AVX2:
         case InstructionSet_AVX512F:
+        case InstructionSet_AVX512BW:
             genAvxFamilyIntrinsic(node);
             break;
         case InstructionSet_AES:
@@ -1746,6 +1747,29 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node)
             break;
         }
 
+        case NI_AVX512F_ConvertToVector128Int16:
+        case NI_AVX512F_ConvertToVector128Int32:
+        case NI_AVX512F_ConvertToVector128UInt16:
+        case NI_AVX512F_ConvertToVector128UInt32:
+        case NI_AVX512F_ConvertToVector256Int16:
+        case NI_AVX512F_ConvertToVector256Int32:
+        case NI_AVX512F_ConvertToVector256UInt16:
+        case NI_AVX512F_ConvertToVector256UInt32:
+        case NI_AVX512BW_ConvertToVector128Byte:
+        case NI_AVX512BW_ConvertToVector128SByte:
+        case NI_AVX512BW_ConvertToVector256Byte:
+        case NI_AVX512BW_ConvertToVector256SByte:
+        {
+            // These instructions are RM_R and so we need to ensure the targetReg
+            // is passed in as the RM register and op1 is passed as the R register
+
+            op1Reg          = op1->GetRegNum();
+            instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+
+            emit->emitIns_R_R(ins, attr, op1Reg, targetReg);
+            break;
+        }
+
         default:
             unreached();
             break;
index 43e8841eddcd21c759fde7d3c745279577e8d7e4..53ffea32e6f514d5822152fb47db876283bbe489 100644 (file)
@@ -274,6 +274,7 @@ HARDWARE_INTRINSIC(Vector512,       Load,
 HARDWARE_INTRINSIC(Vector512,       LoadAligned,                                64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       LoadAlignedNonTemporal,                     64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       LoadUnsafe,                                 64,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(Vector512,       Narrow,                                     64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       OnesComplement,                             64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       op_Addition,                                64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       op_BitwiseAnd,                              64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative)
@@ -286,6 +287,8 @@ HARDWARE_INTRINSIC(Vector512,       StoreAligned,
 HARDWARE_INTRINSIC(Vector512,       StoreAlignedNonTemporal,                    64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       StoreUnsafe,                                64,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       Subtract,                                   64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(Vector512,       WidenLower,                                 64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(Vector512,       WidenUpper,                                 64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector512,       WithLower,                                  64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       WithUpper,                                  64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector512,       Xor,                                        64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
@@ -654,7 +657,7 @@ HARDWARE_INTRINSIC(AVX,             CompareOrdered,
 HARDWARE_INTRINSIC(AVX,             CompareUnordered,                           32,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cmpps,              INS_cmppd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(AVX,             CompareScalar,                              16,              3,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cmpss,              INS_cmpsd},             HW_Category_IMM,                    HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics)
 HARDWARE_INTRINSIC(AVX,             ConvertToVector128Int32,                    32,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtpd2dq,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX,             ConvertToVector128Single,                   32,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtpd2ps,           INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX,             ConvertToVector128Single,                   32,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtpd2ps},          HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX,             ConvertToVector256Int32,                    32,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtps2dq,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX,             ConvertToVector256Single,                   32,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtdq2ps,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX,             ConvertToVector256Double,                   32,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtdq2pd,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtps2pd,           INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
@@ -783,6 +786,20 @@ HARDWARE_INTRINSIC(AVX512F,         Add,
 HARDWARE_INTRINSIC(AVX512F,         And,                                        64,              2,     {INS_pand,              INS_pand,               INS_pand,               INS_pand,               INS_pand,               INS_pand,               INS_vpandq,             INS_vpandq,             INS_andps,              INS_andpd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX512F,         AndNot,                                     64,              2,     {INS_pandn,             INS_pandn,              INS_pandn,              INS_pandn,              INS_pandn,              INS_pandn,              INS_vpandnq,            INS_vpandnq,            INS_andnps,             INS_andnpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX512F,         BroadcastScalarToVector512,                 64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpbroadcastd,       INS_vpbroadcastd,       INS_vpbroadcastq,       INS_vpbroadcastq,       INS_vbroadcastss,       INS_vbroadcastsd},      HW_Category_SIMDScalar,             HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector128Int16,                    -1,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovdw,            INS_vpmovdw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector128Int32,                    -1,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovqd,            INS_vpmovqd,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector128UInt16,                   -1,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovdw,            INS_vpmovdw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector128UInt32,                   -1,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovqd,            INS_vpmovqd,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector256Int16,                    64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovdw,            INS_vpmovdw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector256Int32,                    64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovqd,            INS_vpmovqd,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector256Single,                   64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtpd2ps},          HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector256UInt16,                   64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovdw,            INS_vpmovdw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector256UInt32,                   64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovqd,            INS_vpmovqd,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector512Double,                   64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtdq2pd,           INS_invalid,            INS_invalid,            INS_invalid,            INS_cvtps2pd,           INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector512Int32,                    64,              1,     {INS_invalid,           INS_invalid,            INS_pmovsxwd,           INS_pmovzxwd,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector512Int64,                    64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_pmovsxdq,           INS_pmovzxdq,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector512UInt32,                   64,              1,     {INS_invalid,           INS_invalid,            INS_pmovsxwd,           INS_pmovzxwd,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512F,         ConvertToVector512UInt64,                   64,              1,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_pmovsxdq,           INS_pmovzxdq,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX512F,         ExtractVector256,                           64,              2,     {INS_vextracti64x4,     INS_vextracti64x4,      INS_vextracti64x4,      INS_vextracti64x4,      INS_vextracti64x4,      INS_vextracti64x4,      INS_vextracti64x4,      INS_vextracti64x4,      INS_vextractf64x4,      INS_vextractf64x4},     HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX512F,         InsertVector256,                            64,              3,     {INS_vinserti64x4,      INS_vinserti64x4,       INS_vinserti64x4,       INS_vinserti64x4,       INS_vinserti64x4,       INS_vinserti64x4,       INS_vinserti64x4,       INS_vinserti64x4,       INS_vinsertf64x4,       INS_vinsertf64x4},      HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX512F,         LoadAlignedVector512,                       64,              1,     {INS_movdqa,            INS_movdqa,             INS_movdqa,             INS_movdqa,             INS_movdqa,             INS_movdqa,             INS_vmovdqa64,          INS_vmovdqa64,          INS_movaps,             INS_movapd},            HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
@@ -799,10 +816,16 @@ HARDWARE_INTRINSIC(AVX512F,         Xor,
 //                 ISA              Function name                               SIMD size       NumArg                                                                                                         Instructions                                                                                                                             Category                            Flags
 //                                                                                                      {TYP_BYTE,              TYP_UBYTE,              TYP_SHORT,              TYP_USHORT,             TYP_INT,                TYP_UINT,               TYP_LONG,               TYP_ULONG,              TYP_FLOAT,              TYP_DOUBLE}
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
-//  AVX512BW Intrinsics
-HARDWARE_INTRINSIC(AVX512BW,         Add,                                        64,              2,     {INS_paddb,            INS_paddb,              INS_paddw,              INS_paddw,              INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
-HARDWARE_INTRINSIC(AVX512BW,         BroadcastScalarToVector512,                 64,              1,     {INS_vpbroadcastb,     INS_vpbroadcastb,       INS_vpbroadcastw,       INS_vpbroadcastw,       INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MaybeMemoryLoad)
-HARDWARE_INTRINSIC(AVX512BW,         Subtract,                                   64,              2,     {INS_psubb,            INS_psubb,              INS_psubw,              INS_psubw,              INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
+//  AVX512F Intrinsics
+HARDWARE_INTRINSIC(AVX512BW,        Add,                                        64,              2,     {INS_paddb,            INS_paddb,              INS_paddw,              INS_paddw,              INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX512BW,        BroadcastScalarToVector512,                 64,              1,     {INS_vpbroadcastb,      INS_vpbroadcastb,       INS_vpbroadcastw,       INS_vpbroadcastw,       INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector128Byte,                     -1,              1,     {INS_invalid,           INS_invalid,            INS_vpmovwb,            INS_vpmovwb,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector128SByte,                    -1,              1,     {INS_invalid,           INS_invalid,            INS_vpmovwb,            INS_vpmovwb,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector256Byte,                     64,              1,     {INS_invalid,           INS_invalid,            INS_vpmovwb,            INS_vpmovwb,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector256SByte,                    64,              1,     {INS_invalid,           INS_invalid,            INS_vpmovwb,            INS_vpmovwb,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector512Int16,                    64,              1,     {INS_pmovsxbw,          INS_pmovzxbw,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector512UInt16,                   64,              1,     {INS_pmovsxbw,          INS_pmovzxbw,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX512BW,        Subtract,                                   64,              2,     {INS_psubb,            INS_psubb,              INS_psubw,              INS_psubw,              INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //                 ISA              Function name                               SIMD size       NumArg                                                                                                         Instructions                                                                                                                             Category                            Flags
index b26ba16adb44e4f71ded0f419b4164acfd9b932e..961c7741ade211e8ab234861422b9a3c70f00d3f 100644 (file)
@@ -1984,11 +1984,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
 
         case NI_Vector128_Narrow:
         case NI_Vector256_Narrow:
+        case NI_Vector512_Narrow:
         {
             assert(sig->numArgs == 2);
 
             if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
             {
+                assert((simdSize != 64) || IsBaselineVector512IsaSupportedDebugOnly());
+
                 op2 = impSIMDPopStack(retType);
                 op1 = impSIMDPopStack(retType);
 
@@ -2485,11 +2488,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
 
         case NI_Vector128_WidenLower:
         case NI_Vector256_WidenLower:
+        case NI_Vector512_WidenLower:
         {
             assert(sig->numArgs == 1);
 
             if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
             {
+                assert((simdSize != 64) || IsBaselineVector512IsaSupportedDebugOnly());
+
                 op1 = impSIMDPopStack(retType);
 
                 retNode =
@@ -2500,11 +2506,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
 
         case NI_Vector128_WidenUpper:
         case NI_Vector256_WidenUpper:
+        case NI_Vector512_WidenUpper:
         {
             assert(sig->numArgs == 1);
 
             if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
             {
+                assert((simdSize != 64) || IsBaselineVector512IsaSupportedDebugOnly());
+
                 op1 = impSIMDPopStack(retType);
 
                 retNode =
index 3253471c94550ec02e4b035009bffc718795ab59..207525db40a503fb702a370c46e41973b483d508 100644 (file)
@@ -175,8 +175,12 @@ INSTMUL(imul_15,        "imul",             IUM_RD, BAD_CODE,     0x4400003868,
 
 // These macros encode extra byte that is implicit in the macro.
 #define PACK4(byte1,byte2,byte3,byte4) (((byte1) << 16) | ((byte2) << 24) | (byte3) | ((byte4) << 8))
-#define SSE38(c)   PACK4(0x66, 0x0f, 0x38, c)
-#define SSE3A(c)   PACK4(0x66, 0x0f, 0x3A, c)
+
+#define PSSE38(p, c)     PACK4(p, 0x0f, 0x38, c)
+#define PSSE3A(p, c)     PACK4(p, 0x0f, 0x3A, c)
+
+#define SSE38(c)         PSSE38(0x66, c)
+#define SSE3A(c)         PSSE3A(0x66, c)
 
 // VEX* encodes the implied leading opcode bytes in c1:
 // 1: implied 0f, 2: implied 0f 38, 3: implied 0f 3a
@@ -592,9 +596,9 @@ INST3(pdep,             "pdep",             IUM_WR, BAD_CODE,     BAD_CODE,
 INST3(pext,             "pext",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Parallel Bits Extract
 INST3(rorx,             "rorx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xF0),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX)
 #ifdef TARGET_AMD64
-INST3(sarx,             "sarx",             IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF3, 0x0F, 0x38, 0xF7),           INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Arithmetic Right Without Affecting Flags
+INST3(sarx,             "sarx",             IUM_WR, BAD_CODE,     BAD_CODE,     PSSE38(0xF3, 0xF7),                      INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Arithmetic Right Without Affecting Flags
 INST3(shlx,             "shlx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Logical Left Without Affecting Flags
-INST3(shrx,             "shrx",             IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF7),           INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Logical Right Without Affecting Flags
+INST3(shrx,             "shrx",             IUM_WR, BAD_CODE,     BAD_CODE,     PSSE38(0xF2, 0xF7),                      INS_TT_NONE,                                          REX_WX       | Encoding_VEX                   | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           //  Shift Logical Right Without Affecting Flags
 #endif
 
 INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
@@ -618,36 +622,39 @@ INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE,
 INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 
 // AVX512F
-INST3(vextractf64x4,     "extractf64x4",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x1B),                             INS_TT_TUPLE4,                       Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Extract 256-bit packed double-precision floating point values
-INST3(vextracti64x4,     "extracti64x4",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x3B),                             INS_TT_TUPLE4,                       Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Extract 256-bit packed quadword integer values
-INST3(vinsertf64x4,      "insertf64x4",     IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x1A),                             INS_TT_TUPLE4,                       Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Insert 256-bit packed double-precision floating point values
-INST3(vinserti64x4,      "inserti64x4",     IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x3A),                             INS_TT_TUPLE4,                       Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Insert 256-bit packed quadword integer values
-INST3(vmovdqa64,         "movdqa64",        IUM_WR, PCKDBL(0x7F), BAD_CODE,     PCKDBL(0x6F),                            INS_TT_FULL_MEM,                     Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_FLAGS_None)
-INST3(vmovdqu64,         "movdqu64",        IUM_WR, SSEFLT(0x7F), BAD_CODE,     SSEFLT(0x6F),                            INS_TT_FULL_MEM,                     Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_FLAGS_None)
-INST3(vpandq,            "pandq",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDB),                            INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise AND of two xmm regs
-INST3(vpandnq,           "pandnq",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDF),                            INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise AND NOT of two xmm regs
-INST3(vporq,             "porq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEB),                            INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise OR of two xmm regs
-INST3(vpternlogd,        "pternlogd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x25),                             INS_TT_FULL,                         Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(vpxorq,            "pxorq",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEF),                            INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise XOR of two xmm regs
+INST3(vextractf64x4,     "extractf64x4",    IUM_WR, SSE3A(0x1B),            BAD_CODE,     BAD_CODE,                      INS_TT_TUPLE4,                       Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Extract 256-bit packed double-precision floating point values
+INST3(vextracti64x4,     "extracti64x4",    IUM_WR, SSE3A(0x3B),            BAD_CODE,     BAD_CODE,                      INS_TT_TUPLE4,                       Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Extract 256-bit packed quadword integer values
+INST3(vinsertf64x4,      "insertf64x4",     IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x1A),                   INS_TT_TUPLE4,                       Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Insert 256-bit packed double-precision floating point values
+INST3(vinserti64x4,      "inserti64x4",     IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3A),                   INS_TT_TUPLE4,                       Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Insert 256-bit packed quadword integer values
+INST3(vmovdqa64,         "movdqa64",        IUM_WR, PCKDBL(0x7F),           BAD_CODE,     PCKDBL(0x6F),                  INS_TT_FULL_MEM,                     Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_FLAGS_None)
+INST3(vmovdqu64,         "movdqu64",        IUM_WR, SSEFLT(0x7F),           BAD_CODE,     SSEFLT(0x6F),                  INS_TT_FULL_MEM,                     Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_FLAGS_None)
+INST3(vpandq,            "pandq",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xDB),                  INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise AND of two xmm regs
+INST3(vpandnq,           "pandnq",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xDF),                  INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise AND NOT of two xmm regs
+INST3(vpmovdw,           "pmovdw",          IUM_WR, PSSE38(0xF3, 0x33),     BAD_CODE,     PSSE38(0xF3, 0x33),            INS_TT_HALF_MEM,                     Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX)
+INST3(vpmovqd,           "pmovqd",          IUM_WR, PSSE38(0xF3, 0x35),     BAD_CODE,     PSSE38(0xF3, 0x35),            INS_TT_HALF_MEM,                     Input_64Bit    | REX_W0_EVEX                  | Encoding_EVEX)
+INST3(vporq,             "porq",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xEB),                  INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise OR of two xmm regs
+INST3(vpternlogd,        "pternlogd",       IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x25),                   INS_TT_FULL,                         Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(vpxorq,            "pxorq",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xEF),                  INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise XOR of two xmm regs
 
 // AVX512BW
-INST3(vmovdqu8,          "movdqu8",         IUM_WR, SSEFLT(0x7F), BAD_CODE,     SSEFLT(0x6F),                            INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0_EVEX                  | Encoding_EVEX)
-INST3(vmovdqu16,         "movdqu16",        IUM_WR, SSEFLT(0x7F), BAD_CODE,     SSEFLT(0x6F),                            INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1_EVEX                  | Encoding_EVEX)
-INST3(vpmovb2m,          "pmovb2m",         IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF3, 0x0F, 0x38, 0x29),           INS_TT_NONE,                         Input_8Bit     | REX_W0_EVEX                  | Encoding_EVEX)
-INST3(vpmovw2m,          "pmovw2m",         IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF3, 0x0F, 0x38, 0x29),           INS_TT_NONE,                         Input_16Bit    | REX_W1_EVEX                  | Encoding_EVEX)
+INST3(vmovdqu8,          "movdqu8",         IUM_WR, SSEFLT(0x7F),           BAD_CODE,     SSEFLT(0x6F),                  INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0_EVEX                  | Encoding_EVEX)
+INST3(vmovdqu16,         "movdqu16",        IUM_WR, SSEFLT(0x7F),           BAD_CODE,     SSEFLT(0x6F),                  INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1_EVEX                  | Encoding_EVEX)
+INST3(vpmovb2m,          "pmovb2m",         IUM_WR, BAD_CODE,               BAD_CODE,     PSSE38(0xF3, 0x29),            INS_TT_NONE,                         Input_8Bit     | REX_W0_EVEX                  | Encoding_EVEX)
+INST3(vpmovw2m,          "pmovw2m",         IUM_WR, BAD_CODE,               BAD_CODE,     PSSE38(0xF3, 0x29),            INS_TT_NONE,                         Input_16Bit    | REX_W1_EVEX                  | Encoding_EVEX)
+INST3(vpmovwb,           "pmovwb",          IUM_WR, PSSE38(0xF3, 0x30),     BAD_CODE,     PSSE38(0xF3, 0x30),            INS_TT_HALF_MEM,                     Input_16Bit    | REX_W0_EVEX                  | Encoding_EVEX)
 
 // AVX512DQ
-INST3(vextractf32x8,     "extractf32x8",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x1B),                             INS_TT_TUPLE8,                       Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Extract 256-bit packed double-precision floating point values
-INST3(vextracti32x8,     "extracti32x8",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x3B),                             INS_TT_TUPLE8,                       Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Extract 256-bit packed quadword integer values
-INST3(vinsertf32x8,      "insertf32x8",     IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x1A),                             INS_TT_TUPLE8,                       Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Insert 256-bit packed double-precision floating point values
-INST3(vinserti32x8,      "inserti32x8",     IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x3A),                             INS_TT_TUPLE8,                       Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Insert 256-bit packed quadword integer values
-INST3(vpmovd2m,          "pmovd2m",         IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF3, 0x0F, 0x38, 0x39),           INS_TT_NONE,                         Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX)
-INST3(vpmovq2m,          "pmovq2m",         IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF3, 0x0F, 0x38, 0x39),           INS_TT_NONE,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX)
+INST3(vextractf32x8,     "extractf32x8",    IUM_WR, SSE3A(0x1B),            BAD_CODE,     BAD_CODE,                      INS_TT_TUPLE8,                       Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Extract 256-bit packed double-precision floating point values
+INST3(vextracti32x8,     "extracti32x8",    IUM_WR, SSE3A(0x3B),            BAD_CODE,     BAD_CODE,                      INS_TT_TUPLE8,                       Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Extract 256-bit packed quadword integer values
+INST3(vinsertf32x8,      "insertf32x8",     IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x1A),                   INS_TT_TUPLE8,                       Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Insert 256-bit packed double-precision floating point values
+INST3(vinserti32x8,      "inserti32x8",     IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3A),                   INS_TT_TUPLE8,                       Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Insert 256-bit packed quadword integer values
+INST3(vpmovd2m,          "pmovd2m",         IUM_WR, BAD_CODE,               BAD_CODE,     PSSE38(0xF3, 0x39),            INS_TT_NONE,                         Input_32Bit    | REX_W0_EVEX                  | Encoding_EVEX)
+INST3(vpmovq2m,          "pmovq2m",         IUM_WR, BAD_CODE,               BAD_CODE,     PSSE38(0xF3, 0x39),            INS_TT_NONE,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX)
 
 INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 
 // Scalar instructions in SSE4.2
-INST3(crc32,            "crc32",            IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF0),           INS_TT_NONE,    INS_FLAGS_None)
+INST3(crc32,            "crc32",            IUM_WR, BAD_CODE,     BAD_CODE,     PSSE38(0xF2, 0xF0),                      INS_TT_NONE,    INS_FLAGS_None)
 
 // BMI1
 INST3(tzcnt,            "tzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBC),                            INS_TT_NONE,    Undefined_OF   | Undefined_SF  | Writes_ZF     | Undefined_AF  | Undefined_PF  | Writes_CF)    // Count the Number of Trailing Zero Bits
index d8ccb43a76af309b3d3d1499bcfa7b63908685ae..cc3732238bf1abb7d7903f5aa638d422fa021570 100644 (file)
@@ -5583,6 +5583,37 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node)
                     break;
                 }
 
+                case NI_AVX512F_ConvertToVector128Int16:
+                case NI_AVX512F_ConvertToVector128Int32:
+                case NI_AVX512F_ConvertToVector128UInt16:
+                case NI_AVX512F_ConvertToVector128UInt32:
+                case NI_AVX512F_ConvertToVector256Int16:
+                case NI_AVX512F_ConvertToVector256Int32:
+                case NI_AVX512F_ConvertToVector256UInt16:
+                case NI_AVX512F_ConvertToVector256UInt32:
+                case NI_AVX512BW_ConvertToVector128Byte:
+                case NI_AVX512BW_ConvertToVector128SByte:
+                case NI_AVX512BW_ConvertToVector256Byte:
+                case NI_AVX512BW_ConvertToVector256SByte:
+                {
+                    // These intrinsics are "ins reg/mem, xmm"
+                    unsigned simdSize = hwintrinsic->GetSimdSize();
+
+                    if (simdSize == 16)
+                    {
+                        // For TYP_SIMD16, we produce a TYP_SIMD16 register
+                        // but only store TYP_SIMD8 to memory and so we cannot
+                        // contain without additional work.
+                        isContainable = false;
+                    }
+                    else
+                    {
+                        assert((simdSize == 32) || (simdSize == 64));
+                        isContainable = true;
+                    }
+                    break;
+                }
+
                 default:
                 {
                     break;
@@ -6965,6 +6996,18 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
         case NI_AVX_ExtractVector128:
         case NI_AVX2_ExtractVector128:
         case NI_AVX512F_ExtractVector256:
+        case NI_AVX512F_ConvertToVector128Int16:
+        case NI_AVX512F_ConvertToVector128Int32:
+        case NI_AVX512F_ConvertToVector128UInt16:
+        case NI_AVX512F_ConvertToVector128UInt32:
+        case NI_AVX512F_ConvertToVector256Int16:
+        case NI_AVX512F_ConvertToVector256Int32:
+        case NI_AVX512F_ConvertToVector256UInt16:
+        case NI_AVX512F_ConvertToVector256UInt32:
+        case NI_AVX512BW_ConvertToVector128Byte:
+        case NI_AVX512BW_ConvertToVector128SByte:
+        case NI_AVX512BW_ConvertToVector256Byte:
+        case NI_AVX512BW_ConvertToVector256SByte:
         {
             // These are only containable as part of a store
             return false;
@@ -7164,6 +7207,24 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
                         break;
                     }
 
+                    case NI_AVX512F_ConvertToVector128Int16:
+                    case NI_AVX512F_ConvertToVector128Int32:
+                    case NI_AVX512F_ConvertToVector128UInt16:
+                    case NI_AVX512F_ConvertToVector128UInt32:
+                    case NI_AVX512F_ConvertToVector256Int16:
+                    case NI_AVX512F_ConvertToVector256Int32:
+                    case NI_AVX512F_ConvertToVector256UInt16:
+                    case NI_AVX512F_ConvertToVector256UInt32:
+                    case NI_AVX512BW_ConvertToVector128Byte:
+                    case NI_AVX512BW_ConvertToVector128SByte:
+                    case NI_AVX512BW_ConvertToVector256Byte:
+                    case NI_AVX512BW_ConvertToVector256SByte:
+                    {
+                        // These intrinsics are "ins reg/mem, xmm" and get
+                        // contained by the relevant store operation instead.
+                        return;
+                    }
+
                     default:
                     {
                         break;