Improve codegen for AVX-512 comparisons and general handling of `TYP_MASK` (#87089)
authorTanner Gooding <tagoo@outlook.com>
Wed, 7 Jun 2023 23:01:59 +0000 (16:01 -0700)
committerGitHub <noreply@github.com>
Wed, 7 Jun 2023 23:01:59 +0000 (16:01 -0700)
* Add the various base kmask instructions

* Updating lowering to better handle the AVX512 comparisons that produce a mask

* Remove INS_Flags_IsMskSrcSrcEvexInstruction as its the same as INS_Flags_IsDstDstSrcAVXInstruction

* Save 1-byte on the encoding for CompareEqualMask and signed CompareGreaterThanMask

* Apply formatting patch

* Ensure kmovd is encoded correctly and kmov is disassembled correctly

* Ensure vpcmpgtq is actually handled

* Fix the definition of a couple kmask instructions

* Ensure gtNewSimdCmpOpAll node creates correct IR for simdSize == 64

* Add , that were dropped when resolving merge

* Ensure that the new hwintrinsiclistxarch entries don't assert ValueNum

src/coreclr/jit/emitxarch.cpp
src/coreclr/jit/gentree.cpp
src/coreclr/jit/gentree.h
src/coreclr/jit/hwintrinsic.h
src/coreclr/jit/hwintrinsiccodegenxarch.cpp
src/coreclr/jit/hwintrinsiclistxarch.h
src/coreclr/jit/hwintrinsicxarch.cpp
src/coreclr/jit/instr.h
src/coreclr/jit/instrsxarch.h
src/coreclr/jit/lower.h
src/coreclr/jit/lowerxarch.cpp

index 2fad742..e1586dc 100644 (file)
@@ -18856,6 +18856,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
         case INS_pcmpgtq:
         case INS_psadbw:
         case INS_vdbpsadbw:
+        case INS_vpcmpgtq:
         case INS_vpermps:
         case INS_vpermpd:
         case INS_vpermpd_reg:
@@ -19290,10 +19291,6 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
         }
 
-        case INS_kmovb_msk:
-        case INS_kmovw_msk:
-        case INS_kmovd_msk:
-        case INS_kmovq_msk:
         case INS_kmovb_gpr:
         case INS_kmovw_gpr:
         case INS_kmovd_gpr:
@@ -19304,6 +19301,16 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
         }
 
+        case INS_kmovb_msk:
+        case INS_kmovw_msk:
+        case INS_kmovd_msk:
+        case INS_kmovq_msk:
+        {
+            result.insLatency += PERFSCORE_LATENCY_1C;
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            break;
+        }
+
         case INS_vpcmpb:
         case INS_vpcmpw:
         case INS_vpcmpd:
@@ -19333,13 +19340,72 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
         }
 
+        case INS_kandb:
+        case INS_kandd:
+        case INS_kandq:
+        case INS_kandw:
+        case INS_kandnb:
+        case INS_kandnd:
+        case INS_kandnq:
+        case INS_kandnw:
+        case INS_knotb:
+        case INS_knotd:
+        case INS_knotq:
+        case INS_knotw:
+        case INS_korb:
+        case INS_kord:
+        case INS_korq:
+        case INS_korw:
+        case INS_kxnorb:
+        case INS_kxnord:
+        case INS_kxnorq:
+        case INS_kxnorw:
+        case INS_kxorb:
+        case INS_kxord:
+        case INS_kxorq:
+        case INS_kxorw:
+        {
+            result.insLatency += PERFSCORE_LATENCY_1C;
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            break;
+        }
+
         case INS_kortestb:
-        case INS_kortestw:
         case INS_kortestd:
         case INS_kortestq:
+        case INS_kortestw:
+        case INS_ktestb:
+        case INS_ktestd:
+        case INS_ktestq:
+        case INS_ktestw:
         {
+            // Keep these in a separate group as there isn't a documented latency
+            // Similar instructions have a 1 cycle latency, however
+
             result.insLatency += PERFSCORE_LATENCY_1C;
             result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+
+            break;
+        }
+
+        case INS_kaddb:
+        case INS_kaddd:
+        case INS_kaddq:
+        case INS_kaddw:
+        case INS_kshiftlb:
+        case INS_kshiftld:
+        case INS_kshiftlq:
+        case INS_kshiftlw:
+        case INS_kshiftrb:
+        case INS_kshiftrd:
+        case INS_kshiftrq:
+        case INS_kshiftrw:
+        case INS_kunpckbw:
+        case INS_kunpckdq:
+        case INS_kunpckwd:
+        {
+            result.insLatency += PERFSCORE_LATENCY_4C;
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
             break;
         }
 
index 6b314b0..43a2556 100644 (file)
@@ -20441,7 +20441,20 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
 #if defined(TARGET_XARCH)
         case GT_EQ:
         {
-            if (simdSize == 32)
+            if (simdSize == 64)
+            {
+                assert(IsBaselineVector512IsaSupportedDebugOnly());
+
+                if (varTypeIsSmall(simdBaseType))
+                {
+                    intrinsic = NI_AVX512BW_CompareEqual;
+                }
+                else
+                {
+                    intrinsic = NI_AVX512F_CompareEqual;
+                }
+            }
+            else if (simdSize == 32)
             {
                 assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
 
@@ -20455,17 +20468,15 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
                     intrinsic = NI_AVX2_CompareEqual;
                 }
             }
-            else if (simdSize == 64)
-            {
-                assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_AVX512F_CompareEqualSpecial;
-            }
             else if (simdBaseType == TYP_FLOAT)
             {
+                assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
                 intrinsic = NI_SSE_CompareEqual;
             }
             else if (varTypeIsLong(simdBaseType))
             {
+                assert(simdSize == 16);
+
                 if (compOpportunisticallyDependsOn(InstructionSet_SSE41))
                 {
                     intrinsic = NI_SSE41_CompareEqual;
@@ -20494,6 +20505,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
             }
             else
             {
+                assert(simdSize == 16);
                 intrinsic = NI_SSE2_CompareEqual;
             }
             break;
@@ -20501,6 +20513,37 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
 
         case GT_GE:
         {
+            if (IsBaselineVector512IsaSupported())
+            {
+                if (simdSize == 64)
+                {
+                    if (varTypeIsSmall(simdBaseType))
+                    {
+                        intrinsic = NI_AVX512BW_CompareGreaterThanOrEqual;
+                    }
+                    else
+                    {
+                        intrinsic = NI_AVX512F_CompareGreaterThanOrEqual;
+                    }
+                    break;
+                }
+                else if (!varTypeIsFloating(simdBaseType))
+                {
+                    assert((simdSize == 16) || (simdSize == 32));
+
+                    if (varTypeIsSmall(simdBaseType))
+                    {
+                        intrinsic = NI_AVX512BW_VL_CompareGreaterThanOrEqual;
+                    }
+                    else
+                    {
+                        intrinsic = NI_AVX512F_VL_CompareGreaterThanOrEqual;
+                    }
+
+                    break;
+                }
+            }
+
             if (simdSize == 32)
             {
                 assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
@@ -20510,17 +20553,14 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
                     intrinsic = NI_AVX_CompareGreaterThanOrEqual;
                 }
             }
-            else if (simdSize == 64)
-            {
-                assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_AVX512F_CompareGreaterThanOrEqualSpecial;
-            }
             else if (simdBaseType == TYP_FLOAT)
             {
+                assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
                 intrinsic = NI_SSE_CompareGreaterThanOrEqual;
             }
             else if (simdBaseType == TYP_DOUBLE)
             {
+                assert(simdSize == 16);
                 intrinsic = NI_SSE2_CompareGreaterThanOrEqual;
             }
 
@@ -20568,6 +20608,37 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
 
         case GT_GT:
         {
+            if (IsBaselineVector512IsaSupported())
+            {
+                if (simdSize == 64)
+                {
+                    if (varTypeIsSmall(simdBaseType))
+                    {
+                        intrinsic = NI_AVX512BW_CompareGreaterThan;
+                    }
+                    else
+                    {
+                        intrinsic = NI_AVX512F_CompareGreaterThan;
+                    }
+                    break;
+                }
+                else if (varTypeIsUnsigned(simdBaseType))
+                {
+                    assert((simdSize == 16) || (simdSize == 32));
+
+                    if (varTypeIsSmall(simdBaseType))
+                    {
+                        intrinsic = NI_AVX512BW_VL_CompareGreaterThan;
+                    }
+                    else
+                    {
+                        intrinsic = NI_AVX512F_VL_CompareGreaterThan;
+                    }
+
+                    break;
+                }
+            }
+
             if (varTypeIsUnsigned(simdBaseType))
             {
                 // Vector of byte, ushort, uint and ulong:
@@ -20661,17 +20732,15 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
                     intrinsic = NI_AVX2_CompareGreaterThan;
                 }
             }
-            else if (simdSize == 64)
-            {
-                assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_AVX512F_CompareGreaterThanSpecial;
-            }
             else if (simdBaseType == TYP_FLOAT)
             {
+                assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
                 intrinsic = NI_SSE_CompareGreaterThan;
             }
             else if (varTypeIsLong(simdBaseType))
             {
+                assert(simdSize == 16);
+
                 if (compOpportunisticallyDependsOn(InstructionSet_SSE42))
                 {
                     intrinsic = NI_SSE42_CompareGreaterThan;
@@ -20733,6 +20802,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
             }
             else
             {
+                assert(simdSize == 16);
                 intrinsic = NI_SSE2_CompareGreaterThan;
             }
             break;
@@ -20740,6 +20810,37 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
 
         case GT_LE:
         {
+            if (IsBaselineVector512IsaSupported())
+            {
+                if (simdSize == 64)
+                {
+                    if (varTypeIsSmall(simdBaseType))
+                    {
+                        intrinsic = NI_AVX512BW_CompareLessThanOrEqual;
+                    }
+                    else
+                    {
+                        intrinsic = NI_AVX512F_CompareLessThanOrEqual;
+                    }
+                    break;
+                }
+                else if (!varTypeIsFloating(simdBaseType))
+                {
+                    assert((simdSize == 16) || (simdSize == 32));
+
+                    if (varTypeIsSmall(simdBaseType))
+                    {
+                        intrinsic = NI_AVX512BW_VL_CompareLessThanOrEqual;
+                    }
+                    else
+                    {
+                        intrinsic = NI_AVX512F_VL_CompareLessThanOrEqual;
+                    }
+
+                    break;
+                }
+            }
+
             if (simdSize == 32)
             {
                 assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
@@ -20749,17 +20850,14 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
                     intrinsic = NI_AVX_CompareLessThanOrEqual;
                 }
             }
-            else if (simdSize == 64)
-            {
-                assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_AVX512F_CompareLessThanOrEqualSpecial;
-            }
             else if (simdBaseType == TYP_FLOAT)
             {
+                assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
                 intrinsic = NI_SSE_CompareLessThanOrEqual;
             }
             else if (simdBaseType == TYP_DOUBLE)
             {
+                assert(simdSize == 16);
                 intrinsic = NI_SSE2_CompareLessThanOrEqual;
             }
 
@@ -20807,6 +20905,37 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
 
         case GT_LT:
         {
+            if (IsBaselineVector512IsaSupported())
+            {
+                if (simdSize == 64)
+                {
+                    if (varTypeIsSmall(simdBaseType))
+                    {
+                        intrinsic = NI_AVX512BW_CompareLessThan;
+                    }
+                    else
+                    {
+                        intrinsic = NI_AVX512F_CompareLessThan;
+                    }
+                    break;
+                }
+                else if (varTypeIsUnsigned(simdBaseType))
+                {
+                    assert((simdSize == 16) || (simdSize == 32));
+
+                    if (varTypeIsSmall(simdBaseType))
+                    {
+                        intrinsic = NI_AVX512BW_VL_CompareLessThan;
+                    }
+                    else
+                    {
+                        intrinsic = NI_AVX512F_VL_CompareLessThan;
+                    }
+
+                    break;
+                }
+            }
+
             if (varTypeIsUnsigned(simdBaseType))
             {
                 // Vector of byte, ushort, uint and ulong:
@@ -20900,17 +21029,15 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
                     intrinsic = NI_AVX2_CompareLessThan;
                 }
             }
-            else if (simdSize == 64)
-            {
-                assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_AVX512F_CompareLessThanSpecial;
-            }
             else if (simdBaseType == TYP_FLOAT)
             {
+                assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16));
                 intrinsic = NI_SSE_CompareLessThan;
             }
             else if (varTypeIsLong(simdBaseType))
             {
+                assert(simdSize == 16);
+
                 if (compOpportunisticallyDependsOn(InstructionSet_SSE42))
                 {
                     intrinsic = NI_SSE42_CompareLessThan;
@@ -20972,6 +21099,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
             }
             else
             {
+                assert(simdSize == 16);
                 intrinsic = NI_SSE2_CompareLessThan;
             }
             break;
@@ -21057,15 +21185,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
     assert(intrinsic != NI_Illegal);
 
 #if defined(TARGET_XARCH)
-    if (simdSize != 64)
-    {
-        return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
-    }
-    else
-    {
-        GenTree* cmp = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, intrinsic, simdBaseJitType, simdSize);
-        return gtNewSimdHWIntrinsicNode(type, cmp, NI_AVX512F_MoveMaskToVectorSpecial, simdBaseJitType, simdSize);
-    }
+    return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
 #else
     return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
 #endif
@@ -21116,125 +21236,8 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(
         }
 
         case GT_GE:
-        {
-            // We want to generate a comparison along the lines of
-            // GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet
-
-            if (simdSize == 32)
-            {
-                // TODO-XArch-CQ: It's a non-trivial amount of work to support these
-                // for floating-point while only utilizing AVX. It would require, among
-                // other things, inverting the comparison and potentially support for a
-                // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
-                assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
-                intrinsic = NI_Vector256_op_Equality;
-            }
-            else if (simdSize == 64)
-            {
-                assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_Vector512_GreaterThanOrEqualAll;
-                break;
-            }
-            else
-            {
-                intrinsic = NI_Vector128_op_Equality;
-            }
-
-            op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
-            op2 = gtNewAllBitsSetConNode(simdType);
-
-            if (simdBaseType == TYP_FLOAT)
-            {
-                simdBaseType    = TYP_INT;
-                simdBaseJitType = CORINFO_TYPE_INT;
-            }
-            else if (simdBaseType == TYP_DOUBLE)
-            {
-                simdBaseType    = TYP_LONG;
-                simdBaseJitType = CORINFO_TYPE_LONG;
-            }
-            break;
-        }
         case GT_GT:
-        {
-            // We want to generate a comparison along the lines of
-            // GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet
-
-            if (simdSize == 32)
-            {
-                // TODO-XArch-CQ: It's a non-trivial amount of work to support these
-                // for floating-point while only utilizing AVX. It would require, among
-                // other things, inverting the comparison and potentially support for a
-                // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
-                assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
-                intrinsic = NI_Vector256_op_Equality;
-            }
-            else if (simdSize == 64)
-            {
-                assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_Vector512_GreaterThanAll;
-                break;
-            }
-            else
-            {
-                intrinsic = NI_Vector128_op_Equality;
-            }
-
-            op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
-            op2 = gtNewAllBitsSetConNode(simdType);
-
-            if (simdBaseType == TYP_FLOAT)
-            {
-                simdBaseType    = TYP_INT;
-                simdBaseJitType = CORINFO_TYPE_INT;
-            }
-            else if (simdBaseType == TYP_DOUBLE)
-            {
-                simdBaseType    = TYP_LONG;
-                simdBaseJitType = CORINFO_TYPE_LONG;
-            }
-            break;
-        }
         case GT_LE:
-        {
-            // We want to generate a comparison along the lines of
-            // GT_XX(op1, op2).As<T, TInteger>() == Vector128<TInteger>.AllBitsSet
-
-            if (simdSize == 32)
-            {
-                // TODO-XArch-CQ: It's a non-trivial amount of work to support these
-                // for floating-point while only utilizing AVX. It would require, among
-                // other things, inverting the comparison and potentially support for a
-                // new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
-                assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
-                intrinsic = NI_Vector256_op_Equality;
-            }
-            else if (simdSize == 64)
-            {
-                assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_Vector512_LessThanOrEqualAll;
-                break;
-            }
-            else
-            {
-                intrinsic = NI_Vector128_op_Equality;
-            }
-
-            op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize);
-            op2 = gtNewAllBitsSetConNode(simdType);
-
-            if (simdBaseType == TYP_FLOAT)
-            {
-                simdBaseType    = TYP_INT;
-                simdBaseJitType = CORINFO_TYPE_INT;
-            }
-            else if (simdBaseType == TYP_DOUBLE)
-            {
-                simdBaseType    = TYP_LONG;
-                simdBaseJitType = CORINFO_TYPE_LONG;
-            }
-            break;
-        }
         case GT_LT:
         {
             // We want to generate a comparison along the lines of
@@ -21252,8 +21255,7 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(
             else if (simdSize == 64)
             {
                 assert(IsBaselineVector512IsaSupportedDebugOnly());
-                intrinsic = NI_Vector512_LessThanAll;
-                break;
+                intrinsic = NI_Vector512_op_Equality;
             }
             else
             {
@@ -21494,14 +21496,12 @@ GenTree* Compiler::gtNewSimdCndSelNode(
         return gtNewSimdTernaryLogicNode(type, op1, op2, op3, control, simdBaseJitType, simdSize);
     }
 
+    assert(simdSize != 64);
+
     if (simdSize == 32)
     {
         intrinsic = NI_Vector256_ConditionalSelect;
     }
-    else if (simdSize == 64)
-    {
-        intrinsic = NI_Vector512_ConditionalSelect;
-    }
     else
     {
         intrinsic = NI_Vector128_ConditionalSelect;
@@ -21931,6 +21931,7 @@ GenTree* Compiler::gtNewSimdDotProdNode(
 
 #if defined(TARGET_XARCH)
     assert(!varTypeIsByte(simdBaseType) && !varTypeIsLong(simdBaseType));
+    assert(simdSize != 64);
 
     if (simdSize == 32)
     {
@@ -23827,6 +23828,7 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, CorInfoType si
 
 #if defined(TARGET_XARCH)
     assert(!varTypeIsByte(simdBaseType) && !varTypeIsLong(simdBaseType));
+    assert(simdSize != 64);
 
     // HorizontalAdd combines pairs so we need log2(vectorLength) passes to sum all elements together.
     unsigned vectorLength = getSIMDVectorLength(simdSize, simdBaseType);
@@ -24795,6 +24797,25 @@ GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(
 }
 
 //------------------------------------------------------------------------
+// OperIsHWIntrinsic: Is this a hwintrinsic with the specified id
+//
+// Arguments:
+//    intrinsicId -- the id to compare with the current node
+//
+// Return Value:
+//    true if the node is a hwintrinsic intrinsic with the specified id
+//    otherwise; false
+//
+bool GenTree::OperIsHWIntrinsic(NamedIntrinsic intrinsicId) const
+{
+    if (OperIsHWIntrinsic())
+    {
+        return AsHWIntrinsic()->GetHWIntrinsicId() == intrinsicId;
+    }
+    return false;
+}
+
+//------------------------------------------------------------------------
 // OperIsMemoryLoad: Does this HWI node have memory load semantics?
 //
 // Arguments:
index 08bc535..4483cd7 100644 (file)
@@ -1651,6 +1651,8 @@ public:
         return OperIsHWIntrinsic(gtOper);
     }
 
+    bool OperIsHWIntrinsic(NamedIntrinsic intrinsicId) const;
+
     // This is here for cleaner GT_LONG #ifdefs.
     static bool OperIsLong(genTreeOps gtOper)
     {
index f30f622..01b8867 100644 (file)
@@ -330,6 +330,25 @@ enum class FloatRoundingMode : uint8_t
     NoException = 0x08,
 };
 
+enum class IntComparisonMode : uint8_t
+{
+    Equal           = 0,
+    LessThan        = 1,
+    LessThanOrEqual = 2,
+    False           = 3,
+
+    NotEqual           = 4,
+    GreaterThanOrEqual = 5,
+    GreaterThan        = 6,
+    True               = 7,
+
+    NotGreaterThanOrEqual = LessThan,
+    NotGreaterThan        = LessThanOrEqual,
+
+    NotLessThan        = GreaterThanOrEqual,
+    NotLessThanOrEqual = GreaterThan
+};
+
 enum class TernaryLogicUseFlags : uint8_t
 {
     // Indicates no flags are present
@@ -493,217 +512,7 @@ struct HWIntrinsicInfo
     }
 
 #ifdef TARGET_XARCH
-    static int lookupIval(NamedIntrinsic id, bool opportunisticallyDependsOnAVX)
-    {
-        switch (id)
-        {
-            case NI_SSE_CompareEqual:
-            case NI_SSE_CompareScalarEqual:
-            case NI_SSE2_CompareEqual:
-            case NI_SSE2_CompareScalarEqual:
-            case NI_AVX_CompareEqual:
-            {
-                return static_cast<int>(FloatComparisonMode::OrderedEqualNonSignaling);
-            }
-
-            case NI_SSE_CompareGreaterThan:
-            case NI_SSE_CompareScalarGreaterThan:
-            case NI_SSE2_CompareGreaterThan:
-            case NI_SSE2_CompareScalarGreaterThan:
-            case NI_AVX_CompareGreaterThan:
-            {
-                if (opportunisticallyDependsOnAVX)
-                {
-                    return static_cast<int>(FloatComparisonMode::OrderedGreaterThanSignaling);
-                }
-
-                // CompareGreaterThan is not directly supported in hardware without AVX support.
-                // We will return the inverted case here and lowering will itself swap the ops
-                // to ensure the emitted code remains correct. This simplifies the overall logic
-                // here and for other use cases.
-
-                assert(id != NI_AVX_CompareGreaterThan);
-                return static_cast<int>(FloatComparisonMode::OrderedLessThanSignaling);
-            }
-
-            case NI_SSE_CompareLessThan:
-            case NI_SSE_CompareScalarLessThan:
-            case NI_SSE2_CompareLessThan:
-            case NI_SSE2_CompareScalarLessThan:
-            case NI_AVX_CompareLessThan:
-            {
-                return static_cast<int>(FloatComparisonMode::OrderedLessThanSignaling);
-            }
-
-            case NI_SSE_CompareGreaterThanOrEqual:
-            case NI_SSE_CompareScalarGreaterThanOrEqual:
-            case NI_SSE2_CompareGreaterThanOrEqual:
-            case NI_SSE2_CompareScalarGreaterThanOrEqual:
-            case NI_AVX_CompareGreaterThanOrEqual:
-            {
-                if (opportunisticallyDependsOnAVX)
-                {
-                    return static_cast<int>(FloatComparisonMode::OrderedGreaterThanOrEqualSignaling);
-                }
-
-                // CompareGreaterThanOrEqual is not directly supported in hardware without AVX support.
-                // We will return the inverted case here and lowering will itself swap the ops
-                // to ensure the emitted code remains correct. This simplifies the overall logic
-                // here and for other use cases.
-
-                assert(id != NI_AVX_CompareGreaterThanOrEqual);
-                return static_cast<int>(FloatComparisonMode::OrderedLessThanOrEqualSignaling);
-            }
-
-            case NI_SSE_CompareLessThanOrEqual:
-            case NI_SSE_CompareScalarLessThanOrEqual:
-            case NI_SSE2_CompareLessThanOrEqual:
-            case NI_SSE2_CompareScalarLessThanOrEqual:
-            case NI_AVX_CompareLessThanOrEqual:
-            {
-                return static_cast<int>(FloatComparisonMode::OrderedLessThanOrEqualSignaling);
-            }
-
-            case NI_SSE_CompareNotEqual:
-            case NI_SSE_CompareScalarNotEqual:
-            case NI_SSE2_CompareNotEqual:
-            case NI_SSE2_CompareScalarNotEqual:
-            case NI_AVX_CompareNotEqual:
-            {
-                return static_cast<int>(FloatComparisonMode::UnorderedNotEqualNonSignaling);
-            }
-
-            case NI_SSE_CompareNotGreaterThan:
-            case NI_SSE_CompareScalarNotGreaterThan:
-            case NI_SSE2_CompareNotGreaterThan:
-            case NI_SSE2_CompareScalarNotGreaterThan:
-            case NI_AVX_CompareNotGreaterThan:
-            {
-                if (opportunisticallyDependsOnAVX)
-                {
-                    return static_cast<int>(FloatComparisonMode::UnorderedNotGreaterThanSignaling);
-                }
-
-                // CompareNotGreaterThan is not directly supported in hardware without AVX support.
-                // We will return the inverted case here and lowering will itself swap the ops
-                // to ensure the emitted code remains correct. This simplifies the overall logic
-                // here and for other use cases.
-
-                assert(id != NI_AVX_CompareNotGreaterThan);
-                return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanSignaling);
-            }
-
-            case NI_SSE_CompareNotLessThan:
-            case NI_SSE_CompareScalarNotLessThan:
-            case NI_SSE2_CompareNotLessThan:
-            case NI_SSE2_CompareScalarNotLessThan:
-            case NI_AVX_CompareNotLessThan:
-            {
-                return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanSignaling);
-            }
-
-            case NI_SSE_CompareNotGreaterThanOrEqual:
-            case NI_SSE_CompareScalarNotGreaterThanOrEqual:
-            case NI_SSE2_CompareNotGreaterThanOrEqual:
-            case NI_SSE2_CompareScalarNotGreaterThanOrEqual:
-            case NI_AVX_CompareNotGreaterThanOrEqual:
-            {
-                if (opportunisticallyDependsOnAVX)
-                {
-                    return static_cast<int>(FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling);
-                }
-
-                // CompareNotGreaterThanOrEqual is not directly supported in hardware without AVX support.
-                // We will return the inverted case here and lowering will itself swap the ops
-                // to ensure the emitted code remains correct. This simplifies the overall logic
-                // here and for other use cases.
-
-                assert(id != NI_AVX_CompareNotGreaterThanOrEqual);
-                return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling);
-            }
-
-            case NI_SSE_CompareNotLessThanOrEqual:
-            case NI_SSE_CompareScalarNotLessThanOrEqual:
-            case NI_SSE2_CompareNotLessThanOrEqual:
-            case NI_SSE2_CompareScalarNotLessThanOrEqual:
-            case NI_AVX_CompareNotLessThanOrEqual:
-            {
-                return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling);
-            }
-
-            case NI_SSE_CompareOrdered:
-            case NI_SSE_CompareScalarOrdered:
-            case NI_SSE2_CompareOrdered:
-            case NI_SSE2_CompareScalarOrdered:
-            case NI_AVX_CompareOrdered:
-            {
-                return static_cast<int>(FloatComparisonMode::OrderedNonSignaling);
-            }
-
-            case NI_SSE_CompareUnordered:
-            case NI_SSE_CompareScalarUnordered:
-            case NI_SSE2_CompareUnordered:
-            case NI_SSE2_CompareScalarUnordered:
-            case NI_AVX_CompareUnordered:
-            {
-                return static_cast<int>(FloatComparisonMode::UnorderedNonSignaling);
-            }
-
-            case NI_SSE41_Ceiling:
-            case NI_SSE41_CeilingScalar:
-            case NI_AVX_Ceiling:
-            {
-                FALLTHROUGH;
-            }
-
-            case NI_SSE41_RoundToPositiveInfinity:
-            case NI_SSE41_RoundToPositiveInfinityScalar:
-            case NI_AVX_RoundToPositiveInfinity:
-            {
-                return static_cast<int>(FloatRoundingMode::ToPositiveInfinity);
-            }
-
-            case NI_SSE41_Floor:
-            case NI_SSE41_FloorScalar:
-            case NI_AVX_Floor:
-            {
-                FALLTHROUGH;
-            }
-
-            case NI_SSE41_RoundToNegativeInfinity:
-            case NI_SSE41_RoundToNegativeInfinityScalar:
-            case NI_AVX_RoundToNegativeInfinity:
-            {
-                return static_cast<int>(FloatRoundingMode::ToNegativeInfinity);
-            }
-
-            case NI_SSE41_RoundCurrentDirection:
-            case NI_SSE41_RoundCurrentDirectionScalar:
-            case NI_AVX_RoundCurrentDirection:
-            {
-                return static_cast<int>(FloatRoundingMode::CurrentDirection);
-            }
-
-            case NI_SSE41_RoundToNearestInteger:
-            case NI_SSE41_RoundToNearestIntegerScalar:
-            case NI_AVX_RoundToNearestInteger:
-            {
-                return static_cast<int>(FloatRoundingMode::ToNearestInteger);
-            }
-
-            case NI_SSE41_RoundToZero:
-            case NI_SSE41_RoundToZeroScalar:
-            case NI_AVX_RoundToZero:
-            {
-                return static_cast<int>(FloatRoundingMode::ToZero);
-            }
-
-            default:
-            {
-                return -1;
-            }
-        }
-    }
+    static int lookupIval(Compiler* comp, NamedIntrinsic id, var_types simdBaseType);
 #endif
 
     static bool tryLookupSimdSize(NamedIntrinsic id, unsigned* pSimdSize)
index 973d417..7570765 100644 (file)
@@ -97,9 +97,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
 
     // We need to validate that other phases of the compiler haven't introduced unsupported intrinsics
     assert(compiler->compIsaSupportedDebugOnly(isa));
-
-    int ival = HWIntrinsicInfo::lookupIval(intrinsicId, compiler->compOpportunisticallyDependsOn(InstructionSet_AVX));
-
     assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
 
     if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
@@ -119,12 +116,15 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
         emitter*  emit   = GetEmitter();
 
         assert(numArgs >= 0);
+
         instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
         assert(ins != INS_invalid);
-        emitAttr simdSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize()));
 
+        emitAttr simdSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize()));
         assert(simdSize != 0);
 
+        int ival = HWIntrinsicInfo::lookupIval(compiler, intrinsicId, baseType);
+
         switch (numArgs)
         {
             case 1:
@@ -144,7 +144,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                     genConsumeRegs(op1);
                     op1Reg = op1->GetRegNum();
 
-                    if ((ival != -1) && varTypeIsFloating(baseType))
+                    if (ival != -1)
                     {
                         assert((ival >= 0) && (ival <= 127));
                         if (HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
@@ -208,7 +208,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                     op1Reg = targetReg;
                 }
 
-                if ((ival != -1) && varTypeIsFloating(baseType))
+                if (ival != -1)
                 {
                     assert((ival >= 0) && (ival <= 127));
                     genHWIntrinsic_R_R_RM_I(node, ins, simdSize, static_cast<int8_t>(ival));
@@ -235,7 +235,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                 }
                 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
                 {
-                    assert(ival == -1);
                     auto emitSwCase = [&](int8_t i) {
                         if (HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
                         {
@@ -293,10 +292,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                 genConsumeRegs(op3);
                 op3Reg = op3->GetRegNum();
 
+                assert(ival == -1);
+
                 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
                 {
-                    assert(ival == -1);
-
                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, simdSize, i); };
 
                     if (op3->IsCnsIntOrI())
@@ -384,10 +383,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                 genConsumeRegs(op4);
                 op4Reg = op4->GetRegNum();
 
+                assert(ival == -1);
+
                 if (HWIntrinsicInfo::isImmOp(intrinsicId, op4))
                 {
-                    assert(ival == -1);
-
                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_R_RM_I(node, ins, simdSize, i); };
 
                     if (op4->IsCnsIntOrI())
@@ -455,6 +454,8 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
         case InstructionSet_AVX512F_X64:
         case InstructionSet_AVX512BW:
         case InstructionSet_AVX512BW_VL:
+        case InstructionSet_AVX512VBMI:
+        case InstructionSet_AVX512VBMI_VL:
             genAvxFamilyIntrinsic(node);
             break;
         case InstructionSet_AES:
@@ -1913,6 +1914,7 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node)
 
             instruction maskIns;
             instruction kmovIns;
+            emitAttr    kmovAttr = EA_4BYTE;
 
             // TODO-XARCH-AVX512 note that this type/kmov combination assumes 512-bit vector types but would change
             // if used for other vector lengths, i.e., TYPE_BYTE requires kmovq for for 512-bit vector, but kmovd
@@ -1921,130 +1923,78 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node)
             {
                 case TYP_BYTE:
                 case TYP_UBYTE:
-                    maskIns = INS_vpmovb2m;
-                    kmovIns = INS_kmovq_gpr;
+                {
+                    maskIns  = INS_vpmovb2m;
+                    kmovIns  = INS_kmovq_gpr;
+                    kmovAttr = EA_8BYTE;
                     break;
+                }
+
                 case TYP_SHORT:
                 case TYP_USHORT:
+                {
                     maskIns = INS_vpmovw2m;
                     kmovIns = INS_kmovd_gpr;
                     break;
+                }
+
                 case TYP_INT:
                 case TYP_UINT:
                 case TYP_FLOAT:
+                {
                     maskIns = INS_vpmovd2m;
                     kmovIns = INS_kmovw_gpr;
                     break;
+                }
+
                 case TYP_DOUBLE:
                 case TYP_LONG:
                 case TYP_ULONG:
+                {
                     maskIns = INS_vpmovq2m;
                     kmovIns = INS_kmovb_gpr;
                     break;
+                }
+
                 default:
+                {
                     unreached();
+                }
             }
 
             assert(emitter::isMaskReg(maskReg));
 
             emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg);
-            emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE);
-            break;
-        }
-
-        case NI_AVX512F_CompareEqualSpecial:
-        {
-            GenTree* op2     = node->Op(2);
-            op1Reg           = op1->GetRegNum();
-            regNumber op2Reg = op2->GetRegNum();
-
-            instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareEqualSpecial, baseType);
-
-            assert(compareIns != INS_invalid);
-            assert(emitter::isMaskReg(targetReg));
-
-            emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 0);
+            emit->emitIns_Mov(kmovIns, kmovAttr, targetReg, maskReg, INS_FLAGS_DONT_CARE);
             break;
         }
 
-        case NI_AVX512F_CompareGreaterThanOrEqualSpecial:
-        {
-            GenTree* op2     = node->Op(2);
-            op1Reg           = op1->GetRegNum();
-            regNumber op2Reg = op2->GetRegNum();
-
-            instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanOrEqualSpecial, baseType);
-
-            assert(compareIns != INS_invalid);
-            assert(emitter::isMaskReg(targetReg));
-
-            emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 5);
-            break;
-        }
-
-        case NI_AVX512F_CompareGreaterThanSpecial:
-        {
-            GenTree* op2     = node->Op(2);
-            op1Reg           = op1->GetRegNum();
-            regNumber op2Reg = op2->GetRegNum();
-
-            instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareGreaterThanSpecial, baseType);
-
-            assert(compareIns != INS_invalid);
-            assert(emitter::isMaskReg(targetReg));
-
-            emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 6);
-            break;
-        }
-
-        case NI_AVX512F_CompareLessThanOrEqualSpecial:
-        {
-            GenTree* op2     = node->Op(2);
-            op1Reg           = op1->GetRegNum();
-            regNumber op2Reg = op2->GetRegNum();
-
-            instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanOrEqualSpecial, baseType);
-
-            assert(compareIns != INS_invalid);
-            assert(emitter::isMaskReg(targetReg));
-
-            emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 2);
-            break;
-        }
-
-        case NI_AVX512F_CompareLessThanSpecial:
-        {
-            GenTree* op2     = node->Op(2);
-            op1Reg           = op1->GetRegNum();
-            regNumber op2Reg = op2->GetRegNum();
-
-            instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareLessThanSpecial, baseType);
-
-            assert(compareIns != INS_invalid);
-            assert(emitter::isMaskReg(targetReg));
-
-            emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 1);
-            break;
-        }
-
-        case NI_AVX512F_MoveMaskToVectorSpecial:
+        case NI_AVX512F_KORTEST:
         {
             op1Reg = op1->GetRegNum();
 
-            instruction maskMovIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_MoveMaskToVectorSpecial, baseType);
+            uint32_t simdSize = node->GetSimdSize();
+            uint32_t count    = simdSize / genTypeSize(baseType);
 
-            assert(maskMovIns != INS_invalid);
-            assert(emitter::isMaskReg(op1Reg));
+            instruction testIns;
 
-            emit->emitIns_R_R(maskMovIns, attr, targetReg, op1Reg);
-            break;
-        }
-
-        case NI_AVX512F_KORTEST:
-        {
-            op1Reg = op1->GetRegNum();
-
-            instruction testIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_KORTEST, baseType);
+            if (count <= 8)
+            {
+                testIns = INS_kortestb;
+            }
+            else if (count == 16)
+            {
+                testIns = INS_kortestw;
+            }
+            else if (count == 32)
+            {
+                testIns = INS_kortestd;
+            }
+            else
+            {
+                assert(count == 64);
+                testIns = INS_kortestq;
+            }
 
             assert(testIns != INS_invalid);
             assert(emitter::isMaskReg(op1Reg));
index 66ad4be..e1649b2 100644 (file)
@@ -836,6 +836,12 @@ HARDWARE_INTRINSIC(AVX512F,         AndNot,
 HARDWARE_INTRINSIC(AVX512F,         BroadcastScalarToVector512,                 64,              1,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpbroadcastd,       INS_vpbroadcastd,       INS_vpbroadcastq,       INS_vpbroadcastq,       INS_vbroadcastss,       INS_vbroadcastsd},      HW_Category_SIMDScalar,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX512F,         BroadcastVector128ToVector512,              64,              1,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vbroadcasti128,     INS_vbroadcasti128,     INS_invalid,            INS_invalid,            INS_vbroadcastf128,     INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX512F,         BroadcastVector256ToVector512,              64,              1,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vbroadcasti64x4,    INS_vbroadcasti64x4,    INS_invalid,            INS_vbroadcastf64x4},   HW_Category_MemoryLoad,             HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX512F,         CompareEqual,                               64,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpeqd,           INS_vpcmpeqd,           INS_vpcmpeqq,           INS_vpcmpeqq,           INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         CompareGreaterThan,                         64,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpgtd,           INS_vpcmpud,            INS_vpcmpgtq,           INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         CompareGreaterThanOrEqual,                  64,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         CompareLessThan,                            64,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         CompareLessThanOrEqual,                     64,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F,         CompareNotEqual,                            64,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(AVX512F,         ConvertScalarToVector128Double,             16,              2,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcvtusi2sd32,       INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(AVX512F,         ConvertScalarToVector128Single,             16,              2,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcvtusi2ss32,       INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(AVX512F,         ConvertToUInt32,                            16,              1,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcvtss2usi,         INS_vcvtsd2usi},        HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
@@ -943,6 +949,11 @@ HARDWARE_INTRINSIC(AVX512F_VL,      AlignRight32,
 HARDWARE_INTRINSIC(AVX512F_VL,      AlignRight64,                               -1,              3,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_valignq,            INS_valignq,            INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX512F_VL,      Max,                                        -1,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmaxsq,            INS_vpmaxuq,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX512F_VL,      Min,                                        -1,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpminsq,            INS_vpminuq,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX512F_VL,      CompareGreaterThan,                         -1,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpud,            INS_invalid,            INS_vpcmpuq,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F_VL,      CompareGreaterThanOrEqual,                  -1,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F_VL,      CompareLessThan,                            -1,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F_VL,      CompareLessThanOrEqual,                     -1,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512F_VL,      CompareNotEqual,                            -1,              2,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(AVX512F_VL,      ConvertToVector128Byte,                     -1,              1,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovdb,            INS_vpmovdb,            INS_vpmovqb,            INS_vpmovqb,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX512F_VL,      ConvertToVector128ByteWithSaturation,       -1,              1,      true,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpmovusdb,          INS_invalid,            INS_vpmovusqb,          INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX512F_VL,      ConvertToVector128Double,                   16,              1,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vcvtudq2pd,         INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
@@ -1004,6 +1015,12 @@ HARDWARE_INTRINSIC(AVX512BW,        AddSaturate,
 HARDWARE_INTRINSIC(AVX512BW,        AlignRight,                                 64,              3,     false,  {INS_palignr,           INS_palignr,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX512BW,        Average,                                    64,              2,      true,  {INS_invalid,           INS_pavgb,              INS_invalid,            INS_pavgw,              INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX512BW,        BroadcastScalarToVector512,                 64,              1,      true,  {INS_vpbroadcastb,      INS_vpbroadcastb,       INS_vpbroadcastw,       INS_vpbroadcastw,       INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX512BW,        CompareEqual,                               64,              2,      true,  {INS_vpcmpeqb,          INS_vpcmpeqb,           INS_vpcmpeqw,           INS_vpcmpeqw,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        CompareGreaterThan,                         64,              2,      true,  {INS_vpcmpgtb,          INS_vpcmpub,            INS_vpcmpgtw,           INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        CompareGreaterThanOrEqual,                  64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        CompareLessThan,                            64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        CompareLessThanOrEqual,                     64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW,        CompareNotEqual,                            64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector256Byte,                     64,              1,     false,  {INS_invalid,           INS_invalid,            INS_vpmovwb,            INS_vpmovwb,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector256ByteWithSaturation,       64,              1,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_vpmovuswb,          INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX512BW,        ConvertToVector256SByte,                    64,              1,     false,  {INS_invalid,           INS_invalid,            INS_vpmovwb,            INS_vpmovwb,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
@@ -1045,6 +1062,11 @@ HARDWARE_INTRINSIC(AVX512BW,        UnpackLow,
 //                                                                                                              {TYP_BYTE,              TYP_UBYTE,              TYP_SHORT,              TYP_USHORT,             TYP_INT,                TYP_UINT,               TYP_LONG,               TYP_ULONG,              TYP_FLOAT,              TYP_DOUBLE}
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //  AVX512BW.VL Intrinsics
+HARDWARE_INTRINSIC(AVX512BW_VL,     CompareGreaterThan,                         -1,              2,      true,  {INS_invalid,           INS_vpcmpub,            INS_invalid,            INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW_VL,     CompareGreaterThanOrEqual,                  -1,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW_VL,     CompareLessThan,                            -1,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW_VL,     CompareLessThanOrEqual,                     -1,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(AVX512BW_VL,     CompareNotEqual,                            -1,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(AVX512BW_VL,     ConvertToVector128Byte,                     -1,              1,     false,  {INS_invalid,           INS_invalid,            INS_vpmovwb,            INS_vpmovwb,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX512BW_VL,     ConvertToVector128ByteWithSaturation,       -1,              1,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_vpmovuswb,          INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX512BW_VL,     ConvertToVector128SByte,                    -1,              1,     false,  {INS_invalid,           INS_invalid,            INS_vpmovwb,            INS_vpmovwb,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
@@ -1275,16 +1297,17 @@ HARDWARE_INTRINSIC(SSE,             UCOMISS,
 HARDWARE_INTRINSIC(SSE2,            COMISD,                                     16,              2,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2,            UCOMISD,                                    16,              2,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE41,           PTEST,                                      16,              2,     false,  {INS_ptest,             INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
-HARDWARE_INTRINSIC(AVX,             PTEST,                                       0,              2,     false,  {INS_ptest,             INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_vtestps,            INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
+HARDWARE_INTRINSIC(AVX,             PTEST,                                       0,              2,     false,  {INS_ptest,             INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_ptest,              INS_vtestps,            INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_NoEvexSemantics)
+HARDWARE_INTRINSIC(AVX512F,         KORTEST,                                     0,              1,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_movd,               INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Special,                HW_Flag_NoContainment)
 
-HARDWARE_INTRINSIC(AVX512F,         MoveMaskSpecial,                            64,              1,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_movd,               INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Special,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F,         CompareEqualSpecial,                        64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_Special,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F,         CompareGreaterThanOrEqualSpecial,           64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_Special,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F,         CompareGreaterThanSpecial,                  64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_Special,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F,         CompareLessThanOrEqualSpecial,              64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_Special,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F,         CompareLessThanSpecial,                     64,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_Special,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F,         MoveMaskToVectorSpecial,                    64,              1,      true,  {INS_vpmovm2b,          INS_vpmovm2b,           INS_vpmovm2w,           INS_vpmovm2w,           INS_vpmovm2d,           INS_vpmovm2d,           INS_vpmovm2q,           INS_vpmovm2q,           INS_vpmovm2d,           INS_vpmovm2q},          HW_Category_Special,                HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX512F,         KORTEST,                                    0,               1,     false,  {INS_kortestq,          INS_kortestq,           INS_kortestd,           INS_kortestd,           INS_kortestw,           INS_kortestw,           INS_kortestb,           INS_kortestb,           INS_kortestw,           INS_kortestb},          HW_Category_Special,                HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX512F,         CompareEqualMask,                           -1,              2,      true,  {INS_vpcmpeqb,          INS_vpcmpeqb,           INS_vpcmpeqw,           INS_vpcmpeqw,           INS_vpcmpeqd,           INS_vpcmpeqd,           INS_vpcmpeqq,           INS_vpcmpeqq,           INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F,         CompareGreaterThanMask,                     -1,              2,      true,  {INS_vpcmpgtb,          INS_vpcmpub,            INS_vpcmpgtw,           INS_vpcmpuw,            INS_vpcmpgtd,           INS_vpcmpud,            INS_vpcmpgtq,           INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F,         CompareGreaterThanOrEqualMask,              -1,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F,         CompareLessThanMask,                        -1,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F,         CompareLessThanOrEqualMask,                 -1,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F,         CompareNotEqualMask,                        -1,              2,      true,  {INS_vpcmpb,            INS_vpcmpub,            INS_vpcmpw,             INS_vpcmpuw,            INS_vpcmpd,             INS_vpcmpud,            INS_vpcmpq,             INS_vpcmpuq,            INS_vcmpps,             INS_vcmppd},            HW_Category_SimpleSIMD,             HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F,         ConvertMaskToVector,                        -1,              1,      true,  {INS_vpmovm2b,          INS_vpmovm2b,           INS_vpmovm2w,           INS_vpmovm2w,           INS_vpmovm2d,           INS_vpmovm2d,           INS_vpmovm2q,           INS_vpmovm2q,           INS_vpmovm2d,           INS_vpmovm2q},          HW_Category_SimpleSIMD,             HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(AVX512F,         MoveMaskSpecial,                            -1,              1,     false,  {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Special,                HW_Flag_NoContainment)
 
 #endif // FEATURE_HW_INTRINSIC
 
index 9c38e45..dadabad 100644 (file)
@@ -554,6 +554,302 @@ bool HWIntrinsicInfo::isScalarIsa(CORINFO_InstructionSet isa)
 }
 
 //------------------------------------------------------------------------
+// lookupIval: Gets a the implicit immediate value for the given intrinsic
+//
+// Arguments:
+//    comp         - The compiler
+//    id           - The intrinsic for which to get the ival
+//    simdBaseType - The base type for the intrinsic
+//
+// Return Value:
+//    The immediate value for the given intrinsic or -1 if none exists
+int HWIntrinsicInfo::lookupIval(Compiler* comp, NamedIntrinsic id, var_types simdBaseType)
+{
+    switch (id)
+    {
+        case NI_SSE_CompareEqual:
+        case NI_SSE_CompareScalarEqual:
+        case NI_SSE2_CompareEqual:
+        case NI_SSE2_CompareScalarEqual:
+        case NI_AVX_CompareEqual:
+        case NI_AVX512F_CompareEqualMask:
+        {
+            if (varTypeIsFloating(simdBaseType))
+            {
+                return static_cast<int>(FloatComparisonMode::OrderedEqualNonSignaling);
+            }
+            else
+            {
+                // We can emit `vpcmpeqb`, `vpcmpeqw`, `vpcmpeqd`, or `vpcmpeqq`
+            }
+            break;
+        }
+
+        case NI_SSE_CompareGreaterThan:
+        case NI_SSE_CompareScalarGreaterThan:
+        case NI_SSE2_CompareGreaterThan:
+        case NI_SSE2_CompareScalarGreaterThan:
+        case NI_AVX_CompareGreaterThan:
+        case NI_AVX512F_CompareGreaterThanMask:
+        {
+            if (varTypeIsFloating(simdBaseType))
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+                {
+                    return static_cast<int>(FloatComparisonMode::OrderedGreaterThanSignaling);
+                }
+
+                // CompareGreaterThan is not directly supported in hardware without AVX support.
+                // We will return the inverted case here and lowering will itself swap the ops
+                // to ensure the emitted code remains correct. This simplifies the overall logic
+                // here and for other use cases.
+
+                assert(id != NI_AVX_CompareGreaterThan);
+                return static_cast<int>(FloatComparisonMode::OrderedLessThanSignaling);
+            }
+            else if ((id == NI_AVX512F_CompareGreaterThanMask) && varTypeIsUnsigned(simdBaseType))
+            {
+                // TODO-XARCH-CQ: Allow the other integer paths to use the EVEX encoding
+                return static_cast<int>(IntComparisonMode::GreaterThan);
+            }
+            break;
+        }
+
+        case NI_SSE_CompareLessThan:
+        case NI_SSE_CompareScalarLessThan:
+        case NI_SSE2_CompareLessThan:
+        case NI_SSE2_CompareScalarLessThan:
+        case NI_AVX_CompareLessThan:
+        case NI_AVX512F_CompareLessThanMask:
+        {
+            if (varTypeIsFloating(simdBaseType))
+            {
+                return static_cast<int>(FloatComparisonMode::OrderedLessThanSignaling);
+            }
+            else if (id == NI_AVX512F_CompareLessThanMask)
+            {
+                // TODO-XARCH-CQ: Allow the other integer paths to use the EVEX encoding
+                return static_cast<int>(IntComparisonMode::LessThan);
+            }
+            break;
+        }
+
+        case NI_SSE_CompareGreaterThanOrEqual:
+        case NI_SSE_CompareScalarGreaterThanOrEqual:
+        case NI_SSE2_CompareGreaterThanOrEqual:
+        case NI_SSE2_CompareScalarGreaterThanOrEqual:
+        case NI_AVX_CompareGreaterThanOrEqual:
+        case NI_AVX512F_CompareGreaterThanOrEqualMask:
+        {
+            if (varTypeIsFloating(simdBaseType))
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+                {
+                    return static_cast<int>(FloatComparisonMode::OrderedGreaterThanOrEqualSignaling);
+                }
+
+                // CompareGreaterThanOrEqual is not directly supported in hardware without AVX support.
+                // We will return the inverted case here and lowering will itself swap the ops
+                // to ensure the emitted code remains correct. This simplifies the overall logic
+                // here and for other use cases.
+
+                assert(id != NI_AVX_CompareGreaterThanOrEqual);
+                return static_cast<int>(FloatComparisonMode::OrderedLessThanOrEqualSignaling);
+            }
+            else
+            {
+                assert(id == NI_AVX512F_CompareGreaterThanOrEqualMask);
+                return static_cast<int>(IntComparisonMode::GreaterThanOrEqual);
+            }
+            break;
+        }
+
+        case NI_SSE_CompareLessThanOrEqual:
+        case NI_SSE_CompareScalarLessThanOrEqual:
+        case NI_SSE2_CompareLessThanOrEqual:
+        case NI_SSE2_CompareScalarLessThanOrEqual:
+        case NI_AVX_CompareLessThanOrEqual:
+        case NI_AVX512F_CompareLessThanOrEqualMask:
+        {
+            if (varTypeIsFloating(simdBaseType))
+            {
+                return static_cast<int>(FloatComparisonMode::OrderedLessThanOrEqualSignaling);
+            }
+            else
+            {
+                assert(id == NI_AVX512F_CompareLessThanOrEqualMask);
+                return static_cast<int>(IntComparisonMode::LessThanOrEqual);
+            }
+            break;
+        }
+
+        case NI_SSE_CompareNotEqual:
+        case NI_SSE_CompareScalarNotEqual:
+        case NI_SSE2_CompareNotEqual:
+        case NI_SSE2_CompareScalarNotEqual:
+        case NI_AVX_CompareNotEqual:
+        case NI_AVX512F_CompareNotEqualMask:
+        {
+            if (varTypeIsFloating(simdBaseType))
+            {
+                return static_cast<int>(FloatComparisonMode::UnorderedNotEqualNonSignaling);
+            }
+            else
+            {
+                assert(id == NI_AVX512F_CompareNotEqualMask);
+                return static_cast<int>(IntComparisonMode::NotEqual);
+            }
+            break;
+        }
+
+        case NI_SSE_CompareNotGreaterThan:
+        case NI_SSE_CompareScalarNotGreaterThan:
+        case NI_SSE2_CompareNotGreaterThan:
+        case NI_SSE2_CompareScalarNotGreaterThan:
+        case NI_AVX_CompareNotGreaterThan:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+            {
+                return static_cast<int>(FloatComparisonMode::UnorderedNotGreaterThanSignaling);
+            }
+
+            // CompareNotGreaterThan is not directly supported in hardware without AVX support.
+            // We will return the inverted case here and lowering will itself swap the ops
+            // to ensure the emitted code remains correct. This simplifies the overall logic
+            // here and for other use cases.
+
+            assert(id != NI_AVX_CompareNotGreaterThan);
+            return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanSignaling);
+        }
+
+        case NI_SSE_CompareNotLessThan:
+        case NI_SSE_CompareScalarNotLessThan:
+        case NI_SSE2_CompareNotLessThan:
+        case NI_SSE2_CompareScalarNotLessThan:
+        case NI_AVX_CompareNotLessThan:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanSignaling);
+        }
+
+        case NI_SSE_CompareNotGreaterThanOrEqual:
+        case NI_SSE_CompareScalarNotGreaterThanOrEqual:
+        case NI_SSE2_CompareNotGreaterThanOrEqual:
+        case NI_SSE2_CompareScalarNotGreaterThanOrEqual:
+        case NI_AVX_CompareNotGreaterThanOrEqual:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+            {
+                return static_cast<int>(FloatComparisonMode::UnorderedNotGreaterThanOrEqualSignaling);
+            }
+
+            // CompareNotGreaterThanOrEqual is not directly supported in hardware without AVX support.
+            // We will return the inverted case here and lowering will itself swap the ops
+            // to ensure the emitted code remains correct. This simplifies the overall logic
+            // here and for other use cases.
+
+            assert(id != NI_AVX_CompareNotGreaterThanOrEqual);
+            return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling);
+        }
+
+        case NI_SSE_CompareNotLessThanOrEqual:
+        case NI_SSE_CompareScalarNotLessThanOrEqual:
+        case NI_SSE2_CompareNotLessThanOrEqual:
+        case NI_SSE2_CompareScalarNotLessThanOrEqual:
+        case NI_AVX_CompareNotLessThanOrEqual:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatComparisonMode::UnorderedNotLessThanOrEqualSignaling);
+        }
+
+        case NI_SSE_CompareOrdered:
+        case NI_SSE_CompareScalarOrdered:
+        case NI_SSE2_CompareOrdered:
+        case NI_SSE2_CompareScalarOrdered:
+        case NI_AVX_CompareOrdered:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatComparisonMode::OrderedNonSignaling);
+        }
+
+        case NI_SSE_CompareUnordered:
+        case NI_SSE_CompareScalarUnordered:
+        case NI_SSE2_CompareUnordered:
+        case NI_SSE2_CompareScalarUnordered:
+        case NI_AVX_CompareUnordered:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatComparisonMode::UnorderedNonSignaling);
+        }
+
+        case NI_SSE41_Ceiling:
+        case NI_SSE41_CeilingScalar:
+        case NI_AVX_Ceiling:
+        {
+            FALLTHROUGH;
+        }
+
+        case NI_SSE41_RoundToPositiveInfinity:
+        case NI_SSE41_RoundToPositiveInfinityScalar:
+        case NI_AVX_RoundToPositiveInfinity:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatRoundingMode::ToPositiveInfinity);
+        }
+
+        case NI_SSE41_Floor:
+        case NI_SSE41_FloorScalar:
+        case NI_AVX_Floor:
+        {
+            FALLTHROUGH;
+        }
+
+        case NI_SSE41_RoundToNegativeInfinity:
+        case NI_SSE41_RoundToNegativeInfinityScalar:
+        case NI_AVX_RoundToNegativeInfinity:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatRoundingMode::ToNegativeInfinity);
+        }
+
+        case NI_SSE41_RoundCurrentDirection:
+        case NI_SSE41_RoundCurrentDirectionScalar:
+        case NI_AVX_RoundCurrentDirection:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatRoundingMode::CurrentDirection);
+        }
+
+        case NI_SSE41_RoundToNearestInteger:
+        case NI_SSE41_RoundToNearestIntegerScalar:
+        case NI_AVX_RoundToNearestInteger:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatRoundingMode::ToNearestInteger);
+        }
+
+        case NI_SSE41_RoundToZero:
+        case NI_SSE41_RoundToZeroScalar:
+        case NI_AVX_RoundToZero:
+        {
+            assert(varTypeIsFloating(simdBaseType));
+            return static_cast<int>(FloatRoundingMode::ToZero);
+        }
+
+        default:
+        {
+            break;
+        }
+    }
+
+    return -1;
+}
+
+//------------------------------------------------------------------------
 // impNonConstFallback: convert certain SSE2/AVX2 shift intrinsic to its semantic alternative when the imm-arg is
 // not a compile-time constant
 //
@@ -2871,10 +3167,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                 // These intrinsics are "special import" because the non-AVX path isn't directly
                 // hardware supported. Instead, they start with "swapped operands" and we fix that here.
 
-                FloatComparisonMode comparison =
-                    static_cast<FloatComparisonMode>(HWIntrinsicInfo::lookupIval(intrinsic, true));
-                retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(static_cast<int>(comparison)),
-                                                   NI_AVX_CompareScalar, simdBaseJitType, simdSize);
+                int ival = HWIntrinsicInfo::lookupIval(this, intrinsic, simdBaseType);
+                retNode  = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(ival), NI_AVX_CompareScalar,
+                                                   simdBaseJitType, simdSize);
             }
             else
             {
@@ -2931,10 +3226,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                 // These intrinsics are "special import" because the non-AVX path isn't directly
                 // hardware supported. Instead, they start with "swapped operands" and we fix that here.
 
-                FloatComparisonMode comparison =
-                    static_cast<FloatComparisonMode>(HWIntrinsicInfo::lookupIval(intrinsic, true));
-                retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(static_cast<int>(comparison)),
-                                                   NI_AVX_CompareScalar, simdBaseJitType, simdSize);
+                int ival = HWIntrinsicInfo::lookupIval(this, intrinsic, simdBaseType);
+                retNode  = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, gtNewIconNode(ival), NI_AVX_CompareScalar,
+                                                   simdBaseJitType, simdSize);
             }
             else
             {
index cb03f30..bbc8089 100644 (file)
@@ -146,8 +146,7 @@ enum insFlags : uint64_t
     // Avx
     INS_Flags_IsDstDstSrcAVXInstruction = 1ULL << 26,
     INS_Flags_IsDstSrcSrcAVXInstruction = 1ULL << 27,
-    INS_Flags_IsMskSrcSrcEvexInstruction = 1ULL << 28,
-    INS_Flags_Is3OperandInstructionMask = (INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_IsDstSrcSrcAVXInstruction | INS_Flags_IsMskSrcSrcEvexInstruction),
+    INS_Flags_Is3OperandInstructionMask = (INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_IsDstSrcSrcAVXInstruction),
 
     // w and s bits
     INS_FLAGS_Has_Wbit = 1ULL << 29,
index 4cb9cc2..161df44 100644 (file)
@@ -608,19 +608,28 @@ INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE,
 INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
 
 // AVX512F
-INST3(kmovw_gpr,        "kmovw",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x92),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)
-INST3(kmovw_msk,        "kmovw",            IUM_WR, PCKFLT(0x91),           BAD_CODE,     PCKFLT(0x90),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)
-INST3(kortestw,         "kortestw",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKFLT(0x98),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)
+INST3(kandw,            "kandw",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x41),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical AND masks
+INST3(kandnw,           "kandnw",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x42),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical AND NOT masks
+INST3(kmovw_gpr,        "kmovw",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x92),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Move from and to mask registers
+INST3(kmovw_msk,        "kmovw",            IUM_WR, PCKFLT(0x91),           BAD_CODE,     PCKFLT(0x90),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Move from and to mask registers
+INST3(knotw,            "knotw",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x44),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // NOT mask register
+INST3(korw,             "korw",             IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x45),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical OR masks
+INST3(kortestw,         "kortestw",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKFLT(0x98),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)                  // OR masks and set flags
+INST3(kshiftlw,         "kshiftlw",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x32),                   INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Shift left mask registers
+INST3(kshiftrw,         "kshiftrw",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x30),                   INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Shift right mask registers
+INST3(kunpckbw,         "kunpckbw",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKDBL(0x4B),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Unpack for mask registers
+INST3(kxnorw,           "kxnorw",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x46),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical XNOR masks
+INST3(kxorw,            "kxorw",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x47),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical XOR masks
 INST3(valignd,          "alignd",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x03),                   INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Align doubleword vectors
 INST3(valignq,          "alignq",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x03),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Align quadword vectors
 INST3(vbroadcastf64x2,  "broadcastf64x2",   IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x1A),                   INS_TT_TUPLE2,                       Input_64Bit    | REX_W1                       | Encoding_EVEX)                                                                                                                                  // Broadcast packed float values read from memory to entire register
 INST3(vbroadcasti64x2,  "broadcasti64x2",   IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x5A),                   INS_TT_TUPLE2,                       Input_64Bit    | REX_W1                       | Encoding_EVEX)                                                                                                                                  // Broadcast packed integer values read from memory to entire register
 INST3(vbroadcastf64x4,  "broadcastf64x4",   IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x1B),                   INS_TT_TUPLE2,                       Input_64Bit    | REX_W1                       | Encoding_EVEX)                                                                                                                                  // Broadcast packed float values read from memory to entire register
 INST3(vbroadcasti64x4,  "broadcasti64x4",   IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x5B),                   INS_TT_TUPLE2,                       Input_64Bit    | REX_W1                       | Encoding_EVEX)                                                                                                                                  // Broadcast packed integer values read from memory to entire register
-INST3(vcmpps,           "cmpps",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0xC2),                  INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // compare packed singles
-INST3(vcmpss,           "cmpss",            IUM_WR, BAD_CODE,               BAD_CODE,     SSEFLT(0xC2),                  INS_TT_TUPLE1_SCALAR,                Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // compare scalar singles
-INST3(vcmppd,           "cmppd",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xC2),                  INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // compare packed doubles
-INST3(vcmpsd,           "cmpsd",            IUM_WR, BAD_CODE,               BAD_CODE,     SSEDBL(0xC2),                  INS_TT_TUPLE1_SCALAR,                Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // compare scalar doubles
+INST3(vcmpps,           "cmpps",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0xC2),                  INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // compare packed singles
+INST3(vcmpss,           "cmpss",            IUM_WR, BAD_CODE,               BAD_CODE,     SSEFLT(0xC2),                  INS_TT_TUPLE1_SCALAR,                Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // compare scalar singles
+INST3(vcmppd,           "cmppd",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xC2),                  INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // compare packed doubles
+INST3(vcmpsd,           "cmpsd",            IUM_WR, BAD_CODE,               BAD_CODE,     SSEDBL(0xC2),                  INS_TT_TUPLE1_SCALAR,                Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // compare scalar doubles
 INST3(vcvtpd2udq,       "cvtpd2udq",        IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x79),                  INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX)                                                                                                                                  // cvt packed doubles to unsigned DWORDs
 INST3(vcvtps2udq,       "cvtps2udq",        IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x79),                  INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX)                                                                                                                                  // cvt packed singles to unsigned DWORDs
 INST3(vcvtsd2usi,       "cvtsd2usi",        IUM_WR, BAD_CODE,               BAD_CODE,     SSEDBL(0x79),                  INS_TT_TUPLE1_FIXED,                 Input_64Bit    | REX_WX                       | Encoding_EVEX)                                                                                                                                  // cvt scalar double to unsigned DWORD/QWORD
@@ -658,10 +667,10 @@ INST3(vpandq,           "pandq",            IUM_WR, BAD_CODE,               BAD_
 INST3(vpandnq,          "pandnq",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xDF),                  INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise AND NOT of two xmm regs
 INST3(vpbroadcastd_gpr, "pbroadcastd",      IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x7C),                   INS_TT_TUPLE1_SCALAR,                Input_32Bit    | REX_W0                       | Encoding_EVEX)                                                                                                                                  // Broadcast int32 value from gpr to entire register
 INST3(vpbroadcastq_gpr, "pbroadcastq",      IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x7C),                   INS_TT_TUPLE1_SCALAR,                Input_64Bit    | REX_W1                       | Encoding_EVEX)                                                                                                                                  // Broadcast int64 value from gpr to entire register
-INST3(vpcmpeqd,         "pcmpeqd",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x76),                  INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // Packed compare 32-bit integers for equality
-INST3(vpcmpgtd,         "pcmpgtd",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x66),                  INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // Packed compare 32-bit signed integers for greater than
-INST3(vpcmpeqq,         "pcmpeqq",          IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x29),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // Packed compare 64-bit integers for equality
-INST3(vpcmpgtq,         "pcmpgtq",          IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x37),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // Packed compare 64-bit integers for equality
+INST3(vpcmpeqd,         "pcmpeqd",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x76),                  INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // Packed compare 32-bit integers for equality
+INST3(vpcmpgtd,         "pcmpgtd",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x66),                  INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // Packed compare 32-bit signed integers for greater than
+INST3(vpcmpeqq,         "pcmpeqq",          IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x29),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // Packed compare 64-bit integers for equality
+INST3(vpcmpgtq,         "pcmpgtq",          IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x37),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // Packed compare 64-bit integers for equality
 INST3(vpermq_reg,       "permq",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x36),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                           // Permute 64-bit of input register
 INST3(vpermpd_reg,      "permpd",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x16),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                           // Permute 64-bit of input register
 INST3(vpermi2d,         "permi2d",          IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x76),                   INS_TT_FULL,                         Input_32Bit    | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                           // Full Permute From Two Tables Overwriting the Index
@@ -735,25 +744,47 @@ INST3(vshufi32x4,       "shufi32x4",        IUM_WR, BAD_CODE,               BAD_
 INST3(vshufi64x2,       "shufi64x2",        IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x43),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Shuffle packed values at 128-bit granularity
 
 // AVX512BW
-INST3(kmovd_gpr,        "kmovd",            IUM_WR, BAD_CODE,               BAD_CODE,     SSEDBL(0x92),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)
-INST3(kmovd_msk,        "kmovd",            IUM_WR, PCKDBL(0x91),           BAD_CODE,     PCKDBL(0x90),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)
-INST3(kmovq_gpr,        "kmovq",            IUM_WR, BAD_CODE,               BAD_CODE,     SSEDBL(0x92),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)
-INST3(kmovq_msk,        "kmovq",            IUM_WR, PCKFLT(0x91),           BAD_CODE,     PCKFLT(0x90),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)
-INST3(kortestd,         "kortestd",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKDBL(0x98),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)
-INST3(kortestq,         "kortestq",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKFLT(0x98),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)
+INST3(kaddd,            "kaddd",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x4A),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Add two masks
+INST3(kaddq,            "kaddq",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x4A),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Add two masks
+INST3(kandd,            "kandd",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x41),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical AND masks
+INST3(kandq,            "kandq",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x41),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical AND masks
+INST3(kandnd,           "kandnd",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x42),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical AND NOT masks
+INST3(kandnq,           "kandnq",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x42),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical AND NOT masks
+INST3(kmovd_gpr,        "kmovd",            IUM_WR, BAD_CODE,               BAD_CODE,     SSEDBL(0x92),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Move from and to mask registers
+INST3(kmovd_msk,        "kmovd",            IUM_WR, PCKDBL(0x91),           BAD_CODE,     PCKDBL(0x90),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Move from and to mask registers
+INST3(kmovq_gpr,        "kmovq",            IUM_WR, BAD_CODE,               BAD_CODE,     SSEDBL(0x92),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Move from and to mask registers
+INST3(kmovq_msk,        "kmovq",            IUM_WR, PCKFLT(0x91),           BAD_CODE,     PCKFLT(0x90),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Move from and to mask registers
+INST3(knotd,            "knotd",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x44),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // NOT mask register
+INST3(knotq,            "knotq",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x44),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // NOT mask register
+INST3(kord,             "kord",             IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x45),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical OR masks
+INST3(korq,             "korq",             IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x45),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical OR masks
+INST3(kortestd,         "kortestd",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKDBL(0x98),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)                  // OR masks and set flags
+INST3(kortestq,         "kortestq",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKFLT(0x98),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)                  // OR masks and set flags
+INST3(kshiftld,         "kshiftld",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x33),                   INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Shift left mask registers
+INST3(kshiftlq,         "kshiftlq",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x33),                   INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Shift left mask registers
+INST3(kshiftrd,         "kshiftrd",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x31),                   INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Shift right mask registers
+INST3(kshiftrq,         "kshiftrq",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x31),                   INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Shift right mask registers
+INST3(ktestd,           "ktestd",           IUM_RD, BAD_CODE,               BAD_CODE,     PCKDBL(0x99),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)                  // Packed bit test masks and set flags
+INST3(ktestq,           "ktestq",           IUM_RD, BAD_CODE,               BAD_CODE,     PCKFLT(0x99),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)                  // Packed bit test masks and set flags
+INST3(kunpckdq,         "kunpckdq",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKFLT(0x4B),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Unpack for mask registers
+INST3(kunpckwd,         "kunpckwd",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKFLT(0x4B),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Unpack for mask registers
+INST3(kxnord,           "kxnord",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x46),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical XNOR masks
+INST3(kxnorq,           "kxnorq",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x46),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical XNOR masks
+INST3(kxord,            "kxord",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x47),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical XOR masks
+INST3(kxorq,            "kxorq",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x47),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical XOR masks
 INST3(vdbpsadbw,        "dbpsadbw",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x42),                   INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Double block packed Sum-Absolute-Differences (SAD) on unsigned bytes
 INST3(vmovdqu8,         "movdqu8",          IUM_WR, SSEFLT(0x7F),           BAD_CODE,     SSEFLT(0x6F),                  INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0                       | Encoding_EVEX)
 INST3(vmovdqu16,        "movdqu16",         IUM_WR, SSEFLT(0x7F),           BAD_CODE,     SSEFLT(0x6F),                  INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX)
 INST3(vpbroadcastb_gpr, "pbroadcastb",      IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x7A),                   INS_TT_TUPLE1_SCALAR,                Input_8Bit     | REX_W0                       | Encoding_EVEX)                                                                                                                                  // Broadcast int8 value from gpr to entire register
 INST3(vpbroadcastw_gpr, "pbroadcastw",      IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x7B),                   INS_TT_TUPLE1_SCALAR,                Input_16Bit    | REX_W0                       | Encoding_EVEX)                                                                                                                                  // Broadcast int16 value from gpr to entire register
-INST3(vpcmpb,           "pcmpb",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3F),                   INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)
-INST3(vpcmpeqb,         "pcmpeqb",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x74),                  INS_TT_FULL_MEM,                     Input_8Bit     | REX_WIG                      | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // Packed compare 8-bit integers for equality
-INST3(vpcmpeqw,         "pcmpeqw",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x75),                  INS_TT_FULL_MEM,                     Input_16Bit    | REX_WIG                      | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // Packed compare 16-bit integers for equality
-INST3(vpcmpgtb,         "pcmpgtb",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x64),                  INS_TT_FULL_MEM,                     Input_8Bit     | REX_WIG                      | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // Packed compare 8-bit signed integers for greater than
-INST3(vpcmpgtw,         "pcmpgtw",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x65),                  INS_TT_FULL_MEM,                     Input_16Bit    | REX_WIG                      | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)                                                                                          // Packed compare 16-bit signed integers for greater than
-INST3(vpcmpw,           "pcmpw",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3F),                   INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)
-INST3(vpcmpub,          "pcmpub",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3E),                   INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)
-INST3(vpcmpuw,          "pcmpuw",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3E),                   INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)
+INST3(vpcmpb,           "pcmpb",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3F),                   INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(vpcmpeqb,         "pcmpeqb",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x74),                  INS_TT_FULL_MEM,                     Input_8Bit     | REX_WIG                      | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // Packed compare 8-bit integers for equality
+INST3(vpcmpeqw,         "pcmpeqw",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x75),                  INS_TT_FULL_MEM,                     Input_16Bit    | REX_WIG                      | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // Packed compare 16-bit integers for equality
+INST3(vpcmpgtb,         "pcmpgtb",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x64),                  INS_TT_FULL_MEM,                     Input_8Bit     | REX_WIG                      | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // Packed compare 8-bit signed integers for greater than
+INST3(vpcmpgtw,         "pcmpgtw",          IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x65),                  INS_TT_FULL_MEM,                     Input_16Bit    | REX_WIG                      | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                          // Packed compare 16-bit signed integers for greater than
+INST3(vpcmpw,           "pcmpw",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3F),                   INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(vpcmpub,          "pcmpub",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3E),                   INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(vpcmpuw,          "pcmpuw",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3E),                   INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)
 INST3(vpermw,           "permw",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x8D),                   INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                           // Permute Packed Doublewords Elements
 INST3(vpermi2w,         "permi2w",          IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x75),                   INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                           // Full Permute From Two Tables Overwriting the Index
 INST3(vpermt2w,         "permt2w",          IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x7D),                   INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                           // Full Permute From Two Tables Overwriting one Table
@@ -775,9 +806,21 @@ INST3(vplzcntd,         "plzcntd",          IUM_WR, BAD_CODE,               BAD_
 INST3(vplzcntq,         "plzcntq",          IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x44),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX)                                                                                                                                  // Count the number of leading zero bits for packed qword values
 
 // AVX512DQ
-INST3(kortestb,         "kortestb",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKDBL(0x98),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)
-INST3(kmovb_gpr,        "kmovb",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x92),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)
-INST3(kmovb_msk,        "kmovb",            IUM_WR, PCKDBL(0x91),           BAD_CODE,     PCKDBL(0x90),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)
+INST3(kaddb,            "kaddb",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x4A),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Add two masks
+INST3(kaddw,            "kaddw",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x4A),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Add two masks
+INST3(kandb,            "kandb",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x41),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical AND masks
+INST3(kandnb,           "kandnb",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x42),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical AND NOT masks
+INST3(kmovb_gpr,        "kmovb",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x92),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Move from and to mask registers
+INST3(kmovb_msk,        "kmovb",            IUM_WR, PCKDBL(0x91),           BAD_CODE,     PCKDBL(0x90),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Move from and to mask registers
+INST3(knotb,            "knotb",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x44),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // NOT mask register
+INST3(korb,             "korb",             IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x45),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical OR masks
+INST3(kortestb,         "kortestb",         IUM_RD, BAD_CODE,               BAD_CODE,     PCKDBL(0x98),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)                  // OR masks and set flags
+INST3(kshiftlb,         "kshiftlb",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x32),                   INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Shift left mask registers
+INST3(kshiftrb,         "kshiftrb",         IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x30),                   INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Shift right mask registers
+INST3(ktestb,           "ktestb",           IUM_RD, BAD_CODE,               BAD_CODE,     PCKDBL(0x99),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)                  // Packed bit test masks and set flags
+INST3(ktestw,           "ktestw",           IUM_RD, BAD_CODE,               BAD_CODE,     PCKFLT(0x99),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)                  // Packed bit test masks and set flags
+INST3(kxnorb,           "kxnorb",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x46),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical XNOR masks
+INST3(kxorb,            "kxorb",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0x47),                  INS_TT_NONE,                                          REX_W0                       | Encoding_VEX                                                                                                   | KInstruction)                  // Bitwise logical XOR masks
 INST3(vbroadcastf32x2,  "broadcastf32x2",   IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x19),                   INS_TT_TUPLE2,                       Input_32Bit    | REX_W0                       | Encoding_EVEX)                                                                                                                                  // Broadcast packed float values read from memory to entire register
 INST3(vbroadcasti32x2,  "broadcasti32x2",   IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x59),                   INS_TT_TUPLE2,                       Input_32Bit    | REX_W0                       | Encoding_EVEX)                                                                                                                                  // Broadcast packed integer values read from memory to entire register
 INST3(vbroadcastf32x8,  "broadcastf32x8",   IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x1B),                   INS_TT_TUPLE8,                       Input_32Bit    | REX_W0                       | Encoding_EVEX)                                                                                                                                  // Broadcast packed float values read from memory to entire register
index f49b024..2afbeb1 100644 (file)
@@ -355,11 +355,11 @@ private:
     GenTree* LowerHWIntrinsic(GenTreeHWIntrinsic* node);
     void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
     GenTree* LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp);
-    GenTree* LowerHWIntrinsicCmpOpWithKReg(GenTreeHWIntrinsic* node);
     GenTree* LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
     GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node);
 #if defined(TARGET_XARCH)
     void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
+    GenTree* LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node);
     GenTree* LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node);
     GenTree* LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node);
     GenTree* LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node);
index 4a69c64..24a581a 100644 (file)
@@ -894,12 +894,20 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn
             }
             break;
 
-        case NI_AVX512F_KORTEST:
         case NI_SSE41_PTEST:
         case NI_AVX_PTEST:
+        {
             // If we need the Carry flag then we can't swap operands.
             canSwapOperands = (cc == nullptr) || cc->gtCondition.Is(GenCondition::EQ, GenCondition::NE);
             break;
+        }
+
+        case NI_AVX512F_KORTEST:
+        {
+            // TODO-XARCH-AVX512 remove the KORTEST check when its promoted to 2 proper arguments
+            assert(HWIntrinsicInfo::lookupNumArgs(newIntrinsicId) == 1);
+            break;
+        }
 
         default:
             unreached();
@@ -1166,28 +1174,16 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
 
         case NI_Vector128_op_Equality:
         case NI_Vector256_op_Equality:
+        case NI_Vector512_op_Equality:
         {
             return LowerHWIntrinsicCmpOp(node, GT_EQ);
         }
 
         case NI_Vector128_op_Inequality:
         case NI_Vector256_op_Inequality:
-        {
-            return LowerHWIntrinsicCmpOp(node, GT_NE);
-        }
-
-        case NI_Vector512_GreaterThanAll:
-        case NI_Vector512_GreaterThanAny:
-        case NI_Vector512_GreaterThanOrEqualAll:
-        case NI_Vector512_GreaterThanOrEqualAny:
-        case NI_Vector512_LessThanAll:
-        case NI_Vector512_LessThanAny:
-        case NI_Vector512_LessThanOrEqualAll:
-        case NI_Vector512_LessThanOrEqualAny:
-        case NI_Vector512_op_Equality:
         case NI_Vector512_op_Inequality:
         {
-            return LowerHWIntrinsicCmpOpWithKReg(node);
+            return LowerHWIntrinsicCmpOp(node, GT_NE);
         }
 
         case NI_Vector128_ToScalar:
@@ -1614,6 +1610,32 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
             LowerFusedMultiplyAdd(node);
             break;
 
+        case NI_AVX512F_CompareEqual:
+        case NI_AVX512F_CompareGreaterThan:
+        case NI_AVX512F_CompareGreaterThanOrEqual:
+        case NI_AVX512F_CompareLessThan:
+        case NI_AVX512F_CompareLessThanOrEqual:
+        case NI_AVX512F_CompareNotEqual:
+        case NI_AVX512F_VL_CompareGreaterThan:
+        case NI_AVX512F_VL_CompareGreaterThanOrEqual:
+        case NI_AVX512F_VL_CompareLessThan:
+        case NI_AVX512F_VL_CompareLessThanOrEqual:
+        case NI_AVX512F_VL_CompareNotEqual:
+        case NI_AVX512BW_CompareEqual:
+        case NI_AVX512BW_CompareGreaterThan:
+        case NI_AVX512BW_CompareGreaterThanOrEqual:
+        case NI_AVX512BW_CompareLessThan:
+        case NI_AVX512BW_CompareLessThanOrEqual:
+        case NI_AVX512BW_CompareNotEqual:
+        case NI_AVX512BW_VL_CompareGreaterThan:
+        case NI_AVX512BW_VL_CompareGreaterThanOrEqual:
+        case NI_AVX512BW_VL_CompareLessThan:
+        case NI_AVX512BW_VL_CompareLessThanOrEqual:
+        case NI_AVX512BW_VL_CompareNotEqual:
+        {
+            return LowerHWIntrinsicWithAvx512Mask(node);
+        }
+
         default:
             break;
     }
@@ -1638,7 +1660,8 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
     var_types      simdType        = Compiler::getSIMDTypeForSize(simdSize);
 
     assert((intrinsicId == NI_Vector128_op_Equality) || (intrinsicId == NI_Vector128_op_Inequality) ||
-           (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality));
+           (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality) ||
+           (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality));
 
     assert(varTypeIsSIMD(simdType));
     assert(varTypeIsArithmetic(simdBaseType));
@@ -1655,8 +1678,9 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
     GenTree*     op2    = node->Op(2);
     GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE;
 
-    if (!varTypeIsFloating(simdBaseType) && op2->IsVectorZero() &&
-        comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+    if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && op2->IsVectorZero() &&
+        comp->compOpportunisticallyDependsOn(InstructionSet_SSE41) &&
+        !op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector))
     {
         // On SSE4.1 or higher we can optimize comparisons against zero to
         // just use PTEST. We can't support it for floating-point, however,
@@ -1681,14 +1705,269 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
         }
         else
         {
+            assert(simdSize == 16);
+
             // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
             node->ChangeHWIntrinsicId(NI_SSE41_TestZ);
             LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
         }
 
-        return node->gtNext;
+        return LowerNode(node);
     }
 
+    // TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
+    // so will require us to account for the unused 4th element.
+
+    if ((simdType != TYP_SIMD12) && comp->IsBaselineVector512IsaSupported())
+    {
+        // The EVEX encoded versions of the comparison instructions all return a kmask
+        //
+        // For the comparisons against zero that we normally optimize to use `PTEST` we
+        // have to make a decision to use EVEX and emit 2 instructions (vpcmp + kortest)
+        // or to continue emitting PTEST and hope that the register allocator isn't limited
+        // by it not supporting the extended register set.
+        //
+        // Ideally we'd opt to not use PTEST when EVEX is available, This would be done so we can
+        // best take advantage of EVEX exclusive features such as embedded broadcast and the
+        // 16 additional registers. In many cases this allows for overall denser codegen where
+        // we are doing more in the same number of bytes, even though the individual instruction
+        // is 1-2 bytes larger. Even though there may be cases where continuing to use PTEST for select-
+        // 128/256-bit code paths would still be beneficial, the additional complexity required
+        // to detect and account for those differences is not likely to be worth the tradeoff.
+        //
+        // TODO-XARCH-AVX512: Given the above don't emit the PTEST path above when AVX-512 is available
+        // This will require exposing `NI_AVX512F_TestZ` so that we can keep codegen optimized to just
+        // `vptestm` followed by `kortest`. This will be one instruction more than just `vptest` but
+        // it has the advantages detailed above.
+        //
+        // For other comparisons, using EVEX allows us to avoid leaving the SIMD domain, avoids
+        // needing to use a general-purpose register, and allows us to generate less instructions.
+
+        GenTree* nextNode = node->gtNext;
+
+        NamedIntrinsic maskIntrinsicId = NI_AVX512F_CompareEqualMask;
+        uint32_t       count           = simdSize / genTypeSize(simdBaseType);
+
+        // KORTEST does a bitwise or on the result and sets ZF if it is zero and CF if it is all
+        // bits set. Because of this, when we have at least 8 elements to compare we can use a
+        // normal comparison alongside CF.
+        //
+        // That is, if the user wants `x == y`, we can keep it as `mask = (x == y)` and then emit
+        // `kortest mask, mask` and check `CF == 1`. This will be true if all elements matched and
+        // false otherwise. Things work out nicely and we keep readable disasm.
+        //
+        // Likewise, if the user wants `x != y`, we can keep it as `mask = (x != y)` and then emit
+        // `kortest mask, mask` and check `ZF != 0`. This will be true if any elements mismatched.
+        //
+        // However, if we have less than 8 elements then we have to change it up since we have less
+        // than 8 bits in the output mask and unused bits will be set to 0. This occurs for 32-bit
+        // for Vector128 and and 64-bit elements when using either Vector128 or Vector256.
+        //
+        // To account for this, we will invert the comparison being done. So if the user wants
+        // `x == y`, we will instead emit `mask = (x != y)`, we will still emit `kortest mask, mask`,
+        // but we will then check for `ZF == 0`. This works since that equates to all elements being equal
+        //
+        // Likewise for `x != y` we will instead emit `mask = (x == y)`, then `kortest mask, mask`,
+        // and will then check for `CF == 0` which equates to one or more elements not being equal
+
+        // The scenarios we have to for a full mask are:
+        // * No matches:      0000_0000 - ZF == 1, CF == 0
+        // * Partial matches: 0000_1111 - ZF == 0, CF == 0
+        // * All matches:     1111_1111 - ZF == 0, CF == 1
+        //
+        // The scenarios we have to for a partial mask are:
+        // * No matches:      0000_0000 - ZF == 1, CF == 0
+        // * Partial matches: 0000_0011 - ZF == 0, CF == 0
+        // * All matches:     0000_1111 - ZF == 0, CF == 0
+        //
+        // When we have less than a full mask worth of elements, we need to account for the upper
+        // bits being implicitly zero. To do that, we may need to invert the comparison.
+        //
+        // By inverting the comparison we'll get:
+        // * All matches:     0000_0000 - ZF == 1, CF == 0
+        // * Partial matches: 0000_0011 - ZF == 0, CF == 0
+        // * No matches:      0000_1111 - ZF == 0, CF == 0
+        //
+        // This works since the upper bits are implicitly zero and so by inverting matches also become
+        // zero, which in turn means that `AllBitsSet` will become `Zero` and other cases become non-zero
+
+        if (op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) && op2->IsCnsVec())
+        {
+            // We want to specially handle the common cases of `mask op Zero` and `mask op AllBitsSet`
+            //
+            // These get created for the various `gtNewSimdCmpOpAnyNode` and `gtNewSimdCmpOpAllNode`
+            // scenarios and we want to ensure they still get "optimal" codegen. To handle that, we
+            // simply consume the mask directly and preserve the intended comparison by tweaking the
+            // compare condition passed down into `KORTEST`
+
+            GenTreeHWIntrinsic* maskNode = op1->AsHWIntrinsic()->Op(1)->AsHWIntrinsic();
+            assert(maskNode->TypeIs(TYP_MASK));
+
+            bool           isHandled = false;
+            GenTreeVecCon* vecCon    = op2->AsVecCon();
+
+            if (vecCon->IsZero())
+            {
+                // We have `mask == Zero` which is the same as checking that nothing in the mask
+                // is set. This scenario can be handled by `kortest` and then checking that `ZF == 1`
+                //
+                // -or-
+                //
+                // We have `mask != Zero` which is the same as checking that something in the mask
+                // is set. This scenario can be handled by `kortest` and then checking that `ZF == 0`
+                //
+                // Since this is the default state for `CompareEqualMask` + `GT_EQ`/`GT_NE`, there is nothing
+                // for us to change. This also applies to cases where we have less than a full mask of
+                // elements since the upper mask bits are implicitly zero.
+
+                isHandled = true;
+            }
+            else if (vecCon->IsAllBitsSet())
+            {
+                // We have `mask == AllBitsSet` which is the same as checking that everything in the mask
+                // is set. This scenario can be handled by `kortest` and then checking that `CF == 1` for
+                // a full mask and checking `ZF == 1` for a partial mask using an inverted comparison
+                //
+                // -or-
+                //
+                // We have `mask != AllBitsSet` which is the same as checking that something in the mask
+                // is set. This scenario can be handled by `kortest` and then checking that `CF == 0` for
+                // a full mask and checking `ZF != 0` for a partial mask using an inverted comparison
+
+                if (count < 8)
+                {
+                    assert((count == 1) || (count == 2) || (count == 4));
+
+                    switch (maskNode->GetHWIntrinsicId())
+                    {
+                        case NI_AVX512F_CompareEqualMask:
+                        {
+                            maskIntrinsicId = NI_AVX512F_CompareNotEqualMask;
+                            break;
+                        }
+
+                        case NI_AVX512F_CompareGreaterThanMask:
+                        {
+                            maskIntrinsicId = NI_AVX512F_CompareLessThanOrEqualMask;
+                            break;
+                        }
+
+                        case NI_AVX512F_CompareGreaterThanOrEqualMask:
+                        {
+                            maskIntrinsicId = NI_AVX512F_CompareLessThanMask;
+                            break;
+                        }
+
+                        case NI_AVX512F_CompareLessThanMask:
+                        {
+                            maskIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualMask;
+                            break;
+                        }
+
+                        case NI_AVX512F_CompareLessThanOrEqualMask:
+                        {
+                            maskIntrinsicId = NI_AVX512F_CompareGreaterThanMask;
+                            break;
+                        }
+
+                        case NI_AVX512F_CompareNotEqualMask:
+                        {
+                            maskIntrinsicId = NI_AVX512F_CompareEqualMask;
+                            break;
+                        }
+
+                        default:
+                        {
+                            unreached();
+                        }
+                    }
+
+                    maskNode->ChangeHWIntrinsicId(maskIntrinsicId);
+                }
+                else if (cmpOp == GT_EQ)
+                {
+                    cmpCnd = GenCondition::C;
+                }
+                else
+                {
+                    cmpCnd = GenCondition::NC;
+                }
+                isHandled = true;
+            }
+
+            if (isHandled)
+            {
+                LIR::Use use;
+                if (BlockRange().TryGetUse(node, &use))
+                {
+                    use.ReplaceWith(maskNode);
+                }
+                else
+                {
+                    maskNode->SetUnusedValue();
+                }
+
+                BlockRange().Remove(op2);
+                BlockRange().Remove(op1);
+                BlockRange().Remove(node);
+
+                node = maskNode;
+            }
+        }
+
+        if (node->gtType != TYP_MASK)
+        {
+            // We have `x == y` or `x != y` both of which where we want to find `AllBitsSet` in the mask since
+            // we can directly do the relevant comparison. Given the above tables then when we have a full mask
+            // we can simply check against `CF == 1` for `op_Equality` and `ZF == 0` for `op_Inequality`.
+            //
+            // For a partial mask, we need to invert the `op_Equality` comparisons which means that we now need
+            // to check for `ZF == 1` (we're looking for `AllBitsSet`, that is `all match`). For `op_Inequality`
+            // we can keep things as is since we're looking for `any match` and just want to check `ZF == 0`
+
+            if (count < 8)
+            {
+                assert((count == 1) || (count == 2) || (count == 4));
+                maskIntrinsicId = NI_AVX512F_CompareNotEqualMask;
+            }
+            else
+            {
+                assert((count == 8) || (count == 16) || (count == 32) || (count == 64));
+
+                if (cmpOp == GT_EQ)
+                {
+                    cmpCnd = GenCondition::C;
+                }
+                else
+                {
+                    maskIntrinsicId = NI_AVX512F_CompareNotEqualMask;
+                }
+            }
+
+            node->gtType = TYP_MASK;
+            node->ChangeHWIntrinsicId(maskIntrinsicId);
+
+            LowerNode(node);
+        }
+
+        LIR::Use use;
+        if (BlockRange().TryGetUse(node, &use))
+        {
+            GenTreeHWIntrinsic* cc;
+
+            cc = comp->gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_KORTEST, simdBaseJitType, simdSize);
+            BlockRange().InsertBefore(nextNode, cc);
+
+            use.ReplaceWith(cc);
+            LowerHWIntrinsicCC(cc, NI_AVX512F_KORTEST, cmpCnd);
+
+            nextNode = cc->gtNext;
+        }
+        return nextNode;
+    }
+
+    assert(simdSize != 64);
+
     NamedIntrinsic cmpIntrinsic;
     CorInfoType    cmpJitType;
     NamedIntrinsic mskIntrinsic;
@@ -1728,11 +2007,11 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
         case TYP_ULONG:
         {
             mskJitType = CORINFO_TYPE_UBYTE;
+            cmpJitType = simdBaseJitType;
 
             if (simdSize == 32)
             {
                 cmpIntrinsic = NI_AVX2_CompareEqual;
-                cmpJitType   = simdBaseJitType;
                 mskIntrinsic = NI_AVX2_MoveMask;
                 mskConstant  = -1;
             }
@@ -1743,7 +2022,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
                 if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
                 {
                     cmpIntrinsic = NI_SSE41_CompareEqual;
-                    cmpJitType   = simdBaseJitType;
                 }
                 else
                 {
@@ -1856,80 +2134,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
 }
 
 //----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsicCmpOpWithKReg: Lowers a Vector512 comparison intrinsic
-//
-//  Arguments:
-//     node  - The hardware intrinsic node.
-//
-GenTree* Lowering::LowerHWIntrinsicCmpOpWithKReg(GenTreeHWIntrinsic* node)
-{
-    NamedIntrinsic intrinsicId     = node->GetHWIntrinsicId();
-    CorInfoType    simdBaseJitType = node->GetSimdBaseJitType();
-    var_types      simdBaseType    = node->GetSimdBaseType();
-    unsigned       simdSize        = node->GetSimdSize();
-    var_types      simdType        = Compiler::getSIMDTypeForSize(simdSize);
-
-    assert((intrinsicId == NI_Vector512_GreaterThanAll) || (intrinsicId == NI_Vector512_GreaterThanOrEqualAll) ||
-           (intrinsicId == NI_Vector512_LessThanAll) || (intrinsicId == NI_Vector512_LessThanOrEqualAll) ||
-           (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality));
-
-    assert(varTypeIsSIMD(simdType));
-    assert(varTypeIsArithmetic(simdBaseType));
-    assert(simdSize == 64);
-    assert(node->gtType == TYP_BOOL);
-
-    NamedIntrinsic newIntrinsicId = NI_Illegal;
-    switch (intrinsicId)
-    {
-        case NI_Vector512_GreaterThanAll:
-        {
-            newIntrinsicId = NI_AVX512F_CompareGreaterThanSpecial;
-            break;
-        }
-        case NI_Vector512_GreaterThanOrEqualAll:
-        {
-            newIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualSpecial;
-            break;
-        }
-        case NI_Vector512_LessThanAll:
-        {
-            newIntrinsicId = NI_AVX512F_CompareLessThanSpecial;
-            break;
-        }
-        case NI_Vector512_LessThanOrEqualAll:
-        {
-            newIntrinsicId = NI_AVX512F_CompareLessThanOrEqualSpecial;
-            break;
-        }
-        case NI_Vector512_op_Equality:
-        case NI_Vector512_op_Inequality:
-        {
-            newIntrinsicId = NI_AVX512F_CompareEqualSpecial;
-            break;
-        }
-
-        default:
-        {
-            assert(false);
-            break;
-        }
-    }
-
-    GenTree* op1 = node->Op(1);
-    GenTree* op2 = node->Op(2);
-
-    GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, newIntrinsicId, simdBaseJitType, simdSize);
-    BlockRange().InsertBefore(node, cmp);
-    LowerNode(cmp);
-
-    node->ResetHWIntrinsicId(NI_AVX512F_KORTEST, cmp);
-    GenCondition cmpCnd = (intrinsicId != NI_Vector512_op_Inequality) ? GenCondition::C : GenCondition::NC;
-    LowerHWIntrinsicCC(node, NI_AVX512F_KORTEST, cmpCnd);
-
-    return node->gtNext;
-}
-
-//----------------------------------------------------------------------------------------------
 // Lowering::LowerHWIntrinsicCndSel: Lowers a Vector128 or Vector256 Conditional Select call
 //
 //  Arguments:
@@ -4826,6 +5030,125 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
 }
 
 //----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicWithAvx512Mask: Lowers a HWIntrinsic node that utilizes the AVX512 KMASK registers
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId     = node->GetHWIntrinsicId();
+    CorInfoType    simdBaseJitType = node->GetSimdBaseJitType();
+    var_types      simdBaseType    = node->GetSimdBaseType();
+    unsigned       simdSize        = node->GetSimdSize();
+    var_types      simdType        = Compiler::getSIMDTypeForSize(simdSize);
+
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(simdBaseType));
+    assert(simdSize != 0);
+
+    NamedIntrinsic maskIntrinsicId;
+
+    switch (intrinsicId)
+    {
+        case NI_AVX512F_CompareEqual:
+        case NI_AVX512BW_CompareEqual:
+        {
+            maskIntrinsicId = NI_AVX512F_CompareEqualMask;
+            break;
+        }
+
+        case NI_AVX512F_VL_CompareGreaterThan:
+        case NI_AVX512BW_VL_CompareGreaterThan:
+        {
+            assert(varTypeIsUnsigned(simdBaseType));
+            FALLTHROUGH;
+        }
+
+        case NI_AVX512F_CompareGreaterThan:
+        case NI_AVX512BW_CompareGreaterThan:
+        {
+            maskIntrinsicId = NI_AVX512F_CompareGreaterThanMask;
+            break;
+        }
+
+        case NI_AVX512F_VL_CompareGreaterThanOrEqual:
+        case NI_AVX512BW_VL_CompareGreaterThanOrEqual:
+        {
+            assert(!varTypeIsFloating(simdBaseType));
+            FALLTHROUGH;
+        }
+
+        case NI_AVX512F_CompareGreaterThanOrEqual:
+        case NI_AVX512BW_CompareGreaterThanOrEqual:
+        {
+            maskIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualMask;
+            break;
+        }
+
+        case NI_AVX512F_VL_CompareLessThan:
+        case NI_AVX512BW_VL_CompareLessThan:
+        {
+            assert(varTypeIsUnsigned(simdBaseType));
+            FALLTHROUGH;
+        }
+
+        case NI_AVX512F_CompareLessThan:
+        case NI_AVX512BW_CompareLessThan:
+        {
+            maskIntrinsicId = NI_AVX512F_CompareLessThanMask;
+            break;
+        }
+
+        case NI_AVX512F_VL_CompareLessThanOrEqual:
+        case NI_AVX512BW_VL_CompareLessThanOrEqual:
+        {
+            assert(!varTypeIsFloating(simdBaseType));
+            FALLTHROUGH;
+        }
+
+        case NI_AVX512F_CompareLessThanOrEqual:
+        case NI_AVX512BW_CompareLessThanOrEqual:
+        {
+            maskIntrinsicId = NI_AVX512F_CompareLessThanOrEqualMask;
+            break;
+        }
+
+        case NI_AVX512F_VL_CompareNotEqual:
+        case NI_AVX512BW_VL_CompareNotEqual:
+        {
+            assert(!varTypeIsFloating(simdBaseType));
+            FALLTHROUGH;
+        }
+
+        case NI_AVX512F_CompareNotEqual:
+        case NI_AVX512BW_CompareNotEqual:
+        {
+            maskIntrinsicId = NI_AVX512F_CompareNotEqualMask;
+            break;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+
+    node->gtType = TYP_MASK;
+    node->ChangeHWIntrinsicId(maskIntrinsicId);
+
+    LIR::Use use;
+    if (BlockRange().TryGetUse(node, &use))
+    {
+        GenTree* maskToVector =
+            comp->gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize);
+        BlockRange().InsertAfter(node, maskToVector);
+        use.ReplaceWith(maskToVector);
+    }
+    return LowerNode(node);
+}
+
+//----------------------------------------------------------------------------------------------
 // Lowering::LowerHWIntrinsicToScalar: Lowers a Vector128 or Vector256 ToScalar call
 //
 //  Arguments: