Adding containment support to more x86 HWIntrinsics (dotnet/coreclr#18349)
authorTanner Gooding <tagoo@outlook.com>
Tue, 12 Jun 2018 02:58:34 +0000 (19:58 -0700)
committerGitHub <noreply@github.com>
Tue, 12 Jun 2018 02:58:34 +0000 (19:58 -0700)
* Adding containment support for the Sse42.Crc32 intrinsic

* Adding containment support for the x86 Compare*OrderedScalar and Compare*UnorderedScalar HWIntrinsics

* Adding containment support to several intrinsics that were marked NoContainment

Commit migrated from https://github.com/dotnet/coreclr/commit/7005f768866f3e0a2e7df3b67884bb5a98c92ef2

src/coreclr/src/jit/emitxarch.cpp
src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp
src/coreclr/src/jit/hwintrinsiclistxarch.h
src/coreclr/src/jit/lowerxarch.cpp

index b3011fc..1211256 100644 (file)
@@ -6512,6 +6512,11 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int va
         sz += emitGetRexPrefixSize(ins);
     }
 
+    if (ins == INS_crc32)
+    {
+        sz += 1;
+    }
+
     id->idIns(ins);
     id->idInsFmt(fmt);
     id->idReg1(ireg);
@@ -8368,6 +8373,12 @@ void emitter::emitDispIns(
             {
                 printf("%s, %s", emitRegName(id->idReg1(), EA_PTRSIZE), sstr);
             }
+            else if ((ins == INS_crc32) && (attr != EA_8BYTE))
+            {
+                // The idReg1 is always 4 bytes, but the size of idReg2 can vary.
+                // This logic ensures that we print `crc32 eax, bx` instead of `crc32 ax, bx`
+                printf("%s, %s", emitRegName(id->idReg1(), EA_4BYTE), emitRegName(id->idReg2(), attr));
+            }
             else
             {
                 printf("%s, %s", emitRegName(id->idReg1(), attr), sstr);
@@ -8583,6 +8594,12 @@ void emitter::emitDispIns(
             {
                 printf("%s, %s", emitRegName(id->idReg1(), EA_PTRSIZE), sstr);
             }
+            else if ((ins == INS_crc32) && (attr != EA_8BYTE))
+            {
+                // The idReg1 is always 4 bytes, but the size of idReg2 can vary.
+                // This logic ensures that we print `crc32 eax, bx` instead of `crc32 ax, bx`
+                printf("%s, %s", emitRegName(id->idReg1(), EA_4BYTE), emitRegName(id->idReg2(), attr));
+            }
             else
             {
                 printf("%s, %s", emitRegName(id->idReg1(), attr), sstr);
@@ -8696,6 +8713,8 @@ void emitter::emitDispIns(
 #ifdef FEATURE_HW_INTRINSICS
             else if (ins == INS_crc32 && attr != EA_8BYTE)
             {
+                // The idReg1 is always 4 bytes, but the size of idReg2 can vary.
+                // This logic ensures that we print `crc32 eax, bx` instead of `crc32 ax, bx`
                 printf("%s, %s", emitRegName(id->idReg1(), EA_4BYTE), emitRegName(id->idReg2(), attr));
             }
 #endif // FEATURE_HW_INTRINSICS
@@ -8780,6 +8799,12 @@ void emitter::emitDispIns(
                 attr = EA_PTRSIZE;
             }
 #endif
+            else if ((ins == INS_crc32) && (attr != EA_8BYTE))
+            {
+                // The idReg1 is always 4 bytes, but the size of idReg2 can vary.
+                // This logic ensures that we print `crc32 eax, bx` instead of `crc32 ax, bx`
+                printf("%s, %s", emitRegName(id->idReg1(), EA_4BYTE), emitRegName(id->idReg2(), attr));
+            }
             printf("%s, %s", emitRegName(id->idReg1(), attr), sstr);
             offs = emitGetInsDsp(id);
             emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC);
@@ -9319,12 +9344,22 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 
     // Special case emitting AVX instructions
-    if (Is4ByteSSE4OrAVXInstruction(ins))
+    if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
     {
+        if ((ins == INS_crc32) && (size > EA_1BYTE))
+        {
+            code |= 0x0100;
+
+            if (size == EA_2BYTE)
+            {
+                dst += emitOutputByte(dst, 0x66);
+            }
+        }
+
         unsigned regcode = insEncodeReg345(ins, id->idReg1(), size, &code);
         dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
 
-        if (UseVEXEncoding())
+        if (UseVEXEncoding() && (ins != INS_crc32))
         {
             // Emit last opcode byte
             // TODO-XArch-CQ: Right now support 4-byte opcode instructions only
@@ -9450,6 +9485,7 @@ GOT_DSP:
         switch (reg)
         {
             case REG_NA:
+            {
                 if (id->idIsDspReloc())
                 {
                     INT32 addlDelta = 0;
@@ -9457,7 +9493,7 @@ GOT_DSP:
                     // The address is of the form "[disp]"
                     // On x86 - disp is relative to zero
                     // On Amd64 - disp is relative to RIP
-                    if (Is4ByteSSE4OrAVXInstruction(ins))
+                    if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
                     {
                         dst += emitOutputByte(dst, code | 0x05);
                     }
@@ -9513,7 +9549,7 @@ GOT_DSP:
                 else
                 {
 #ifdef _TARGET_X86_
-                    if (Is4ByteSSE4OrAVXInstruction(ins))
+                    if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
                     {
                         dst += emitOutputByte(dst, code | 0x05);
                     }
@@ -9543,9 +9579,11 @@ GOT_DSP:
                     dst += emitOutputLong(dst, dsp);
                 }
                 break;
+            }
 
             case REG_EBP:
-                if (Is4ByteSSE4OrAVXInstruction(ins))
+            {
+                if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
                 {
                     // Does the offset fit in a byte?
                     if (dspInByte)
@@ -9584,9 +9622,11 @@ GOT_DSP:
                     }
                 }
                 break;
+            }
 
             case REG_ESP:
-                if (Is4ByteSSE4OrAVXInstruction(ins))
+            {
+                if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
                 {
                     // Is the offset 0 or does it at least fit in a byte?
                     if (dspIsZero)
@@ -9637,9 +9677,11 @@ GOT_DSP:
                     }
                 }
                 break;
+            }
 
             default:
-                if (Is4ByteSSE4OrAVXInstruction(ins))
+            {
+                if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
                 {
                     // Put the register in the opcode
                     code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr);
@@ -9701,6 +9743,7 @@ GOT_DSP:
                 }
 
                 break;
+            }
         }
     }
     else
@@ -9720,7 +9763,7 @@ GOT_DSP:
                 regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) |
                           insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul);
 
-                if (Is4ByteSSE4OrAVXInstruction(ins))
+                if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
                 {
                     // Emit [ebp + {2/4/8} * rgz] as [ebp + {2/4/8} * rgx + 0]
                     if (dspIsZero && reg != REG_EBP)
@@ -9787,7 +9830,7 @@ GOT_DSP:
                 regByte = insEncodeReg012(ins, REG_EBP, EA_PTRSIZE, nullptr) |
                           insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul);
 
-                if (Is4ByteSSE4OrAVXInstruction(ins))
+                if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
                 {
                     dst += emitOutputByte(dst, code | 0x04);
                 }
@@ -9816,7 +9859,7 @@ GOT_DSP:
             // The address is "[reg+rgx+dsp]"
             regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr);
 
-            if (Is4ByteSSE4OrAVXInstruction(ins))
+            if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
             {
                 if (dspIsZero && reg != REG_EBP)
                 {
@@ -10043,12 +10086,22 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     }
 
     // Special case emitting AVX instructions
-    if (Is4ByteSSE4OrAVXInstruction(ins))
+    if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
     {
+        if ((ins == INS_crc32) && (size > EA_1BYTE))
+        {
+            code |= 0x0100;
+
+            if (size == EA_2BYTE)
+            {
+                dst += emitOutputByte(dst, 0x66);
+            }
+        }
+
         unsigned regcode = insEncodeReg345(ins, id->idReg1(), size, &code);
         dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
 
-        if (UseVEXEncoding())
+        if (UseVEXEncoding() && (ins != INS_crc32))
         {
             // Emit last opcode byte
             // TODO-XArch-CQ: Right now support 4-byte opcode instructions only
@@ -10174,7 +10227,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
     if (EBPbased)
     {
         // EBP-based variable: does the offset fit in a byte?
-        if (Is4ByteSSE4OrAVXInstruction(ins))
+        if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
         {
             if (dspInByte)
             {
@@ -10213,7 +10266,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
         dspIsZero = (dsp == 0);
 
         // Does the offset fit in a byte?
-        if (Is4ByteSSE4OrAVXInstruction(ins))
+        if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
         {
             if (dspInByte)
             {
@@ -10483,12 +10536,22 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
 #endif //_TARGET_X86_
 
     // Special case emitting AVX instructions
-    if (Is4ByteSSE4OrAVXInstruction(ins))
+    if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
     {
+        if ((ins == INS_crc32) && (size > EA_1BYTE))
+        {
+            code |= 0x0100;
+
+            if (size == EA_2BYTE)
+            {
+                dst += emitOutputByte(dst, 0x66);
+            }
+        }
+
         unsigned regcode = insEncodeReg345(ins, id->idReg1(), size, &code);
         dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
 
-        if (UseVEXEncoding())
+        if (UseVEXEncoding() && (ins != INS_crc32))
         {
             // Emit last opcode byte
             // TODO-XArch-CQ: Right now support 4-byte opcode instructions only
@@ -12813,8 +12876,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_RWR_ARD:
         case IF_RRW_ARD:
         case IF_RWR_RRD_ARD:
+        {
             code = insCodeRM(ins);
-            if (Is4ByteSSE4OrAVXInstruction(ins))
+            if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
             {
                 dst = emitOutputAM(dst, id, code);
             }
@@ -12826,6 +12890,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             }
             sz = emitSizeOfInsDsc(id);
             break;
+        }
 
         case IF_RWR_RRD_ARD_CNS:
         case IF_RWR_RRD_ARD_RRD:
@@ -12954,11 +13019,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_RRD_SRD:
         case IF_RWR_SRD:
         case IF_RRW_SRD:
+        {
             code = insCodeRM(ins);
 
             // 4-byte AVX instructions are special cased inside emitOutputSV
             // since they do not have space to encode ModRM byte.
-            if (Is4ByteSSE4OrAVXInstruction(ins))
+            if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
             {
                 dst = emitOutputSV(dst, id, code);
             }
@@ -12975,7 +13041,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8);
                 dst     = emitOutputSV(dst, id, code | regcode);
             }
+
+            sz = emitSizeOfInsDsc(id);
             break;
+        }
 
         case IF_RWR_RRD_SRD:
         {
@@ -13115,9 +13184,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         case IF_RRD_MRD:
         case IF_RWR_MRD:
         case IF_RRW_MRD:
+        {
             code = insCodeRM(ins);
+
             // Special case 4-byte AVX instructions
-            if (Is4ByteSSE4OrAVXInstruction(ins))
+            if (Is4ByteSSE4OrAVXInstruction(ins) || (ins == INS_crc32))
             {
                 dst = emitOutputCV(dst, id, code);
             }
@@ -13134,8 +13205,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8);
                 dst     = emitOutputCV(dst, id, code | regcode | 0x0500);
             }
+
             sz = emitSizeOfInsDsc(id);
             break;
+        }
 
         case IF_RWR_RRD_MRD:
         {
index 1468f03..0107273 100644 (file)
@@ -25,6 +25,33 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "gcinfoencoder.h"
 
 //------------------------------------------------------------------------
+// assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
+//
+// Arguments:
+//    lowering - The lowering phase from the compiler
+//    node     - The HWIntrinsic node that has the contained node
+//    op       - The op that is contained
+//
+static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
+{
+#if DEBUG
+    // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
+    // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
+    //
+    // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
+    // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
+    // spillage
+    // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
+    // register
+    // in the first place).
+
+    bool supportsRegOptional = false;
+    bool isContainable       = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
+    assert(isContainable || supportsRegOptional);
+#endif // DEBUG
+}
+
+//------------------------------------------------------------------------
 // genIsTableDrivenHWIntrinsic:
 //
 // Arguments:
@@ -342,21 +369,34 @@ void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emi
     var_types targetType = node->TypeGet();
     regNumber targetReg  = node->gtRegNum;
     GenTree*  op1        = node->gtGetOp1();
+    GenTree*  op2        = node->gtGetOp2();
     emitter*  emit       = getEmitter();
 
+    if (op2 != nullptr)
+    {
+        // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this
+        // code path. They are all MultiIns, as the return value comes from the flags and
+        // we have two operands instead.
+
+        assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
+        assert(targetReg != REG_NA);
+
+        targetReg = op1->gtRegNum;
+        op1       = op2;
+        op2       = nullptr;
+    }
+    else
+    {
+        assert(!node->OperIsCommutative());
+    }
+
     assert(targetReg != REG_NA);
-    assert(node->gtGetOp2() == nullptr);
-    assert(!node->OperIsCommutative());
+    assert(op2 == nullptr);
 
     if (op1->isContained() || op1->isUsedFromSpillTemp())
     {
         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
-
-#if DEBUG
-        bool supportsRegOptional = false;
-        bool isContainable       = compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional);
-        assert(isContainable || (supportsRegOptional && op1->IsRegOptional()));
-#endif // DEBUG
+        assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
 
         TempDsc* tmpDsc = nullptr;
         unsigned varNum = BAD_VAR_NUM;
@@ -480,12 +520,7 @@ void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, i
     if (op1->isContained() || op1->isUsedFromSpillTemp())
     {
         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
-
-#if DEBUG
-        bool supportsRegOptional = false;
-        bool isContainable       = compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional);
-        assert(isContainable || (supportsRegOptional && op1->IsRegOptional()));
-#endif // DEBUG
+        assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
 
         TempDsc* tmpDsc = nullptr;
         unsigned varNum = BAD_VAR_NUM;
@@ -609,12 +644,7 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
     if (op2->isContained() || op2->isUsedFromSpillTemp())
     {
         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
-
-#if DEBUG
-        bool supportsRegOptional = false;
-        bool isContainable       = compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional);
-        assert(isContainable || (supportsRegOptional && op2->IsRegOptional()));
-#endif // DEBUG
+        assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
 
         TempDsc* tmpDsc = nullptr;
         unsigned varNum = BAD_VAR_NUM;
@@ -770,12 +800,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins,
     if (op2->isContained() || op2->isUsedFromSpillTemp())
     {
         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
-
-#if DEBUG
-        bool supportsRegOptional = false;
-        bool isContainable       = compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional);
-        assert(isContainable || (supportsRegOptional && op2->IsRegOptional()));
-#endif // DEBUG
+        assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
 
         TempDsc* tmpDsc = nullptr;
         unsigned varNum = BAD_VAR_NUM;
@@ -928,6 +953,9 @@ void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
 
     if (op2->isContained() || op2->isUsedFromSpillTemp())
     {
+        assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
+        assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
+
         TempDsc* tmpDsc = nullptr;
         unsigned varNum = BAD_VAR_NUM;
         unsigned offset = (unsigned)-1;
@@ -1245,14 +1273,13 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE_CompareEqualUnorderedScalar:
         {
             assert(baseType == TYP_FLOAT);
-            op2Reg             = op2->gtRegNum;
             regNumber   tmpReg = node->GetSingleTempReg();
             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
 
             // Ensure we aren't overwriting targetReg
             assert(tmpReg != targetReg);
 
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
@@ -1264,10 +1291,9 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE_CompareGreaterThanUnorderedScalar:
         {
             assert(baseType == TYP_FLOAT);
-            op2Reg = op2->gtRegNum;
-
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -1277,10 +1303,9 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
         {
             assert(baseType == TYP_FLOAT);
-            op2Reg = op2->gtRegNum;
-
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -1290,10 +1315,9 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE_CompareLessThanUnorderedScalar:
         {
             assert(baseType == TYP_FLOAT);
-            op2Reg = op2->gtRegNum;
-
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
+
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -1303,10 +1327,9 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
         {
             assert(baseType == TYP_FLOAT);
-            op2Reg = op2->gtRegNum;
-
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
+
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -1316,14 +1339,13 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE_CompareNotEqualUnorderedScalar:
         {
             assert(baseType == TYP_FLOAT);
-            op2Reg             = op2->gtRegNum;
             regNumber   tmpReg = node->GetSingleTempReg();
             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
 
             // Ensure we aren't overwriting targetReg
             assert(tmpReg != targetReg);
 
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
@@ -1461,14 +1483,13 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE2_CompareEqualUnorderedScalar:
         {
             assert(baseType == TYP_DOUBLE);
-            op2Reg             = op2->gtRegNum;
             regNumber   tmpReg = node->GetSingleTempReg();
             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
 
             // Ensure we aren't overwriting targetReg
             assert(tmpReg != targetReg);
 
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
@@ -1480,10 +1501,9 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE2_CompareGreaterThanUnorderedScalar:
         {
             assert(baseType == TYP_DOUBLE);
-            op2Reg          = op2->gtRegNum;
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
 
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -1493,10 +1513,9 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
         {
             assert(baseType == TYP_DOUBLE);
-            op2Reg          = op2->gtRegNum;
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
 
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -1506,10 +1525,9 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE2_CompareLessThanUnorderedScalar:
         {
             assert(baseType == TYP_DOUBLE);
-            op2Reg          = op2->gtRegNum;
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
 
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -1519,10 +1537,9 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
         {
             assert(baseType == TYP_DOUBLE);
-            op2Reg          = op2->gtRegNum;
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
 
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -1532,14 +1549,13 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE2_CompareNotEqualUnorderedScalar:
         {
             assert(baseType == TYP_DOUBLE);
-            op2Reg             = op2->gtRegNum;
             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
             regNumber   tmpReg = node->GetSingleTempReg();
 
             // Ensure we aren't overwriting targetReg
             assert(tmpReg != targetReg);
 
-            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
@@ -1720,9 +1736,9 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
             regNumber tmpReg = node->GetSingleTempReg();
             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
             emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
-            emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
         }
 
@@ -1730,18 +1746,18 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE41_TestZ:
         {
             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
-            emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
-            emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
+            genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
         }
 
         case NI_SSE41_TestC:
         {
             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
-            emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
-            emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
+            genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
         }
 
@@ -1749,9 +1765,9 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
         case NI_SSE41_TestNotZAndNotC:
         {
             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
-            emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
-            emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
+            genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
         }
 
@@ -1812,43 +1828,58 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
 {
     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    regNumber      targetReg   = node->gtRegNum;
     GenTree*       op1         = node->gtGetOp1();
     GenTree*       op2         = node->gtGetOp2();
-    regNumber      targetReg   = node->gtRegNum;
-    assert(targetReg != REG_NA);
-    var_types targetType = node->TypeGet();
-    var_types baseType   = node->gtSIMDBaseType;
+    var_types      baseType    = node->gtSIMDBaseType;
+    var_types      targetType  = node->TypeGet();
+    emitter*       emit        = getEmitter();
 
     regNumber op1Reg = op1->gtRegNum;
-    regNumber op2Reg = op2->gtRegNum;
     genConsumeOperands(node);
 
+    assert(targetReg != REG_NA);
+    assert(op1Reg != REG_NA);
+    assert(op2 != nullptr);
+    assert(!node->OperIsCommutative());
+
     switch (intrinsicId)
     {
         case NI_SSE42_Crc32:
+        {
             if (op1Reg != targetReg)
             {
-                assert(op2Reg != targetReg);
-                inst_RV_RV(INS_mov, targetReg, op1Reg, targetType, emitTypeSize(targetType));
+                assert(op2->gtRegNum != targetReg);
+                emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
             }
 
-            if (baseType == TYP_UBYTE || baseType == TYP_USHORT) // baseType is the type of the second argument
+            // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
+            // overload that explicitly takes the operands.
+            node->gtOp1 = op2;
+            node->gtOp2 = nullptr;
+
+            if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
             {
                 assert(targetType == TYP_INT);
-                inst_RV_RV(INS_crc32, targetReg, op2Reg, baseType, emitTypeSize(baseType));
+                genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
             }
             else
             {
                 assert(op1->TypeGet() == op2->TypeGet());
-                assert(targetType == TYP_INT || targetType == TYP_LONG);
-                inst_RV_RV(INS_crc32, targetReg, op2Reg, targetType, emitTypeSize(targetType));
+                assert((targetType == TYP_INT) || (targetType == TYP_LONG));
+                genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
             }
 
             break;
+        }
+
         default:
+        {
             unreached();
             break;
+        }
     }
+
     genProduceReg(node);
 }
 
@@ -1985,25 +2016,25 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
 
         case NI_AVX_TestC:
         {
-            emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
-            emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
+            genHWIntrinsic_R_RM(node, ins, attr);
             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
         }
 
         case NI_AVX_TestNotZAndNotC:
         {
-            emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
-            emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
+            genHWIntrinsic_R_RM(node, ins, attr);
             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
         }
 
         case NI_AVX_TestZ:
         {
-            emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
-            emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
+            genHWIntrinsic_R_RM(node, ins, attr);
             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
         }
 
index 5c91e2e..3f2f08b 100644 (file)
@@ -34,29 +34,29 @@ HARDWARE_INTRINSIC(SSE_AddScalar,                                   "AddScalar",
 HARDWARE_INTRINSIC(SSE_And,                                         "And",                                          SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_andps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE_AndNot,                                      "AndNot",                                       SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_andnps,         INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_CompareEqual,                                "CompareEqual",                                 SSE,           0,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar,                   "CompareEqualOrderedScalar",                    SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar,                   "CompareEqualOrderedScalar",                    SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareEqualScalar,                          "CompareEqualScalar",                           SSE,           0,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar,                 "CompareEqualUnorderedScalar",                  SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar,                 "CompareEqualUnorderedScalar",                  SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareGreaterThan,                          "CompareGreaterThan",                           SSE,           6,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar,             "CompareGreaterThanOrderedScalar",              SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar,             "CompareGreaterThanOrderedScalar",              SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar,                    "CompareGreaterThanScalar",                     SSE,           6,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar,           "CompareGreaterThanUnorderedScalar",            SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar,           "CompareGreaterThanUnorderedScalar",            SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual,                   "CompareGreaterThanOrEqual",                    SSE,           5,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar,      "CompareGreaterThanOrEqualOrderedScalar",       SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar,      "CompareGreaterThanOrEqualOrderedScalar",       SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar,             "CompareGreaterThanOrEqualScalar",              SSE,           5,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar,    "CompareGreaterThanOrEqualUnorderedScalar",     SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar,    "CompareGreaterThanOrEqualUnorderedScalar",     SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareLessThan,                             "CompareLessThan",                              SSE,           1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar,                "CompareLessThanOrderedScalar",                 SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar,                "CompareLessThanOrderedScalar",                 SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareLessThanScalar,                       "CompareLessThanScalar",                        SSE,           1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar,              "CompareLessThanUnorderedScalar",               SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar,              "CompareLessThanUnorderedScalar",               SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual,                      "CompareLessThanOrEqual",                       SSE,           2,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar,         "CompareLessThanOrEqualOrderedScalar",          SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar,         "CompareLessThanOrEqualOrderedScalar",          SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar,                "CompareLessThanOrEqualScalar",                 SSE,           2,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar,       "CompareLessThanOrEqualUnorderedScalar",        SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar,       "CompareLessThanOrEqualUnorderedScalar",        SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareNotEqual,                             "CompareNotEqual",                              SSE,           4,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar,                "CompareNotEqualOrderedScalar",                 SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar,                "CompareNotEqualOrderedScalar",                 SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comiss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar,                       "CompareNotEqualScalar",                        SSE,           4,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar,              "CompareNotEqualUnorderedScalar",               SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar,              "CompareNotEqualUnorderedScalar",               SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomiss,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan,                       "CompareNotGreaterThan",                        SSE,           2,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar,                 "CompareNotGreaterThanScalar",                  SSE,           2,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual,                "CompareNotGreaterThanOrEqual",                 SSE,           1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
@@ -133,29 +133,29 @@ HARDWARE_INTRINSIC(SSE2_And,                                        "And",
 HARDWARE_INTRINSIC(SSE2_AndNot,                                     "AndNot",                                       SSE2,         -1,              16,           2,     {INS_pandn,             INS_pandn,          INS_pandn,          INS_pandn,          INS_pandn,          INS_pandn,          INS_pandn,          INS_pandn,          INS_invalid,        INS_andnpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_Average,                                    "Average",                                      SSE2,         -1,              16,           2,     {INS_invalid,           INS_pavgb,          INS_invalid,        INS_pavgw,          INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE2_CompareEqual,                               "CompareEqual",                                 SSE2,          0,              16,           2,     {INS_pcmpeqb,           INS_pcmpeqb,        INS_pcmpeqw,        INS_pcmpeqw,        INS_pcmpeqd,        INS_pcmpeqd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmppd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE2_CompareEqualOrderedScalar,                  "CompareEqualOrderedScalar",                    SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareEqualOrderedScalar,                  "CompareEqualOrderedScalar",                    SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareEqualScalar,                         "CompareEqualScalar",                           SSE2,          0,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE2_CompareEqualUnorderedScalar,                "CompareEqualUnorderedScalar",                  SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareEqualUnorderedScalar,                "CompareEqualUnorderedScalar",                  SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareGreaterThan,                         "CompareGreaterThan",                           SSE2,          6,              16,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmppd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrderedScalar,            "CompareGreaterThanOrderedScalar",              SSE2,         -1,              16,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrderedScalar,            "CompareGreaterThanOrderedScalar",              SSE2,         -1,              16,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareGreaterThanScalar,                   "CompareGreaterThanScalar",                     SSE2,          6,              16,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE2_CompareGreaterThanUnorderedScalar,          "CompareGreaterThanUnorderedScalar",            SSE2,         -1,              16,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanUnorderedScalar,          "CompareGreaterThanUnorderedScalar",            SSE2,         -1,              16,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqual,                  "CompareGreaterThanOrEqual",                    SSE2,          5,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmppd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqualOrderedScalar,     "CompareGreaterThanOrEqualOrderedScalar",       SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqualOrderedScalar,     "CompareGreaterThanOrEqualOrderedScalar",       SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqualScalar,            "CompareGreaterThanOrEqualScalar",              SSE2,          5,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqualUnorderedScalar,   "CompareGreaterThanOrEqualUnorderedScalar",     SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqualUnorderedScalar,   "CompareGreaterThanOrEqualUnorderedScalar",     SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareLessThan,                            "CompareLessThan",                              SSE2,          1,              16,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmppd},             HW_Category_Special,                HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE2_CompareLessThanOrderedScalar,               "CompareLessThanOrderedScalar",                 SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanOrderedScalar,               "CompareLessThanOrderedScalar",                 SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareLessThanScalar,                      "CompareLessThanScalar",                        SSE2,          1,              16,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE2_CompareLessThanUnorderedScalar,             "CompareLessThanUnorderedScalar",               SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanUnorderedScalar,             "CompareLessThanUnorderedScalar",               SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqual,                     "CompareLessThanOrEqual",                       SSE2,          2,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmppd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqualOrderedScalar,        "CompareLessThanOrEqualOrderedScalar",          SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqualOrderedScalar,        "CompareLessThanOrEqualOrderedScalar",          SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqualScalar,               "CompareLessThanOrEqualScalar",                 SSE2,          2,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqualUnorderedScalar,      "CompareLessThanOrEqualUnorderedScalar",        SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqualUnorderedScalar,      "CompareLessThanOrEqualUnorderedScalar",        SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareNotEqual,                            "CompareNotEqual",                              SSE2,          4,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmppd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE2_CompareNotEqualOrderedScalar,               "CompareNotEqualOrderedScalar",                 SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareNotEqualOrderedScalar,               "CompareNotEqualOrderedScalar",                 SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_comisd},            HW_Category_SIMDScalar,             HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareNotEqualScalar,                      "CompareNotEqualScalar",                        SSE2,          4,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE2_CompareNotEqualUnorderedScalar,             "CompareNotEqualUnorderedScalar",               SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_CompareNotEqualUnorderedScalar,             "CompareNotEqualUnorderedScalar",               SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_ucomisd},           HW_Category_SIMDScalar,             HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_CompareNotGreaterThan,                      "CompareNotGreaterThan",                        SSE2,          2,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmppd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_CompareNotGreaterThanScalar,                "CompareNotGreaterThanScalar",                  SSE2,          2,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_CompareNotGreaterThanOrEqual,               "CompareNotGreaterThanOrEqual",                 SSE2,          1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmppd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
@@ -249,9 +249,9 @@ HARDWARE_INTRINSIC(SSE3_HorizontalAdd,                              "HorizontalA
 HARDWARE_INTRINSIC(SSE3_HorizontalSubtract,                         "HorizontalSubtract",                           SSE3,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_hsubps,         INS_hsubpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE3_LoadAndDuplicateToVector128,                "LoadAndDuplicateToVector128",                  SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_lddqu,          INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movddup},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE3_LoadDquVector128,                           "LoadDquVector128",                             SSE3,         -1,              16,           1,     {INS_lddqu,             INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE3_MoveAndDuplicate,                           "MoveAndDuplicate",                             SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movddup},           HW_Category_SimpleSIMD,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE3_MoveHighAndDuplicate,                       "MoveHighAndDuplicate",                         SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movshdup,       INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE3_MoveLowAndDuplicate,                        "MoveLowAndDuplicate",                          SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movsldup,       INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE3_MoveAndDuplicate,                           "MoveAndDuplicate",                             SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movddup},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE3_MoveHighAndDuplicate,                       "MoveHighAndDuplicate",                         SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movshdup,       INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE3_MoveLowAndDuplicate,                        "MoveLowAndDuplicate",                          SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movsldup,       INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //                 Intrinsic ID                                     Function name                                   ISA         ival        SIMD size       NumArg                                                                                                     instructions                                                                                                     Category                            Flags
@@ -281,9 +281,9 @@ HARDWARE_INTRINSIC(SSE41_BlendVariable,                             "BlendVariab
 HARDWARE_INTRINSIC(SSE41_Ceiling,                                   "Ceiling",                                      SSE41,        10,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundps,        INS_roundpd},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE41_CeilingScalar,                             "CeilingScalar",                                SSE41,        10,              16,          -1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundss,        INS_roundsd},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE41_CompareEqual,                              "CompareEqual",                                 SSE41,        -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_pcmpeqq,        INS_pcmpeqq,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int16,                   "ConvertToVector128Int16",                      SSE41,        -1,              16,           1,     {INS_pmovsxbw,          INS_pmovzxbw,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int32,                   "ConvertToVector128Int32",                      SSE41,        -1,              16,           1,     {INS_pmovsxbd,          INS_pmovzxbd,       INS_pmovsxwd,       INS_pmovzxwd,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int64,                   "ConvertToVector128Int64",                      SSE41,        -1,              16,           1,     {INS_pmovsxbq,          INS_pmovzxbq,       INS_pmovsxwq,       INS_pmovzxwq,       INS_pmovsxdq,       INS_pmovzxdq,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int16,                   "ConvertToVector128Int16",                      SSE41,        -1,              16,           1,     {INS_pmovsxbw,          INS_pmovzxbw,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int32,                   "ConvertToVector128Int32",                      SSE41,        -1,              16,           1,     {INS_pmovsxbd,          INS_pmovzxbd,       INS_pmovsxwd,       INS_pmovzxwd,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int64,                   "ConvertToVector128Int64",                      SSE41,        -1,              16,           1,     {INS_pmovsxbq,          INS_pmovzxbq,       INS_pmovsxwq,       INS_pmovzxwq,       INS_pmovsxdq,       INS_pmovzxdq,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE41_DotProduct,                                "DotProduct",                                   SSE41,        -1,              16,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_dpps,           INS_dppd},              HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(SSE41_Extract,                                   "Extract",                                      SSE41,        -1,              16,           2,     {INS_pextrb,            INS_pextrb,         INS_invalid,        INS_invalid,        INS_pextrd,         INS_pextrd,         INS_pextrq,         INS_pextrq,         INS_extractps,      INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiIns|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE41_Floor,                                     "Floor",                                        SSE41,         9,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundps,        INS_roundpd},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
@@ -308,11 +308,11 @@ HARDWARE_INTRINSIC(SSE41_RoundToPositiveInfinityScalar,             "RoundToPosi
 HARDWARE_INTRINSIC(SSE41_RoundToZero,                               "RoundToZero",                                  SSE41,        11,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundps,        INS_roundpd},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE41_RoundToZeroScalar,                         "RoundToZeroScalar",                            SSE41,        11,              16,          -1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundss,        INS_roundsd},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE41_TestAllOnes,                               "TestAllOnes",                                  SSE41,        -1,              16,           1,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(SSE41_TestAllZeros,                              "TestAllZeros",                                 SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(SSE41_TestC,                                     "TestC",                                        SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(SSE41_TestMixOnesZeros,                          "TestMixOnesZeros",                             SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(SSE41_TestNotZAndNotC,                           "TestNotZAndNotC",                              SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(SSE41_TestZ,                                     "TestZ",                                        SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(SSE41_TestAllZeros,                              "TestAllZeros",                                 SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(SSE41_TestC,                                     "TestC",                                        SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(SSE41_TestMixOnesZeros,                          "TestMixOnesZeros",                             SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(SSE41_TestNotZAndNotC,                           "TestNotZAndNotC",                              SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(SSE41_TestZ,                                     "TestZ",                                        SSE41,        -1,              16,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
 
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //                 Intrinsic ID                                     Function name                                   ISA         ival        SIMD size       NumArg                                                                                                     instructions                                                                                                     Category                            Flags
@@ -392,9 +392,9 @@ HARDWARE_INTRINSIC(AVX_Store,                                       "Store",
 HARDWARE_INTRINSIC(AVX_StoreAligned,                                "StoreAligned",                                 AVX,          -1,              32,           2,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movaps,         INS_movapd},            HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal,                     "StoreAlignedNonTemporal",                      AVX,          -1,              32,           2,     {INS_movntdq,           INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntps,        INS_movntpd},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_Subtract,                                    "Subtract",                                     AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_subps,          INS_subpd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(AVX_TestC,                                       "TestC",                                        AVX,          -1,               0,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_vtestps,        INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(AVX_TestNotZAndNotC,                             "TestNotZAndNotC",                              AVX,          -1,               0,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_vtestps,        INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(AVX_TestZ,                                       "TestZ",                                        AVX,          -1,               0,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_vtestps,        INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX_TestC,                                       "TestC",                                        AVX,          -1,               0,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_vtestps,        INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX_TestNotZAndNotC,                             "TestNotZAndNotC",                              AVX,          -1,               0,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_vtestps,        INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX_TestZ,                                       "TestZ",                                        AVX,          -1,               0,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_vtestps,        INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX_UnpackHigh,                                  "UnpackHigh",                                   AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_unpckhps,       INS_unpckhpd},          HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_UnpackLow,                                   "UnpackLow",                                    AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_unpcklps,       INS_unpcklpd},          HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_Xor,                                         "Xor",                                          AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_xorps,          INS_xorpd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
@@ -438,11 +438,11 @@ HARDWARE_INTRINSIC(AVX2_Or,                                         "Or",
 HARDWARE_INTRINSIC(AVX2_Permute2x128,                               "Permute2x128",                                 AVX2,         -1,              32,           3,     {INS_vperm2i128,        INS_vperm2i128,     INS_vperm2i128,     INS_vperm2i128,     INS_vperm2i128,     INS_vperm2i128,     INS_vperm2i128,     INS_vperm2i128,     INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_OneTypeGeneric|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_ShiftLeftLogical,                           "ShiftLeftLogical",                             AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_psllw,          INS_psllw,          INS_pslld,          INS_pslld,          INS_psllq,          INS_psllq,          INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_ShiftLeftLogical128BitLane,                 "ShiftLeftLogical128BitLane",                   AVX2,         -1,              32,           2,     {INS_pslldq,            INS_pslldq,         INS_pslldq,         INS_pslldq,         INS_pslldq,         INS_pslldq,         INS_pslldq,         INS_pslldq,         INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(AVX2_ShiftLeftLogicalVariable,                   "ShiftLeftLogicalVariable",                     AVX2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpsllvd,        INS_vpsllvd,        INS_vpsllvq,        INS_vpsllvq,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_UnfixedSIMDSize|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_ShiftLeftLogicalVariable,                   "ShiftLeftLogicalVariable",                     AVX2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpsllvd,        INS_vpsllvd,        INS_vpsllvq,        INS_vpsllvq,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_UnfixedSIMDSize)
 HARDWARE_INTRINSIC(AVX2_ShiftRightArithmetic,                       "ShiftRightArithmetic",                         AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_psraw,          INS_invalid,        INS_psrad,          INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_ShiftRightLogical,                          "ShiftRightLogical",                            AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_psrlw,          INS_psrlw,          INS_psrld,          INS_psrld,          INS_psrlq,          INS_psrlq,          INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_ShiftRightLogical128BitLane,                "ShiftRightLogical128BitLane",                  AVX2,         -1,              32,           2,     {INS_psrldq,            INS_psrldq,         INS_psrldq,         INS_psrldq,         INS_psrldq,         INS_psrldq,         INS_psrldq,         INS_psrldq,         INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(AVX2_ShiftRightLogicalVariable,                  "ShiftRightLogicalVariable",                    AVX2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpsrlvd,        INS_vpsrlvd,        INS_vpsrlvq,        INS_vpsrlvq,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_UnfixedSIMDSize|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_ShiftRightLogicalVariable,                  "ShiftRightLogicalVariable",                    AVX2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpsrlvd,        INS_vpsrlvd,        INS_vpsrlvq,        INS_vpsrlvq,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_UnfixedSIMDSize)
 HARDWARE_INTRINSIC(AVX2_Subtract,                                   "Subtract",                                     AVX2,         -1,              32,           2,     {INS_psubb,             INS_psubb,          INS_psubw,          INS_psubw,          INS_psubd,          INS_psubd,          INS_psubq,          INS_psubq,          INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_SubtractSaturate,                           "SubtractSaturate",                             AVX2,         -1,              32,           2,     {INS_psubsb,            INS_psubusb,        INS_psubsw,         INS_psubusw,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_UnpackHigh,                                 "UnpackHigh",                                   AVX2,         -1,              32,           2,     {INS_punpckhbw,         INS_punpckhbw,      INS_punpckhwd,      INS_punpckhwd,      INS_punpckhdq,      INS_punpckhdq,      INS_punpckhqdq,     INS_punpckhqdq,     INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
index fff2ec3..a0a4dfe 100644 (file)
@@ -2612,7 +2612,37 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
             {
                 case HW_Category_SimpleSIMD:
                 case HW_Category_SIMDScalar:
+                case HW_Category_Scalar:
                 {
+                    if (HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId))
+                    {
+                        switch (intrinsicId)
+                        {
+                            case NI_SSE_CompareLessThanOrderedScalar:
+                            case NI_SSE_CompareLessThanUnorderedScalar:
+                            case NI_SSE_CompareLessThanOrEqualOrderedScalar:
+                            case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
+                            case NI_SSE2_CompareLessThanOrderedScalar:
+                            case NI_SSE2_CompareLessThanUnorderedScalar:
+                            case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
+                            case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
+                            {
+                                // We need to swap the operands for CompareLessThanOrEqual
+                                node->gtOp1 = op2;
+                                node->gtOp2 = op1;
+                                op2         = op1;
+                                break;
+                            }
+
+                            default:
+                            {
+                                // TODO-XArch-CQ: The Compare*OrderedScalar and Compare*UnorderedScalar methods
+                                //                are commutative if you also inverse the intrinsic.
+                                break;
+                            }
+                        }
+                    }
+
                     bool supportsRegOptional = false;
 
                     if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
@@ -2745,9 +2775,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
 
                 default:
                 {
-                    // TODO-XArch-CQ: Assert that this is unreached after we have ensured the relevant node types are
-                    // handled.
-                    //                https://github.com/dotnet/coreclr/issues/16497
+                    unreached();
                     break;
                 }
             }