Merge SSE intrinsics into the table-driven framework
authorFei Peng <fei.peng@intel.com>
Fri, 19 Jan 2018 08:01:21 +0000 (00:01 -0800)
committerFei Peng <fei.peng@intel.com>
Fri, 19 Jan 2018 08:01:21 +0000 (00:01 -0800)
src/jit/compiler.h
src/jit/emitxarch.cpp
src/jit/emitxarch.h
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp
src/jit/instrsxarch.h
src/jit/lowerxarch.cpp
src/jit/lsraxarch.cpp
src/jit/namedintrinsiclist.h

index ea1c6b2..f48a7b7 100644 (file)
@@ -3122,7 +3122,7 @@ protected:
     static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic);
     static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type);
     static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic);
-    static HWIntrinsicFlag flagOfHWIntrinsic(NamedIntrinsic intrinsic);
+    static HWIntrinsicFlag flagsOfHWIntrinsic(NamedIntrinsic intrinsic);
     GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass);
     GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig);
 #endif // _TARGET_XARCH_
index 0f6a9ff..73f28d9 100644 (file)
@@ -4228,6 +4228,7 @@ void emitter::emitIns_R_R_S_I(
     emitCurIGsize += sz;
 }
 
+#ifdef DEBUG
 static bool isAvxBlendv(instruction ins)
 {
     return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
@@ -4237,6 +4238,7 @@ static bool isSse41Blendv(instruction ins)
 {
     return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb;
 }
+#endif
 
 void emitter::emitIns_R_R_R_R(
     instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3)
@@ -5216,23 +5218,23 @@ void emitter::emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg,
         {
             emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_A(ins, emitTypeSize(simdtype), reg, indir, IF_RRW_ARD);
+        emitIns_R_A(ins, attr, reg, indir, IF_RRW_ARD);
     }
 }
 
-void emitter::emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype)
+void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_AR(ins, emitTypeSize(simdtype), reg, reg1, base, 0);
+        emitIns_R_R_AR(ins, attr, reg, reg1, base, 0);
     }
     else
     {
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_AR(ins, emitTypeSize(simdtype), reg, base, 0);
+        emitIns_R_AR(ins, attr, reg, base, 0);
     }
 }
 
@@ -5325,70 +5327,70 @@ void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg,
 }
 
 void emitter::emitIns_SIMD_R_R_A_I(
-    instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype)
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_A_I(ins, emitTypeSize(simdtype), reg, reg1, indir, ival, IF_RWR_RRD_ARD_CNS);
+        emitIns_R_R_A_I(ins, attr, reg, reg1, indir, ival, IF_RWR_RRD_ARD_CNS);
     }
     else
     {
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_A_I(ins, emitTypeSize(simdtype), reg, indir, ival);
+        emitIns_R_A_I(ins, attr, reg, indir, ival);
     }
 }
 
 void emitter::emitIns_SIMD_R_R_C_I(
-    instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival, var_types simdtype)
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_C_I(ins, emitTypeSize(simdtype), reg, reg1, fldHnd, offs, ival);
+        emitIns_R_R_C_I(ins, attr, reg, reg1, fldHnd, offs, ival);
     }
     else
     {
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_C_I(ins, emitTypeSize(simdtype), reg, fldHnd, offs, ival);
+        emitIns_R_C_I(ins, attr, reg, fldHnd, offs, ival);
     }
 }
 
 void emitter::emitIns_SIMD_R_R_R_I(
-    instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype)
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_R_I(ins, emitTypeSize(simdtype), reg, reg1, reg2, ival);
+        emitIns_R_R_R_I(ins, attr, reg, reg1, reg2, ival);
     }
     else
     {
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_R_I(ins, emitTypeSize(simdtype), reg, reg2, ival);
+        emitIns_R_R_I(ins, attr, reg, reg2, ival);
     }
 }
 
 void emitter::emitIns_SIMD_R_R_S_I(
-    instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype)
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs, int ival)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_S_I(ins, emitTypeSize(simdtype), reg, reg1, varx, offs, ival);
+        emitIns_R_R_S_I(ins, attr, reg, reg1, varx, offs, ival);
     }
     else
     {
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_S_I(ins, emitTypeSize(simdtype), reg, varx, offs, ival);
+        emitIns_R_S_I(ins, attr, reg, varx, offs, ival);
     }
 }
 #endif
index 0473447..f2e426d 100644 (file)
@@ -455,19 +455,12 @@ void emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg,
 void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp);
 
 #if FEATURE_HW_INTRINSICS
-void emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype);
-void emitIns_SIMD_R_R_A_I(
-    instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype);
-void emitIns_SIMD_R_R_C_I(instruction          ins,
-                          regNumber            reg,
-                          regNumber            reg1,
-                          CORINFO_FIELD_HANDLE fldHnd,
-                          int                  offs,
-                          int                  ival,
-                          var_types            simdtype);
-void emitIns_SIMD_R_R_R_I(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype);
-void emitIns_SIMD_R_R_S_I(
-    instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype);
+void emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base);
+void emitIns_SIMD_R_R_A_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival);
+void emitIns_SIMD_R_R_C_I(
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival);
+void emitIns_SIMD_R_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival);
+void emitIns_SIMD_R_R_S_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs, int ival);
 void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir);
 void emitIns_SIMD_R_R_C(
     instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs);
index 6ea0de7..69b3cf5 100644 (file)
@@ -33,12 +33,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 // Return Value:
 //    returns true if this category can be table-driven in CodeGen
 //
-static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category)
+static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
 {
     // TODO - make more categories to the table-driven framework
-    const bool tableDrivenIntrinsic    = category == HW_Category_SimpleSIMD;
-    const bool nonTableDrivenIntrinsic = category == HW_Category_Special;
-    return tableDrivenIntrinsic && !nonTableDrivenIntrinsic;
+    // HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen
+    const bool tableDrivenCategory =
+        category == HW_Category_SimpleSIMD || category == HW_Category_MemoryLoad || category == HW_Category_SIMDScalar;
+    const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0;
+    return tableDrivenCategory && tableDrivenFlag;
 }
 
 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
@@ -46,12 +48,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
     NamedIntrinsic      intrinsicID = node->gtHWIntrinsicId;
     InstructionSet      isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
     HWIntrinsicCategory category    = Compiler::categoryOfHWIntrinsic(intrinsicID);
-    HWIntrinsicFlag     flag        = Compiler::flagOfHWIntrinsic(intrinsicID);
+    HWIntrinsicFlag     flags       = Compiler::flagsOfHWIntrinsic(intrinsicID);
+    int                 ival        = Compiler::ivalOfHWIntrinsic(intrinsicID);
     int                 numArgs     = Compiler::numArgsOfHWIntrinsic(intrinsicID);
 
-    assert((flag & HW_Flag_NoCodeGen) == 0);
+    assert((flags & HW_Flag_NoCodeGen) == 0);
 
-    if (genIsTableDrivenHWIntrinsic(category))
+    if (genIsTableDrivenHWIntrinsic(category, flags))
     {
         GenTree*  op1        = node->gtGetOp1();
         GenTree*  op2        = node->gtGetOp2();
@@ -74,10 +77,35 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
             case 1:
                 genConsumeOperands(node);
                 op1Reg = op1->gtRegNum;
-                emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
+                if (category == HW_Category_MemoryLoad)
+                {
+                    emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
+                }
+                else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0)
+                {
+                    emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
+                }
+                else
+                {
+
+                    emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
+                }
                 break;
+
             case 2:
-                genHWIntrinsic_R_R_RM(node, ins);
+                genConsumeOperands(node);
+                if (ival != -1)
+                {
+                    genHWIntrinsic_R_R_RM_I(node, ins);
+                }
+                else if (category == HW_Category_MemoryLoad)
+                {
+                    emit->emitIns_SIMD_R_R_AR(ins, emitTypeSize(TYP_SIMD16), targetReg, op1->gtRegNum, op2->gtRegNum);
+                }
+                else
+                {
+                    genHWIntrinsic_R_R_RM(node, ins);
+                }
                 break;
             case 3:
             {
@@ -329,14 +357,14 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
 
                 case GT_CLS_VAR_ADDR:
                 {
-                    emit->emitIns_SIMD_R_R_C_I(ins, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0, ival,
-                                               targetType);
+                    emit->emitIns_SIMD_R_R_C_I(ins, emitTypeSize(targetType), targetReg, op1Reg,
+                                               memBase->gtClsVar.gtClsVarHnd, 0, ival);
                     return;
                 }
 
                 default:
                 {
-                    emit->emitIns_SIMD_R_R_A_I(ins, targetReg, op1Reg, memIndir, ival, targetType);
+                    emit->emitIns_SIMD_R_R_A_I(ins, emitTypeSize(targetType), targetReg, op1Reg, memIndir, ival);
                     return;
                 }
             }
@@ -374,11 +402,11 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
         assert(offset != (unsigned)-1);
 
-        emit->emitIns_SIMD_R_R_S_I(ins, targetReg, op1Reg, varNum, offset, ival, targetType);
+        emit->emitIns_SIMD_R_R_S_I(ins, emitTypeSize(targetType), targetReg, op1Reg, varNum, offset, ival);
     }
     else
     {
-        emit->emitIns_SIMD_R_R_R_I(ins, targetReg, op1Reg, op2->gtRegNum, ival, targetType);
+        emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(targetType), targetReg, op1Reg, op2->gtRegNum, ival);
     }
 }
 
@@ -392,7 +420,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
     regNumber      targetReg   = node->gtRegNum;
     var_types      targetType  = node->TypeGet();
     var_types      baseType    = node->gtSIMDBaseType;
-    instruction    ins         = INS_invalid;
+    instruction    ins         = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
 
     regNumber op1Reg = REG_NA;
     regNumber op2Reg = REG_NA;
@@ -408,28 +436,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
 
     switch (intrinsicID)
     {
-        case NI_SSE_Add:
-        case NI_SSE_AddScalar:
-        case NI_SSE_And:
-        case NI_SSE_AndNot:
         case NI_SSE_ConvertToVector128SingleScalar:
-        case NI_SSE_Divide:
-        case NI_SSE_DivideScalar:
-        case NI_SSE_Max:
-        case NI_SSE_MaxScalar:
-        case NI_SSE_Min:
-        case NI_SSE_MinScalar:
-        case NI_SSE_MoveHighToLow:
-        case NI_SSE_MoveLowToHigh:
-        case NI_SSE_MoveScalar:
-        case NI_SSE_Multiply:
-        case NI_SSE_MultiplyScalar:
-        case NI_SSE_Or:
-        case NI_SSE_Subtract:
-        case NI_SSE_SubtractScalar:
-        case NI_SSE_UnpackHigh:
-        case NI_SSE_UnpackLow:
-        case NI_SSE_Xor:
         {
             assert(node->TypeGet() == TYP_SIMD16);
             assert(node->gtSIMDBaseType == TYP_FLOAT);
@@ -439,51 +446,14 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             genHWIntrinsic_R_R_RM(node, ins);
             break;
         }
-
-        case NI_SSE_CompareEqual:
-        case NI_SSE_CompareEqualScalar:
-        case NI_SSE_CompareGreaterThan:
-        case NI_SSE_CompareGreaterThanScalar:
-        case NI_SSE_CompareGreaterThanOrEqual:
-        case NI_SSE_CompareGreaterThanOrEqualScalar:
-        case NI_SSE_CompareLessThan:
-        case NI_SSE_CompareLessThanScalar:
-        case NI_SSE_CompareLessThanOrEqual:
-        case NI_SSE_CompareLessThanOrEqualScalar:
-        case NI_SSE_CompareNotEqual:
-        case NI_SSE_CompareNotEqualScalar:
-        case NI_SSE_CompareNotGreaterThan:
-        case NI_SSE_CompareNotGreaterThanScalar:
-        case NI_SSE_CompareNotGreaterThanOrEqual:
-        case NI_SSE_CompareNotGreaterThanOrEqualScalar:
-        case NI_SSE_CompareNotLessThan:
-        case NI_SSE_CompareNotLessThanScalar:
-        case NI_SSE_CompareNotLessThanOrEqual:
-        case NI_SSE_CompareNotLessThanOrEqualScalar:
-        case NI_SSE_CompareOrdered:
-        case NI_SSE_CompareOrderedScalar:
-        case NI_SSE_CompareUnordered:
-        case NI_SSE_CompareUnorderedScalar:
-        {
-            assert(node->TypeGet() == TYP_SIMD16);
-            assert(node->gtSIMDBaseType == TYP_FLOAT);
-            assert(Compiler::ivalOfHWIntrinsic(intrinsicID) != -1);
-
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            genHWIntrinsic_R_R_RM_I(node, ins);
-            break;
-        }
-
         case NI_SSE_CompareEqualOrderedScalar:
         case NI_SSE_CompareEqualUnorderedScalar:
         {
             assert(baseType == TYP_FLOAT);
-            op2Reg = op2->gtRegNum;
-
-            regNumber   tmpReg = node->GetSingleTempReg();
-            instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
+            op2Reg           = op2->gtRegNum;
+            regNumber tmpReg = node->GetSingleTempReg();
 
-            emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16);
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
@@ -498,8 +468,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             assert(baseType == TYP_FLOAT);
             op2Reg = op2->gtRegNum;
 
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16);
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -511,8 +480,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             assert(baseType == TYP_FLOAT);
             op2Reg = op2->gtRegNum;
 
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16);
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -524,8 +492,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             assert(baseType == TYP_FLOAT);
             op2Reg = op2->gtRegNum;
 
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_SIMD_R_R(ins, op2Reg, op1Reg, TYP_SIMD16);
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -537,8 +504,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             assert(baseType == TYP_FLOAT);
             op2Reg = op2->gtRegNum;
 
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_SIMD_R_R(ins, op2Reg, op1Reg, TYP_SIMD16);
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
             break;
@@ -550,10 +516,9 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             assert(baseType == TYP_FLOAT);
             op2Reg = op2->gtRegNum;
 
-            regNumber   tmpReg = node->GetSingleTempReg();
-            instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
+            regNumber tmpReg = node->GetSingleTempReg();
 
-            emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16);
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
@@ -562,84 +527,23 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             break;
         }
 
-        case NI_SSE_ConvertToInt32:
-        case NI_SSE_ConvertToInt32WithTruncation:
-        case NI_SSE_ConvertToInt64:
-        case NI_SSE_ConvertToInt64WithTruncation:
-        case NI_SSE_Reciprocal:
-        case NI_SSE_ReciprocalSqrt:
-        case NI_SSE_Sqrt:
-        {
-            assert(baseType == TYP_FLOAT);
-            assert(op2 == nullptr);
-
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_SIMD16);
-            break;
-        }
-
         case NI_SSE_ConvertToSingle:
         case NI_SSE_StaticCast:
         {
             assert(op2 == nullptr);
             if (op1Reg != targetReg)
             {
-                instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-                emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_SIMD16);
+                emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
             }
             break;
         }
 
-        case NI_SSE_LoadAlignedVector128:
-        case NI_SSE_LoadScalar:
-        case NI_SSE_LoadVector128:
-        {
-            assert(baseType == TYP_FLOAT);
-            assert(op2 == nullptr);
-
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_R_AR(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, 0);
-            break;
-        }
-
-        case NI_SSE_LoadHigh:
-        case NI_SSE_LoadLow:
-        {
-            assert(baseType == TYP_FLOAT);
-            op2Reg = op2->gtRegNum;
-
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_SIMD_R_R_AR(ins, targetReg, op1Reg, op2Reg, TYP_SIMD16);
-            break;
-        }
-
         case NI_SSE_MoveMask:
         {
             assert(baseType == TYP_FLOAT);
             assert(op2 == nullptr);
 
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_INT);
-            break;
-        }
-
-        case NI_SSE_ReciprocalScalar:
-        case NI_SSE_ReciprocalSqrtScalar:
-        case NI_SSE_SqrtScalar:
-        {
-            assert(baseType == TYP_FLOAT);
-            assert(op2 == nullptr);
-
-            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
-            emit->emitIns_SIMD_R_R_R(ins, targetReg, op1Reg, op1Reg, TYP_SIMD16);
-            break;
-        }
-
-        case NI_SSE_SetAllVector128:
-        {
-            assert(baseType == TYP_FLOAT);
-            assert(op2 == nullptr);
-            emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op1Reg, 0, TYP_SIMD16);
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
             break;
         }
 
@@ -651,12 +555,12 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             if (op1Reg == targetReg)
             {
                 regNumber tmpReg = node->GetSingleTempReg();
-                emit->emitIns_SIMD_R_R(INS_movaps, tmpReg, op1Reg, TYP_SIMD16);
+                emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
                 op1Reg = tmpReg;
             }
 
-            emit->emitIns_SIMD_R_R_R(INS_xorps, targetReg, targetReg, targetReg, TYP_SIMD16);
-            emit->emitIns_SIMD_R_R_R(INS_movss, targetReg, targetReg, op1Reg, TYP_SIMD16);
+            emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
+            emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
             break;
         }
 
@@ -665,7 +569,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             assert(baseType == TYP_FLOAT);
             assert(op1 == nullptr);
             assert(op2 == nullptr);
-            emit->emitIns_SIMD_R_R_R(INS_xorps, targetReg, targetReg, targetReg, TYP_SIMD16);
+            emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
             break;
         }
 
@@ -699,7 +603,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             if (op3->IsCnsIntOrI())
             {
                 ssize_t ival = op3->AsIntConCommon()->IconValue();
-                emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op2Reg, (int)ival, TYP_SIMD16);
+                emit->emitIns_SIMD_R_R_R_I(INS_shufps, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, (int)ival);
             }
             else
             {
@@ -749,7 +653,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
                 for (unsigned i = 0; i < jmpCount; i++)
                 {
                     genDefineTempLabel(jmpTable[i]);
-                    emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op2Reg, i, TYP_SIMD16);
+                    emit->emitIns_SIMD_R_R_R_I(INS_shufps, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, i);
                     emit->emitIns_J(INS_jmp, switchTableEnd);
                 }
 
index f9ccf7f..ec9c9d7 100644 (file)
@@ -15,7 +15,7 @@
     1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic`
     2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID
     3) The intrinsic that generates instructions with a fixed imm8 operand has a `ival` field with "not -1" value, e.g., Sse.CompareEqual(v1,v2) -> cmpps xmm0, xmm1, 0
-    4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128<T>` (16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_) or `Vector256<T>`
+    4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128<T>`(16) or `Vector256<T>`(32)
     5) Scalar intrinsics that operate over general purpose registers (e.g., Sse41.Crc32) have `SIMD size` with 0
     6) Each intrinsic has a `NumArg` for number of parameters, and some intrinsics that are overloaded on multiple parameter numbers have this field with -1
     7) Each intrinsic has 10 `instructions` fields that list the instructions should be generated based-on the base type
 //  SSE Intrinsics          
 HARDWARE_INTRINSIC(SSE_IsSupported,                                  "get_IsSupported",                                  SSE,        -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
 HARDWARE_INTRINSIC(SSE_Add,                                          "Add",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)                 
-HARDWARE_INTRINSIC(SSE_AddScalar,                                    "AddScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_AddScalar,                                    "AddScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE_And,                                          "And",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_andps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE_AndNot,                                       "AndNot",                                           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_andnps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Fixed)
-HARDWARE_INTRINSIC(SSE_CompareEqual,                                 "CompareEqual",                                     SSE,        0,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar,                    "CompareEqualOrderedScalar",                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareEqualScalar,                           "CompareEqualScalar",                               SSE,        0,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar,                  "CompareEqualUnorderedScalar",                      SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_AndNot,                                       "AndNot",                                           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_andnps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareEqual,                                 "CompareEqual",                                     SSE,        0,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar,                    "CompareEqualOrderedScalar",                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareEqualScalar,                           "CompareEqualScalar",                               SSE,        0,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_Commutative|HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar,                  "CompareEqualUnorderedScalar",                      SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE_CompareGreaterThan,                           "CompareGreaterThan",                               SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar,              "CompareGreaterThanOrderedScalar",                  SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar,                     "CompareGreaterThanScalar",                         SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar,            "CompareGreaterThanUnorderedScalar",                SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar,              "CompareGreaterThanOrderedScalar",                  SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar,                     "CompareGreaterThanScalar",                         SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar,            "CompareGreaterThanUnorderedScalar",                SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual,                    "CompareGreaterThanOrEqual",                        SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar,       "CompareGreaterThanOrEqualOrderedScalar",           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar,              "CompareGreaterThanOrEqualScalar",                  SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar,     "CompareGreaterThanOrEqualUnorderedScalar",         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar,       "CompareGreaterThanOrEqualOrderedScalar",           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar,              "CompareGreaterThanOrEqualScalar",                  SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar,     "CompareGreaterThanOrEqualUnorderedScalar",         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE_CompareLessThan,                              "CompareLessThan",                                  SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar,                 "CompareLessThanOrderedScalar",                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanScalar,                        "CompareLessThanScalar",                            SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar,               "CompareLessThanUnorderedScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar,                 "CompareLessThanOrderedScalar",                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareLessThanScalar,                        "CompareLessThanScalar",                            SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar,               "CompareLessThanUnorderedScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual,                       "CompareLessThanOrEqual",                           SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar,          "CompareLessThanOrEqualOrderedScalar",              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar,                 "CompareLessThanOrEqualScalar",                     SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar,        "CompareLessThanOrEqualUnorderedScalar",            SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotEqual,                              "CompareNotEqual",                                  SSE,        4,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar,                 "CompareNotEqualOrderedScalar",                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar,                        "CompareNotEqualScalar",                            SSE,        4,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar,               "CompareNotEqualUnorderedScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar,          "CompareLessThanOrEqualOrderedScalar",              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar,                 "CompareLessThanOrEqualScalar",                     SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar,        "CompareLessThanOrEqualUnorderedScalar",            SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareNotEqual,                              "CompareNotEqual",                                  SSE,        4,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar,                 "CompareNotEqualOrderedScalar",                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar,                        "CompareNotEqualScalar",                            SSE,        4,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar,               "CompareNotEqualUnorderedScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan,                        "CompareNotGreaterThan",                            SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar,                  "CompareNotGreaterThanScalar",                      SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar,                  "CompareNotGreaterThanScalar",                      SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual,                 "CompareNotGreaterThanOrEqual",                     SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar,           "CompareNotGreaterThanOrEqualScalar",               SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar,           "CompareNotGreaterThanOrEqualScalar",               SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_CompareNotLessThan,                           "CompareNotLessThan",                               SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar,                     "CompareNotLessThanScalar",                         SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar,                     "CompareNotLessThanScalar",                         SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual,                    "CompareNotLessThanOrEqual",                        SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar,              "CompareNotLessThanOrEqualScalar",                  SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar,              "CompareNotLessThanOrEqualScalar",                  SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_CompareOrdered,                               "CompareOrdered",                                   SSE,        7,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareOrderedScalar,                         "CompareOrderedScalar",                             SSE,        7,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareOrderedScalar,                         "CompareOrderedScalar",                             SSE,        7,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_CompareUnordered,                             "CompareUnordered",                                 SSE,        3,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar,                       "CompareUnorderedScalar",                           SSE,        3,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToInt32,                               "ConvertToInt32",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtss2si,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToInt64,                               "ConvertToInt64",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtss2si,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToSingle,                              "ConvertToSingle",                                  SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar,               "ConvertToVector128SingleScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtsi2ss,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation,                 "ConvertToInt32WithTruncation",                     SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttss2si, INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation,                 "ConvertToInt64WithTruncation",                     SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttss2si, INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Divide,                                       "Divide",                                           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_DivideScalar,                                 "DivideScalar",                                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadAlignedVector128,                         "LoadAlignedVector128",                             SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movaps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadHigh,                                     "LoadHigh",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadLow,                                      "LoadLow",                                          SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movlps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadScalar,                                   "LoadScalar",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadVector128,                                "LoadVector128",                                    SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movups,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Max,                                          "Max",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MaxScalar,                                    "MaxScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Min,                                          "Min",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MinScalar,                                    "MinScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MoveHighToLow,                                "MoveHighToLow",                                    SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhlps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MoveLowToHigh,                                "MoveLowToHigh",                                    SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movlhps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MoveMask,                                     "MoveMask",                                         SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movmskps,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MoveScalar,                                   "MoveScalar",                                       SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Multiply,                                     "Multiply",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MultiplyScalar,                               "MultiplyScalar",                                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar,                       "CompareUnorderedScalar",                           SSE,        3,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt32,                               "ConvertToInt32",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtss2si,  INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_ConvertToInt64,                               "ConvertToInt64",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtss2si,  INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_ConvertToSingle,                              "ConvertToSingle",                                  SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_Helper,                            HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar,               "ConvertToVector128SingleScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtsi2ss,  INS_invalid},           HW_Category_Special,                           HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation,                 "ConvertToInt32WithTruncation",                     SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttss2si, INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation,                 "ConvertToInt64WithTruncation",                     SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttss2si, INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_Divide,                                       "Divide",                                           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_DivideScalar,                                 "DivideScalar",                                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_LoadAlignedVector128,                         "LoadAlignedVector128",                             SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movaps,    INS_invalid},           HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadHigh,                                     "LoadHigh",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhps,    INS_invalid},           HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadLow,                                      "LoadLow",                                          SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movlps,    INS_invalid},           HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadScalar,                                   "LoadScalar",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadVector128,                                "LoadVector128",                                    SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movups,    INS_invalid},           HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Max,                                          "Max",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_MaxScalar,                                    "MaxScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_Min,                                          "Min",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_MinScalar,                                    "MinScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_MoveHighToLow,                                "MoveHighToLow",                                    SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhlps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_MoveLowToHigh,                                "MoveLowToHigh",                                    SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movlhps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_MoveMask,                                     "MoveMask",                                         SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movmskps,  INS_invalid},           HW_Category_Special,                           HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_MoveScalar,                                   "MoveScalar",                                       SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_Multiply,                                     "Multiply",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_MultiplyScalar,                               "MultiplyScalar",                                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE_Or,                                           "Or",                                               SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_orps,      INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_Reciprocal,                                   "Reciprocal",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rcpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)                 
-HARDWARE_INTRINSIC(SSE_ReciprocalScalar,                             "ReciprocalScalar",                                 SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rcpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalScalar,                             "ReciprocalScalar",                                 SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rcpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE_ReciprocalSqrt,                               "ReciprocalSqrt",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rsqrtps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar,                         "ReciprocalSqrtScalar",                             SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rsqrtss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SetAllVector128,                              "SetAllVector128",                                  SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SetScalar,                                    "SetScalar",                                        SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SetVector128,                                 "SetVector128",                                     SSE,        -1,           16,           4,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SetZeroVector128,                             "SetZeroVector128",                                 SSE,        -1,           16,           0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Shuffle,                                      "Shuffle",                                          SSE,        -1,           16,           3,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_shufps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar,                         "ReciprocalSqrtScalar",                             SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rsqrtss,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE_SetAllVector128,                              "SetAllVector128",                                  SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Helper,                            HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(SSE_SetScalar,                                    "SetScalar",                                        SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_Helper,                            HW_Flag_MultiIns)
+HARDWARE_INTRINSIC(SSE_SetVector128,                                 "SetVector128",                                     SSE,        -1,           16,           4,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Helper,                            HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(SSE_SetZeroVector128,                             "SetZeroVector128",                                 SSE,        -1,           16,           0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_xorps,     INS_invalid},           HW_Category_Helper,                            HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Shuffle,                                      "Shuffle",                                          SSE,        -1,           16,           3,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_shufps,    INS_invalid},           HW_Category_IMM,                               HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_Sqrt,                                         "Sqrt",                                             SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_sqrtps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SqrtScalar,                                   "SqrtScalar",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_sqrtss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_StaticCast,                                   "StaticCast",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movaps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_Store,                                        "Store",                                            SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movups,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreAligned,                                 "StoreAligned",                                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movaps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal,                      "StoreAlignedNonTemporal",                          SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movntps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreHigh,                                    "StoreHigh",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreLow,                                     "StoreLow",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movlps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreScalar,                                  "StoreScalar",                                      SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SqrtScalar,                                   "SqrtScalar",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_sqrtss,    INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE_StaticCast,                                   "StaticCast",                                       SSE,        -1,           16,           1,           {INS_movaps,    INS_movaps,    INS_movaps,    INS_movaps,    INS_movaps,    INS_movaps,    INS_movaps,    INS_movaps,    INS_movaps,    INS_movaps},            HW_Category_Helper,                            HW_Flag_TwoTypeGeneric)
 HARDWARE_INTRINSIC(SSE_Subtract,                                     "Subtract",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_subps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SubtractScalar,                               "SubtractScalar",                                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_subss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SubtractScalar,                               "SubtractScalar",                                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_subss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_UnpackHigh,                                   "UnpackHigh",                                       SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_unpckhps,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_UnpackLow,                                    "UnpackLow",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_unpcklps,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Xor,                                          "Xor",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_xorps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Xor,                                          "Xor",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_xorps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 
 //  SSE2 Intrinsics 
 HARDWARE_INTRINSIC(SSE2_IsSupported,                                 "get_IsSupported",                                  SSE2,       -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
index a96952d..80091f1 100644 (file)
@@ -16,7 +16,7 @@ struct HWIntrinsicInfo
     int                 numArgs;
     instruction         ins[10];
     HWIntrinsicCategory category;
-    HWIntrinsicFlag     flag;
+    HWIntrinsicFlag     flags;
 };
 
 static const HWIntrinsicInfo hwIntrinsicInfoArray[] = {
@@ -192,7 +192,7 @@ int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic)
 static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig)
 {
     assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
-    assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag & HW_Flag_UnfixedSIMDSize) == 0);
+    assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags & HW_Flag_UnfixedSIMDSize) == 0);
     return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize;
 }
 
@@ -248,19 +248,19 @@ HWIntrinsicCategory Compiler::categoryOfHWIntrinsic(NamedIntrinsic intrinsic)
 }
 
 //------------------------------------------------------------------------
-// HWIntrinsicFlag: get the flag of the given intrinsic
+// HWIntrinsicFlag: get the flags of the given intrinsic
 //
 // Arguments:
 //    intrinsic -- id of the intrinsic function.
 //
 // Return Value:
-//     the flag of the given intrinsic
+//     the flags of the given intrinsic
 //
-HWIntrinsicFlag Compiler::flagOfHWIntrinsic(NamedIntrinsic intrinsic)
+HWIntrinsicFlag Compiler::flagsOfHWIntrinsic(NamedIntrinsic intrinsic)
 {
     assert(intrinsic != NI_Illegal);
     assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
-    return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag;
+    return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags;
 }
 
 //------------------------------------------------------------------------
@@ -290,7 +290,7 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE
         assert(varTypeIsArithmetic(argType));
         arg = impPopStack().val;
         assert(varTypeIsArithmetic(arg->TypeGet()));
-        assert(genTypeSize(argType) <= genTypeSize(arg->TypeGet()));
+        assert(genActualType(arg->gtType) == genActualType(argType));
     }
     return arg;
 }
@@ -436,12 +436,10 @@ GenTree* Compiler::impUnsupportedHWIntrinsic(unsigned              helper,
 // Return Value:
 //    returns true if this category can be table-driven in the importer
 //
-static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category)
+static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
 {
-    // TODO - make more categories to the table-driven framework
-    const bool tableDrivenIntrinsic    = category == HW_Category_SimpleSIMD;
-    const bool nonTableDrivenIntrinsic = category == HW_Category_Special;
-    return tableDrivenIntrinsic && !nonTableDrivenIntrinsic;
+    // HW_Flag_NoCodeGen implies this intrinsic should be manually morphed in the importer.
+    return category != HW_Category_Special && category != HW_Category_Scalar && (flags & HW_Flag_NoCodeGen) == 0;
 }
 
 //------------------------------------------------------------------------
@@ -462,15 +460,24 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic        intrinsic,
 {
     InstructionSet      isa      = isaOfHWIntrinsic(intrinsic);
     HWIntrinsicCategory category = categoryOfHWIntrinsic(intrinsic);
+    HWIntrinsicFlag     flags    = flagsOfHWIntrinsic(intrinsic);
     int                 numArgs  = sig->numArgs;
-    var_types           callType = JITtype2varType(sig->retType);
+    var_types           retType  = JITtype2varType(sig->retType);
+    var_types           baseType = TYP_UNKNOWN;
+    if (retType == TYP_STRUCT && featureSIMD)
+    {
+        unsigned int sizeBytes;
+        baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);
+        retType  = getSIMDTypeForSize(sizeBytes);
+        assert(sizeBytes != 0 && baseType != TYP_UNKNOWN);
+    }
 
     // This intrinsic is supported if
     // - the ISA is available on the underlying hardware (compSupports returns true)
     // - the compiler supports this hardware intrinsics (compSupportsHWIntrinsic returns true)
     // - intrinsics do not require 64-bit registers (r64) on 32-bit platforms (isTypeSupportedForIntrinsic returns
     // true)
-    bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(callType);
+    bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(retType);
 
     if (category == HW_Category_IsSupportedProperty)
     {
@@ -481,22 +488,59 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic        intrinsic,
     {
         return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
     }
+    else if (category == HW_Category_IMM)
+    {
+        GenTree* lastOp = impStackTop().val;
+        if (!lastOp->IsCnsIntOrI() && !mustExpand)
+        {
+            // When the imm-argument is not a constant and we are not being forced to expand, we need to
+            // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The
+            // intrinsic method is recursive and will be forced to expand, at which point
+            // we emit some less efficient fallback code.
+            return nullptr;
+        }
+    }
+
+    if ((flags & HW_Flag_Generic) != 0)
+    {
+        assert(baseType != TYP_UNKNOWN);
+        // When the type argument is not a numeric type (and we are not being forced to expand), we need to
+        // return nullptr so a GT_CALL to the intrinsic method is emitted that will throw NotSupportedException
+        if (!varTypeIsArithmetic(baseType))
+        {
+            assert(!mustExpand);
+            return nullptr;
+        }
+
+        if ((flags & HW_Flag_TwoTypeGeneric) != 0)
+        {
+            // StaticCast<T, U> has two type parameters.
+            assert(!mustExpand);
+            assert(numArgs == 1);
+            var_types srcType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
+            assert(srcType != TYP_UNKNOWN);
+            if (!varTypeIsArithmetic(srcType))
+            {
+                return nullptr;
+            }
+        }
+    }
 
     // table-driven importer of simple intrinsics
-    if (impIsTableDrivenHWIntrinsic(category))
+    if (impIsTableDrivenHWIntrinsic(category, flags))
     {
-        unsigned int sizeBytes;
-        var_types    baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);
-        assert(baseType != TYP_UNKNOWN && sizeBytes != 0);
-        var_types               retType  = getSIMDTypeForSize(sizeBytes);
+        if (!varTypeIsSIMD(retType))
+        {
+            baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
+            assert(baseType != TYP_UNKNOWN);
+        }
+
         unsigned                simdSize = simdSizeOfHWIntrinsic(intrinsic, sig);
         CORINFO_ARG_LIST_HANDLE argList  = sig->args;
         CORINFO_CLASS_HANDLE    argClass;
         var_types               argType = TYP_UNKNOWN;
 
-        assert(numArgs > 0);
-        assert(retType != TYP_UNDEF);
-        assert(retType == TYP_SIMD16 || retType == TYP_SIMD32);
+        assert(numArgs >= 0);
         assert(insOfHWIntrinsic(intrinsic, baseType) != INS_invalid);
         assert(simdSize == 32 || simdSize == 16);
 
@@ -506,10 +550,12 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic        intrinsic,
 
         switch (numArgs)
         {
+            case 0:
+                retNode = gtNewSimdHWIntrinsicNode(retType, intrinsic, baseType, simdSize);
+                break;
             case 1:
                 argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
                 op1     = getArgForHWIntrinsic(argType, argClass);
-
                 retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
                 break;
             case 2:
@@ -537,8 +583,7 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic        intrinsic,
                 argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
                 op1     = getArgForHWIntrinsic(argType, argClass);
 
-                op1     = gtNewArgList(op1, op2, op3);
-                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, baseType, simdSize);
                 break;
             }
             default:
@@ -653,11 +698,13 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic        intrinsic,
                                    CORINFO_SIG_INFO*     sig,
                                    bool                  mustExpand)
 {
-    GenTree* retNode = nullptr;
-    GenTree* op1     = nullptr;
-    GenTree* op2     = nullptr;
-    GenTree* op3     = nullptr;
-    GenTree* op4     = nullptr;
+    GenTree* retNode  = nullptr;
+    GenTree* op1      = nullptr;
+    GenTree* op2      = nullptr;
+    GenTree* op3      = nullptr;
+    GenTree* op4      = nullptr;
+    int      simdSize = simdSizeOfHWIntrinsic(intrinsic, sig);
+    assert(simdSize == 16);
 
     switch (intrinsic)
     {
@@ -671,112 +718,14 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic        intrinsic,
             op2 = impPopStack().val;
             op1 = impPopStack().val;
 
-            GenTree* left    = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op4, op3, NI_SSE_UnpackLow, TYP_FLOAT, 16);
-            GenTree* right   = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE_UnpackLow, TYP_FLOAT, 16);
+            GenTree* left    = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op4, op3, NI_SSE_UnpackLow, TYP_FLOAT, simdSize);
+            GenTree* right   = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE_UnpackLow, TYP_FLOAT, simdSize);
             GenTree* control = gtNewIconNode(68, TYP_UBYTE);
 
-            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, left, right, control, NI_SSE_Shuffle, TYP_FLOAT, 16);
+            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, left, right, control, NI_SSE_Shuffle, TYP_FLOAT, simdSize);
             break;
         }
 
-        case NI_SSE_Shuffle:
-        {
-            assert(sig->numArgs == 3);
-            assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
-
-            op3 = impStackTop().val;
-
-            if (op3->IsCnsIntOrI() || mustExpand)
-            {
-                impPopStack(); // Pop the value we peeked at
-                op2     = impSIMDPopStack(TYP_SIMD16);
-                op1     = impSIMDPopStack(TYP_SIMD16);
-                retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, op3, intrinsic, TYP_FLOAT, 16);
-            }
-            else
-            {
-                // When op3 is not a constant and we are not being forced to expand, we need to
-                // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The
-                // intrinsic method is recursive and will be forced to expand, at which point
-                // we emit some less efficient fallback code.
-
-                return nullptr;
-            }
-            break;
-        }
-
-        case NI_SSE_Add:
-        case NI_SSE_AddScalar:
-        case NI_SSE_And:
-        case NI_SSE_AndNot:
-        case NI_SSE_CompareEqual:
-        case NI_SSE_CompareEqualScalar:
-        case NI_SSE_CompareGreaterThan:
-        case NI_SSE_CompareGreaterThanScalar:
-        case NI_SSE_CompareGreaterThanOrEqual:
-        case NI_SSE_CompareGreaterThanOrEqualScalar:
-        case NI_SSE_CompareLessThan:
-        case NI_SSE_CompareLessThanScalar:
-        case NI_SSE_CompareLessThanOrEqual:
-        case NI_SSE_CompareLessThanOrEqualScalar:
-        case NI_SSE_CompareNotEqual:
-        case NI_SSE_CompareNotEqualScalar:
-        case NI_SSE_CompareNotGreaterThan:
-        case NI_SSE_CompareNotGreaterThanScalar:
-        case NI_SSE_CompareNotGreaterThanOrEqual:
-        case NI_SSE_CompareNotGreaterThanOrEqualScalar:
-        case NI_SSE_CompareNotLessThan:
-        case NI_SSE_CompareNotLessThanScalar:
-        case NI_SSE_CompareNotLessThanOrEqual:
-        case NI_SSE_CompareNotLessThanOrEqualScalar:
-        case NI_SSE_CompareOrdered:
-        case NI_SSE_CompareOrderedScalar:
-        case NI_SSE_CompareUnordered:
-        case NI_SSE_CompareUnorderedScalar:
-        case NI_SSE_Divide:
-        case NI_SSE_DivideScalar:
-        case NI_SSE_Max:
-        case NI_SSE_MaxScalar:
-        case NI_SSE_Min:
-        case NI_SSE_MinScalar:
-        case NI_SSE_MoveHighToLow:
-        case NI_SSE_MoveLowToHigh:
-        case NI_SSE_MoveScalar:
-        case NI_SSE_Multiply:
-        case NI_SSE_MultiplyScalar:
-        case NI_SSE_Or:
-        case NI_SSE_Subtract:
-        case NI_SSE_SubtractScalar:
-        case NI_SSE_UnpackHigh:
-        case NI_SSE_UnpackLow:
-        case NI_SSE_Xor:
-            assert(sig->numArgs == 2);
-            assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
-            op2     = impSIMDPopStack(TYP_SIMD16);
-            op1     = impSIMDPopStack(TYP_SIMD16);
-            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16);
-            break;
-
-        case NI_SSE_CompareEqualOrderedScalar:
-        case NI_SSE_CompareEqualUnorderedScalar:
-        case NI_SSE_CompareGreaterThanOrderedScalar:
-        case NI_SSE_CompareGreaterThanUnorderedScalar:
-        case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
-        case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
-        case NI_SSE_CompareLessThanOrderedScalar:
-        case NI_SSE_CompareLessThanUnorderedScalar:
-        case NI_SSE_CompareLessThanOrEqualOrderedScalar:
-        case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
-        case NI_SSE_CompareNotEqualOrderedScalar:
-        case NI_SSE_CompareNotEqualUnorderedScalar:
-            assert(sig->numArgs == 2);
-            assert(JITtype2varType(sig->retType) == TYP_BOOL);
-            assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT);
-            op2     = impSIMDPopStack(TYP_SIMD16);
-            op1     = impSIMDPopStack(TYP_SIMD16);
-            retNode = gtNewSimdHWIntrinsicNode(TYP_BOOL, op1, op2, intrinsic, TYP_FLOAT, 16);
-            break;
-
         case NI_SSE_ConvertToVector128SingleScalar:
         {
             assert(sig->numArgs == 2);
@@ -797,18 +746,7 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic        intrinsic,
 
             op2     = impPopStack().val;
             op1     = impSIMDPopStack(TYP_SIMD16);
-            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16);
-            break;
-        }
-
-        case NI_SSE_LoadHigh:
-        case NI_SSE_LoadLow:
-        {
-            assert(sig->numArgs == 2);
-            assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
-            op2     = impPopStack().val;
-            op1     = impSIMDPopStack(TYP_SIMD16);
-            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16);
+            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, simdSize);
             break;
         }
 
@@ -817,77 +755,15 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic        intrinsic,
             assert(JITtype2varType(sig->retType) == TYP_INT);
             assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT);
             op1     = impSIMDPopStack(TYP_SIMD16);
-            retNode = gtNewSimdHWIntrinsicNode(TYP_INT, op1, intrinsic, TYP_FLOAT, 16);
-            break;
-
-        case NI_SSE_StaticCast:
-        {
-            assert(sig->numArgs == 1);
-            var_types tgtType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
-            var_types srcType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
-
-            if (varTypeIsArithmetic(tgtType) && varTypeIsArithmetic(srcType))
-            {
-                op1     = impSIMDPopStack(TYP_SIMD16);
-                retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, tgtType, 16);
-            }
-            else
-            {
-                return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
-            }
+            retNode = gtNewSimdHWIntrinsicNode(TYP_INT, op1, intrinsic, TYP_FLOAT, simdSize);
             break;
-        }
 
-        case NI_SSE_LoadAlignedVector128:
-        case NI_SSE_LoadScalar:
-        case NI_SSE_LoadVector128:
         case NI_SSE_SetAllVector128:
-        case NI_SSE_SetScalar:
             assert(sig->numArgs == 1);
             assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
             op1     = impPopStack().val;
-            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, TYP_FLOAT, 16);
-            break;
-
-        case NI_SSE_Reciprocal:
-        case NI_SSE_ReciprocalScalar:
-        case NI_SSE_ReciprocalSqrt:
-        case NI_SSE_ReciprocalSqrtScalar:
-        case NI_SSE_Sqrt:
-        case NI_SSE_SqrtScalar:
-            assert(sig->numArgs == 1);
-            assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
-            op1     = impSIMDPopStack(TYP_SIMD16);
-            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, TYP_FLOAT, 16);
-            break;
-
-        case NI_SSE_ConvertToInt32:
-        case NI_SSE_ConvertToInt32WithTruncation:
-        case NI_SSE_ConvertToInt64:
-        case NI_SSE_ConvertToInt64WithTruncation:
-        case NI_SSE_ConvertToSingle:
-        {
-            assert(sig->numArgs == 1);
-            assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT);
-            var_types callType = JITtype2varType(sig->retType);
-
-#ifdef _TARGET_X86_
-            if (varTypeIsLong(callType))
-            {
-                assert(intrinsic == NI_SSE_ConvertToInt64 || intrinsic == NI_SSE_ConvertToInt64WithTruncation);
-                return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
-            }
-#endif // _TARGET_X86_
-
-            op1     = impSIMDPopStack(TYP_SIMD16);
-            retNode = gtNewSimdHWIntrinsicNode(callType, op1, intrinsic, TYP_FLOAT, 16);
-            break;
-        }
-
-        case NI_SSE_SetZeroVector128:
-            assert(sig->numArgs == 0);
-            assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
-            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, intrinsic, TYP_FLOAT, 16);
+            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtCloneExpr(op1), gtNewIconNode(0), NI_SSE_Shuffle,
+                                               TYP_FLOAT, simdSize);
             break;
 
         default:
index f48a6ce..1b85332 100644 (file)
@@ -252,7 +252,6 @@ INST3( andnpd, "andnpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x55))    /
 INST3( orps,   "orps",   0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x56))    // Or packed singles
 INST3( orpd,   "orpd",   0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x56))    // Or packed doubles
 INST3( haddpd, "haddpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7C))    // Horizontal add packed doubles
-INST3( rcpps,  "rcpps",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53))    // Reciprocals of Packed Singles
 
 // SSE 2 approx arith
 INST3( rcpps,   "rcpps",   0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53))    // Reciprocal of packed singles
index 4169b4a..a046704 100644 (file)
@@ -2305,14 +2305,17 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
 void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
 {
     NamedIntrinsic      intrinsicID = node->gtHWIntrinsicId;
-    HWIntrinsicCategory category    = comp->categoryOfHWIntrinsic(intrinsicID);
-    int                 numArgs     = comp->numArgsOfHWIntrinsic(intrinsicID);
+    HWIntrinsicCategory category    = Compiler::categoryOfHWIntrinsic(intrinsicID);
+    HWIntrinsicFlag     flags       = Compiler::flagsOfHWIntrinsic(intrinsicID);
+    int                 numArgs     = Compiler::numArgsOfHWIntrinsic(intrinsicID);
     GenTree*            op1         = node->gtGetOp1();
     GenTree*            op2         = node->gtGetOp2();
 
     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
     // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned
-    if (category == HW_Category_SimpleSIMD && numArgs == 2 && comp->canUseVexEncoding())
+
+    if (comp->canUseVexEncoding() && numArgs == 2 && (flags & HW_Flag_NoContainment) == 0 &&
+        category == HW_Category_SimpleSIMD)
     {
         if (IsContainableMemoryOp(op2))
         {
@@ -2325,7 +2328,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
         }
     }
 
-    if (NamedIntrinsic == NI_SSE_Shuffle)
+    // TODO - change to all IMM intrinsics
+    if (intrinsicID == NI_SSE_Shuffle)
     {
         assert(op1->OperIsList());
         GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
index f56a36a..fbadd56 100644 (file)
@@ -2564,7 +2564,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree,
 
                 info->internalIntCount = 2;
                 info->setInternalCandidates(this, allRegs(TYP_INT));
-                break;
             }
             break;
         }
@@ -2577,7 +2576,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree,
             break;
 
         case NI_SSE41_BlendVariable:
-        {
             if (!compiler->canUseVexEncoding())
             {
                 // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
@@ -2589,7 +2587,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree,
                 info->hasDelayFreeSrc = true;
             }
             break;
-        }
 
 #ifdef _TARGET_X86_
         case NI_SSE42_Crc32:
index 8d5aac2..6387f60 100644 (file)
@@ -39,18 +39,33 @@ enum HWIntrinsicFlag : unsigned int
     // Generic
     // - must throw NotSupportException if the type argument is not numeric type
     HW_Flag_Generic = 0x4,
+    // Two-type Generic
+    // - the intrinsic has two type parameters
+    HW_Flag_TwoTypeGeneric = 0xC,
 
     // NoCodeGen
     // - should be transformed in the compiler front-end, cannot reach CodeGen
-    HW_Flag_NoCodeGen = 0x8,
+    HW_Flag_NoCodeGen = 0x10,
 
     // Unfixed SIMD-size
     // - overloaded on multiple vector sizes (SIMD size in the table is unreliable)
-    HW_Flag_UnfixedSIMDSize = 0x10,
+    HW_Flag_UnfixedSIMDSize = 0x20,
 
     // Complex overload
     // - the codegen of overloads cannot be determined by intrinsicID and base type
-    HW_Flag_ComplexOverloads = 0x20,
+    HW_Flag_ComplexOverloads = 0x40,
+
+    // Multi-instruction
+    // - that one intrinsic can generate multiple instructions
+    HW_Flag_MultiIns = 0x80,
+
+    // NoContainment
+    // the intrinsic cannot be contained
+    HW_Flag_NoContainment = 0x100,
+
+    // Copy Upper bits
+    // some SIMD scalar intrinsics need the semantics of copying upper bits from the source operand
+    HW_Flag_CopyUpperBits = 0x200,
 };
 
 inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2)
@@ -63,7 +78,6 @@ enum HWIntrinsicCategory : unsigned int
     // Simple SIMD intrinsics
     // - take Vector128/256<T> parameters
     // - return a Vector128/256<T>
-    // - generate single instruction
     // - the codegen of overloads can be determined by intrinsicID and base type of returned vector
     HW_Category_SimpleSIMD,
 
@@ -79,6 +93,10 @@ enum HWIntrinsicCategory : unsigned int
     // - operate over general purpose registers, like crc32, lzcnt, popcnt, etc.
     HW_Category_Scalar,
 
+    // SIMD scalar
+    // - operate over vector registers(XMM), but just compute on the first element
+    HW_Category_SIMDScalar,
+
     // Memory access intrinsics
     // - e.g., Avx.Load, Avx.Store, Sse.LoadAligned
     HW_Category_MemoryLoad,