table drive Intel hardware intrinsic
authorFei Peng <fei.peng@intel.com>
Wed, 17 Jan 2018 02:35:38 +0000 (18:35 -0800)
committerFei Peng <fei.peng@intel.com>
Thu, 18 Jan 2018 21:37:57 +0000 (13:37 -0800)
22 files changed:
src/jit/compiler.h
src/jit/emit.h
src/jit/emitfmtsxarch.h
src/jit/emitxarch.cpp
src/jit/emitxarch.h
src/jit/gentree.cpp
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp
src/jit/instrsxarch.h
src/jit/lowerxarch.cpp
src/jit/lsraxarch.cpp
src/jit/namedintrinsiclist.h
tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply.cs [new file with mode: 0644]
tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_r.csproj [new file with mode: 0644]
tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_ro.csproj [new file with mode: 0644]
tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply.cs [new file with mode: 0644]
tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_r.csproj [new file with mode: 0644]
tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_ro.csproj [new file with mode: 0644]
tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply.cs [new file with mode: 0644]
tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_r.csproj [new file with mode: 0644]
tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_ro.csproj [new file with mode: 0644]

index 7f2f3b8..ea1c6b2 100644 (file)
@@ -3042,11 +3042,11 @@ protected:
     NamedIntrinsic lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method);
 
 #if FEATURE_HW_INTRINSICS
-    InstructionSet lookupHWIntrinsicISA(const char* className);
-    NamedIntrinsic lookupHWIntrinsic(const char* methodName, InstructionSet isa);
-    InstructionSet isaOfHWIntrinsic(NamedIntrinsic intrinsic);
-    bool isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic);
-    bool isFullyImplmentedISAClass(InstructionSet isa);
+    static InstructionSet lookupHWIntrinsicISA(const char* className);
+    static NamedIntrinsic lookupHWIntrinsic(const char* methodName, InstructionSet isa);
+    static InstructionSet isaOfHWIntrinsic(NamedIntrinsic intrinsic);
+    static bool isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic);
+    static bool isFullyImplmentedISAClass(InstructionSet isa);
 #ifdef _TARGET_XARCH_
     GenTree* impUnsupportedHWIntrinsic(unsigned              helper,
                                        CORINFO_METHOD_HANDLE method,
@@ -3119,7 +3119,12 @@ protected:
     bool compSupportsHWIntrinsic(InstructionSet isa);
     bool isScalarISA(InstructionSet isa);
     static int ivalOfHWIntrinsic(NamedIntrinsic intrinsic);
+    static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic);
     static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type);
+    static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic);
+    static HWIntrinsicFlag flagOfHWIntrinsic(NamedIntrinsic intrinsic);
+    GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass);
+    GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig);
 #endif // _TARGET_XARCH_
 #endif // FEATURE_HW_INTRINSICS
     GenTreePtr impArrayAccessIntrinsic(CORINFO_CLASS_HANDLE clsHnd,
index a602cfc..fd4ccbd 100644 (file)
@@ -938,6 +938,7 @@ protected:
             struct
             {
                 regNumber _idReg3 : REGNUM_BITS;
+                regNumber _idReg4 : REGNUM_BITS;
             };
 #endif // defined(_TARGET_XARCH_)
 
@@ -1119,6 +1120,19 @@ protected:
             idAddr()->_idReg3 = reg;
             assert(reg == idAddr()->_idReg3);
         }
+        regNumber idReg4() const
+        {
+            assert(!idIsTiny());
+            assert(!idIsSmallDsc());
+            return idAddr()->_idReg4;
+        }
+        void idReg4(regNumber reg)
+        {
+            assert(!idIsTiny());
+            assert(!idIsSmallDsc());
+            idAddr()->_idReg4 = reg;
+            assert(reg == idAddr()->_idReg4);
+        }
 #endif // defined(_TARGET_XARCH_)
 #ifdef _TARGET_ARMARCH_
         insOpts idInsOpt() const
index 953103a..b7ab38f 100644 (file)
@@ -110,6 +110,8 @@ IF_DEF(RRW_RRW_CNS, IS_R1_RW|IS_R2_RW,          SCNS)     // r/w    reg , r/w  r
 
 IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE)     // write  reg , read reg2 , read reg3
 IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write  reg , read reg2 , read reg3, const
+
+IF_DEF(RWR_RRD_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD|IS_R4_RD, NONE)     // write  reg , read reg2 , read reg3 , read reg4
 //----------------------------------------------------------------------------
 // The following formats are used for direct addresses (e.g. static data members)
 //----------------------------------------------------------------------------
index f55226f..0f6a9ff 100644 (file)
@@ -146,6 +146,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
         case INS_pminub:
         case INS_pminud:
         case INS_pminuw:
+        case INS_pmuldq:
         case INS_pmulld:
         case INS_pmullw:
         case INS_pmuludq:
@@ -4227,6 +4228,43 @@ void emitter::emitIns_R_R_S_I(
     emitCurIGsize += sz;
 }
 
+static bool isAvxBlendv(instruction ins)
+{
+    return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
+}
+
+static bool isSse41Blendv(instruction ins)
+{
+    return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb;
+}
+
+void emitter::emitIns_R_R_R_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3)
+{
+    assert(isAvxBlendv(ins));
+    assert(UseVEXEncoding());
+    // Currently vex prefix only use three bytes mode.
+    // size = vex + opcode + ModR/M + 1-byte-cns(Reg) = 3 + 1 + 1 + 1 = 6
+    // TODO-XArch-CQ: We should create function which can calculate all kinds of AVX instructions size in future
+    UNATIVE_OFFSET sz = 6;
+
+    // AVX/AVX2 supports 4-reg format for vblendvps/vblendvpd/vpblendvb,
+    // which encodes the fourth register into imm8[7:4]
+    int ival = (reg3 - XMMBASE) << 4; // convert reg3 to ival
+
+    instrDesc* id = emitNewInstrCns(attr, ival);
+    id->idIns(ins);
+    id->idInsFmt(IF_RWR_RRD_RRD_RRD);
+    id->idReg1(targetReg);
+    id->idReg2(reg1);
+    id->idReg3(reg2);
+    id->idReg4(reg3);
+
+    id->idCodeSize(sz);
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
 /*****************************************************************************
  *
  *  Add an instruction with a register + static member operands.
@@ -5166,23 +5204,17 @@ void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNu
 }
 
 #if FEATURE_HW_INTRINSICS
-void emitter::emitIns_SIMD_R_R(instruction ins, regNumber reg, regNumber reg1, var_types simdtype)
-{
-    emitIns_R_R(ins, emitTypeSize(simdtype), reg, reg1);
-}
-
-void emitter::emitIns_SIMD_R_R_A(
-    instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, var_types simdtype)
+void emitter::emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_A(ins, emitTypeSize(simdtype), reg, reg1, indir, IF_RWR_RRD_ARD);
+        emitIns_R_R_A(ins, attr, reg, reg1, indir, IF_RWR_RRD_ARD);
     }
     else
     {
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
         emitIns_R_A(ins, emitTypeSize(simdtype), reg, indir, IF_RRW_ARD);
     }
@@ -5205,51 +5237,90 @@ void emitter::emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1
 }
 
 void emitter::emitIns_SIMD_R_R_C(
-    instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, var_types simdtype)
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_C(ins, emitTypeSize(simdtype), reg, reg1, fldHnd, offs);
+        emitIns_R_R_C(ins, attr, reg, reg1, fldHnd, offs);
     }
     else
     {
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_C(ins, emitTypeSize(simdtype), reg, fldHnd, offs);
+        emitIns_R_C(ins, attr, reg, fldHnd, offs);
     }
 }
 
-void emitter::emitIns_SIMD_R_R_R(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, var_types simdtype)
+void emitter::emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_R(ins, emitTypeSize(simdtype), reg, reg1, reg2);
+        emitIns_R_R_R(ins, attr, reg, reg1, reg2);
     }
     else
     {
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_R(ins, emitTypeSize(simdtype), reg, reg2);
+        emitIns_R_R(ins, attr, reg, reg2);
     }
 }
 
-void emitter::emitIns_SIMD_R_R_S(instruction ins, regNumber reg, regNumber reg1, int varx, int offs, var_types simdtype)
+void emitter::emitIns_SIMD_R_R_R_R(
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3)
 {
+    assert(isAvxBlendv(ins) || isSse41Blendv(ins));
     if (UseVEXEncoding())
     {
-        emitIns_R_R_S(ins, emitTypeSize(simdtype), reg, reg1, varx, offs);
+        // convert SSE encoding of SSE4.1 instructions to VEX encoding
+        switch (ins)
+        {
+            case INS_blendvps:
+                ins = INS_vblendvps;
+                break;
+            case INS_blendvpd:
+                ins = INS_vblendvpd;
+                break;
+            case INS_pblendvb:
+                ins = INS_vpblendvb;
+                break;
+            default:
+                break;
+        }
+        emitIns_R_R_R_R(ins, attr, reg, reg1, reg2, reg3);
     }
     else
     {
+        assert(isSse41Blendv(ins));
+        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+        if (reg3 != REG_XMM0)
+        {
+            emitIns_R_R(INS_movaps, attr, REG_XMM0, reg3);
+        }
         if (reg1 != reg)
         {
-            emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
+        }
+        emitIns_R_R(ins, attr, reg, reg2);
+    }
+}
+
+void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_S(ins, attr, reg, reg1, varx, offs);
+    }
+    else
+    {
+        if (reg1 != reg)
+        {
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
         }
-        emitIns_R_S(ins, emitTypeSize(simdtype), reg, varx, offs);
+        emitIns_R_S(ins, attr, reg, varx, offs);
     }
 }
 
@@ -7653,6 +7724,14 @@ void emitter::emitDispIns(
             val = emitGetInsSC(id);
             goto PRINT_CONSTANT;
             break;
+        case IF_RWR_RRD_RRD_RRD:
+            assert(IsAVXOnlyInstruction(ins));
+            assert(UseVEXEncoding());
+            printf("%s, ", emitRegName(id->idReg1(), attr));
+            printf("%s, ", emitRegName(id->idReg2(), attr));
+            printf("%s, ", emitRegName(id->idReg3(), attr));
+            printf("%s", emitRegName(id->idReg4(), attr));
+            break;
         case IF_RRW_RRW_CNS:
             printf("%s,", emitRegName(id->idReg1(), attr));
             printf(" %s", emitRegName(id->idReg2(), attr));
@@ -10304,7 +10383,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
 
     instruction ins = id->idIns();
     assert(IsAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins));
     regNumber targetReg = id->idReg1();
     regNumber src1      = id->idReg2();
     regNumber src2      = id->idReg3();
@@ -11570,6 +11649,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             sz  = emitSizeOfInsDsc(id);
             break;
         case IF_RWR_RRD_RRD_CNS:
+        case IF_RWR_RRD_RRD_RRD:
             dst = emitOutputRRR(dst, id);
             sz  = emitSizeOfInsDsc(id);
             dst += emitOutputByte(dst, emitGetInsSC(id));
index 9c176bc..0473447 100644 (file)
@@ -398,6 +398,8 @@ void emitIns_R_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber r
 
 void emitIns_R_R_S_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival);
 
+void emitIns_R_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, regNumber reg4);
+
 void emitIns_S(instruction ins, emitAttr attr, int varx, int offs);
 
 void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
@@ -453,13 +455,7 @@ void emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg,
 void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp);
 
 #if FEATURE_HW_INTRINSICS
-void emitIns_SIMD_R_R(instruction ins, regNumber reg, regNumber reg1, var_types simdtype);
-void emitIns_SIMD_R_R_A(instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, var_types simdtype);
 void emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype);
-void emitIns_SIMD_R_R_C(
-    instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, var_types simdtype);
-void emitIns_SIMD_R_R_R(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, var_types simdtype);
-void emitIns_SIMD_R_R_S(instruction ins, regNumber reg, regNumber reg1, int varx, int offs, var_types simdtype);
 void emitIns_SIMD_R_R_A_I(
     instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype);
 void emitIns_SIMD_R_R_C_I(instruction          ins,
@@ -472,6 +468,13 @@ void emitIns_SIMD_R_R_C_I(instruction          ins,
 void emitIns_SIMD_R_R_R_I(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype);
 void emitIns_SIMD_R_R_S_I(
     instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype);
+void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir);
+void emitIns_SIMD_R_R_C(
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs);
+void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs);
+void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2);
+void emitIns_SIMD_R_R_R_R(
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3);
 #endif
 
 #if FEATURE_STACK_FP_X87
index 9fd4acf..5d1fdc2 100644 (file)
@@ -17914,20 +17914,20 @@ GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types      type,
 }
 
 GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(
-    var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size)
+    var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned simdSize)
 {
     SetOpLclRelatedToSIMDIntrinsic(op1);
 
-    return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, hwIntrinsicID, baseType, size);
+    return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, hwIntrinsicID, baseType, simdSize);
 }
 
 GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(
-    var_types type, GenTree* op1, GenTree* op2, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size)
+    var_types type, GenTree* op1, GenTree* op2, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned simdSize)
 {
     SetOpLclRelatedToSIMDIntrinsic(op1);
     SetOpLclRelatedToSIMDIntrinsic(op2);
 
-    return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, op2, hwIntrinsicID, baseType, size);
+    return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, op2, hwIntrinsicID, baseType, simdSize);
 }
 
 GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types      type,
index ed61c95..6ea0de7 100644 (file)
@@ -24,10 +24,94 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "gcinfo.h"
 #include "gcinfoencoder.h"
 
+//------------------------------------------------------------------------
+// genIsTableDrivenHWIntrinsic:
+//
+// Arguments:
+//    category - category of a HW intrinsic
+//
+// Return Value:
+//    returns true if this category can be table-driven in CodeGen
+//
+static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category)
+{
+    // TODO - make more categories to the table-driven framework
+    const bool tableDrivenIntrinsic    = category == HW_Category_SimpleSIMD;
+    const bool nonTableDrivenIntrinsic = category == HW_Category_Special;
+    return tableDrivenIntrinsic && !nonTableDrivenIntrinsic;
+}
+
 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
 {
-    NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
-    InstructionSet isa         = compiler->isaOfHWIntrinsic(intrinsicID);
+    NamedIntrinsic      intrinsicID = node->gtHWIntrinsicId;
+    InstructionSet      isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
+    HWIntrinsicCategory category    = Compiler::categoryOfHWIntrinsic(intrinsicID);
+    HWIntrinsicFlag     flag        = Compiler::flagOfHWIntrinsic(intrinsicID);
+    int                 numArgs     = Compiler::numArgsOfHWIntrinsic(intrinsicID);
+
+    assert((flag & HW_Flag_NoCodeGen) == 0);
+
+    if (genIsTableDrivenHWIntrinsic(category))
+    {
+        GenTree*  op1        = node->gtGetOp1();
+        GenTree*  op2        = node->gtGetOp2();
+        regNumber targetReg  = node->gtRegNum;
+        var_types targetType = node->TypeGet();
+        var_types baseType   = node->gtSIMDBaseType;
+
+        regNumber op1Reg = REG_NA;
+        regNumber op2Reg = REG_NA;
+        emitter*  emit   = getEmitter();
+
+        assert(numArgs >= 0);
+        instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+        assert(ins != INS_invalid);
+        emitAttr simdSize = (emitAttr)(node->gtSIMDSize);
+        assert(simdSize != 0);
+
+        switch (numArgs)
+        {
+            case 1:
+                genConsumeOperands(node);
+                op1Reg = op1->gtRegNum;
+                emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
+                break;
+            case 2:
+                genHWIntrinsic_R_R_RM(node, ins);
+                break;
+            case 3:
+            {
+                assert(op1->OperIsList());
+                assert(op1->gtGetOp2()->OperIsList());
+                assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
+
+                GenTreeArgList* argList = op1->AsArgList();
+                op1                     = argList->Current();
+                genConsumeRegs(op1);
+                op1Reg = op1->gtRegNum;
+
+                argList = argList->Rest();
+                op2     = argList->Current();
+                genConsumeRegs(op2);
+                op2Reg = op2->gtRegNum;
+
+                argList      = argList->Rest();
+                GenTree* op3 = argList->Current();
+                genConsumeRegs(op3);
+                regNumber op3Reg = op3->gtRegNum;
+
+                emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
+                break;
+            }
+
+            default:
+                unreached();
+                break;
+        }
+        genProduceReg(node);
+        return;
+    }
+
     switch (isa)
     {
         case InstructionSet_SSE:
@@ -87,6 +171,7 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
     regNumber targetReg  = node->gtRegNum;
     GenTree*  op1        = node->gtGetOp1();
     GenTree*  op2        = node->gtGetOp2();
+    emitAttr  simdSize   = (emitAttr)(node->gtSIMDSize);
     emitter*  emit       = getEmitter();
 
     // TODO-XArch-CQ: Commutative operations can have op1 be contained
@@ -136,13 +221,13 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
 
                 case GT_CLS_VAR_ADDR:
                 {
-                    emit->emitIns_SIMD_R_R_C(ins, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0, targetType);
+                    emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
                     return;
                 }
 
                 default:
                 {
-                    emit->emitIns_SIMD_R_R_A(ins, targetReg, op1Reg, memIndir, targetType);
+                    emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir);
                     return;
                 }
             }
@@ -180,11 +265,11 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
         assert(offset != (unsigned)-1);
 
-        emit->emitIns_SIMD_R_R_S(ins, targetReg, op1Reg, varNum, offset, targetType);
+        emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset);
     }
     else
     {
-        emit->emitIns_SIMD_R_R_R(ins, targetReg, op1Reg, op2->gtRegNum, targetType);
+        emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum);
     }
 }
 
@@ -691,40 +776,6 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
 
     switch (intrinsicID)
     {
-        case NI_SSE2_Add:
-        {
-            assert(node->TypeGet() == TYP_SIMD16);
-
-            switch (baseType)
-            {
-                case TYP_DOUBLE:
-                    ins = INS_addpd;
-                    break;
-                case TYP_INT:
-                case TYP_UINT:
-                    ins = INS_paddd;
-                    break;
-                case TYP_LONG:
-                case TYP_ULONG:
-                    ins = INS_paddq;
-                    break;
-                case TYP_BYTE:
-                case TYP_UBYTE:
-                    ins = INS_paddb;
-                    break;
-                case TYP_SHORT:
-                case TYP_USHORT:
-                    ins = INS_paddw;
-                    break;
-                default:
-                    unreached();
-                    break;
-            }
-
-            genHWIntrinsic_R_R_RM(node, ins);
-            break;
-        }
-
         default:
             unreached();
             break;
@@ -800,27 +851,6 @@ void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node)
 
     switch (intrinsicID)
     {
-        case NI_AVX_Add:
-        {
-            assert(node->TypeGet() == TYP_SIMD32);
-
-            switch (baseType)
-            {
-                case TYP_DOUBLE:
-                    ins = INS_addpd;
-                    break;
-                case TYP_FLOAT:
-                    ins = INS_addps;
-                    break;
-                default:
-                    unreached();
-                    break;
-            }
-
-            genHWIntrinsic_R_R_RM(node, ins);
-            break;
-        }
-
         default:
             unreached();
             break;
@@ -839,37 +869,6 @@ void CodeGen::genAVX2Intrinsic(GenTreeHWIntrinsic* node)
 
     switch (intrinsicID)
     {
-        case NI_AVX2_Add:
-        {
-            assert(node->TypeGet() == TYP_SIMD32);
-
-            switch (baseType)
-            {
-                case TYP_INT:
-                case TYP_UINT:
-                    ins = INS_paddd;
-                    break;
-                case TYP_LONG:
-                case TYP_ULONG:
-                    ins = INS_paddq;
-                    break;
-                case TYP_BYTE:
-                case TYP_UBYTE:
-                    ins = INS_paddb;
-                    break;
-                case TYP_SHORT:
-                case TYP_USHORT:
-                    ins = INS_paddw;
-                    break;
-                default:
-                    unreached();
-                    break;
-            }
-
-            genHWIntrinsic_R_R_RM(node, ins);
-            break;
-        }
-
         default:
             unreached();
             break;
index cd5e59d..f9ccf7f 100644 (file)
 // clang-format off
 
 #if FEATURE_HW_INTRINSICS
-//                 Intrinsic ID                                     Function name                                   ISA
-//  SSE Intrinsics
-HARDWARE_INTRINSIC(SSE_IsSupported,                                 "get_IsSupported",                              SSE)
-HARDWARE_INTRINSIC(SSE_Add,                                         "Add",                                          SSE)
-HARDWARE_INTRINSIC(SSE_AddScalar,                                   "AddScalar",                                    SSE)
-HARDWARE_INTRINSIC(SSE_And,                                         "And",                                          SSE)
-HARDWARE_INTRINSIC(SSE_AndNot,                                      "AndNot",                                       SSE)
-HARDWARE_INTRINSIC(SSE_CompareEqual,                                "CompareEqual",                                 SSE)
-HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar,                   "CompareEqualOrderedScalar",                    SSE)
-HARDWARE_INTRINSIC(SSE_CompareEqualScalar,                          "CompareEqualScalar",                           SSE)
-HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar,                 "CompareEqualUnorderedScalar",                  SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThan,                          "CompareGreaterThan",                           SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar,             "CompareGreaterThanOrderedScalar",              SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar,                    "CompareGreaterThanScalar",                     SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar,           "CompareGreaterThanUnorderedScalar",            SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual,                   "CompareGreaterThanOrEqual",                    SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar,      "CompareGreaterThanOrEqualOrderedScalar",       SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar,             "CompareGreaterThanOrEqualScalar",              SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar,    "CompareGreaterThanOrEqualUnorderedScalar",     SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThan,                             "CompareLessThan",                              SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar,                "CompareLessThanOrderedScalar",                 SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanScalar,                       "CompareLessThanScalar",                        SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar,              "CompareLessThanUnorderedScalar",               SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual,                      "CompareLessThanOrEqual",                       SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar,         "CompareLessThanOrEqualOrderedScalar",          SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar,                "CompareLessThanOrEqualScalar",                 SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar,       "CompareLessThanOrEqualUnorderedScalar",        SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotEqual,                             "CompareNotEqual",                              SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar,                "CompareNotEqualOrderedScalar",                 SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar,                       "CompareNotEqualScalar",                        SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar,              "CompareNotEqualUnorderedScalar",               SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan,                       "CompareNotGreaterThan",                        SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar,                 "CompareNotGreaterThanScalar",                  SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual,                "CompareNotGreaterThanOrEqual",                 SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar,          "CompareNotGreaterThanOrEqualScalar",           SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThan,                          "CompareNotLessThan",                           SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar,                    "CompareNotLessThanScalar",                     SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual,                   "CompareNotLessThanOrEqual",                    SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar,             "CompareNotLessThanOrEqualScalar",              SSE)
-HARDWARE_INTRINSIC(SSE_CompareOrdered,                              "CompareOrdered",                               SSE)
-HARDWARE_INTRINSIC(SSE_CompareOrderedScalar,                        "CompareOrderedScalar",                         SSE)
-HARDWARE_INTRINSIC(SSE_CompareUnordered,                            "CompareUnordered",                             SSE)
-HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar,                      "CompareUnorderedScalar",                       SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToInt32,                              "ConvertToInt32",                               SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToInt64,                              "ConvertToInt64",                               SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToSingle,                             "ConvertToSingle",                              SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar,              "ConvertToVector128SingleScalar",               SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation,                "ConvertToInt32WithTruncation",                 SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation,                "ConvertToInt64WithTruncation",                 SSE)
-HARDWARE_INTRINSIC(SSE_Divide,                                      "Divide",                                       SSE)
-HARDWARE_INTRINSIC(SSE_DivideScalar,                                "DivideScalar",                                 SSE)
-HARDWARE_INTRINSIC(SSE_LoadAlignedVector128,                        "LoadAlignedVector128",                         SSE)
-HARDWARE_INTRINSIC(SSE_LoadHigh,                                    "LoadHigh",                                     SSE)
-HARDWARE_INTRINSIC(SSE_LoadLow,                                     "LoadLow",                                      SSE)
-HARDWARE_INTRINSIC(SSE_LoadScalar,                                  "LoadScalar",                                   SSE)
-HARDWARE_INTRINSIC(SSE_LoadVector128,                               "LoadVector128",                                SSE)
-HARDWARE_INTRINSIC(SSE_Max,                                         "Max",                                          SSE)
-HARDWARE_INTRINSIC(SSE_MaxScalar,                                   "MaxScalar",                                    SSE)
-HARDWARE_INTRINSIC(SSE_Min,                                         "Min",                                          SSE)
-HARDWARE_INTRINSIC(SSE_MinScalar,                                   "MinScalar",                                    SSE)
-HARDWARE_INTRINSIC(SSE_MoveHighToLow,                               "MoveHighToLow",                                SSE)
-HARDWARE_INTRINSIC(SSE_MoveLowToHigh,                               "MoveLowToHigh",                                SSE)
-HARDWARE_INTRINSIC(SSE_MoveMask,                                    "MoveMask",                                     SSE)
-HARDWARE_INTRINSIC(SSE_MoveScalar,                                  "MoveScalar",                                   SSE)
-HARDWARE_INTRINSIC(SSE_Multiply,                                    "Multiply",                                     SSE)
-HARDWARE_INTRINSIC(SSE_MultiplyScalar,                              "MultiplyScalar",                               SSE)
-HARDWARE_INTRINSIC(SSE_Or,                                          "Or",                                           SSE)
-HARDWARE_INTRINSIC(SSE_Reciprocal,                                  "Reciprocal",                                   SSE)
-HARDWARE_INTRINSIC(SSE_ReciprocalScalar,                            "ReciprocalScalar",                             SSE)
-HARDWARE_INTRINSIC(SSE_ReciprocalSqrt,                              "ReciprocalSqrt",                               SSE)
-HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar,                        "ReciprocalSqrtScalar",                         SSE)
-HARDWARE_INTRINSIC(SSE_SetAllVector128,                             "SetAllVector128",                              SSE)
-HARDWARE_INTRINSIC(SSE_SetScalar,                                   "SetScalar",                                    SSE)
-HARDWARE_INTRINSIC(SSE_SetVector128,                                "SetVector128",                                 SSE)
-HARDWARE_INTRINSIC(SSE_SetZeroVector128,                            "SetZeroVector128",                             SSE)
-HARDWARE_INTRINSIC(SSE_Shuffle,                                     "Shuffle",                                      SSE)
-HARDWARE_INTRINSIC(SSE_Sqrt,                                        "Sqrt",                                         SSE)
-HARDWARE_INTRINSIC(SSE_SqrtScalar,                                  "SqrtScalar",                                   SSE)
-HARDWARE_INTRINSIC(SSE_StaticCast,                                  "StaticCast",                                   SSE)
-HARDWARE_INTRINSIC(SSE_Store,                                       "Store",                                        SSE)
-HARDWARE_INTRINSIC(SSE_StoreAligned,                                "StoreAligned",                                 SSE)
-HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal,                     "StoreAlignedNonTemporal",                      SSE)
-HARDWARE_INTRINSIC(SSE_StoreHigh,                                   "StoreHigh",                                    SSE)
-HARDWARE_INTRINSIC(SSE_StoreLow,                                    "StoreLow",                                     SSE)
-HARDWARE_INTRINSIC(SSE_StoreScalar,                                 "StoreScalar",                                  SSE)
-HARDWARE_INTRINSIC(SSE_Subtract,                                    "Subtract",                                     SSE)
-HARDWARE_INTRINSIC(SSE_SubtractScalar,                              "SubtractScalar",                               SSE)
-HARDWARE_INTRINSIC(SSE_UnpackHigh,                                  "UnpackHigh",                                   SSE)
-HARDWARE_INTRINSIC(SSE_UnpackLow,                                   "UnpackLow",                                    SSE)
-HARDWARE_INTRINSIC(SSE_Xor,                                         "Xor",                                          SSE)
-
-//  SSE2 Intrinsics
-HARDWARE_INTRINSIC(SSE2_IsSupported,                                "get_IsSupported",                              SSE2)
-HARDWARE_INTRINSIC(SSE2_Add,                                        "Add",                                          SSE2)
-
-//  SSE3 Intrinsics
-HARDWARE_INTRINSIC(SSE3_IsSupported,                                "get_IsSupported",                              SSE3)
-
-//  SSSE3 Intrinsics
-HARDWARE_INTRINSIC(SSSE3_IsSupported,                               "get_IsSupported",                              SSSE3)
-
-//  SSE41 Intrinsics
-HARDWARE_INTRINSIC(SSE41_IsSupported,                               "get_IsSupported",                              SSE41)
-
-//  SSE42 Intrinsics
-HARDWARE_INTRINSIC(SSE42_IsSupported,                               "get_IsSupported",                              SSE42)
-HARDWARE_INTRINSIC(SSE42_Crc32,                                     "Crc32",                                        SSE42)
-
-//  AVX Intrinsics
-HARDWARE_INTRINSIC(AVX_IsSupported,                                 "get_IsSupported",                              AVX)
-HARDWARE_INTRINSIC(AVX_Add,                                         "Add",                                          AVX)
-
-//  AVX2 Intrinsics
-HARDWARE_INTRINSIC(AVX2_IsSupported,                                "get_IsSupported",                              AVX2)
-HARDWARE_INTRINSIC(AVX2_Add,                                        "Add",                                          AVX2)
-
-//  AES Intrinsics
-HARDWARE_INTRINSIC(AES_IsSupported,                                 "get_IsSupported",                              AES)
-
-//  BMI1 Intrinsics
-HARDWARE_INTRINSIC(BMI1_IsSupported,                                "get_IsSupported",                              BMI1)
-
-//  BMI2 Intrinsics
-HARDWARE_INTRINSIC(BMI2_IsSupported,                                "get_IsSupported",                              BMI2)
-
-//  FMA Intrinsics
-HARDWARE_INTRINSIC(FMA_IsSupported,                                 "get_IsSupported",                              FMA)
-
-//  LZCNT Intrinsics
-HARDWARE_INTRINSIC(LZCNT_IsSupported,                               "get_IsSupported",                              LZCNT)
-HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount,                          "LeadingZeroCount",                             LZCNT)
+/* Note
+    1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic`
+    2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID
+    3) The intrinsic that generates instructions with a fixed imm8 operand has a `ival` field with "not -1" value, e.g., Sse.CompareEqual(v1,v2) -> cmpps xmm0, xmm1, 0
+    4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128<T>` (16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_) or `Vector256<T>`
+    5) Scalar intrinsics that operate over general purpose registers (e.g., Sse41.Crc32) have `SIMD size` with 0
+    6) Each intrinsic has a `NumArg` for number of parameters, and some intrinsics that are overloaded on multiple parameter numbers have this field with -1
+    7) Each intrinsic has 10 `instructions` fields that list the instructions should be generated based-on the base type
+    8) Each intrinsic has one category with type of `enum HWIntrinsicCategory`, please see the definition of HWIntrinsicCategory for details
+    9) Each intrinsic has one or more flags with type of `enum HWIntrinsicFlag`
+*/
+// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
+//                  Intrinsic ID                                      Function name                                      ISA        ival       SIMD size       NumArg                                                                   instructions                                                                                              Category                                         Flags
+//                                                                                                                                                                            {TYP_BYTE,      TYP_UBYTE,     TYP_SHORT,     TYP_USHORT,    TYP_INT,       TYP_UINT,      TYP_LONG,      TYP_ULONG,     TYP_FLOAT,     TYP_DOUBLE}
+// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
+//  SSE Intrinsics          
+HARDWARE_INTRINSIC(SSE_IsSupported,                                  "get_IsSupported",                                  SSE,        -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(SSE_Add,                                          "Add",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)                 
+HARDWARE_INTRINSIC(SSE_AddScalar,                                    "AddScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_And,                                          "And",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_andps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_AndNot,                                       "AndNot",                                           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_andnps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Fixed)
+HARDWARE_INTRINSIC(SSE_CompareEqual,                                 "CompareEqual",                                     SSE,        0,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar,                    "CompareEqualOrderedScalar",                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareEqualScalar,                           "CompareEqualScalar",                               SSE,        0,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar,                  "CompareEqualUnorderedScalar",                      SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThan,                           "CompareGreaterThan",                               SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar,              "CompareGreaterThanOrderedScalar",                  SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar,                     "CompareGreaterThanScalar",                         SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar,            "CompareGreaterThanUnorderedScalar",                SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual,                    "CompareGreaterThanOrEqual",                        SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar,       "CompareGreaterThanOrEqualOrderedScalar",           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar,              "CompareGreaterThanOrEqualScalar",                  SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar,     "CompareGreaterThanOrEqualUnorderedScalar",         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThan,                              "CompareLessThan",                                  SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar,                 "CompareLessThanOrderedScalar",                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanScalar,                        "CompareLessThanScalar",                            SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar,               "CompareLessThanUnorderedScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual,                       "CompareLessThanOrEqual",                           SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar,          "CompareLessThanOrEqualOrderedScalar",              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar,                 "CompareLessThanOrEqualScalar",                     SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar,        "CompareLessThanOrEqualUnorderedScalar",            SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotEqual,                              "CompareNotEqual",                                  SSE,        4,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar,                 "CompareNotEqualOrderedScalar",                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comiss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar,                        "CompareNotEqualScalar",                            SSE,        4,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar,               "CompareNotEqualUnorderedScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomiss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan,                        "CompareNotGreaterThan",                            SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar,                  "CompareNotGreaterThanScalar",                      SSE,        2,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual,                 "CompareNotGreaterThanOrEqual",                     SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar,           "CompareNotGreaterThanOrEqualScalar",               SSE,        1,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThan,                           "CompareNotLessThan",                               SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar,                     "CompareNotLessThanScalar",                         SSE,        5,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual,                    "CompareNotLessThanOrEqual",                        SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar,              "CompareNotLessThanOrEqualScalar",                  SSE,        6,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareOrdered,                               "CompareOrdered",                                   SSE,        7,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareOrderedScalar,                         "CompareOrderedScalar",                             SSE,        7,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareUnordered,                             "CompareUnordered",                                 SSE,        3,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar,                       "CompareUnorderedScalar",                           SSE,        3,            16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt32,                               "ConvertToInt32",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtss2si,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt64,                               "ConvertToInt64",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtss2si,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToSingle,                              "ConvertToSingle",                                  SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar,               "ConvertToVector128SingleScalar",                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtsi2ss,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation,                 "ConvertToInt32WithTruncation",                     SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttss2si, INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation,                 "ConvertToInt64WithTruncation",                     SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttss2si, INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Divide,                                       "Divide",                                           SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_DivideScalar,                                 "DivideScalar",                                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadAlignedVector128,                         "LoadAlignedVector128",                             SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movaps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadHigh,                                     "LoadHigh",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadLow,                                      "LoadLow",                                          SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movlps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadScalar,                                   "LoadScalar",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadVector128,                                "LoadVector128",                                    SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movups,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Max,                                          "Max",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MaxScalar,                                    "MaxScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Min,                                          "Min",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MinScalar,                                    "MinScalar",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MoveHighToLow,                                "MoveHighToLow",                                    SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhlps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MoveLowToHigh,                                "MoveLowToHigh",                                    SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movlhps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MoveMask,                                     "MoveMask",                                         SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movmskps,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MoveScalar,                                   "MoveScalar",                                       SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Multiply,                                     "Multiply",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MultiplyScalar,                               "MultiplyScalar",                                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Or,                                           "Or",                                               SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_orps,      INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Reciprocal,                                   "Reciprocal",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rcpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(SSE_ReciprocalScalar,                             "ReciprocalScalar",                                 SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rcpss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalSqrt,                               "ReciprocalSqrt",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rsqrtps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar,                         "ReciprocalSqrtScalar",                             SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rsqrtss,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SetAllVector128,                              "SetAllVector128",                                  SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SetScalar,                                    "SetScalar",                                        SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SetVector128,                                 "SetVector128",                                     SSE,        -1,           16,           4,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SetZeroVector128,                             "SetZeroVector128",                                 SSE,        -1,           16,           0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Shuffle,                                      "Shuffle",                                          SSE,        -1,           16,           3,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_shufps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Sqrt,                                         "Sqrt",                                             SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_sqrtps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SqrtScalar,                                   "SqrtScalar",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_sqrtss,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_StaticCast,                                   "StaticCast",                                       SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movaps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_Store,                                        "Store",                                            SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movups,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreAligned,                                 "StoreAligned",                                     SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movaps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal,                      "StoreAlignedNonTemporal",                          SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movntps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreHigh,                                    "StoreHigh",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreLow,                                     "StoreLow",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movlps,    INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreScalar,                                  "StoreScalar",                                      SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Subtract,                                     "Subtract",                                         SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_subps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SubtractScalar,                               "SubtractScalar",                                   SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_subss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_UnpackHigh,                                   "UnpackHigh",                                       SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_unpckhps,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_UnpackLow,                                    "UnpackLow",                                        SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_unpcklps,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Xor,                                          "Xor",                                              SSE,        -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_xorps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+
+//  SSE2 Intrinsics 
+HARDWARE_INTRINSIC(SSE2_IsSupported,                                 "get_IsSupported",                                  SSE2,       -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(SSE2_Add,                                         "Add",                                              SSE2,       -1,           16,           2,           {INS_paddb,     INS_paddb,     INS_paddw,     INS_paddw,     INS_paddd,     INS_paddd,     INS_paddq,     INS_paddq,     INS_invalid,   INS_addpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)                 
+
+//  SSE3 Intrinsics 
+HARDWARE_INTRINSIC(SSE3_IsSupported,                                 "get_IsSupported",                                  SSE3,       -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+
+//  SSSE3 Intrinsics 
+HARDWARE_INTRINSIC(SSSE3_IsSupported,                                "get_IsSupported",                                  SSSE3,      -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+
+//  SSE41 Intrinsics 
+HARDWARE_INTRINSIC(SSE41_IsSupported,                                "get_IsSupported",                                  SSE41,      -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(SSE41_Multiply,                                   "Multiply",                                         SSE41,      -1,           16,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_pmuldq,    INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)                 
+HARDWARE_INTRINSIC(SSE41_BlendVariable,                              "BlendVariable",                                    SSE41,      -1,           16,           3,           {INS_pblendvb,  INS_pblendvb,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_blendvps,  INS_blendvpd},          HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)                 
+
+//  SSE42 Intrinsics 
+HARDWARE_INTRINSIC(SSE42_IsSupported,                                "get_IsSupported",                                  SSE42,      -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(SSE42_Crc32,                                      "Crc32",                                            SSE42,      -1,           0,            2,           {INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFlag)                 
+
+//  AVX Intrinsics 
+HARDWARE_INTRINSIC(AVX_IsSupported,                                  "get_IsSupported",                                  AVX,        -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(AVX_Add,                                          "Add",                                              AVX,        -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addps,     INS_addpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)                 
+HARDWARE_INTRINSIC(AVX_Multiply,                                     "Multiply",                                         AVX,        -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulps,     INS_mulpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)                 
+HARDWARE_INTRINSIC(AVX_Reciprocal,                                   "Reciprocal",                                       AVX,        -1,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rcpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(AVX_BlendVariable,                                "BlendVariable",                                    AVX,        -1,           32,           3,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_vblendvps, INS_vblendvpd},         HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)                 
+
+//  AVX2 Intrinsics 
+HARDWARE_INTRINSIC(AVX2_IsSupported,                                 "get_IsSupported",                                  AVX2,       -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(AVX2_Add,                                         "Add",                                              AVX2,       -1,           32,           2,           {INS_paddb,     INS_paddb,     INS_paddw,     INS_paddw,     INS_paddd,     INS_paddd,     INS_paddq,     INS_paddq,     INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)                 
+HARDWARE_INTRINSIC(AVX2_Multiply,                                    "Multiply",                                         AVX2,       -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_pmuldq,    INS_pmuludq,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)                 
+HARDWARE_INTRINSIC(AVX2_BlendVariable,                               "BlendVariable",                                    AVX2,       -1,           32,           3,           {INS_vpblendvb, INS_vpblendvb, INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)                 
+
+//  AES Intrinsics 
+HARDWARE_INTRINSIC(AES_IsSupported,                                  "get_IsSupported",                                  AES,        -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+
+//  BMI1 Intrinsics 
+HARDWARE_INTRINSIC(BMI1_IsSupported,                                 "get_IsSupported",                                  BMI1,       -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+
+//  BMI2 Intrinsics 
+HARDWARE_INTRINSIC(BMI2_IsSupported,                                 "get_IsSupported",                                  BMI2,       -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+
+//  FMA Intrinsics 
+HARDWARE_INTRINSIC(FMA_IsSupported,                                  "get_IsSupported",                                  FMA,        -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+
+//  LZCNT Intrinsics 
+HARDWARE_INTRINSIC(LZCNT_IsSupported,                                "get_IsSupported",                                  LZCNT,      -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount,                           "LeadingZeroCount",                                 LZCNT,      -1,           0,            1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_lzcnt,     INS_invalid,   INS_lzcnt,     INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFlag)                 
 
 //  PCLMULQDQ Intrinsics
-HARDWARE_INTRINSIC(PCLMULQDQ_IsSupported,                           "get_IsSupported",                              PCLMULQDQ)
+HARDWARE_INTRINSIC(PCLMULQDQ_IsSupported,                            "get_IsSupported",                                  PCLMULQDQ,  -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
 
 //  POPCNT Intrinsics
-HARDWARE_INTRINSIC(POPCNT_IsSupported,                              "get_IsSupported",                              POPCNT)
-HARDWARE_INTRINSIC(POPCNT_PopCount,                                 "PopCount",                                     POPCNT)
-#endif // FEATURE_HW_INTRINSICS
+HARDWARE_INTRINSIC(POPCNT_IsSupported,                               "get_IsSupported",                                  POPCNT,     -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)                 
+HARDWARE_INTRINSIC(POPCNT_PopCount,                                  "PopCount",                                         POPCNT,     -1,           0,            1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_popcnt,    INS_invalid,   INS_popcnt,    INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFlag)                 
+#endif // FEATURE_HW_INTRINSIC
 
 #undef HARDWARE_INTRINSIC
 
index 5016cc0..a96952d 100644 (file)
@@ -8,13 +8,20 @@
 
 struct HWIntrinsicInfo
 {
-    NamedIntrinsic intrinsicID;
-    const char*    intrinsicName;
-    InstructionSet isa;
-}
+    NamedIntrinsic      intrinsicID;
+    const char*         intrinsicName;
+    InstructionSet      isa;
+    int                 ival;
+    unsigned            simdSize;
+    int                 numArgs;
+    instruction         ins[10];
+    HWIntrinsicCategory category;
+    HWIntrinsicFlag     flag;
+};
 
-static const hwIntrinsicInfoArray[] = {
-#define HARDWARE_INTRINSIC(id, name, isa) {NI_##id, name, InstructionSet_##isa},
+static const HWIntrinsicInfo hwIntrinsicInfoArray[] = {
+#define HARDWARE_INTRINSIC(id, name, isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag) \
+    {NI_##id, name, InstructionSet_##isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag},
 #include "hwintrinsiclistxarch.h"
 };
 
@@ -153,64 +160,56 @@ InstructionSet Compiler::isaOfHWIntrinsic(NamedIntrinsic intrinsic)
 }
 
 //------------------------------------------------------------------------
-// ivalOfHWIntrinsic: get the imm8 value of the given intrinsic
+// ivalOfHWIntrinsic: get the imm8 value of this intrinsic from the hwIntrinsicInfoArray table
 //
 // Arguments:
 //    intrinsic -- id of the intrinsic function.
 //
 // Return Value:
-//     the imm8 value of the intrinsic, -1 for non-IMM intrinsics
+//     The imm8 value that is implicit for this intrinsic, or -1 for intrinsics that do not take an immediate, or for
+//     which the immediate is an explicit argument.
 //
 int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic)
 {
     assert(intrinsic != NI_Illegal);
     assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+    return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].ival;
+}
 
-    switch (intrinsic)
-    {
-        case NI_SSE_CompareEqual:
-        case NI_SSE_CompareEqualScalar:
-            return 0;
-
-        case NI_SSE_CompareLessThan:
-        case NI_SSE_CompareLessThanScalar:
-        case NI_SSE_CompareNotGreaterThanOrEqual:
-        case NI_SSE_CompareNotGreaterThanOrEqualScalar:
-            return 1;
-
-        case NI_SSE_CompareLessThanOrEqual:
-        case NI_SSE_CompareLessThanOrEqualScalar:
-        case NI_SSE_CompareNotGreaterThan:
-        case NI_SSE_CompareNotGreaterThanScalar:
-            return 2;
-
-        case NI_SSE_CompareUnordered:
-        case NI_SSE_CompareUnorderedScalar:
-            return 3;
-
-        case NI_SSE_CompareNotEqual:
-        case NI_SSE_CompareNotEqualScalar:
-            return 4;
-
-        case NI_SSE_CompareGreaterThanOrEqual:
-        case NI_SSE_CompareGreaterThanOrEqualScalar:
-        case NI_SSE_CompareNotLessThan:
-        case NI_SSE_CompareNotLessThanScalar:
-            return 5;
-
-        case NI_SSE_CompareGreaterThan:
-        case NI_SSE_CompareGreaterThanScalar:
-        case NI_SSE_CompareNotLessThanOrEqual:
-        case NI_SSE_CompareNotLessThanOrEqualScalar:
-            return 6;
-
-        case NI_SSE_CompareOrdered:
-        case NI_SSE_CompareOrderedScalar:
-            return 7;
+//------------------------------------------------------------------------
+// simdSizeOfHWIntrinsic: get the SIMD size of this intrinsic
+//
+// Arguments:
+//    intrinsic -- id of the intrinsic function.
+//
+// Return Value:
+//     the SIMD size of this intrinsic
+//         - from the hwIntrinsicInfoArray table if intrinsic has NO HW_Flag_UnfixedSIMDSize
+//         - TODO-XArch-NYI - from the signature if intrinsic has HW_Flag_UnfixedSIMDSize
+//
+// Note - this function is only used by the importer
+//        after importation (i.e., codegen), we can get the SIMD size from GenTreeHWIntrinsic IR
+static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig)
+{
+    assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+    assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag & HW_Flag_UnfixedSIMDSize) == 0);
+    return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize;
+}
 
-        default:
-            return -1;
-    }
+//------------------------------------------------------------------------
+// numArgsOfHWIntrinsic: get the number of arguments
+//
+// Arguments:
+//    intrinsic -- id of the intrinsic function.
+//
+// Return Value:
+//     number of arguments
+//
+int Compiler::numArgsOfHWIntrinsic(NamedIntrinsic intrinsic)
+{
+    assert(intrinsic != NI_Illegal);
+    assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+    return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].numArgs;
 }
 
 //------------------------------------------------------------------------
@@ -228,200 +227,72 @@ instruction Compiler::insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type)
 {
     assert(intrinsic != NI_Illegal);
     assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+    assert(type >= TYP_BYTE && type <= TYP_DOUBLE);
+    return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].ins[type - TYP_BYTE];
+}
 
-    switch (intrinsic)
-    {
-        case NI_SSE_Add:
-            return INS_addps;
-
-        case NI_SSE_AddScalar:
-            return INS_addss;
-
-        case NI_SSE_And:
-            return INS_andps;
-
-        case NI_SSE_AndNot:
-            return INS_andnps;
-
-        case NI_SSE_CompareEqual:
-        case NI_SSE_CompareGreaterThan:
-        case NI_SSE_CompareGreaterThanOrEqual:
-        case NI_SSE_CompareLessThan:
-        case NI_SSE_CompareLessThanOrEqual:
-        case NI_SSE_CompareNotEqual:
-        case NI_SSE_CompareNotGreaterThan:
-        case NI_SSE_CompareNotGreaterThanOrEqual:
-        case NI_SSE_CompareNotLessThan:
-        case NI_SSE_CompareNotLessThanOrEqual:
-        case NI_SSE_CompareOrdered:
-        case NI_SSE_CompareUnordered:
-            return INS_cmpps;
-
-        case NI_SSE_CompareEqualScalar:
-        case NI_SSE_CompareGreaterThanScalar:
-        case NI_SSE_CompareGreaterThanOrEqualScalar:
-        case NI_SSE_CompareLessThanScalar:
-        case NI_SSE_CompareLessThanOrEqualScalar:
-        case NI_SSE_CompareNotEqualScalar:
-        case NI_SSE_CompareNotGreaterThanScalar:
-        case NI_SSE_CompareNotGreaterThanOrEqualScalar:
-        case NI_SSE_CompareNotLessThanScalar:
-        case NI_SSE_CompareNotLessThanOrEqualScalar:
-        case NI_SSE_CompareOrderedScalar:
-        case NI_SSE_CompareUnorderedScalar:
-            return INS_cmpss;
-
-        case NI_SSE_CompareEqualOrderedScalar:
-        case NI_SSE_CompareGreaterThanOrderedScalar:
-        case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
-        case NI_SSE_CompareLessThanOrderedScalar:
-        case NI_SSE_CompareLessThanOrEqualOrderedScalar:
-        case NI_SSE_CompareNotEqualOrderedScalar:
-            return INS_comiss;
-
-        case NI_SSE_CompareEqualUnorderedScalar:
-        case NI_SSE_CompareGreaterThanUnorderedScalar:
-        case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
-        case NI_SSE_CompareLessThanUnorderedScalar:
-        case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
-        case NI_SSE_CompareNotEqualUnorderedScalar:
-            return INS_ucomiss;
-
-        case NI_SSE_ConvertToInt32:
-        case NI_SSE_ConvertToInt64:
-            return INS_cvtss2si;
-
-        case NI_SSE_ConvertToInt32WithTruncation:
-        case NI_SSE_ConvertToInt64WithTruncation:
-            return INS_cvttss2si;
-
-        case NI_SSE_ConvertToSingle:
-        case NI_SSE_LoadScalar:
-        case NI_SSE_MoveScalar:
-            return INS_movss;
-
-        case NI_SSE_ConvertToVector128SingleScalar:
-            return INS_cvtsi2ss;
-
-        case NI_SSE_Divide:
-            return INS_divps;
-
-        case NI_SSE_DivideScalar:
-            return INS_divss;
-
-        case NI_SSE_LoadAlignedVector128:
-        case NI_SSE_StaticCast:
-            return INS_movaps;
-
-        case NI_SSE_LoadHigh:
-            return INS_movhps;
-
-        case NI_SSE_LoadLow:
-            return INS_movlps;
-
-        case NI_SSE_LoadVector128:
-            return INS_movups;
-
-        case NI_SSE_Max:
-            return INS_maxps;
-
-        case NI_SSE_MaxScalar:
-            return INS_maxss;
-
-        case NI_SSE_Min:
-            return INS_minps;
-
-        case NI_SSE_MinScalar:
-            return INS_minss;
-
-        case NI_SSE_MoveHighToLow:
-            return INS_movhlps;
-
-        case NI_SSE_MoveLowToHigh:
-            return INS_movlhps;
-
-        case NI_SSE_MoveMask:
-            return INS_movmskps;
-
-        case NI_SSE_Multiply:
-            return INS_mulps;
-
-        case NI_SSE_MultiplyScalar:
-            return INS_mulss;
-
-        case NI_SSE_Or:
-            return INS_orps;
-
-        case NI_SSE_Reciprocal:
-            return INS_rcpps;
-
-        case NI_SSE_ReciprocalScalar:
-            return INS_rcpss;
-
-        case NI_SSE_ReciprocalSqrt:
-            return INS_rsqrtps;
-
-        case NI_SSE_ReciprocalSqrtScalar:
-            return INS_rsqrtss;
-
-        case NI_SSE_Sqrt:
-            return INS_sqrtps;
-
-        case NI_SSE_SqrtScalar:
-            return INS_sqrtss;
-
-        case NI_SSE_Subtract:
-            return INS_subps;
-
-        case NI_SSE_SubtractScalar:
-            return INS_subss;
-
-        case NI_SSE_UnpackHigh:
-            return INS_unpckhps;
-
-        case NI_SSE_UnpackLow:
-            return INS_unpcklps;
-
-        case NI_SSE_Xor:
-            return INS_xorps;
-
-        default:
-            return INS_invalid;
-    }
+//------------------------------------------------------------------------
+// categoryOfHWIntrinsic: get the category of the given intrinsic
+//
+// Arguments:
+//    intrinsic -- id of the intrinsic function.
+//
+// Return Value:
+//     the category of the given intrinsic
+//
+HWIntrinsicCategory Compiler::categoryOfHWIntrinsic(NamedIntrinsic intrinsic)
+{
+    assert(intrinsic != NI_Illegal);
+    assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+    return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].category;
 }
 
 //------------------------------------------------------------------------
-// isIntrinsicAnIsSupportedPropertyGetter: return true if the intrinsic is "get_IsSupported"
+// HWIntrinsicFlag: get the flag of the given intrinsic
 //
 // Arguments:
 //    intrinsic -- id of the intrinsic function.
 //
 // Return Value:
-//    true if the intrinsic is "get_IsSupported"
-//    Sometimes we need to specially treat "get_IsSupported"
-bool Compiler::isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic)
+//     the flag of the given intrinsic
+//
+HWIntrinsicFlag Compiler::flagOfHWIntrinsic(NamedIntrinsic intrinsic)
 {
-    switch (intrinsic)
+    assert(intrinsic != NI_Illegal);
+    assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+    return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag;
+}
+
+//------------------------------------------------------------------------
+// getArgForHWIntrinsic: get the argument from the stack and match  the signature
+//
+// Arguments:
+//    argType   -- the required type of argument
+//    argClass  -- the class handle of argType
+//
+// Return Value:
+//     get the argument at the given index from the stack and match  the signature
+//
+GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass)
+{
+    GenTree* arg = nullptr;
+    if (argType == TYP_STRUCT)
     {
-        case NI_SSE_IsSupported:
-        case NI_SSE2_IsSupported:
-        case NI_SSE3_IsSupported:
-        case NI_SSSE3_IsSupported:
-        case NI_SSE41_IsSupported:
-        case NI_SSE42_IsSupported:
-        case NI_AVX_IsSupported:
-        case NI_AVX2_IsSupported:
-        case NI_AES_IsSupported:
-        case NI_BMI1_IsSupported:
-        case NI_BMI2_IsSupported:
-        case NI_FMA_IsSupported:
-        case NI_LZCNT_IsSupported:
-        case NI_PCLMULQDQ_IsSupported:
-        case NI_POPCNT_IsSupported:
-            return true;
-        default:
-            return false;
+        unsigned int argSizeBytes;
+        var_types    base = getBaseTypeAndSizeOfSIMDType(argClass, &argSizeBytes);
+        argType           = getSIMDTypeForSize(argSizeBytes);
+        assert(argType == TYP_SIMD32 || argType == TYP_SIMD16);
+        arg = impSIMDPopStack(argType);
+        assert(arg->TypeGet() == TYP_SIMD16 || arg->TypeGet() == TYP_SIMD32);
+    }
+    else
+    {
+        assert(varTypeIsArithmetic(argType));
+        arg = impPopStack().val;
+        assert(varTypeIsArithmetic(arg->TypeGet()));
+        assert(genTypeSize(argType) <= genTypeSize(arg->TypeGet()));
     }
+    return arg;
 }
 
 //------------------------------------------------------------------------
@@ -503,6 +374,15 @@ bool Compiler::compSupportsHWIntrinsic(InstructionSet isa)
                                                     isFullyImplmentedISAClass(isa));
 }
 
+static bool isTypeSupportedForIntrinsic(var_types type)
+{
+#ifdef _TARGET_X86_
+    return !varTypeIsLong(type);
+#else
+    return true;
+#endif
+}
+
 //------------------------------------------------------------------------
 // impUnsupportedHWIntrinsic: returns a node for an unsupported HWIntrinsic
 //
@@ -548,8 +428,24 @@ GenTree* Compiler::impUnsupportedHWIntrinsic(unsigned              helper,
 }
 
 //------------------------------------------------------------------------
+// impIsTableDrivenHWIntrinsic:
+//
+// Arguments:
+//    category - category of a HW intrinsic
+//
+// Return Value:
+//    returns true if this category can be table-driven in the importer
+//
+static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category)
+{
+    // TODO - make more categories to the table-driven framework
+    const bool tableDrivenIntrinsic    = category == HW_Category_SimpleSIMD;
+    const bool nonTableDrivenIntrinsic = category == HW_Category_Special;
+    return tableDrivenIntrinsic && !nonTableDrivenIntrinsic;
+}
+
+//------------------------------------------------------------------------
 // impX86HWIntrinsic: dispatch hardware intrinsics to their own implementation
-// function
 //
 // Arguments:
 //    intrinsic -- id of the intrinsic function.
@@ -564,22 +460,94 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic        intrinsic,
                                      CORINFO_SIG_INFO*     sig,
                                      bool                  mustExpand)
 {
-    InstructionSet isa = isaOfHWIntrinsic(intrinsic);
+    InstructionSet      isa      = isaOfHWIntrinsic(intrinsic);
+    HWIntrinsicCategory category = categoryOfHWIntrinsic(intrinsic);
+    int                 numArgs  = sig->numArgs;
+    var_types           callType = JITtype2varType(sig->retType);
 
     // This intrinsic is supported if
     // - the ISA is available on the underlying hardware (compSupports returns true)
     // - the compiler supports this hardware intrinsics (compSupportsHWIntrinsic returns true)
-    bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa);
+    // - intrinsics do not require 64-bit registers (r64) on 32-bit platforms (isTypeSupportedForIntrinsic returns
+    // true)
+    bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(callType);
 
-    if (isIntrinsicAnIsSupportedPropertyGetter(intrinsic))
+    if (category == HW_Category_IsSupportedProperty)
     {
         return gtNewIconNode(issupported);
     }
+    // - calling to unsupported intrinsics must throw PlatforNotSupportedException
     else if (!issupported)
     {
         return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
     }
 
+    // table-driven importer of simple intrinsics
+    if (impIsTableDrivenHWIntrinsic(category))
+    {
+        unsigned int sizeBytes;
+        var_types    baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);
+        assert(baseType != TYP_UNKNOWN && sizeBytes != 0);
+        var_types               retType  = getSIMDTypeForSize(sizeBytes);
+        unsigned                simdSize = simdSizeOfHWIntrinsic(intrinsic, sig);
+        CORINFO_ARG_LIST_HANDLE argList  = sig->args;
+        CORINFO_CLASS_HANDLE    argClass;
+        var_types               argType = TYP_UNKNOWN;
+
+        assert(numArgs > 0);
+        assert(retType != TYP_UNDEF);
+        assert(retType == TYP_SIMD16 || retType == TYP_SIMD32);
+        assert(insOfHWIntrinsic(intrinsic, baseType) != INS_invalid);
+        assert(simdSize == 32 || simdSize == 16);
+
+        GenTree* retNode = nullptr;
+        GenTree* op1     = nullptr;
+        GenTree* op2     = nullptr;
+
+        switch (numArgs)
+        {
+            case 1:
+                argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
+                op1     = getArgForHWIntrinsic(argType, argClass);
+
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+                break;
+            case 2:
+                argType = JITtype2varType(
+                    strip(info.compCompHnd->getArgType(sig, info.compCompHnd->getArgNext(argList), &argClass)));
+                op2 = getArgForHWIntrinsic(argType, argClass);
+
+                argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
+                op1     = getArgForHWIntrinsic(argType, argClass);
+
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
+                break;
+
+            case 3:
+            {
+                CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(argList);
+                CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2);
+
+                argType      = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass)));
+                GenTree* op3 = getArgForHWIntrinsic(argType, argClass);
+
+                argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass)));
+                op2     = getArgForHWIntrinsic(argType, argClass);
+
+                argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
+                op1     = getArgForHWIntrinsic(argType, argClass);
+
+                op1     = gtNewArgList(op1, op2, op3);
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+                break;
+            }
+            default:
+                unreached();
+        }
+        return retNode;
+    }
+
+    // other intrinsics need special importation
     switch (isa)
     {
         case InstructionSet_SSE:
@@ -940,14 +908,6 @@ GenTree* Compiler::impSSE2Intrinsic(NamedIntrinsic        intrinsic,
     var_types baseType = TYP_UNKNOWN;
     switch (intrinsic)
     {
-        case NI_SSE2_Add:
-            assert(sig->numArgs == 2);
-            op2      = impSIMDPopStack(TYP_SIMD16);
-            op1      = impSIMDPopStack(TYP_SIMD16);
-            baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
-            retNode  = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, NI_SSE2_Add, baseType, 16);
-            break;
-
         default:
             JITDUMP("Not implemented hardware intrinsic");
             break;
@@ -989,26 +949,17 @@ GenTree* Compiler::impSSE42Intrinsic(NamedIntrinsic        intrinsic,
     GenTree*  op2      = nullptr;
     var_types callType = JITtype2varType(sig->retType);
 
-    CORINFO_ARG_LIST_HANDLE argLst = sig->args;
+    CORINFO_ARG_LIST_HANDLE argList = sig->args;
     CORINFO_CLASS_HANDLE    argClass;
     CorInfoType             corType;
     switch (intrinsic)
     {
         case NI_SSE42_Crc32:
             assert(sig->numArgs == 2);
-
-#ifdef _TARGET_X86_
-            if (varTypeIsLong(callType))
-            {
-                return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
-            }
-#endif
-
-            op2 = impPopStack().val;
-            op1 = impPopStack().val;
-
-            argLst  = info.compCompHnd->getArgNext(argLst);                        // the second argument
-            corType = strip(info.compCompHnd->getArgType(sig, argLst, &argClass)); // type of the second argument
+            op2     = impPopStack().val;
+            op1     = impPopStack().val;
+            argList = info.compCompHnd->getArgNext(argList);                        // the second argument
+            corType = strip(info.compCompHnd->getArgType(sig, argList, &argClass)); // type of the second argument
 
             retNode = gtNewScalarHWIntrinsicNode(callType, op1, op2, NI_SSE42_Crc32);
 
@@ -1035,14 +986,6 @@ GenTree* Compiler::impAVXIntrinsic(NamedIntrinsic        intrinsic,
     var_types baseType = TYP_UNKNOWN;
     switch (intrinsic)
     {
-        case NI_AVX_Add:
-            assert(sig->numArgs == 2);
-            op2      = impSIMDPopStack(TYP_SIMD32);
-            op1      = impSIMDPopStack(TYP_SIMD32);
-            baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
-            retNode  = gtNewSimdHWIntrinsicNode(TYP_SIMD32, op1, op2, NI_AVX_Add, baseType, 32);
-            break;
-
         default:
             JITDUMP("Not implemented hardware intrinsic");
             break;
@@ -1061,14 +1004,6 @@ GenTree* Compiler::impAVX2Intrinsic(NamedIntrinsic        intrinsic,
     var_types baseType = TYP_UNKNOWN;
     switch (intrinsic)
     {
-        case NI_AVX2_Add:
-            assert(sig->numArgs == 2);
-            op2      = impSIMDPopStack(TYP_SIMD32);
-            op1      = impSIMDPopStack(TYP_SIMD32);
-            baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
-            retNode  = gtNewSimdHWIntrinsicNode(TYP_SIMD32, op1, op2, NI_AVX2_Add, baseType, 32);
-            break;
-
         default:
             JITDUMP("Not implemented hardware intrinsic");
             break;
@@ -1115,14 +1050,6 @@ GenTree* Compiler::impLZCNTIntrinsic(NamedIntrinsic        intrinsic,
 {
     assert(sig->numArgs == 1);
     var_types callType = JITtype2varType(sig->retType);
-
-#ifdef _TARGET_X86_
-    if (varTypeIsLong(callType))
-    {
-        return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
-    }
-#endif
-
     return gtNewScalarHWIntrinsicNode(callType, impPopStack().val, NI_LZCNT_LeadingZeroCount);
 }
 
@@ -1141,14 +1068,6 @@ GenTree* Compiler::impPOPCNTIntrinsic(NamedIntrinsic        intrinsic,
 {
     assert(sig->numArgs == 1);
     var_types callType = JITtype2varType(sig->retType);
-
-#ifdef _TARGET_X86_
-    if (varTypeIsLong(callType))
-    {
-        return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
-    }
-#endif
-
     return gtNewScalarHWIntrinsicNode(callType, impPopStack().val, NI_POPCNT_PopCount);
 }
 
index afb84a5..f48a6ce 100644 (file)
@@ -252,6 +252,7 @@ INST3( andnpd, "andnpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x55))    /
 INST3( orps,   "orps",   0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x56))    // Or packed singles
 INST3( orpd,   "orpd",   0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x56))    // Or packed doubles
 INST3( haddpd, "haddpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7C))    // Horizontal add packed doubles
+INST3( rcpps,  "rcpps",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53))    // Reciprocals of Packed Singles
 
 // SSE 2 approx arith
 INST3( rcpps,   "rcpps",   0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53))    // Reciprocal of packed singles
@@ -381,6 +382,11 @@ INST3( roundps,      "roundps"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SS
 INST3( roundss,      "roundss"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x0A))   // Round scalar single precision floating-point values
 INST3( roundpd,      "roundpd"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x09))   // Round packed double precision floating-point values
 INST3( roundsd,      "roundsd"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x0B))   // Round scalar double precision floating-point values
+INST3( pmuldq,       "pmuldq"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x28))   // packed multiply 32-bit signed integers and store 64-bit result
+INST3( blendvps,     "blendvps"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x14))   // Variable Blend Packed Singles
+INST3( blendvpd,     "blendvpd"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x15))   // Variable Blend Packed Doubles
+INST3( pblendvb,     "pblendvb"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x10))   // Variable Blend Packed Bytes
+
 INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 
 INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
@@ -398,6 +404,9 @@ INST3( vinserti128,  "inserti128"  , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SS
 INST3( vzeroupper,   "zeroupper"   , 0, IUM_WR, 0, 0, 0xC577F8,     BAD_CODE, BAD_CODE)      // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
 INST3( vperm2i128,   "perm2i128"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x46))   // Permute 128-bit halves of input register
 INST3( vpermq,       "permq"       , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x00))   // Permute 64-bit of input register
+INST3( vblendvps,    "blendvps"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x4A))   // Variable Blend Packed Singles
+INST3( vblendvpd,    "blendvpd"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x4B))   // Variable Blend Packed Doubles
+INST3( vpblendvb,    "pblendvb"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x4C))   // Variable Blend Packed Bytes
 INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 
 // Scalar instructions in SSE4.2
index a6f8073..4169b4a 100644 (file)
@@ -2304,96 +2304,36 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
 //
 void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
 {
-    NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
-    GenTree*       op1         = node->gtGetOp1();
-    GenTree*       op2         = node->gtGetOp2();
-
-    switch (node->gtHWIntrinsicId)
-    {
-        case NI_SSE_Add:
-        case NI_SSE_AddScalar:
-        case NI_SSE_And:
-        case NI_SSE_AndNot:
-        case NI_SSE_CompareEqual:
-        case NI_SSE_CompareEqualScalar:
-        case NI_SSE_CompareGreaterThan:
-        case NI_SSE_CompareGreaterThanScalar:
-        case NI_SSE_CompareGreaterThanOrEqual:
-        case NI_SSE_CompareGreaterThanOrEqualScalar:
-        case NI_SSE_CompareLessThan:
-        case NI_SSE_CompareLessThanScalar:
-        case NI_SSE_CompareLessThanOrEqual:
-        case NI_SSE_CompareLessThanOrEqualScalar:
-        case NI_SSE_CompareNotEqual:
-        case NI_SSE_CompareNotEqualScalar:
-        case NI_SSE_CompareNotGreaterThan:
-        case NI_SSE_CompareNotGreaterThanScalar:
-        case NI_SSE_CompareNotGreaterThanOrEqual:
-        case NI_SSE_CompareNotGreaterThanOrEqualScalar:
-        case NI_SSE_CompareNotLessThan:
-        case NI_SSE_CompareNotLessThanScalar:
-        case NI_SSE_CompareNotLessThanOrEqual:
-        case NI_SSE_CompareNotLessThanOrEqualScalar:
-        case NI_SSE_CompareOrdered:
-        case NI_SSE_CompareOrderedScalar:
-        case NI_SSE_CompareUnordered:
-        case NI_SSE_CompareUnorderedScalar:
-        case NI_SSE_ConvertToVector128SingleScalar:
-        case NI_SSE_Divide:
-        case NI_SSE_DivideScalar:
-        case NI_SSE_Max:
-        case NI_SSE_MaxScalar:
-        case NI_SSE_Min:
-        case NI_SSE_MinScalar:
-        case NI_SSE_Multiply:
-        case NI_SSE_MultiplyScalar:
-        case NI_SSE_Or:
-        case NI_SSE_Subtract:
-        case NI_SSE_SubtractScalar:
-        case NI_SSE_UnpackHigh:
-        case NI_SSE_UnpackLow:
-        case NI_SSE_Xor:
-        case NI_SSE2_Add:
-            if (!comp->getEmitter()->UseVEXEncoding())
-            {
-                // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
-                // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned
-                break;
-            }
-            __fallthrough;
+    NamedIntrinsic      intrinsicID = node->gtHWIntrinsicId;
+    HWIntrinsicCategory category    = comp->categoryOfHWIntrinsic(intrinsicID);
+    int                 numArgs     = comp->numArgsOfHWIntrinsic(intrinsicID);
+    GenTree*            op1         = node->gtGetOp1();
+    GenTree*            op2         = node->gtGetOp2();
 
-        case NI_AVX_Add:
-        case NI_AVX2_Add:
+    // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
+    // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned
+    if (category == HW_Category_SimpleSIMD && numArgs == 2 && comp->canUseVexEncoding())
+    {
+        if (IsContainableMemoryOp(op2))
         {
-            assert(comp->getEmitter()->UseVEXEncoding());
-
-            if (IsContainableMemoryOp(op2))
-            {
-                MakeSrcContained(node, op2);
-            }
-            else
-            {
-                // TODO-XArch-CQ: Commutative operations can have op1 be contained
-                op2->SetRegOptional();
-            }
-            break;
+            MakeSrcContained(node, op2);
         }
-
-        case NI_SSE_Shuffle:
+        else
         {
-            assert(op1->OperIsList());
-            GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
-
-            if (op3->IsCnsIntOrI())
-            {
-                MakeSrcContained(node, op3);
-            }
-            break;
+            // TODO-XArch-CQ: Commutative operations can have op1 be contained
+            op2->SetRegOptional();
         }
+    }
 
-        default:
-            assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END));
-            break;
+    if (NamedIntrinsic == NI_SSE_Shuffle)
+    {
+        assert(op1->OperIsList());
+        GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
+
+        if (op3->IsCnsIntOrI())
+        {
+            MakeSrcContained(node, op3);
+        }
     }
 }
 #endif // FEATURE_HW_INTRINSICS
index 5e8924f..f56a36a 100644 (file)
@@ -2504,13 +2504,11 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree, TreeNodeInfo* info)
 void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, TreeNodeInfo* info)
 {
     NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId;
-    InstructionSet isa         = compiler->isaOfHWIntrinsic(intrinsicID);
-
+    InstructionSet isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
     if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
     {
         SetContainsAVXFlags(true, 32);
     }
-
     GenTree* op1   = intrinsicTree->gtOp.gtOp1;
     GenTree* op2   = intrinsicTree->gtOp.gtOp2;
     info->srcCount = 0;
@@ -2519,15 +2517,10 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree,
     {
         if (op1->OperIsList())
         {
-            int srcCount = 0;
-
             for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest())
             {
-                GenTree* listItem = list->Current();
-                srcCount += GetOperandInfo(listItem);
+                info->srcCount += GetOperandInfo(list->Current());
             }
-
-            info->srcCount += srcCount;
         }
         else
         {
@@ -2583,6 +2576,21 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree,
             useList.Last()->info.isTgtPref = true;
             break;
 
+        case NI_SSE41_BlendVariable:
+        {
+            if (!compiler->canUseVexEncoding())
+            {
+                // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+                LocationInfoListNode* op2Info = useList.Begin()->Next();
+                LocationInfoListNode* op3Info = op2Info->Next();
+                op2Info->info.isDelayFree     = true;
+                op3Info->info.isDelayFree     = true;
+                op3Info->info.setSrcCandidates(this, RBM_XMM0);
+                info->hasDelayFreeSrc = true;
+            }
+            break;
+        }
+
 #ifdef _TARGET_X86_
         case NI_SSE42_Crc32:
         {
index 1144df6..8d5aac2 100644 (file)
@@ -16,10 +16,83 @@ enum NamedIntrinsic : unsigned int
     NI_System_Collections_Generic_EqualityComparer_get_Default = 4,
 #if FEATURE_HW_INTRINSICS
     NI_HW_INTRINSIC_START,
-#define HARDWARE_INTRINSIC(id, name, isa) NI_##id,
+#define HARDWARE_INTRINSIC(id, name, isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag) \
+    NI_##id,
 #include "hwintrinsiclistxarch.h"
     NI_HW_INTRINSIC_END
 #endif
 };
 
+#if FEATURE_HW_INTRINSICS && defined(_TARGET_XARCH_)
+enum HWIntrinsicFlag : unsigned int
+{
+    HW_Flag_NoFlag = 0,
+
+    // Commutative
+    // - if a binary-op intrinsic is commutative (e.g., Add, Multiply), its op1 can be contained
+    HW_Flag_Commutative = 0x1,
+
+    // Full range IMM intrinsic
+    // - the immediate value is vaild on the full range of imm8 (0-255)
+    HW_Flag_FullRangeIMM = 0x2,
+
+    // Generic
+    // - must throw NotSupportException if the type argument is not numeric type
+    HW_Flag_Generic = 0x4,
+
+    // NoCodeGen
+    // - should be transformed in the compiler front-end, cannot reach CodeGen
+    HW_Flag_NoCodeGen = 0x8,
+
+    // Unfixed SIMD-size
+    // - overloaded on multiple vector sizes (SIMD size in the table is unreliable)
+    HW_Flag_UnfixedSIMDSize = 0x10,
+
+    // Complex overload
+    // - the codegen of overloads cannot be determined by intrinsicID and base type
+    HW_Flag_ComplexOverloads = 0x20,
+};
+
+inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2)
+{
+    return static_cast<HWIntrinsicFlag>(static_cast<unsigned>(c1) | static_cast<unsigned>(c2));
+}
+
+enum HWIntrinsicCategory : unsigned int
+{
+    // Simple SIMD intrinsics
+    // - take Vector128/256<T> parameters
+    // - return a Vector128/256<T>
+    // - generate single instruction
+    // - the codegen of overloads can be determined by intrinsicID and base type of returned vector
+    HW_Category_SimpleSIMD,
+
+    // IsSupported Property
+    // - each ISA class has an "IsSupported" property
+    HW_Category_IsSupportedProperty,
+
+    // IMM intrinsics
+    // - some SIMD intrinsics requires immediate value (i.e. imm8) to generate instruction
+    HW_Category_IMM,
+
+    // Scalar intrinsics
+    // - operate over general purpose registers, like crc32, lzcnt, popcnt, etc.
+    HW_Category_Scalar,
+
+    // Memory access intrinsics
+    // - e.g., Avx.Load, Avx.Store, Sse.LoadAligned
+    HW_Category_MemoryLoad,
+    HW_Category_MemoryStore,
+
+    // Helper intrinsics
+    // - do not directly correspond to a instruction, such as Avx.SetAllVector256
+    HW_Category_Helper,
+
+    // Special intrinsics
+    // - have to be addressed specially
+    HW_Category_Special
+};
+
+#endif // FEATURE_HW_INTRINSICS && defined(_TARGET_XARCH_)
+
 #endif // _NAMEDINTRINSICLIST_H_
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply.cs b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply.cs
new file mode 100644 (file)
index 0000000..2c7bf21
--- /dev/null
@@ -0,0 +1,108 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
+
+namespace IntelHardwareIntrinsicTest
+{
+    class Program
+    {
+        const int Pass = 100;
+        const int Fail = 0;
+
+        static unsafe int Main(string[] args)
+        {
+            int testResult = Pass;
+
+            if (Avx.IsSupported)
+            {
+                using (TestTable<float, float, float> floatTable = new TestTable<float, float, float>(new float[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new float[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new float[8]))
+                using (TestTable<double, double, double> doubleTable = new TestTable<double, double, double>(new double[4] { 1, -5, 100, 0 }, new double[4] { 22, -1, -50, 0 }, new double[4]))
+                {
+                    var vf1 = Unsafe.Read<Vector256<float>>(floatTable.inArray1Ptr);
+                    var vf2 = Unsafe.Read<Vector256<float>>(floatTable.inArray2Ptr);
+                    var vf3 = Avx.Multiply(vf1, vf2);
+                    Unsafe.Write(floatTable.outArrayPtr, vf3);
+
+                    var vd1 = Unsafe.Read<Vector256<double>>(doubleTable.inArray1Ptr);
+                    var vd2 = Unsafe.Read<Vector256<double>>(doubleTable.inArray2Ptr);
+                    var vd3 = Avx.Multiply(vd1, vd2);
+                    Unsafe.Write(doubleTable.outArrayPtr, vd3);
+
+                    if (!floatTable.CheckResult((x, y, z) => x * y == z))
+                    {
+                        Console.WriteLine("AVX Multiply failed on float:");
+                        foreach (var item in floatTable.outArray)
+                        {
+                            Console.Write(item + ", ");
+                        }
+                        Console.WriteLine();
+                        testResult = Fail;
+                    }
+
+                    if (!doubleTable.CheckResult((x, y, z) => x * y == z))
+                    {
+                        Console.WriteLine("AVX Multiply failed on double:");
+                        foreach (var item in doubleTable.outArray)
+                        {
+                            Console.Write(item + ", ");
+                        }
+                        Console.WriteLine();
+                        testResult = Fail;
+                    }
+                }
+            }
+            return testResult;
+        }
+
+        public unsafe struct TestTable<T1, T2, T3> : IDisposable where T1 : struct where T2 : struct where T3 : struct
+        {
+            public T1[] inArray1;
+            public T2[] inArray2;
+            public T3[] outArray;
+
+            public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer();
+            public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer();
+            public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer();
+
+            GCHandle inHandle1;
+            GCHandle inHandle2;
+            GCHandle outHandle;
+            public TestTable(T1[] a, T2[] b, T3[] c)
+            {
+                this.inArray1 = a;
+                this.inArray2 = b;
+                this.outArray = c;
+
+                inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned);
+                inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned);
+                outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned);
+            }
+            public bool CheckResult(Func<T1, T2, T3, bool> check)
+            {
+                for (int i = 0; i < inArray1.Length; i++)
+                {
+                    if (!check(inArray1[i], inArray2[i], outArray[i]))
+                    {
+                        return false;
+                    }
+                }
+                return true;
+            }
+
+            public void Dispose()
+            {
+                inHandle1.Free();
+                inHandle2.Free();
+                outHandle.Free();
+            }
+        }
+
+    }
+}
\ No newline at end of file
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_r.csproj
new file mode 100644 (file)
index 0000000..7c151fe
--- /dev/null
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize></Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Multiply.cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_ro.csproj
new file mode 100644 (file)
index 0000000..b6fbea2
--- /dev/null
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize>true</Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Multiply.cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply.cs b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply.cs
new file mode 100644 (file)
index 0000000..4f3b6af
--- /dev/null
@@ -0,0 +1,116 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
+
+namespace IntelHardwareIntrinsicTest
+{
+    class Program
+    {
+        const int Pass = 100;
+        const int Fail = 0;
+
+        static unsafe int Main(string[] args)
+        {
+            int testResult = Pass;
+
+            if (Avx2.IsSupported)
+            {
+                using (TestTable<int, int, long> intTable = new TestTable<int, int, long>(new int[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new int[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new long[4]))
+                using (TestTable<uint, uint, ulong> uintTable = new TestTable<uint, uint, ulong>(new uint[8] { 1, 5, 100, 0, 1, 5, 100, 0 }, new uint[8] { 22, 1, 50, 0, 22, 1, 50, 0 }, new ulong[4]))
+                {
+
+                    var vi1 = Unsafe.Read<Vector256<int>>(intTable.inArray1Ptr);
+                    var vi2 = Unsafe.Read<Vector256<int>>(intTable.inArray2Ptr);
+                    var vi3 = Avx2.Multiply(vi1, vi2);
+                    Unsafe.Write(intTable.outArrayPtr, vi3);
+
+                    var vui1 = Unsafe.Read<Vector256<uint>>(uintTable.inArray1Ptr);
+                    var vui2 = Unsafe.Read<Vector256<uint>>(uintTable.inArray2Ptr);
+                    var vui3 = Avx2.Multiply(vui1, vui2);
+                    Unsafe.Write(uintTable.outArrayPtr, vui3);
+
+                    for (int i = 0; i < intTable.outArray.Length; i++)
+                    {
+                        if (intTable.inArray1[i * 2] * intTable.inArray2[i * 2] != intTable.outArray[i])
+                        {
+                            Console.WriteLine("AVX2 Multiply failed on int:");
+                            foreach (var item in intTable.outArray)
+                            {
+                                Console.Write(item + ", ");
+                            }
+                            Console.WriteLine();
+                            return Fail;
+                        }
+                    }
+
+                    for (int i = 0; i < uintTable.outArray.Length; i++)
+                    {
+                        if (uintTable.inArray1[i * 2] * uintTable.inArray2[i * 2] != uintTable.outArray[i])
+                        {
+                            Console.WriteLine("AVX2 Multiply failed on uint:");
+                            foreach (var item in uintTable.outArray)
+                            {
+                                Console.Write(item + ", ");
+                            }
+                            Console.WriteLine();
+                            return Fail;
+                        }
+                    }
+                }
+            }
+
+            return testResult;
+        }
+
+        public unsafe struct TestTable<T1, T2, T3> : IDisposable where T1 : struct where T2 : struct where T3 : struct
+        {
+            public T1[] inArray1;
+            public T2[] inArray2;
+            public T3[] outArray;
+
+            public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer();
+            public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer();
+            public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer();
+
+            GCHandle inHandle1;
+            GCHandle inHandle2;
+            GCHandle outHandle;
+            public TestTable(T1[] a, T2[] b, T3[] c)
+            {
+                this.inArray1 = a;
+                this.inArray2 = b;
+                this.outArray = c;
+
+                inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned);
+                inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned);
+                outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned);
+            }
+            public bool CheckResult(Func<T1, T2, T3, bool> check)
+            {
+                for (int i = 0; i < inArray1.Length; i++)
+                {
+                    if (!check(inArray1[i], inArray2[i], outArray[i]))
+                    {
+                        return false;
+                    }
+                }
+                return true;
+            }
+
+            public void Dispose()
+            {
+                inHandle1.Free();
+                inHandle2.Free();
+                outHandle.Free();
+            }
+        }
+
+    }
+}
\ No newline at end of file
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_r.csproj
new file mode 100644 (file)
index 0000000..7c151fe
--- /dev/null
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize></Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Multiply.cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_ro.csproj
new file mode 100644 (file)
index 0000000..b6fbea2
--- /dev/null
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize>true</Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Multiply.cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply.cs b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply.cs
new file mode 100644 (file)
index 0000000..09feca9
--- /dev/null
@@ -0,0 +1,95 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
+
+namespace IntelHardwareIntrinsicTest
+{
+    class Program
+    {
+        const int Pass = 100;
+        const int Fail = 0;
+
+        static unsafe int Main(string[] args)
+        {
+            int testResult = Pass;
+
+            if (Sse41.IsSupported)
+            {
+                using (TestTable<int, int, long> intTable = new TestTable<int, int, long>(new int[4] { 1, -5, 100, 0}, new int[4] { 22, -1, -50, 0}, new long[2]))
+                {
+
+                    var vi1 = Unsafe.Read<Vector128<int>>(intTable.inArray1Ptr);
+                    var vi2 = Unsafe.Read<Vector128<int>>(intTable.inArray2Ptr);
+                    var vi3 = Sse41.Multiply(vi1, vi2);
+                    Unsafe.Write(intTable.outArrayPtr, vi3);
+
+                    for (int i = 0; i < intTable.outArray.Length; i++)
+                    {
+                        if (intTable.inArray1[i * 2] * intTable.inArray2[i * 2] != intTable.outArray[i])
+                        {
+                            Console.WriteLine("SSE4.1 Multiply failed on int:");
+                            foreach (var item in intTable.outArray)
+                            {
+                                Console.Write(item + ", ");
+                            }
+                            Console.WriteLine();
+                            return Fail;
+                        }
+                    }
+                }
+            }
+            return testResult;
+        }
+
+        public unsafe struct TestTable<T1, T2, T3> : IDisposable where T1 : struct where T2 : struct where T3 : struct
+        {
+            public T1[] inArray1;
+            public T2[] inArray2;
+            public T3[] outArray;
+
+            public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer();
+            public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer();
+            public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer();
+
+            GCHandle inHandle1;
+            GCHandle inHandle2;
+            GCHandle outHandle;
+            public TestTable(T1[] a, T2[] b, T3[] c)
+            {
+                this.inArray1 = a;
+                this.inArray2 = b;
+                this.outArray = c;
+
+                inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned);
+                inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned);
+                outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned);
+            }
+            public bool CheckResult(Func<T1, T2, T3, bool> check)
+            {
+                for (int i = 0; i < inArray1.Length; i++)
+                {
+                    if (!check(inArray1[i], inArray2[i], outArray[i]))
+                    {
+                        return false;
+                    }
+                }
+                return true;
+            }
+
+            public void Dispose()
+            {
+                inHandle1.Free();
+                inHandle2.Free();
+                outHandle.Free();
+            }
+        }
+
+    }
+}
\ No newline at end of file
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_r.csproj
new file mode 100644 (file)
index 0000000..7c151fe
--- /dev/null
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize></Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Multiply.cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_ro.csproj
new file mode 100644 (file)
index 0000000..b6fbea2
--- /dev/null
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize>true</Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Multiply.cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file