NamedIntrinsic lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method);
#if FEATURE_HW_INTRINSICS
- InstructionSet lookupHWIntrinsicISA(const char* className);
- NamedIntrinsic lookupHWIntrinsic(const char* methodName, InstructionSet isa);
- InstructionSet isaOfHWIntrinsic(NamedIntrinsic intrinsic);
- bool isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic);
- bool isFullyImplmentedISAClass(InstructionSet isa);
+ static InstructionSet lookupHWIntrinsicISA(const char* className);
+ static NamedIntrinsic lookupHWIntrinsic(const char* methodName, InstructionSet isa);
+ static InstructionSet isaOfHWIntrinsic(NamedIntrinsic intrinsic);
+ static bool isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic);
+ static bool isFullyImplmentedISAClass(InstructionSet isa);
#ifdef _TARGET_XARCH_
GenTree* impUnsupportedHWIntrinsic(unsigned helper,
CORINFO_METHOD_HANDLE method,
bool compSupportsHWIntrinsic(InstructionSet isa);
bool isScalarISA(InstructionSet isa);
static int ivalOfHWIntrinsic(NamedIntrinsic intrinsic);
+ static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic);
static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type);
+ static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic);
+ static HWIntrinsicFlag flagOfHWIntrinsic(NamedIntrinsic intrinsic);
+ GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass);
+ GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig);
#endif // _TARGET_XARCH_
#endif // FEATURE_HW_INTRINSICS
GenTreePtr impArrayAccessIntrinsic(CORINFO_CLASS_HANDLE clsHnd,
struct
{
regNumber _idReg3 : REGNUM_BITS;
+ regNumber _idReg4 : REGNUM_BITS;
};
#endif // defined(_TARGET_XARCH_)
idAddr()->_idReg3 = reg;
assert(reg == idAddr()->_idReg3);
}
+ regNumber idReg4() const
+ {
+ assert(!idIsTiny());
+ assert(!idIsSmallDsc());
+ return idAddr()->_idReg4;
+ }
+ void idReg4(regNumber reg)
+ {
+ assert(!idIsTiny());
+ assert(!idIsSmallDsc());
+ idAddr()->_idReg4 = reg;
+ assert(reg == idAddr()->_idReg4);
+ }
#endif // defined(_TARGET_XARCH_)
#ifdef _TARGET_ARMARCH_
insOpts idInsOpt() const
IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg , read reg2 , read reg3
IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write reg , read reg2 , read reg3, const
+
+IF_DEF(RWR_RRD_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD|IS_R4_RD, NONE) // write reg , read reg2 , read reg3 , read reg4
//----------------------------------------------------------------------------
// The following formats are used for direct addresses (e.g. static data members)
//----------------------------------------------------------------------------
case INS_pminub:
case INS_pminud:
case INS_pminuw:
+ case INS_pmuldq:
case INS_pmulld:
case INS_pmullw:
case INS_pmuludq:
emitCurIGsize += sz;
}
+static bool isAvxBlendv(instruction ins)
+{
+ return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
+}
+
+static bool isSse41Blendv(instruction ins)
+{
+ return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb;
+}
+
+void emitter::emitIns_R_R_R_R(
+ instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3)
+{
+ assert(isAvxBlendv(ins));
+ assert(UseVEXEncoding());
+ // Currently vex prefix only use three bytes mode.
+ // size = vex + opcode + ModR/M + 1-byte-cns(Reg) = 3 + 1 + 1 + 1 = 6
+ // TODO-XArch-CQ: We should create function which can calculate all kinds of AVX instructions size in future
+ UNATIVE_OFFSET sz = 6;
+
+ // AVX/AVX2 supports 4-reg format for vblendvps/vblendvpd/vpblendvb,
+ // which encodes the fourth register into imm8[7:4]
+ int ival = (reg3 - XMMBASE) << 4; // convert reg3 to ival
+
+ instrDesc* id = emitNewInstrCns(attr, ival);
+ id->idIns(ins);
+ id->idInsFmt(IF_RWR_RRD_RRD_RRD);
+ id->idReg1(targetReg);
+ id->idReg2(reg1);
+ id->idReg3(reg2);
+ id->idReg4(reg3);
+
+ id->idCodeSize(sz);
+ dispIns(id);
+ emitCurIGsize += sz;
+}
+
/*****************************************************************************
*
* Add an instruction with a register + static member operands.
}
#if FEATURE_HW_INTRINSICS
-void emitter::emitIns_SIMD_R_R(instruction ins, regNumber reg, regNumber reg1, var_types simdtype)
-{
- emitIns_R_R(ins, emitTypeSize(simdtype), reg, reg1);
-}
-
-void emitter::emitIns_SIMD_R_R_A(
- instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, var_types simdtype)
+void emitter::emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir)
{
if (UseVEXEncoding())
{
- emitIns_R_R_A(ins, emitTypeSize(simdtype), reg, reg1, indir, IF_RWR_RRD_ARD);
+ emitIns_R_R_A(ins, attr, reg, reg1, indir, IF_RWR_RRD_ARD);
}
else
{
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
emitIns_R_A(ins, emitTypeSize(simdtype), reg, indir, IF_RRW_ARD);
}
}
void emitter::emitIns_SIMD_R_R_C(
- instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, var_types simdtype)
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs)
{
if (UseVEXEncoding())
{
- emitIns_R_R_C(ins, emitTypeSize(simdtype), reg, reg1, fldHnd, offs);
+ emitIns_R_R_C(ins, attr, reg, reg1, fldHnd, offs);
}
else
{
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_C(ins, emitTypeSize(simdtype), reg, fldHnd, offs);
+ emitIns_R_C(ins, attr, reg, fldHnd, offs);
}
}
-void emitter::emitIns_SIMD_R_R_R(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, var_types simdtype)
+void emitter::emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2)
{
if (UseVEXEncoding())
{
- emitIns_R_R_R(ins, emitTypeSize(simdtype), reg, reg1, reg2);
+ emitIns_R_R_R(ins, attr, reg, reg1, reg2);
}
else
{
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_R(ins, emitTypeSize(simdtype), reg, reg2);
+ emitIns_R_R(ins, attr, reg, reg2);
}
}
-void emitter::emitIns_SIMD_R_R_S(instruction ins, regNumber reg, regNumber reg1, int varx, int offs, var_types simdtype)
+void emitter::emitIns_SIMD_R_R_R_R(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3)
{
+ assert(isAvxBlendv(ins) || isSse41Blendv(ins));
if (UseVEXEncoding())
{
- emitIns_R_R_S(ins, emitTypeSize(simdtype), reg, reg1, varx, offs);
+ // convert SSE encoding of SSE4.1 instructions to VEX encoding
+ switch (ins)
+ {
+ case INS_blendvps:
+ ins = INS_vblendvps;
+ break;
+ case INS_blendvpd:
+ ins = INS_vblendvpd;
+ break;
+ case INS_pblendvb:
+ ins = INS_vpblendvb;
+ break;
+ default:
+ break;
+ }
+ emitIns_R_R_R_R(ins, attr, reg, reg1, reg2, reg3);
}
else
{
+ assert(isSse41Blendv(ins));
+ // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+ if (reg3 != REG_XMM0)
+ {
+ emitIns_R_R(INS_movaps, attr, REG_XMM0, reg3);
+ }
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
+ }
+ emitIns_R_R(ins, attr, reg, reg2);
+ }
+}
+
+void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs)
+{
+ if (UseVEXEncoding())
+ {
+ emitIns_R_R_S(ins, attr, reg, reg1, varx, offs);
+ }
+ else
+ {
+ if (reg1 != reg)
+ {
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_S(ins, emitTypeSize(simdtype), reg, varx, offs);
+ emitIns_R_S(ins, attr, reg, varx, offs);
}
}
val = emitGetInsSC(id);
goto PRINT_CONSTANT;
break;
+ case IF_RWR_RRD_RRD_RRD:
+ assert(IsAVXOnlyInstruction(ins));
+ assert(UseVEXEncoding());
+ printf("%s, ", emitRegName(id->idReg1(), attr));
+ printf("%s, ", emitRegName(id->idReg2(), attr));
+ printf("%s, ", emitRegName(id->idReg3(), attr));
+ printf("%s", emitRegName(id->idReg4(), attr));
+ break;
case IF_RRW_RRW_CNS:
printf("%s,", emitRegName(id->idReg1(), attr));
printf(" %s", emitRegName(id->idReg2(), attr));
instruction ins = id->idIns();
assert(IsAVXInstruction(ins));
- assert(IsThreeOperandAVXInstruction(ins));
+ assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins));
regNumber targetReg = id->idReg1();
regNumber src1 = id->idReg2();
regNumber src2 = id->idReg3();
sz = emitSizeOfInsDsc(id);
break;
case IF_RWR_RRD_RRD_CNS:
+ case IF_RWR_RRD_RRD_RRD:
dst = emitOutputRRR(dst, id);
sz = emitSizeOfInsDsc(id);
dst += emitOutputByte(dst, emitGetInsSC(id));
void emitIns_R_R_S_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival);
+void emitIns_R_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, regNumber reg4);
+
void emitIns_S(instruction ins, emitAttr attr, int varx, int offs);
void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp);
#if FEATURE_HW_INTRINSICS
-void emitIns_SIMD_R_R(instruction ins, regNumber reg, regNumber reg1, var_types simdtype);
-void emitIns_SIMD_R_R_A(instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, var_types simdtype);
void emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype);
-void emitIns_SIMD_R_R_C(
- instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, var_types simdtype);
-void emitIns_SIMD_R_R_R(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, var_types simdtype);
-void emitIns_SIMD_R_R_S(instruction ins, regNumber reg, regNumber reg1, int varx, int offs, var_types simdtype);
void emitIns_SIMD_R_R_A_I(
instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype);
void emitIns_SIMD_R_R_C_I(instruction ins,
void emitIns_SIMD_R_R_R_I(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype);
void emitIns_SIMD_R_R_S_I(
instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype);
+void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir);
+void emitIns_SIMD_R_R_C(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs);
+void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs);
+void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2);
+void emitIns_SIMD_R_R_R_R(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3);
#endif
#if FEATURE_STACK_FP_X87
}
GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(
- var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size)
+ var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned simdSize)
{
SetOpLclRelatedToSIMDIntrinsic(op1);
- return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, hwIntrinsicID, baseType, size);
+ return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, hwIntrinsicID, baseType, simdSize);
}
GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(
- var_types type, GenTree* op1, GenTree* op2, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size)
+ var_types type, GenTree* op1, GenTree* op2, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned simdSize)
{
SetOpLclRelatedToSIMDIntrinsic(op1);
SetOpLclRelatedToSIMDIntrinsic(op2);
- return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, op2, hwIntrinsicID, baseType, size);
+ return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, op2, hwIntrinsicID, baseType, simdSize);
}
GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type,
#include "gcinfo.h"
#include "gcinfoencoder.h"
+//------------------------------------------------------------------------
+// genIsTableDrivenHWIntrinsic:
+//
+// Arguments:
+// category - category of a HW intrinsic
+//
+// Return Value:
+// returns true if this category can be table-driven in CodeGen
+//
+static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category)
+{
+ // TODO - make more categories to the table-driven framework
+ const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD;
+ const bool nonTableDrivenIntrinsic = category == HW_Category_Special;
+ return tableDrivenIntrinsic && !nonTableDrivenIntrinsic;
+}
+
void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
- NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
- InstructionSet isa = compiler->isaOfHWIntrinsic(intrinsicID);
+ NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
+ InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID);
+ HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID);
+ HWIntrinsicFlag flag = Compiler::flagOfHWIntrinsic(intrinsicID);
+ int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID);
+
+ assert((flag & HW_Flag_NoCodeGen) == 0);
+
+ if (genIsTableDrivenHWIntrinsic(category))
+ {
+ GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = node->gtGetOp2();
+ regNumber targetReg = node->gtRegNum;
+ var_types targetType = node->TypeGet();
+ var_types baseType = node->gtSIMDBaseType;
+
+ regNumber op1Reg = REG_NA;
+ regNumber op2Reg = REG_NA;
+ emitter* emit = getEmitter();
+
+ assert(numArgs >= 0);
+ instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+ assert(ins != INS_invalid);
+ emitAttr simdSize = (emitAttr)(node->gtSIMDSize);
+ assert(simdSize != 0);
+
+ switch (numArgs)
+ {
+ case 1:
+ genConsumeOperands(node);
+ op1Reg = op1->gtRegNum;
+ emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
+ break;
+ case 2:
+ genHWIntrinsic_R_R_RM(node, ins);
+ break;
+ case 3:
+ {
+ assert(op1->OperIsList());
+ assert(op1->gtGetOp2()->OperIsList());
+ assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
+
+ GenTreeArgList* argList = op1->AsArgList();
+ op1 = argList->Current();
+ genConsumeRegs(op1);
+ op1Reg = op1->gtRegNum;
+
+ argList = argList->Rest();
+ op2 = argList->Current();
+ genConsumeRegs(op2);
+ op2Reg = op2->gtRegNum;
+
+ argList = argList->Rest();
+ GenTree* op3 = argList->Current();
+ genConsumeRegs(op3);
+ regNumber op3Reg = op3->gtRegNum;
+
+ emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
+ break;
+ }
+
+ default:
+ unreached();
+ break;
+ }
+ genProduceReg(node);
+ return;
+ }
+
switch (isa)
{
case InstructionSet_SSE:
regNumber targetReg = node->gtRegNum;
GenTree* op1 = node->gtGetOp1();
GenTree* op2 = node->gtGetOp2();
+ emitAttr simdSize = (emitAttr)(node->gtSIMDSize);
emitter* emit = getEmitter();
// TODO-XArch-CQ: Commutative operations can have op1 be contained
case GT_CLS_VAR_ADDR:
{
- emit->emitIns_SIMD_R_R_C(ins, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0, targetType);
+ emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
return;
}
default:
{
- emit->emitIns_SIMD_R_R_A(ins, targetReg, op1Reg, memIndir, targetType);
+ emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir);
return;
}
}
assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
assert(offset != (unsigned)-1);
- emit->emitIns_SIMD_R_R_S(ins, targetReg, op1Reg, varNum, offset, targetType);
+ emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset);
}
else
{
- emit->emitIns_SIMD_R_R_R(ins, targetReg, op1Reg, op2->gtRegNum, targetType);
+ emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum);
}
}
switch (intrinsicID)
{
- case NI_SSE2_Add:
- {
- assert(node->TypeGet() == TYP_SIMD16);
-
- switch (baseType)
- {
- case TYP_DOUBLE:
- ins = INS_addpd;
- break;
- case TYP_INT:
- case TYP_UINT:
- ins = INS_paddd;
- break;
- case TYP_LONG:
- case TYP_ULONG:
- ins = INS_paddq;
- break;
- case TYP_BYTE:
- case TYP_UBYTE:
- ins = INS_paddb;
- break;
- case TYP_SHORT:
- case TYP_USHORT:
- ins = INS_paddw;
- break;
- default:
- unreached();
- break;
- }
-
- genHWIntrinsic_R_R_RM(node, ins);
- break;
- }
-
default:
unreached();
break;
switch (intrinsicID)
{
- case NI_AVX_Add:
- {
- assert(node->TypeGet() == TYP_SIMD32);
-
- switch (baseType)
- {
- case TYP_DOUBLE:
- ins = INS_addpd;
- break;
- case TYP_FLOAT:
- ins = INS_addps;
- break;
- default:
- unreached();
- break;
- }
-
- genHWIntrinsic_R_R_RM(node, ins);
- break;
- }
-
default:
unreached();
break;
switch (intrinsicID)
{
- case NI_AVX2_Add:
- {
- assert(node->TypeGet() == TYP_SIMD32);
-
- switch (baseType)
- {
- case TYP_INT:
- case TYP_UINT:
- ins = INS_paddd;
- break;
- case TYP_LONG:
- case TYP_ULONG:
- ins = INS_paddq;
- break;
- case TYP_BYTE:
- case TYP_UBYTE:
- ins = INS_paddb;
- break;
- case TYP_SHORT:
- case TYP_USHORT:
- ins = INS_paddw;
- break;
- default:
- unreached();
- break;
- }
-
- genHWIntrinsic_R_R_RM(node, ins);
- break;
- }
-
default:
unreached();
break;
// clang-format off
#if FEATURE_HW_INTRINSICS
-// Intrinsic ID Function name ISA
-// SSE Intrinsics
-HARDWARE_INTRINSIC(SSE_IsSupported, "get_IsSupported", SSE)
-HARDWARE_INTRINSIC(SSE_Add, "Add", SSE)
-HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE)
-HARDWARE_INTRINSIC(SSE_And, "And", SSE)
-HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE)
-HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE)
-HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThan, "CompareGreaterThan", SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual, "CompareGreaterThanOrEqual", SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThan, "CompareLessThan", SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual, "CompareLessThanOrEqual", SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan, "CompareNotGreaterThan", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual, "CompareNotGreaterThanOrEqual", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThan, "CompareNotLessThan", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual, "CompareNotLessThanOrEqual", SSE)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareOrdered, "CompareOrdered", SSE)
-HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_CompareUnordered, "CompareUnordered", SSE)
-HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE)
-HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE)
-HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE)
-HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE)
-HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE)
-HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE)
-HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE)
-HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE)
-HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE)
-HARDWARE_INTRINSIC(SSE_Max, "Max", SSE)
-HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE)
-HARDWARE_INTRINSIC(SSE_Min, "Min", SSE)
-HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE)
-HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE)
-HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE)
-HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE)
-HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE)
-HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE)
-HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE)
-HARDWARE_INTRINSIC(SSE_Or, "Or", SSE)
-HARDWARE_INTRINSIC(SSE_Reciprocal, "Reciprocal", SSE)
-HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE)
-HARDWARE_INTRINSIC(SSE_ReciprocalSqrt, "ReciprocalSqrt", SSE)
-HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE)
-HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE)
-HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE)
-HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE)
-HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE)
-HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE)
-HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE)
-HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE)
-HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE)
-HARDWARE_INTRINSIC(SSE_Store, "Store", SSE)
-HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE)
-HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE)
-HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE)
-HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE)
-HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE)
-HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE)
-HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE)
-HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE)
-HARDWARE_INTRINSIC(SSE_UnpackLow, "UnpackLow", SSE)
-HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE)
-
-// SSE2 Intrinsics
-HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2)
-HARDWARE_INTRINSIC(SSE2_Add, "Add", SSE2)
-
-// SSE3 Intrinsics
-HARDWARE_INTRINSIC(SSE3_IsSupported, "get_IsSupported", SSE3)
-
-// SSSE3 Intrinsics
-HARDWARE_INTRINSIC(SSSE3_IsSupported, "get_IsSupported", SSSE3)
-
-// SSE41 Intrinsics
-HARDWARE_INTRINSIC(SSE41_IsSupported, "get_IsSupported", SSE41)
-
-// SSE42 Intrinsics
-HARDWARE_INTRINSIC(SSE42_IsSupported, "get_IsSupported", SSE42)
-HARDWARE_INTRINSIC(SSE42_Crc32, "Crc32", SSE42)
-
-// AVX Intrinsics
-HARDWARE_INTRINSIC(AVX_IsSupported, "get_IsSupported", AVX)
-HARDWARE_INTRINSIC(AVX_Add, "Add", AVX)
-
-// AVX2 Intrinsics
-HARDWARE_INTRINSIC(AVX2_IsSupported, "get_IsSupported", AVX2)
-HARDWARE_INTRINSIC(AVX2_Add, "Add", AVX2)
-
-// AES Intrinsics
-HARDWARE_INTRINSIC(AES_IsSupported, "get_IsSupported", AES)
-
-// BMI1 Intrinsics
-HARDWARE_INTRINSIC(BMI1_IsSupported, "get_IsSupported", BMI1)
-
-// BMI2 Intrinsics
-HARDWARE_INTRINSIC(BMI2_IsSupported, "get_IsSupported", BMI2)
-
-// FMA Intrinsics
-HARDWARE_INTRINSIC(FMA_IsSupported, "get_IsSupported", FMA)
-
-// LZCNT Intrinsics
-HARDWARE_INTRINSIC(LZCNT_IsSupported, "get_IsSupported", LZCNT)
-HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount, "LeadingZeroCount", LZCNT)
+/* Note
+ 1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic`
+ 2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID
+ 3) The intrinsic that generates instructions with a fixed imm8 operand has a `ival` field with "not -1" value, e.g., Sse.CompareEqual(v1,v2) -> cmpps xmm0, xmm1, 0
+ 4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128<T>` (16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_) or `Vector256<T>`
+ 5) Scalar intrinsics that operate over general purpose registers (e.g., Sse41.Crc32) have `SIMD size` with 0
+ 6) Each intrinsic has a `NumArg` for number of parameters, and some intrinsics that are overloaded on multiple parameter numbers have this field with -1
+ 7) Each intrinsic has 10 `instructions` fields that list the instructions should be generated based-on the base type
+ 8) Each intrinsic has one category with type of `enum HWIntrinsicCategory`, please see the definition of HWIntrinsicCategory for details
+ 9) Each intrinsic has one or more flags with type of `enum HWIntrinsicFlag`
+*/
+// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
+// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags
+// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
+// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
+// SSE Intrinsics
+HARDWARE_INTRINSIC(SSE_IsSupported, "get_IsSupported", SSE, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Add, "Add", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_And, "And", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Fixed)
+HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThan, "CompareGreaterThan", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual, "CompareGreaterThanOrEqual", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThan, "CompareLessThan", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual, "CompareLessThanOrEqual", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan, "CompareNotGreaterThan", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual, "CompareNotGreaterThanOrEqual", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThan, "CompareNotLessThan", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual, "CompareNotLessThanOrEqual", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareOrdered, "CompareOrdered", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareUnordered, "CompareUnordered", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Or, "Or", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Reciprocal, "Reciprocal", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalSqrt, "ReciprocalSqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE, -1, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+//HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_UnpackLow, "UnpackLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+
+// SSE2 Intrinsics
+HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_Add, "Add", SSE2, -1, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+
+// SSE3 Intrinsics
+HARDWARE_INTRINSIC(SSE3_IsSupported, "get_IsSupported", SSE3, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+
+// SSSE3 Intrinsics
+HARDWARE_INTRINSIC(SSSE3_IsSupported, "get_IsSupported", SSSE3, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+
+// SSE41 Intrinsics
+HARDWARE_INTRINSIC(SSE41_IsSupported, "get_IsSupported", SSE41, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_Multiply, "Multiply", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE41_BlendVariable, "BlendVariable", SSE41, -1, 16, 3, {INS_pblendvb, INS_pblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+
+// SSE42 Intrinsics
+HARDWARE_INTRINSIC(SSE42_IsSupported, "get_IsSupported", SSE42, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE42_Crc32, "Crc32", SSE42, -1, 0, 2, {INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFlag)
+
+// AVX Intrinsics
+HARDWARE_INTRINSIC(AVX_IsSupported, "get_IsSupported", AVX, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_Add, "Add", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX_Multiply, "Multiply", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX_Reciprocal, "Reciprocal", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_BlendVariable, "BlendVariable", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+
+// AVX2 Intrinsics
+HARDWARE_INTRINSIC(AVX2_IsSupported, "get_IsSupported", AVX2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX2_Add, "Add", AVX2, -1, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX2_Multiply, "Multiply", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX2_BlendVariable, "BlendVariable", AVX2, -1, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+
+// AES Intrinsics
+HARDWARE_INTRINSIC(AES_IsSupported, "get_IsSupported", AES, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+
+// BMI1 Intrinsics
+HARDWARE_INTRINSIC(BMI1_IsSupported, "get_IsSupported", BMI1, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+
+// BMI2 Intrinsics
+HARDWARE_INTRINSIC(BMI2_IsSupported, "get_IsSupported", BMI2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+
+// FMA Intrinsics
+HARDWARE_INTRINSIC(FMA_IsSupported, "get_IsSupported", FMA, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+
+// LZCNT Intrinsics
+HARDWARE_INTRINSIC(LZCNT_IsSupported, "get_IsSupported", LZCNT, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount, "LeadingZeroCount", LZCNT, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_lzcnt, INS_invalid, INS_lzcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFlag)
// PCLMULQDQ Intrinsics
-HARDWARE_INTRINSIC(PCLMULQDQ_IsSupported, "get_IsSupported", PCLMULQDQ)
+HARDWARE_INTRINSIC(PCLMULQDQ_IsSupported, "get_IsSupported", PCLMULQDQ, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
// POPCNT Intrinsics
-HARDWARE_INTRINSIC(POPCNT_IsSupported, "get_IsSupported", POPCNT)
-HARDWARE_INTRINSIC(POPCNT_PopCount, "PopCount", POPCNT)
-#endif // FEATURE_HW_INTRINSICS
+HARDWARE_INTRINSIC(POPCNT_IsSupported, "get_IsSupported", POPCNT, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(POPCNT_PopCount, "PopCount", POPCNT, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_popcnt, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFlag)
+#endif // FEATURE_HW_INTRINSIC
#undef HARDWARE_INTRINSIC
struct HWIntrinsicInfo
{
- NamedIntrinsic intrinsicID;
- const char* intrinsicName;
- InstructionSet isa;
-}
+ NamedIntrinsic intrinsicID;
+ const char* intrinsicName;
+ InstructionSet isa;
+ int ival;
+ unsigned simdSize;
+ int numArgs;
+ instruction ins[10];
+ HWIntrinsicCategory category;
+ HWIntrinsicFlag flag;
+};
-static const hwIntrinsicInfoArray[] = {
-#define HARDWARE_INTRINSIC(id, name, isa) {NI_##id, name, InstructionSet_##isa},
+static const HWIntrinsicInfo hwIntrinsicInfoArray[] = {
+#define HARDWARE_INTRINSIC(id, name, isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag) \
+ {NI_##id, name, InstructionSet_##isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag},
#include "hwintrinsiclistxarch.h"
};
}
//------------------------------------------------------------------------
-// ivalOfHWIntrinsic: get the imm8 value of the given intrinsic
+// ivalOfHWIntrinsic: get the imm8 value of this intrinsic from the hwIntrinsicInfoArray table
//
// Arguments:
// intrinsic -- id of the intrinsic function.
//
// Return Value:
-// the imm8 value of the intrinsic, -1 for non-IMM intrinsics
+// The imm8 value that is implicit for this intrinsic, or -1 for intrinsics that do not take an immediate, or for
+// which the immediate is an explicit argument.
//
int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic)
{
assert(intrinsic != NI_Illegal);
assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+ return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].ival;
+}
- switch (intrinsic)
- {
- case NI_SSE_CompareEqual:
- case NI_SSE_CompareEqualScalar:
- return 0;
-
- case NI_SSE_CompareLessThan:
- case NI_SSE_CompareLessThanScalar:
- case NI_SSE_CompareNotGreaterThanOrEqual:
- case NI_SSE_CompareNotGreaterThanOrEqualScalar:
- return 1;
-
- case NI_SSE_CompareLessThanOrEqual:
- case NI_SSE_CompareLessThanOrEqualScalar:
- case NI_SSE_CompareNotGreaterThan:
- case NI_SSE_CompareNotGreaterThanScalar:
- return 2;
-
- case NI_SSE_CompareUnordered:
- case NI_SSE_CompareUnorderedScalar:
- return 3;
-
- case NI_SSE_CompareNotEqual:
- case NI_SSE_CompareNotEqualScalar:
- return 4;
-
- case NI_SSE_CompareGreaterThanOrEqual:
- case NI_SSE_CompareGreaterThanOrEqualScalar:
- case NI_SSE_CompareNotLessThan:
- case NI_SSE_CompareNotLessThanScalar:
- return 5;
-
- case NI_SSE_CompareGreaterThan:
- case NI_SSE_CompareGreaterThanScalar:
- case NI_SSE_CompareNotLessThanOrEqual:
- case NI_SSE_CompareNotLessThanOrEqualScalar:
- return 6;
-
- case NI_SSE_CompareOrdered:
- case NI_SSE_CompareOrderedScalar:
- return 7;
+//------------------------------------------------------------------------
+// simdSizeOfHWIntrinsic: get the SIMD size of this intrinsic
+//
+// Arguments:
+// intrinsic -- id of the intrinsic function.
+//
+// Return Value:
+// the SIMD size of this intrinsic
+// - from the hwIntrinsicInfoArray table if intrinsic has NO HW_Flag_UnfixedSIMDSize
+// - TODO-XArch-NYI - from the signature if intrinsic has HW_Flag_UnfixedSIMDSize
+//
+// Note - this function is only used by the importer
+// after importation (i.e., codegen), we can get the SIMD size from GenTreeHWIntrinsic IR
+static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig)
+{
+ assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+ assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag & HW_Flag_UnfixedSIMDSize) == 0);
+ return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize;
+}
- default:
- return -1;
- }
+//------------------------------------------------------------------------
+// numArgsOfHWIntrinsic: get the number of arguments
+//
+// Arguments:
+// intrinsic -- id of the intrinsic function.
+//
+// Return Value:
+// number of arguments
+//
+int Compiler::numArgsOfHWIntrinsic(NamedIntrinsic intrinsic)
+{
+ assert(intrinsic != NI_Illegal);
+ assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+ return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].numArgs;
}
//------------------------------------------------------------------------
{
assert(intrinsic != NI_Illegal);
assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+ assert(type >= TYP_BYTE && type <= TYP_DOUBLE);
+ return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].ins[type - TYP_BYTE];
+}
- switch (intrinsic)
- {
- case NI_SSE_Add:
- return INS_addps;
-
- case NI_SSE_AddScalar:
- return INS_addss;
-
- case NI_SSE_And:
- return INS_andps;
-
- case NI_SSE_AndNot:
- return INS_andnps;
-
- case NI_SSE_CompareEqual:
- case NI_SSE_CompareGreaterThan:
- case NI_SSE_CompareGreaterThanOrEqual:
- case NI_SSE_CompareLessThan:
- case NI_SSE_CompareLessThanOrEqual:
- case NI_SSE_CompareNotEqual:
- case NI_SSE_CompareNotGreaterThan:
- case NI_SSE_CompareNotGreaterThanOrEqual:
- case NI_SSE_CompareNotLessThan:
- case NI_SSE_CompareNotLessThanOrEqual:
- case NI_SSE_CompareOrdered:
- case NI_SSE_CompareUnordered:
- return INS_cmpps;
-
- case NI_SSE_CompareEqualScalar:
- case NI_SSE_CompareGreaterThanScalar:
- case NI_SSE_CompareGreaterThanOrEqualScalar:
- case NI_SSE_CompareLessThanScalar:
- case NI_SSE_CompareLessThanOrEqualScalar:
- case NI_SSE_CompareNotEqualScalar:
- case NI_SSE_CompareNotGreaterThanScalar:
- case NI_SSE_CompareNotGreaterThanOrEqualScalar:
- case NI_SSE_CompareNotLessThanScalar:
- case NI_SSE_CompareNotLessThanOrEqualScalar:
- case NI_SSE_CompareOrderedScalar:
- case NI_SSE_CompareUnorderedScalar:
- return INS_cmpss;
-
- case NI_SSE_CompareEqualOrderedScalar:
- case NI_SSE_CompareGreaterThanOrderedScalar:
- case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
- case NI_SSE_CompareLessThanOrderedScalar:
- case NI_SSE_CompareLessThanOrEqualOrderedScalar:
- case NI_SSE_CompareNotEqualOrderedScalar:
- return INS_comiss;
-
- case NI_SSE_CompareEqualUnorderedScalar:
- case NI_SSE_CompareGreaterThanUnorderedScalar:
- case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
- case NI_SSE_CompareLessThanUnorderedScalar:
- case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
- case NI_SSE_CompareNotEqualUnorderedScalar:
- return INS_ucomiss;
-
- case NI_SSE_ConvertToInt32:
- case NI_SSE_ConvertToInt64:
- return INS_cvtss2si;
-
- case NI_SSE_ConvertToInt32WithTruncation:
- case NI_SSE_ConvertToInt64WithTruncation:
- return INS_cvttss2si;
-
- case NI_SSE_ConvertToSingle:
- case NI_SSE_LoadScalar:
- case NI_SSE_MoveScalar:
- return INS_movss;
-
- case NI_SSE_ConvertToVector128SingleScalar:
- return INS_cvtsi2ss;
-
- case NI_SSE_Divide:
- return INS_divps;
-
- case NI_SSE_DivideScalar:
- return INS_divss;
-
- case NI_SSE_LoadAlignedVector128:
- case NI_SSE_StaticCast:
- return INS_movaps;
-
- case NI_SSE_LoadHigh:
- return INS_movhps;
-
- case NI_SSE_LoadLow:
- return INS_movlps;
-
- case NI_SSE_LoadVector128:
- return INS_movups;
-
- case NI_SSE_Max:
- return INS_maxps;
-
- case NI_SSE_MaxScalar:
- return INS_maxss;
-
- case NI_SSE_Min:
- return INS_minps;
-
- case NI_SSE_MinScalar:
- return INS_minss;
-
- case NI_SSE_MoveHighToLow:
- return INS_movhlps;
-
- case NI_SSE_MoveLowToHigh:
- return INS_movlhps;
-
- case NI_SSE_MoveMask:
- return INS_movmskps;
-
- case NI_SSE_Multiply:
- return INS_mulps;
-
- case NI_SSE_MultiplyScalar:
- return INS_mulss;
-
- case NI_SSE_Or:
- return INS_orps;
-
- case NI_SSE_Reciprocal:
- return INS_rcpps;
-
- case NI_SSE_ReciprocalScalar:
- return INS_rcpss;
-
- case NI_SSE_ReciprocalSqrt:
- return INS_rsqrtps;
-
- case NI_SSE_ReciprocalSqrtScalar:
- return INS_rsqrtss;
-
- case NI_SSE_Sqrt:
- return INS_sqrtps;
-
- case NI_SSE_SqrtScalar:
- return INS_sqrtss;
-
- case NI_SSE_Subtract:
- return INS_subps;
-
- case NI_SSE_SubtractScalar:
- return INS_subss;
-
- case NI_SSE_UnpackHigh:
- return INS_unpckhps;
-
- case NI_SSE_UnpackLow:
- return INS_unpcklps;
-
- case NI_SSE_Xor:
- return INS_xorps;
-
- default:
- return INS_invalid;
- }
+//------------------------------------------------------------------------
+// categoryOfHWIntrinsic: get the category of the given intrinsic
+//
+// Arguments:
+// intrinsic -- id of the intrinsic function.
+//
+// Return Value:
+// the category of the given intrinsic
+//
+HWIntrinsicCategory Compiler::categoryOfHWIntrinsic(NamedIntrinsic intrinsic)
+{
+ assert(intrinsic != NI_Illegal);
+ assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+ return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].category;
}
//------------------------------------------------------------------------
-// isIntrinsicAnIsSupportedPropertyGetter: return true if the intrinsic is "get_IsSupported"
+// HWIntrinsicFlag: get the flag of the given intrinsic
//
// Arguments:
// intrinsic -- id of the intrinsic function.
//
// Return Value:
-// true if the intrinsic is "get_IsSupported"
-// Sometimes we need to specially treat "get_IsSupported"
-bool Compiler::isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic)
+// the flag of the given intrinsic
+//
+HWIntrinsicFlag Compiler::flagOfHWIntrinsic(NamedIntrinsic intrinsic)
{
- switch (intrinsic)
+ assert(intrinsic != NI_Illegal);
+ assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
+ return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag;
+}
+
+//------------------------------------------------------------------------
+// getArgForHWIntrinsic: get the argument from the stack and match the signature
+//
+// Arguments:
+// argType -- the required type of argument
+// argClass -- the class handle of argType
+//
+// Return Value:
+// get the argument at the given index from the stack and match the signature
+//
+GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass)
+{
+ GenTree* arg = nullptr;
+ if (argType == TYP_STRUCT)
{
- case NI_SSE_IsSupported:
- case NI_SSE2_IsSupported:
- case NI_SSE3_IsSupported:
- case NI_SSSE3_IsSupported:
- case NI_SSE41_IsSupported:
- case NI_SSE42_IsSupported:
- case NI_AVX_IsSupported:
- case NI_AVX2_IsSupported:
- case NI_AES_IsSupported:
- case NI_BMI1_IsSupported:
- case NI_BMI2_IsSupported:
- case NI_FMA_IsSupported:
- case NI_LZCNT_IsSupported:
- case NI_PCLMULQDQ_IsSupported:
- case NI_POPCNT_IsSupported:
- return true;
- default:
- return false;
+ unsigned int argSizeBytes;
+ var_types base = getBaseTypeAndSizeOfSIMDType(argClass, &argSizeBytes);
+ argType = getSIMDTypeForSize(argSizeBytes);
+ assert(argType == TYP_SIMD32 || argType == TYP_SIMD16);
+ arg = impSIMDPopStack(argType);
+ assert(arg->TypeGet() == TYP_SIMD16 || arg->TypeGet() == TYP_SIMD32);
+ }
+ else
+ {
+ assert(varTypeIsArithmetic(argType));
+ arg = impPopStack().val;
+ assert(varTypeIsArithmetic(arg->TypeGet()));
+ assert(genTypeSize(argType) <= genTypeSize(arg->TypeGet()));
}
+ return arg;
}
//------------------------------------------------------------------------
isFullyImplmentedISAClass(isa));
}
+static bool isTypeSupportedForIntrinsic(var_types type)
+{
+#ifdef _TARGET_X86_
+ return !varTypeIsLong(type);
+#else
+ return true;
+#endif
+}
+
//------------------------------------------------------------------------
// impUnsupportedHWIntrinsic: returns a node for an unsupported HWIntrinsic
//
}
//------------------------------------------------------------------------
+// impIsTableDrivenHWIntrinsic:
+//
+// Arguments:
+// category - category of a HW intrinsic
+//
+// Return Value:
+// returns true if this category can be table-driven in the importer
+//
+static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category)
+{
+ // TODO - make more categories to the table-driven framework
+ const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD;
+ const bool nonTableDrivenIntrinsic = category == HW_Category_Special;
+ return tableDrivenIntrinsic && !nonTableDrivenIntrinsic;
+}
+
+//------------------------------------------------------------------------
// impX86HWIntrinsic: dispatch hardware intrinsics to their own implementation
-// function
//
// Arguments:
// intrinsic -- id of the intrinsic function.
CORINFO_SIG_INFO* sig,
bool mustExpand)
{
- InstructionSet isa = isaOfHWIntrinsic(intrinsic);
+ InstructionSet isa = isaOfHWIntrinsic(intrinsic);
+ HWIntrinsicCategory category = categoryOfHWIntrinsic(intrinsic);
+ int numArgs = sig->numArgs;
+ var_types callType = JITtype2varType(sig->retType);
// This intrinsic is supported if
// - the ISA is available on the underlying hardware (compSupports returns true)
// - the compiler supports this hardware intrinsics (compSupportsHWIntrinsic returns true)
- bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa);
+ // - intrinsics do not require 64-bit registers (r64) on 32-bit platforms (isTypeSupportedForIntrinsic returns
+ // true)
+ bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(callType);
- if (isIntrinsicAnIsSupportedPropertyGetter(intrinsic))
+ if (category == HW_Category_IsSupportedProperty)
{
return gtNewIconNode(issupported);
}
+ // - calling to unsupported intrinsics must throw PlatforNotSupportedException
else if (!issupported)
{
return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
}
+ // table-driven importer of simple intrinsics
+ if (impIsTableDrivenHWIntrinsic(category))
+ {
+ unsigned int sizeBytes;
+ var_types baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);
+ assert(baseType != TYP_UNKNOWN && sizeBytes != 0);
+ var_types retType = getSIMDTypeForSize(sizeBytes);
+ unsigned simdSize = simdSizeOfHWIntrinsic(intrinsic, sig);
+ CORINFO_ARG_LIST_HANDLE argList = sig->args;
+ CORINFO_CLASS_HANDLE argClass;
+ var_types argType = TYP_UNKNOWN;
+
+ assert(numArgs > 0);
+ assert(retType != TYP_UNDEF);
+ assert(retType == TYP_SIMD16 || retType == TYP_SIMD32);
+ assert(insOfHWIntrinsic(intrinsic, baseType) != INS_invalid);
+ assert(simdSize == 32 || simdSize == 16);
+
+ GenTree* retNode = nullptr;
+ GenTree* op1 = nullptr;
+ GenTree* op2 = nullptr;
+
+ switch (numArgs)
+ {
+ case 1:
+ argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
+ op1 = getArgForHWIntrinsic(argType, argClass);
+
+ retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+ break;
+ case 2:
+ argType = JITtype2varType(
+ strip(info.compCompHnd->getArgType(sig, info.compCompHnd->getArgNext(argList), &argClass)));
+ op2 = getArgForHWIntrinsic(argType, argClass);
+
+ argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
+ op1 = getArgForHWIntrinsic(argType, argClass);
+
+ retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
+ break;
+
+ case 3:
+ {
+ CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(argList);
+ CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2);
+
+ argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass)));
+ GenTree* op3 = getArgForHWIntrinsic(argType, argClass);
+
+ argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass)));
+ op2 = getArgForHWIntrinsic(argType, argClass);
+
+ argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
+ op1 = getArgForHWIntrinsic(argType, argClass);
+
+ op1 = gtNewArgList(op1, op2, op3);
+ retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+ break;
+ }
+ default:
+ unreached();
+ }
+ return retNode;
+ }
+
+ // other intrinsics need special importation
switch (isa)
{
case InstructionSet_SSE:
var_types baseType = TYP_UNKNOWN;
switch (intrinsic)
{
- case NI_SSE2_Add:
- assert(sig->numArgs == 2);
- op2 = impSIMDPopStack(TYP_SIMD16);
- op1 = impSIMDPopStack(TYP_SIMD16);
- baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, NI_SSE2_Add, baseType, 16);
- break;
-
default:
JITDUMP("Not implemented hardware intrinsic");
break;
GenTree* op2 = nullptr;
var_types callType = JITtype2varType(sig->retType);
- CORINFO_ARG_LIST_HANDLE argLst = sig->args;
+ CORINFO_ARG_LIST_HANDLE argList = sig->args;
CORINFO_CLASS_HANDLE argClass;
CorInfoType corType;
switch (intrinsic)
{
case NI_SSE42_Crc32:
assert(sig->numArgs == 2);
-
-#ifdef _TARGET_X86_
- if (varTypeIsLong(callType))
- {
- return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
- }
-#endif
-
- op2 = impPopStack().val;
- op1 = impPopStack().val;
-
- argLst = info.compCompHnd->getArgNext(argLst); // the second argument
- corType = strip(info.compCompHnd->getArgType(sig, argLst, &argClass)); // type of the second argument
+ op2 = impPopStack().val;
+ op1 = impPopStack().val;
+ argList = info.compCompHnd->getArgNext(argList); // the second argument
+ corType = strip(info.compCompHnd->getArgType(sig, argList, &argClass)); // type of the second argument
retNode = gtNewScalarHWIntrinsicNode(callType, op1, op2, NI_SSE42_Crc32);
var_types baseType = TYP_UNKNOWN;
switch (intrinsic)
{
- case NI_AVX_Add:
- assert(sig->numArgs == 2);
- op2 = impSIMDPopStack(TYP_SIMD32);
- op1 = impSIMDPopStack(TYP_SIMD32);
- baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD32, op1, op2, NI_AVX_Add, baseType, 32);
- break;
-
default:
JITDUMP("Not implemented hardware intrinsic");
break;
var_types baseType = TYP_UNKNOWN;
switch (intrinsic)
{
- case NI_AVX2_Add:
- assert(sig->numArgs == 2);
- op2 = impSIMDPopStack(TYP_SIMD32);
- op1 = impSIMDPopStack(TYP_SIMD32);
- baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD32, op1, op2, NI_AVX2_Add, baseType, 32);
- break;
-
default:
JITDUMP("Not implemented hardware intrinsic");
break;
{
assert(sig->numArgs == 1);
var_types callType = JITtype2varType(sig->retType);
-
-#ifdef _TARGET_X86_
- if (varTypeIsLong(callType))
- {
- return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
- }
-#endif
-
return gtNewScalarHWIntrinsicNode(callType, impPopStack().val, NI_LZCNT_LeadingZeroCount);
}
{
assert(sig->numArgs == 1);
var_types callType = JITtype2varType(sig->retType);
-
-#ifdef _TARGET_X86_
- if (varTypeIsLong(callType))
- {
- return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
- }
-#endif
-
return gtNewScalarHWIntrinsicNode(callType, impPopStack().val, NI_POPCNT_PopCount);
}
INST3( orps, "orps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x56)) // Or packed singles
INST3( orpd, "orpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x56)) // Or packed doubles
INST3( haddpd, "haddpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7C)) // Horizontal add packed doubles
+INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocals of Packed Singles
// SSE 2 approx arith
INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocal of packed singles
INST3( roundss, "roundss" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0A)) // Round scalar single precision floating-point values
INST3( roundpd, "roundpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x09)) // Round packed double precision floating-point values
INST3( roundsd, "roundsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0B)) // Round scalar double precision floating-point values
+INST3( pmuldq, "pmuldq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x28)) // packed multiply 32-bit signed integers and store 64-bit result
+INST3( blendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x14)) // Variable Blend Packed Singles
+INST3( blendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x15)) // Variable Blend Packed Doubles
+INST3( pblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x10)) // Variable Blend Packed Bytes
+
INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
INST3( vperm2i128, "perm2i128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x46)) // Permute 128-bit halves of input register
INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x00)) // Permute 64-bit of input register
+INST3( vblendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4A)) // Variable Blend Packed Singles
+INST3( vblendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4B)) // Variable Blend Packed Doubles
+INST3( vpblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4C)) // Variable Blend Packed Bytes
INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
// Scalar instructions in SSE4.2
//
void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
{
- NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
- GenTree* op1 = node->gtGetOp1();
- GenTree* op2 = node->gtGetOp2();
-
- switch (node->gtHWIntrinsicId)
- {
- case NI_SSE_Add:
- case NI_SSE_AddScalar:
- case NI_SSE_And:
- case NI_SSE_AndNot:
- case NI_SSE_CompareEqual:
- case NI_SSE_CompareEqualScalar:
- case NI_SSE_CompareGreaterThan:
- case NI_SSE_CompareGreaterThanScalar:
- case NI_SSE_CompareGreaterThanOrEqual:
- case NI_SSE_CompareGreaterThanOrEqualScalar:
- case NI_SSE_CompareLessThan:
- case NI_SSE_CompareLessThanScalar:
- case NI_SSE_CompareLessThanOrEqual:
- case NI_SSE_CompareLessThanOrEqualScalar:
- case NI_SSE_CompareNotEqual:
- case NI_SSE_CompareNotEqualScalar:
- case NI_SSE_CompareNotGreaterThan:
- case NI_SSE_CompareNotGreaterThanScalar:
- case NI_SSE_CompareNotGreaterThanOrEqual:
- case NI_SSE_CompareNotGreaterThanOrEqualScalar:
- case NI_SSE_CompareNotLessThan:
- case NI_SSE_CompareNotLessThanScalar:
- case NI_SSE_CompareNotLessThanOrEqual:
- case NI_SSE_CompareNotLessThanOrEqualScalar:
- case NI_SSE_CompareOrdered:
- case NI_SSE_CompareOrderedScalar:
- case NI_SSE_CompareUnordered:
- case NI_SSE_CompareUnorderedScalar:
- case NI_SSE_ConvertToVector128SingleScalar:
- case NI_SSE_Divide:
- case NI_SSE_DivideScalar:
- case NI_SSE_Max:
- case NI_SSE_MaxScalar:
- case NI_SSE_Min:
- case NI_SSE_MinScalar:
- case NI_SSE_Multiply:
- case NI_SSE_MultiplyScalar:
- case NI_SSE_Or:
- case NI_SSE_Subtract:
- case NI_SSE_SubtractScalar:
- case NI_SSE_UnpackHigh:
- case NI_SSE_UnpackLow:
- case NI_SSE_Xor:
- case NI_SSE2_Add:
- if (!comp->getEmitter()->UseVEXEncoding())
- {
- // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
- // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned
- break;
- }
- __fallthrough;
+ NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
+ HWIntrinsicCategory category = comp->categoryOfHWIntrinsic(intrinsicID);
+ int numArgs = comp->numArgsOfHWIntrinsic(intrinsicID);
+ GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = node->gtGetOp2();
- case NI_AVX_Add:
- case NI_AVX2_Add:
+ // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
+ // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned
+ if (category == HW_Category_SimpleSIMD && numArgs == 2 && comp->canUseVexEncoding())
+ {
+ if (IsContainableMemoryOp(op2))
{
- assert(comp->getEmitter()->UseVEXEncoding());
-
- if (IsContainableMemoryOp(op2))
- {
- MakeSrcContained(node, op2);
- }
- else
- {
- // TODO-XArch-CQ: Commutative operations can have op1 be contained
- op2->SetRegOptional();
- }
- break;
+ MakeSrcContained(node, op2);
}
-
- case NI_SSE_Shuffle:
+ else
{
- assert(op1->OperIsList());
- GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
-
- if (op3->IsCnsIntOrI())
- {
- MakeSrcContained(node, op3);
- }
- break;
+ // TODO-XArch-CQ: Commutative operations can have op1 be contained
+ op2->SetRegOptional();
}
+ }
- default:
- assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END));
- break;
+ if (NamedIntrinsic == NI_SSE_Shuffle)
+ {
+ assert(op1->OperIsList());
+ GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
+
+ if (op3->IsCnsIntOrI())
+ {
+ MakeSrcContained(node, op3);
+ }
}
}
#endif // FEATURE_HW_INTRINSICS
void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, TreeNodeInfo* info)
{
NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId;
- InstructionSet isa = compiler->isaOfHWIntrinsic(intrinsicID);
-
+ InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID);
if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
{
SetContainsAVXFlags(true, 32);
}
-
GenTree* op1 = intrinsicTree->gtOp.gtOp1;
GenTree* op2 = intrinsicTree->gtOp.gtOp2;
info->srcCount = 0;
{
if (op1->OperIsList())
{
- int srcCount = 0;
-
for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest())
{
- GenTree* listItem = list->Current();
- srcCount += GetOperandInfo(listItem);
+ info->srcCount += GetOperandInfo(list->Current());
}
-
- info->srcCount += srcCount;
}
else
{
useList.Last()->info.isTgtPref = true;
break;
+ case NI_SSE41_BlendVariable:
+ {
+ if (!compiler->canUseVexEncoding())
+ {
+ // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+ LocationInfoListNode* op2Info = useList.Begin()->Next();
+ LocationInfoListNode* op3Info = op2Info->Next();
+ op2Info->info.isDelayFree = true;
+ op3Info->info.isDelayFree = true;
+ op3Info->info.setSrcCandidates(this, RBM_XMM0);
+ info->hasDelayFreeSrc = true;
+ }
+ break;
+ }
+
#ifdef _TARGET_X86_
case NI_SSE42_Crc32:
{
NI_System_Collections_Generic_EqualityComparer_get_Default = 4,
#if FEATURE_HW_INTRINSICS
NI_HW_INTRINSIC_START,
-#define HARDWARE_INTRINSIC(id, name, isa) NI_##id,
+#define HARDWARE_INTRINSIC(id, name, isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag) \
+ NI_##id,
#include "hwintrinsiclistxarch.h"
NI_HW_INTRINSIC_END
#endif
};
+#if FEATURE_HW_INTRINSICS && defined(_TARGET_XARCH_)
+enum HWIntrinsicFlag : unsigned int
+{
+ HW_Flag_NoFlag = 0,
+
+ // Commutative
+ // - if a binary-op intrinsic is commutative (e.g., Add, Multiply), its op1 can be contained
+ HW_Flag_Commutative = 0x1,
+
+ // Full range IMM intrinsic
+ // - the immediate value is vaild on the full range of imm8 (0-255)
+ HW_Flag_FullRangeIMM = 0x2,
+
+ // Generic
+ // - must throw NotSupportException if the type argument is not numeric type
+ HW_Flag_Generic = 0x4,
+
+ // NoCodeGen
+ // - should be transformed in the compiler front-end, cannot reach CodeGen
+ HW_Flag_NoCodeGen = 0x8,
+
+ // Unfixed SIMD-size
+ // - overloaded on multiple vector sizes (SIMD size in the table is unreliable)
+ HW_Flag_UnfixedSIMDSize = 0x10,
+
+ // Complex overload
+ // - the codegen of overloads cannot be determined by intrinsicID and base type
+ HW_Flag_ComplexOverloads = 0x20,
+};
+
+inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2)
+{
+ return static_cast<HWIntrinsicFlag>(static_cast<unsigned>(c1) | static_cast<unsigned>(c2));
+}
+
+enum HWIntrinsicCategory : unsigned int
+{
+ // Simple SIMD intrinsics
+ // - take Vector128/256<T> parameters
+ // - return a Vector128/256<T>
+ // - generate single instruction
+ // - the codegen of overloads can be determined by intrinsicID and base type of returned vector
+ HW_Category_SimpleSIMD,
+
+ // IsSupported Property
+ // - each ISA class has an "IsSupported" property
+ HW_Category_IsSupportedProperty,
+
+ // IMM intrinsics
+ // - some SIMD intrinsics requires immediate value (i.e. imm8) to generate instruction
+ HW_Category_IMM,
+
+ // Scalar intrinsics
+ // - operate over general purpose registers, like crc32, lzcnt, popcnt, etc.
+ HW_Category_Scalar,
+
+ // Memory access intrinsics
+ // - e.g., Avx.Load, Avx.Store, Sse.LoadAligned
+ HW_Category_MemoryLoad,
+ HW_Category_MemoryStore,
+
+ // Helper intrinsics
+ // - do not directly correspond to a instruction, such as Avx.SetAllVector256
+ HW_Category_Helper,
+
+ // Special intrinsics
+ // - have to be addressed specially
+ HW_Category_Special
+};
+
+#endif // FEATURE_HW_INTRINSICS && defined(_TARGET_XARCH_)
+
#endif // _NAMEDINTRINSICLIST_H_
--- /dev/null
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
+
+namespace IntelHardwareIntrinsicTest
+{
+ class Program
+ {
+ const int Pass = 100;
+ const int Fail = 0;
+
+ static unsafe int Main(string[] args)
+ {
+ int testResult = Pass;
+
+ if (Avx.IsSupported)
+ {
+ using (TestTable<float, float, float> floatTable = new TestTable<float, float, float>(new float[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new float[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new float[8]))
+ using (TestTable<double, double, double> doubleTable = new TestTable<double, double, double>(new double[4] { 1, -5, 100, 0 }, new double[4] { 22, -1, -50, 0 }, new double[4]))
+ {
+ var vf1 = Unsafe.Read<Vector256<float>>(floatTable.inArray1Ptr);
+ var vf2 = Unsafe.Read<Vector256<float>>(floatTable.inArray2Ptr);
+ var vf3 = Avx.Multiply(vf1, vf2);
+ Unsafe.Write(floatTable.outArrayPtr, vf3);
+
+ var vd1 = Unsafe.Read<Vector256<double>>(doubleTable.inArray1Ptr);
+ var vd2 = Unsafe.Read<Vector256<double>>(doubleTable.inArray2Ptr);
+ var vd3 = Avx.Multiply(vd1, vd2);
+ Unsafe.Write(doubleTable.outArrayPtr, vd3);
+
+ if (!floatTable.CheckResult((x, y, z) => x * y == z))
+ {
+ Console.WriteLine("AVX Multiply failed on float:");
+ foreach (var item in floatTable.outArray)
+ {
+ Console.Write(item + ", ");
+ }
+ Console.WriteLine();
+ testResult = Fail;
+ }
+
+ if (!doubleTable.CheckResult((x, y, z) => x * y == z))
+ {
+ Console.WriteLine("AVX Multiply failed on double:");
+ foreach (var item in doubleTable.outArray)
+ {
+ Console.Write(item + ", ");
+ }
+ Console.WriteLine();
+ testResult = Fail;
+ }
+ }
+ }
+ return testResult;
+ }
+
+ public unsafe struct TestTable<T1, T2, T3> : IDisposable where T1 : struct where T2 : struct where T3 : struct
+ {
+ public T1[] inArray1;
+ public T2[] inArray2;
+ public T3[] outArray;
+
+ public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer();
+ public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer();
+ public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer();
+
+ GCHandle inHandle1;
+ GCHandle inHandle2;
+ GCHandle outHandle;
+ public TestTable(T1[] a, T2[] b, T3[] c)
+ {
+ this.inArray1 = a;
+ this.inArray2 = b;
+ this.outArray = c;
+
+ inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned);
+ inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned);
+ outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned);
+ }
+ public bool CheckResult(Func<T1, T2, T3, bool> check)
+ {
+ for (int i = 0; i < inArray1.Length; i++)
+ {
+ if (!check(inArray1[i], inArray2[i], outArray[i]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public void Dispose()
+ {
+ inHandle1.Free();
+ inHandle2.Free();
+ outHandle.Free();
+ }
+ }
+
+ }
+}
\ No newline at end of file
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+ <OutputType>Exe</OutputType>
+ <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+ <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+ <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+ </PropertyGroup>
+ <!-- Default configurations to help VS understand the configurations -->
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+ <ItemGroup>
+ <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+ <Visible>False</Visible>
+ </CodeAnalysisDependentAssemblyPaths>
+ </ItemGroup>
+ <PropertyGroup>
+ <DebugType>None</DebugType>
+ <Optimize></Optimize>
+ </PropertyGroup>
+ <ItemGroup>
+ <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Multiply.cs" />
+ </ItemGroup>
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+ <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+ <OutputType>Exe</OutputType>
+ <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+ <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+ <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+ </PropertyGroup>
+ <!-- Default configurations to help VS understand the configurations -->
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+ <ItemGroup>
+ <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+ <Visible>False</Visible>
+ </CodeAnalysisDependentAssemblyPaths>
+ </ItemGroup>
+ <PropertyGroup>
+ <DebugType>None</DebugType>
+ <Optimize>true</Optimize>
+ </PropertyGroup>
+ <ItemGroup>
+ <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Multiply.cs" />
+ </ItemGroup>
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+ <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
+
+namespace IntelHardwareIntrinsicTest
+{
+ class Program
+ {
+ const int Pass = 100;
+ const int Fail = 0;
+
+ static unsafe int Main(string[] args)
+ {
+ int testResult = Pass;
+
+ if (Avx2.IsSupported)
+ {
+ using (TestTable<int, int, long> intTable = new TestTable<int, int, long>(new int[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new int[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new long[4]))
+ using (TestTable<uint, uint, ulong> uintTable = new TestTable<uint, uint, ulong>(new uint[8] { 1, 5, 100, 0, 1, 5, 100, 0 }, new uint[8] { 22, 1, 50, 0, 22, 1, 50, 0 }, new ulong[4]))
+ {
+
+ var vi1 = Unsafe.Read<Vector256<int>>(intTable.inArray1Ptr);
+ var vi2 = Unsafe.Read<Vector256<int>>(intTable.inArray2Ptr);
+ var vi3 = Avx2.Multiply(vi1, vi2);
+ Unsafe.Write(intTable.outArrayPtr, vi3);
+
+ var vui1 = Unsafe.Read<Vector256<uint>>(uintTable.inArray1Ptr);
+ var vui2 = Unsafe.Read<Vector256<uint>>(uintTable.inArray2Ptr);
+ var vui3 = Avx2.Multiply(vui1, vui2);
+ Unsafe.Write(uintTable.outArrayPtr, vui3);
+
+ for (int i = 0; i < intTable.outArray.Length; i++)
+ {
+ if (intTable.inArray1[i * 2] * intTable.inArray2[i * 2] != intTable.outArray[i])
+ {
+ Console.WriteLine("AVX2 Multiply failed on int:");
+ foreach (var item in intTable.outArray)
+ {
+ Console.Write(item + ", ");
+ }
+ Console.WriteLine();
+ return Fail;
+ }
+ }
+
+ for (int i = 0; i < uintTable.outArray.Length; i++)
+ {
+ if (uintTable.inArray1[i * 2] * uintTable.inArray2[i * 2] != uintTable.outArray[i])
+ {
+ Console.WriteLine("AVX2 Multiply failed on uint:");
+ foreach (var item in uintTable.outArray)
+ {
+ Console.Write(item + ", ");
+ }
+ Console.WriteLine();
+ return Fail;
+ }
+ }
+ }
+ }
+
+ return testResult;
+ }
+
+ public unsafe struct TestTable<T1, T2, T3> : IDisposable where T1 : struct where T2 : struct where T3 : struct
+ {
+ public T1[] inArray1;
+ public T2[] inArray2;
+ public T3[] outArray;
+
+ public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer();
+ public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer();
+ public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer();
+
+ GCHandle inHandle1;
+ GCHandle inHandle2;
+ GCHandle outHandle;
+ public TestTable(T1[] a, T2[] b, T3[] c)
+ {
+ this.inArray1 = a;
+ this.inArray2 = b;
+ this.outArray = c;
+
+ inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned);
+ inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned);
+ outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned);
+ }
+ public bool CheckResult(Func<T1, T2, T3, bool> check)
+ {
+ for (int i = 0; i < inArray1.Length; i++)
+ {
+ if (!check(inArray1[i], inArray2[i], outArray[i]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public void Dispose()
+ {
+ inHandle1.Free();
+ inHandle2.Free();
+ outHandle.Free();
+ }
+ }
+
+ }
+}
\ No newline at end of file
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+ <OutputType>Exe</OutputType>
+ <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+ <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+ <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+ </PropertyGroup>
+ <!-- Default configurations to help VS understand the configurations -->
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+ <ItemGroup>
+ <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+ <Visible>False</Visible>
+ </CodeAnalysisDependentAssemblyPaths>
+ </ItemGroup>
+ <PropertyGroup>
+ <DebugType>None</DebugType>
+ <Optimize></Optimize>
+ </PropertyGroup>
+ <ItemGroup>
+ <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Multiply.cs" />
+ </ItemGroup>
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+ <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+ <OutputType>Exe</OutputType>
+ <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+ <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+ <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+ </PropertyGroup>
+ <!-- Default configurations to help VS understand the configurations -->
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+ <ItemGroup>
+ <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+ <Visible>False</Visible>
+ </CodeAnalysisDependentAssemblyPaths>
+ </ItemGroup>
+ <PropertyGroup>
+ <DebugType>None</DebugType>
+ <Optimize>true</Optimize>
+ </PropertyGroup>
+ <ItemGroup>
+ <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Multiply.cs" />
+ </ItemGroup>
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+ <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
+
+namespace IntelHardwareIntrinsicTest
+{
+ class Program
+ {
+ const int Pass = 100;
+ const int Fail = 0;
+
+ static unsafe int Main(string[] args)
+ {
+ int testResult = Pass;
+
+ if (Sse41.IsSupported)
+ {
+ using (TestTable<int, int, long> intTable = new TestTable<int, int, long>(new int[4] { 1, -5, 100, 0}, new int[4] { 22, -1, -50, 0}, new long[2]))
+ {
+
+ var vi1 = Unsafe.Read<Vector128<int>>(intTable.inArray1Ptr);
+ var vi2 = Unsafe.Read<Vector128<int>>(intTable.inArray2Ptr);
+ var vi3 = Sse41.Multiply(vi1, vi2);
+ Unsafe.Write(intTable.outArrayPtr, vi3);
+
+ for (int i = 0; i < intTable.outArray.Length; i++)
+ {
+ if (intTable.inArray1[i * 2] * intTable.inArray2[i * 2] != intTable.outArray[i])
+ {
+ Console.WriteLine("SSE4.1 Multiply failed on int:");
+ foreach (var item in intTable.outArray)
+ {
+ Console.Write(item + ", ");
+ }
+ Console.WriteLine();
+ return Fail;
+ }
+ }
+ }
+ }
+ return testResult;
+ }
+
+ public unsafe struct TestTable<T1, T2, T3> : IDisposable where T1 : struct where T2 : struct where T3 : struct
+ {
+ public T1[] inArray1;
+ public T2[] inArray2;
+ public T3[] outArray;
+
+ public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer();
+ public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer();
+ public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer();
+
+ GCHandle inHandle1;
+ GCHandle inHandle2;
+ GCHandle outHandle;
+ public TestTable(T1[] a, T2[] b, T3[] c)
+ {
+ this.inArray1 = a;
+ this.inArray2 = b;
+ this.outArray = c;
+
+ inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned);
+ inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned);
+ outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned);
+ }
+ public bool CheckResult(Func<T1, T2, T3, bool> check)
+ {
+ for (int i = 0; i < inArray1.Length; i++)
+ {
+ if (!check(inArray1[i], inArray2[i], outArray[i]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public void Dispose()
+ {
+ inHandle1.Free();
+ inHandle2.Free();
+ outHandle.Free();
+ }
+ }
+
+ }
+}
\ No newline at end of file
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+ <OutputType>Exe</OutputType>
+ <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+ <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+ <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+ </PropertyGroup>
+ <!-- Default configurations to help VS understand the configurations -->
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+ <ItemGroup>
+ <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+ <Visible>False</Visible>
+ </CodeAnalysisDependentAssemblyPaths>
+ </ItemGroup>
+ <PropertyGroup>
+ <DebugType>None</DebugType>
+ <Optimize></Optimize>
+ </PropertyGroup>
+ <ItemGroup>
+ <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Multiply.cs" />
+ </ItemGroup>
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+ <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{95DFC527-4DC1-495E-97D7-E94EE1F7140D}</ProjectGuid>
+ <OutputType>Exe</OutputType>
+ <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+ <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+ <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+ </PropertyGroup>
+ <!-- Default configurations to help VS understand the configurations -->
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
+ <ItemGroup>
+ <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+ <Visible>False</Visible>
+ </CodeAnalysisDependentAssemblyPaths>
+ </ItemGroup>
+ <PropertyGroup>
+ <DebugType>None</DebugType>
+ <Optimize>true</Optimize>
+ </PropertyGroup>
+ <ItemGroup>
+ <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Multiply.cs" />
+ </ItemGroup>
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+ <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>
\ No newline at end of file