void genSSE2Intrinsic(GenTreeHWIntrinsic* node);
void genSSE41Intrinsic(GenTreeHWIntrinsic* node);
void genSSE42Intrinsic(GenTreeHWIntrinsic* node);
-void genAVXIntrinsic(GenTreeHWIntrinsic* node);
-void genAVX2Intrinsic(GenTreeHWIntrinsic* node);
+void genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node);
void genAESIntrinsic(GenTreeHWIntrinsic* node);
void genBMI1Intrinsic(GenTreeHWIntrinsic* node);
void genBMI2Intrinsic(GenTreeHWIntrinsic* node);
CORINFO_METHOD_HANDLE method,
CORINFO_SIG_INFO* sig,
bool mustExpand);
- GenTree* impAVXIntrinsic(NamedIntrinsic intrinsic,
- CORINFO_METHOD_HANDLE method,
- CORINFO_SIG_INFO* sig,
- bool mustExpand);
- GenTree* impAVX2Intrinsic(NamedIntrinsic intrinsic,
- CORINFO_METHOD_HANDLE method,
- CORINFO_SIG_INFO* sig,
- bool mustExpand);
+ GenTree* impAvxOrAvx2Intrinsic(NamedIntrinsic intrinsic,
+ CORINFO_METHOD_HANDLE method,
+ CORINFO_SIG_INFO* sig,
+ bool mustExpand);
GenTree* impAESIntrinsic(NamedIntrinsic intrinsic,
CORINFO_METHOD_HANDLE method,
CORINFO_SIG_INFO* sig,
genSSE42Intrinsic(node);
break;
case InstructionSet_AVX:
- genAVXIntrinsic(node);
- break;
case InstructionSet_AVX2:
- genAVX2Intrinsic(node);
+ genAvxOrAvx2Intrinsic(node);
break;
case InstructionSet_AES:
genAESIntrinsic(node);
}
//------------------------------------------------------------------------
-// genAVXIntrinsic: Generates the code for an AVX hardware intrinsic node
+// genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
//
// Arguments:
// node - The hardware intrinsic node
//
-void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node)
+void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
{
NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
var_types baseType = node->gtSIMDBaseType;
emitAttr attr = EA_ATTR(node->gtSIMDSize);
var_types targetType = node->TypeGet();
instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+ int numArgs = Compiler::numArgsOfHWIntrinsic(node);
GenTree* op1 = node->gtGetOp1();
GenTree* op2 = node->gtGetOp2();
+ regNumber op1Reg = REG_NA;
+ regNumber op2Reg = REG_NA;
regNumber targetReg = node->gtRegNum;
emitter* emit = getEmitter();
- genConsumeOperands(node);
+ if ((op1 != nullptr) && !op1->OperIsList())
+ {
+ genConsumeOperands(node);
+ }
switch (intrinsicID)
{
break;
}
- default:
- unreached();
- break;
- }
+ case NI_AVX_ExtractVector128:
+ case NI_AVX_InsertVector128:
+ case NI_AVX2_ExtractVector128:
+ case NI_AVX2_InsertVector128:
+ {
+ GenTree* lastOp = nullptr;
+ if (numArgs == 2)
+ {
+ assert(intrinsicID == NI_AVX_ExtractVector128 || NI_AVX_ExtractVector128);
+ op1Reg = op1->gtRegNum;
+ op2Reg = op2->gtRegNum;
+ lastOp = op2;
+ }
+ else
+ {
+ assert(numArgs == 3);
+ assert(op1->OperIsList());
+ assert(op1->gtGetOp2()->OperIsList());
+ assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
- genProduceReg(node);
-}
+ GenTreeArgList* argList = op1->AsArgList();
+ op1 = argList->Current();
+ genConsumeRegs(op1);
+ op1Reg = op1->gtRegNum;
-//------------------------------------------------------------------------
-// genAVX2Intrinsic: Generates the code for an AVX2 hardware intrinsic node
-//
-// Arguments:
-// node - The hardware intrinsic node
-//
-void CodeGen::genAVX2Intrinsic(GenTreeHWIntrinsic* node)
-{
- NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
- var_types baseType = node->gtSIMDBaseType;
- instruction ins = INS_invalid;
+ argList = argList->Rest();
+ op2 = argList->Current();
+ genConsumeRegs(op2);
+ op2Reg = op2->gtRegNum;
- genConsumeOperands(node);
+ argList = argList->Rest();
+ lastOp = argList->Current();
+ genConsumeRegs(lastOp);
+ }
+
+ regNumber op3Reg = lastOp->gtRegNum;
+
+ auto emitSwCase = [&](unsigned i) {
+ // TODO-XARCH-Bug the emitter cannot work with imm8 >= 128,
+ // so clear the 8th bit that is not used by the instructions
+ i &= 0x7FU;
+ if (numArgs == 3)
+ {
+ if (intrinsicID == NI_AVX_ExtractVector128 || intrinsicID == NI_AVX2_ExtractVector128)
+ {
+ emit->emitIns_R_AR_I(ins, attr, op2Reg, op1Reg, 0, (int)i);
+ }
+ else if (op2->TypeGet() == TYP_I_IMPL)
+ {
+ emit->emitIns_SIMD_R_R_AR_I(ins, attr, targetReg, op1Reg, op2Reg, (int)i);
+ }
+ else
+ {
+ assert(op2->TypeGet() == TYP_SIMD16);
+ emit->emitIns_SIMD_R_R_R_I(ins, attr, targetReg, op1Reg, op2Reg, (int)i);
+ }
+ }
+ else
+ {
+ assert(numArgs == 2);
+ assert(intrinsicID == NI_AVX_ExtractVector128 || intrinsicID == NI_AVX2_ExtractVector128);
+ emit->emitIns_SIMD_R_R_I(ins, attr, targetReg, op1Reg, (int)i);
+ }
+ };
+
+ if (lastOp->IsCnsIntOrI())
+ {
+ ssize_t ival = lastOp->AsIntCon()->IconValue();
+ emitSwCase((unsigned)ival);
+ }
+ else
+ {
+ // We emit a fallback case for the scenario when the imm-op is not a constant. This should
+ // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
+ // can also occur if the consumer calls it directly and just doesn't pass a constant value.
+ regNumber baseReg = node->ExtractTempReg();
+ regNumber offsReg = node->GetSingleTempReg();
+ genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
+ }
+ break;
+ }
- switch (intrinsicID)
- {
default:
unreached();
break;
HARDWARE_INTRINSIC(AVX_DuplicateEvenIndexed, "DuplicateEvenIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_DuplicateOddIndexed, "DuplicateOddIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_ExtendToVector256, "ExtendToVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_ExtractVector128, "ExtractVector128", AVX, -1, 32, -1, {INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128, INS_vextractf128},HW_Category_IMM, HW_Flag_OneTypeGeneric|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX_Floor, "Floor", AVX, 9, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_GetLowerHalf, "GetLowerHalf", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_HorizontalAdd, "HorizontalAdd", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_HorizontalSubtract, "HorizontalSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_InsertVector128, "InsertVector128", AVX, -1, 32, 3, {INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128, INS_vinsertf128},HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_OneTypeGeneric|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_LoadVector256, "LoadVector256", AVX, -1, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX2, -1, 32, 1, {INS_vpbroadcastb,INS_vpbroadcastb,INS_vpbroadcastw,INS_vpbroadcastw,INS_vpbroadcastd,INS_vpbroadcastd,INS_vpbroadcastq,INS_vpbroadcastq,INS_vbroadcastss,INS_vbroadcastsd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric)
HARDWARE_INTRINSIC(AVX2_CompareEqual, "CompareEqual", AVX2, -1, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX2_CompareGreaterThan, "CompareGreaterThan", AVX2, -1, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX2_ExtractVector128, "ExtractVector128", AVX2, -1, 32, -1, {INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_invalid, INS_invalid},HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX2_HorizontalAdd, "HorizontalAdd", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_invalid, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate, "HorizontalAddSaturate", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_HorizontalSubtract, "HorizontalSubtract", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_HorizontalSubtractSaturate, "HorizontalSubtractSaturate", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX2_InsertVector128, "InsertVector128", AVX2, -1, 32, 3, {INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal, "LoadAlignedVector256NonTemporal", AVX2, -1, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX2_Multiply, "Multiply", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX2_Or, "Or", AVX2, -1, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
{
// HW_Flag_NoCodeGen implies this intrinsic should be manually morphed in the importer.
- return category != HW_Category_Special && category != HW_Category_Scalar && (flags & HW_Flag_NoCodeGen) == 0;
+ return category != HW_Category_Special && category != HW_Category_Scalar &&
+ ((flags & (HW_Flag_NoCodeGen | HW_Flag_SpecialImport)) == 0);
}
//------------------------------------------------------------------------
assert(baseType != TYP_UNKNOWN);
}
- if ((flags & (HW_Flag_OneTypeGeneric | HW_Flag_TwoTypeGeneric)) != 0)
+ if (((flags & (HW_Flag_OneTypeGeneric | HW_Flag_TwoTypeGeneric)) != 0) && ((flags & HW_Flag_SpecialImport) == 0))
{
if (!varTypeIsArithmetic(baseType))
{
case InstructionSet_SSE42:
return impSSE42Intrinsic(intrinsic, method, sig, mustExpand);
case InstructionSet_AVX:
- return impAVXIntrinsic(intrinsic, method, sig, mustExpand);
case InstructionSet_AVX2:
- return impAVX2Intrinsic(intrinsic, method, sig, mustExpand);
+ return impAvxOrAvx2Intrinsic(intrinsic, method, sig, mustExpand);
case InstructionSet_AES:
return impAESIntrinsic(intrinsic, method, sig, mustExpand);
return retNode;
}
-GenTree* Compiler::impAVXIntrinsic(NamedIntrinsic intrinsic,
- CORINFO_METHOD_HANDLE method,
- CORINFO_SIG_INFO* sig,
- bool mustExpand)
+GenTree* Compiler::impAvxOrAvx2Intrinsic(NamedIntrinsic intrinsic,
+ CORINFO_METHOD_HANDLE method,
+ CORINFO_SIG_INFO* sig,
+ bool mustExpand)
{
GenTree* retNode = nullptr;
GenTree* op1 = nullptr;
GenTree* op2 = nullptr;
var_types baseType = TYP_UNKNOWN;
- switch (intrinsic)
- {
- default:
- JITDUMP("Not implemented hardware intrinsic");
- break;
- }
- return retNode;
-}
+ int simdSize = simdSizeOfHWIntrinsic(intrinsic, sig);
-GenTree* Compiler::impAVX2Intrinsic(NamedIntrinsic intrinsic,
- CORINFO_METHOD_HANDLE method,
- CORINFO_SIG_INFO* sig,
- bool mustExpand)
-{
- GenTree* retNode = nullptr;
- GenTree* op1 = nullptr;
- GenTree* op2 = nullptr;
- var_types baseType = TYP_UNKNOWN;
switch (intrinsic)
{
+ case NI_AVX_ExtractVector128:
+ case NI_AVX2_ExtractVector128:
+ {
+ GenTree* lastOp = impPopStack().val;
+ assert(lastOp->IsCnsIntOrI() || mustExpand);
+ GenTree* vectorOp = impSIMDPopStack(TYP_SIMD32);
+ if (sig->numArgs == 2)
+ {
+ baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
+ if (!varTypeIsArithmetic(baseType))
+ {
+ retNode = impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_TYPE_NOT_SUPPORTED, method, sig, mustExpand);
+ }
+ else
+ {
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, lastOp, intrinsic, baseType, 32);
+ }
+ }
+ else
+ {
+ assert(sig->numArgs == 3);
+ op1 = impPopStack().val;
+ CORINFO_ARG_LIST_HANDLE secondArg = info.compCompHnd->getArgNext(sig->args);
+ CORINFO_CLASS_HANDLE secondArgClass = info.compCompHnd->getArgClass(sig, secondArg);
+ baseType = getBaseTypeOfSIMDType(secondArgClass);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_VOID, op1, vectorOp, lastOp, intrinsic, baseType, 32);
+ }
+ break;
+ }
default:
JITDUMP("Not implemented hardware intrinsic");
break;
INST3( vpbroadcastw, "pbroadcastw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x79)) // Broadcast int16 value from reg/memory to entire ymm register
INST3( vpbroadcastd, "pbroadcastd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x58)) // Broadcast int32 value from reg/memory to entire ymm register
INST3( vpbroadcastq, "pbroadcastq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x59)) // Broadcast int64 value from reg/memory to entire ymm register
-INST3( vextractf128, "extractf128" , 0, IUM_WR, 0, 0, SSE3A(0x19), BAD_CODE, BAD_CODE) // Extract 128-bit packed floating point values
-INST3( vextracti128, "extracti128" , 0, IUM_WR, 0, 0, SSE3A(0x39), BAD_CODE, BAD_CODE) // Extract 128-bit packed integer values
+INST3( vextractf128, "extractf128" , 0, IUM_WR, 0, 0, SSE3A(0x19), BAD_CODE, SSE3A(0x19)) // Extract 128-bit packed floating point values
+INST3( vextracti128, "extracti128" , 0, IUM_WR, 0, 0, SSE3A(0x39), BAD_CODE, SSE3A(0x39)) // Extract 128-bit packed integer values
INST3( vinsertf128, "insertf128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x18)) // Insert 128-bit packed floating point values
INST3( vinserti128, "inserti128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x38)) // Insert 128-bit packed integer values
INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
// Select base type using the second argument type
HW_Flag_BaseTypeFromSecondArg = 0x10000,
- // Specail codegen
+ // Special codegen
// the intrinsics need special rules in CodeGen,
- // but can be table-driven in the front-end
+ // but may be table-driven in the front-end
HW_Flag_SpecialCodeGen = 0x20000,
// No Read/Modify/Write Semantics
// the intrinsic doesn't have read/modify/write semantics in two/three-operand form.
HW_Flag_NoRMWSemantics = 0x40000,
+
+ // Special import
+ // the intrinsics need special rules in importer,
+ // but may be table-driven in the back-end
+ HW_Flag_SpecialImport = 0x80000,
};
inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2)