From 0f6597f60f580f64b29b00bec7f71d2fb0d57d9d Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Fri, 31 Aug 2018 14:15:45 -0700 Subject: [PATCH] Implement AVX2 Gather intrinsic in JIT --- src/jit/emitfmtsxarch.h | 1 + src/jit/emitxarch.cpp | 97 +++++++++++++++++++++++++- src/jit/emitxarch.h | 9 +++ src/jit/gentree.cpp | 44 +++--------- src/jit/gentree.h | 5 +- src/jit/hwintrinsiccodegenxarch.cpp | 114 ++++++++++++++++++++++++++++++ src/jit/hwintrinsiclistxarch.h | 4 ++ src/jit/hwintrinsicxarch.cpp | 134 ++++++++++++++++++++++++++++++++++-- src/jit/hwintrinsicxarch.h | 2 + src/jit/instrsxarch.h | 8 +++ src/jit/lowerxarch.cpp | 7 ++ src/jit/lsraxarch.cpp | 52 +++++++++++++- src/jit/namedintrinsiclist.h | 2 +- 13 files changed, 429 insertions(+), 50 deletions(-) diff --git a/src/jit/emitfmtsxarch.h b/src/jit/emitfmtsxarch.h index b070b3d..371023b 100644 --- a/src/jit/emitfmtsxarch.h +++ b/src/jit/emitfmtsxarch.h @@ -188,6 +188,7 @@ IF_DEF(RRW_ARD_CNS, IS_AM_RD|IS_R1_RW, AMD_CNS) // r/w reg , read [ IF_DEF(RWR_RRD_ARD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD ) // write reg , read reg2, read [adr] IF_DEF(RWR_ARD_CNS, IS_AM_RD|IS_R1_WR, AMD_CNS) // write reg , read [adr], const +IF_DEF(RWR_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD) // write reg , read [adr], read reg2 IF_DEF(RWR_RRD_ARD_CNS, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD_CNS) // write reg , read reg2, read [adr], const IF_DEF(RWR_RRD_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, AMD_CNS) // write reg , read reg2, read [adr], read reg3 diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 5be69db..3dff716 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -334,6 +334,10 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr) case INS_vfnmsub213sd: case INS_vfnmsub231sd: case INS_vpmaskmovq: + case INS_vpgatherdq: + case INS_vpgatherqq: + case INS_vgatherdpd: + case INS_vgatherqpd: return true; default: break; @@ -2901,8 +2905,8 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G if (dst->isContained() || (dst->isLclField() && (dst->gtRegNum == REG_NA)) || dst->isUsedFromSpillTemp()) { // dst can only be a modrm - assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) || - instrIs3opImul(ins)); // dst on 3opImul isn't really the dst + // dst on 3opImul isn't really the dst + assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) || instrIs3opImul(ins)); assert(!src->isUsedFromMemory()); memOp = dst; @@ -4123,6 +4127,74 @@ void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, reg emitCurIGsize += sz; } +//------------------------------------------------------------------------ +// IsAVX2GatherInstruction: return true if the instruction is AVX2 Gather +// +// Arguments: +// ins - the instruction to check +// Return Value: +// true if the instruction is AVX2 Gather +// +bool IsAVX2GatherInstruction(instruction ins) +{ + switch (ins) + { + case INS_vpgatherdd: + case INS_vpgatherdq: + case INS_vpgatherqd: + case INS_vpgatherqq: + case INS_vgatherdps: + case INS_vgatherdpd: + case INS_vgatherqps: + case INS_vgatherqpd: + return true; + default: + return false; + } +} + +//------------------------------------------------------------------------ +// emitIns_R_AR_R: Emits an AVX2 Gather instructions +// +// Arguments: +// ins - the instruction to emit +// attr - the instruction operand size +// reg1 - the destination and first source operand +// reg2 - the mask operand (encoded in VEX.vvvv) +// base - the base register of address to load +// index - the index register of VSIB +// scale - the scale number of VSIB +// offs - the offset added to the memory address from base +// +void emitter::emitIns_R_AR_R(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + regNumber base, + regNumber index, + int scale, + int offs) +{ + assert(IsAVX2GatherInstruction(ins)); + + instrDesc* id = emitNewInstrAmd(attr, offs); + + id->idIns(ins); + id->idReg1(reg1); + id->idReg2(reg2); + + id->idInsFmt(IF_RWR_ARD_RRD); + id->idAddr()->iiaAddrMode.amBaseReg = base; + id->idAddr()->iiaAddrMode.amIndxReg = index; + id->idAddr()->iiaAddrMode.amScale = emitEncodeSize((emitAttr)scale); + + UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)); + id->idCodeSize(sz); + + dispIns(id); + emitCurIGsize += sz; +} + void emitter::emitIns_R_R_C( instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs) { @@ -8341,6 +8413,17 @@ void emitter::emitDispIns( emitDispAddrMode(id); break; + case IF_RWR_ARD_RRD: + if (ins == INS_vpgatherqd || ins == INS_vgatherqps) + { + attr = EA_16BYTE; + } + sstr = codeGen->genSizeStr(EA_ATTR(4)); + printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); + emitDispAddrMode(id); + printf(", %s", emitRegName(id->idReg2(), attr)); + break; + case IF_RWR_RRD_ARD_CNS: { printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr); @@ -9223,6 +9306,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) switch (id->idInsFmt()) { case IF_RWR_RRD_ARD: + case IF_RWR_ARD_RRD: case IF_RWR_RRD_ARD_CNS: case IF_RWR_RRD_ARD_RRD: { @@ -12884,6 +12968,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; } + case IF_RWR_ARD_RRD: + { + assert(IsAVX2GatherInstruction(ins)); + code = insCodeRM(ins); + dst = emitOutputAM(dst, id, code); + sz = emitSizeOfInsDsc(id); + break; + } + case IF_RWR_RRD_ARD_CNS: case IF_RWR_RRD_ARD_RRD: { diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index 3ec962f..965309d3 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -335,6 +335,15 @@ void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs); +void emitIns_R_AR_R(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + regNumber base, + regNumber index, + int scale, + int offs); + void emitIns_R_R_C( instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs); diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp index 620087a..94152ea 100644 --- a/src/jit/gentree.cpp +++ b/src/jit/gentree.cpp @@ -17501,13 +17501,18 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad() { // Some AVX instructions here also have MemoryLoad sematics - // Do we have 3 operands? - if (HWIntrinsicInfo::lookupNumArgs(this) != 3) + // Do we have less than 3 operands? + if (HWIntrinsicInfo::lookupNumArgs(this) < 3) { return false; } - else // We have 3 operands/args + else // We have 3 or more operands/args { + if (HWIntrinsicInfo::isAVX2GatherIntrinsic(gtHWIntrinsicId)) + { + return true; + } + GenTreeArgList* argList = gtOp.gtOp1->AsArgList(); if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) && @@ -17558,38 +17563,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryStore() bool GenTreeHWIntrinsic::OperIsMemoryLoadOrStore() { #ifdef _TARGET_XARCH_ - // Some xarch instructions have MemoryLoad sematics - HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(gtHWIntrinsicId); - if ((category == HW_Category_MemoryLoad) || (category == HW_Category_MemoryStore)) - { - return true; - } - else if (category == HW_Category_IMM) - { - // Some AVX instructions here also have MemoryLoad or MemoryStore sematics - - // Do we have 3 operands? - if (HWIntrinsicInfo::lookupNumArgs(this) != 3) - { - return false; - } - else // We have 3 operands/args - { - GenTreeArgList* argList = gtOp.gtOp1->AsArgList(); - - if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) && - (argList->Rest()->Current()->TypeGet() == TYP_I_IMPL)) // Is the type of the second arg TYP_I_IMPL? - { - // This is Avx/Avx2.InsertVector128 - return true; - } - else if ((gtHWIntrinsicId == NI_AVX_ExtractVector128 || gtHWIntrinsicId == NI_AVX2_ExtractVector128)) - { - // This is Avx/Avx2.ExtractVector128 - return true; - } - } - } + return OperIsMemoryLoad() || OperIsMemoryStore(); #endif // _TARGET_XARCH_ return false; } diff --git a/src/jit/gentree.h b/src/jit/gentree.h index 006812b..ef86c5d 100644 --- a/src/jit/gentree.h +++ b/src/jit/gentree.h @@ -479,8 +479,8 @@ public: // happening. void CopyCosts(const GenTree* const tree) { - INDEBUG(gtCostsInitialized = - tree->gtCostsInitialized;) // If the 'tree' costs aren't initialized, we'll hit an assert below. + // If the 'tree' costs aren't initialized, we'll hit an assert below. + INDEBUG(gtCostsInitialized = tree->gtCostsInitialized;) _gtCostEx = tree->gtCostEx; _gtCostSz = tree->gtCostSz; } @@ -4115,6 +4115,7 @@ struct GenTreeSIMD : public GenTreeJitIntrinsic struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic { NamedIntrinsic gtHWIntrinsicId; + var_types gtIndexBaseType; // for AVX2 Gather* intrinsics GenTreeHWIntrinsic(var_types type, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size) : GenTreeJitIntrinsic(GT_HWIntrinsic, type, nullptr, nullptr, baseType, size), gtHWIntrinsicId(hwIntrinsicID) diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index 0a9dfb3..f3dec92 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -1184,6 +1184,9 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsi HWIntrinsicSwitchCaseBody emitSwCase) { assert(nonConstImmReg != REG_NA); + // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range + // that does work with the current compiler generated jump-table fallback + assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic)); emitter* emit = getEmitter(); const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1; @@ -2008,6 +2011,117 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX2_GatherVector128: + case NI_AVX2_GatherVector256: + case NI_AVX2_GatherMaskVector128: + case NI_AVX2_GatherMaskVector256: + { + GenTreeArgList* list = op1->AsArgList(); + op1 = list->Current(); + op1Reg = op1->gtRegNum; + genConsumeRegs(op1); + + list = list->Rest(); + op2 = list->Current(); + op2Reg = op2->gtRegNum; + genConsumeRegs(op2); + + list = list->Rest(); + GenTree* op3 = list->Current(); + genConsumeRegs(op3); + + list = list->Rest(); + GenTree* op4 = nullptr; + GenTree* lastOp = nullptr; + GenTree* indexOp = nullptr; + + regNumber op3Reg = REG_NA; + regNumber op4Reg = REG_NA; + regNumber addrBaseReg = REG_NA; + regNumber addrIndexReg = REG_NA; + regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT); + + if (numArgs == 5) + { + assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256); + op4 = list->Current(); + list = list->Rest(); + lastOp = list->Current(); + op3Reg = op3->gtRegNum; + op4Reg = op4->gtRegNum; + genConsumeRegs(op4); + addrBaseReg = op2Reg; + addrIndexReg = op3Reg; + indexOp = op3; + + // copy op4Reg into the tmp mask register, + // the mask register will be cleared by gather instructions + emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg); + + if (targetReg != op1Reg) + { + // copy source vector to the target register for masking merge + emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); + } + } + else + { + assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256); + addrBaseReg = op1Reg; + addrIndexReg = op2Reg; + indexOp = op2; + lastOp = op3; + + // generate all-one mask vector + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg); + } + + bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32); + + // hwintrinsiclistxarch.h uses Dword index instructions in default + if (varTypeIsLong(node->gtIndexBaseType)) + { + switch (ins) + { + case INS_vpgatherdd: + ins = INS_vpgatherqd; + if (isVector128GatherWithVector256Index) + { + // YMM index in address mode + attr = emitTypeSize(TYP_SIMD32); + } + break; + case INS_vpgatherdq: + ins = INS_vpgatherqq; + break; + case INS_vgatherdps: + ins = INS_vgatherqps; + if (isVector128GatherWithVector256Index) + { + // YMM index in address mode + attr = emitTypeSize(TYP_SIMD32); + } + break; + case INS_vgatherdpd: + ins = INS_vgatherqpd; + break; + default: + unreached(); + } + } + + assert(lastOp->IsCnsIntOrI()); + ssize_t ival = lastOp->AsIntCon()->IconValue(); + assert((ival >= 0) && (ival <= 255)); + + assert(targetReg != maskReg); + assert(targetReg != addrIndexReg); + assert(maskReg != addrIndexReg); + emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0); + + break; + } + case NI_AVX_GetLowerHalf: { assert(op2 == nullptr); diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index ac59fc8..41ee9ef 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -430,6 +430,10 @@ HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int32, "ConvertToVe HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt32, "ConvertToVector256UInt32", AVX2, -1, 32, 1, {INS_invalid, INS_pmovzxbd, INS_invalid, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int64, "ConvertToVector256Int64", AVX2, -1, 32, 1, {INS_pmovsxbq, INS_invalid, INS_pmovsxwq, INS_invalid, INS_pmovsxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt64, "ConvertToVector256UInt64", AVX2, -1, 32, 1, {INS_invalid, INS_pmovzxbq, INS_invalid, INS_pmovzxwq, INS_invalid, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX2_GatherVector128, "GatherVector128", AVX2, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX2_GatherVector256, "GatherVector256", AVX2, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX2_GatherMaskVector128, "GatherMaskVector128", AVX2, -1, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX2_GatherMaskVector256, "GatherMaskVector256", AVX2, -1, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) HARDWARE_INTRINSIC(AVX2_HorizontalAdd, "HorizontalAdd", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_invalid, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate, "HorizontalAddSaturate", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_HorizontalSubtract, "HorizontalSubtract", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index 33363b5..8f53c3a 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -232,7 +232,6 @@ int HWIntrinsicInfo::lookupNumArgs(const GenTreeHWIntrinsic* node) if (op1->OperIsList()) { -#if DEBUG GenTreeArgList* list = op1->AsArgList(); numArgs = 0; @@ -242,10 +241,7 @@ int HWIntrinsicInfo::lookupNumArgs(const GenTreeHWIntrinsic* node) list = list->Rest(); } while (list != nullptr); - assert(numArgs == 3); -#endif - - return 3; + return numArgs; } GenTree* op2 = node->gtGetOp2(); @@ -303,6 +299,17 @@ GenTree* HWIntrinsicInfo::lookupLastOp(const GenTreeHWIntrinsic* node) return node->gtGetOp1()->AsArgList()->Rest()->Rest()->Current(); } + case 5: + { + assert(node->gtGetOp1() != nullptr); + assert(node->gtGetOp1()->OperIsList()); + assert(node->gtGetOp2() == nullptr); + assert(node->gtGetOp1()->AsArgList()->Rest()->Rest()->Rest()->Rest()->Current() != nullptr); + assert(node->gtGetOp1()->AsArgList()->Rest()->Rest()->Rest()->Rest()->Rest() == nullptr); + + return node->gtGetOp1()->AsArgList()->Rest()->Rest()->Rest()->Rest()->Current(); + } + default: { unreached(); @@ -362,6 +369,12 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id) return 31; // enum FloatComparisonMode has 32 values } + case NI_AVX2_GatherVector128: + case NI_AVX2_GatherVector256: + case NI_AVX2_GatherMaskVector128: + case NI_AVX2_GatherMaskVector256: + return 8; + default: { assert(HWIntrinsicInfo::HasFullRangeImm(id)); @@ -371,6 +384,53 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id) } //------------------------------------------------------------------------ +// isInImmRange: Check if ival is valid for the intrinsic +// +// Arguments: +// id -- The NamedIntrinsic associated with the HWIntrinsic to lookup +// ival -- the imm value to be checked +// +// Return Value: +// true if ival is valid for the intrinsic +// +bool HWIntrinsicInfo::isInImmRange(NamedIntrinsic id, int ival) +{ + assert(HWIntrinsicInfo::lookupCategory(id) == HW_Category_IMM); + + if (isAVX2GatherIntrinsic(id)) + { + return ival == 1 || ival == 2 || ival == 4 || ival == 8; + } + else + { + return ival <= lookupImmUpperBound(id) && ival >= 0; + } +} + +//------------------------------------------------------------------------ +// isAVX2GatherIntrinsic: Check if the intrinsic is AVX Gather* +// +// Arguments: +// id -- The NamedIntrinsic associated with the HWIntrinsic to lookup +// +// Return Value: +// true if id is AVX Gather* intrinsic +// +bool HWIntrinsicInfo::isAVX2GatherIntrinsic(NamedIntrinsic id) +{ + switch (id) + { + case NI_AVX2_GatherVector128: + case NI_AVX2_GatherVector256: + case NI_AVX2_GatherMaskVector128: + case NI_AVX2_GatherMaskVector256: + return true; + default: + return false; + } +} + +//------------------------------------------------------------------------ // isFullyImplementedIsa: Gets a value that indicates whether the InstructionSet is fully implemented // // Arguments: @@ -532,7 +592,10 @@ GenTree* Compiler::addRangeCheckIfNeeded(NamedIntrinsic intrinsic, GenTree* last assert(lastOp != nullptr); // Full-range imm-intrinsics do not need the range-check // because the imm-parameter of the intrinsic method is a byte. - if (mustExpand && !HWIntrinsicInfo::HasFullRangeImm(intrinsic) && HWIntrinsicInfo::isImmOp(intrinsic, lastOp)) + // AVX2 Gather intrinsics no not need the range-check + // because their imm-parameter have discrete valid values that are handle by managed code + if (mustExpand && !HWIntrinsicInfo::HasFullRangeImm(intrinsic) && HWIntrinsicInfo::isImmOp(intrinsic, lastOp) && + !HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic)) { assert(!lastOp->IsCnsIntOrI()); GenTree* upperBoundNode = @@ -683,7 +746,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, if (!HWIntrinsicInfo::HasFullRangeImm(intrinsic)) { if (!mustExpand && lastOp->IsCnsIntOrI() && - lastOp->AsIntCon()->IconValue() > HWIntrinsicInfo::lookupImmUpperBound(intrinsic)) + !HWIntrinsicInfo::isInImmRange(intrinsic, (int)lastOp->AsIntCon()->IconValue())) { return nullptr; } @@ -808,13 +871,26 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass))); op2 = getArgForHWIntrinsic(argType, argClass); + var_types op2Type; + if (intrinsic == NI_AVX2_GatherVector128 || intrinsic == NI_AVX2_GatherVector256) + { + assert(varTypeIsSIMD(op2->TypeGet())); + op2Type = getBaseTypeOfSIMDType(argClass); + } argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, baseType, simdSize); + + if (intrinsic == NI_AVX2_GatherVector128 || intrinsic == NI_AVX2_GatherVector256) + { + assert(varTypeIsSIMD(op2->TypeGet())); + retNode->AsHWIntrinsic()->gtIndexBaseType = op2Type; + } break; } + default: unreached(); } @@ -1276,6 +1352,50 @@ GenTree* Compiler::impAvxOrAvx2Intrinsic(NamedIntrinsic intrinsic, } break; } + + case NI_AVX2_GatherMaskVector128: + case NI_AVX2_GatherMaskVector256: + { + CORINFO_ARG_LIST_HANDLE argList = sig->args; + CORINFO_CLASS_HANDLE argClass; + var_types argType = TYP_UNKNOWN; + unsigned int sizeBytes; + baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes); + var_types retType = getSIMDTypeForSize(sizeBytes); + + assert(sig->numArgs == 5); + CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(argList); + CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2); + CORINFO_ARG_LIST_HANDLE arg4 = info.compCompHnd->getArgNext(arg3); + CORINFO_ARG_LIST_HANDLE arg5 = info.compCompHnd->getArgNext(arg4); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg5, &argClass))); + GenTree* op5 = getArgForHWIntrinsic(argType, argClass); + SetOpLclRelatedToSIMDIntrinsic(op5); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg4, &argClass))); + GenTree* op4 = getArgForHWIntrinsic(argType, argClass); + SetOpLclRelatedToSIMDIntrinsic(op4); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass))); + var_types indexbaseType = getBaseTypeOfSIMDType(argClass); + GenTree* op3 = getArgForHWIntrinsic(argType, argClass); + SetOpLclRelatedToSIMDIntrinsic(op3); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass))); + op2 = getArgForHWIntrinsic(argType, argClass); + SetOpLclRelatedToSIMDIntrinsic(op2); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); + op1 = getArgForHWIntrinsic(argType, argClass); + SetOpLclRelatedToSIMDIntrinsic(op1); + + GenTree* opList = new (this, GT_LIST) GenTreeArgList(op1, gtNewArgList(op2, op3, op4, op5)); + retNode = new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(retType, opList, intrinsic, baseType, simdSize); + retNode->AsHWIntrinsic()->gtIndexBaseType = indexbaseType; + break; + } + default: JITDUMP("Not implemented hardware intrinsic"); break; diff --git a/src/jit/hwintrinsicxarch.h b/src/jit/hwintrinsicxarch.h index 61a09a0..d74709a 100644 --- a/src/jit/hwintrinsicxarch.h +++ b/src/jit/hwintrinsicxarch.h @@ -149,6 +149,8 @@ struct HWIntrinsicInfo static bool isImmOp(NamedIntrinsic id, const GenTree* op); static int lookupImmUpperBound(NamedIntrinsic id); + static bool isInImmRange(NamedIntrinsic id, int ival); + static bool isAVX2GatherIntrinsic(NamedIntrinsic id); static bool isFullyImplementedIsa(InstructionSet isa); static bool isScalarIsa(InstructionSet isa); diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index f64edad..f205ed8 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -503,6 +503,14 @@ INST3(vmaskmovps, "maskmovps", IUM_WR, SSE38(0x2E), BAD_CODE, INST3(vmaskmovpd, "maskmovpd", IUM_WR, SSE38(0x2F), BAD_CODE, SSE38(0x2D), INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores INST3(vpmaskmovd, "pmaskmovd", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Dword Loads and Stores INST3(vpmaskmovq, "pmaskmovq", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Qword Loads and Stores +INST3(vpgatherdd, "pgatherdd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Dword +INST3(vpgatherqd, "pgatherqd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Qword +INST3(vpgatherdq, "pgatherdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword with Signed Dword Indices +INST3(vpgatherqq, "pgatherqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Qword with Signed Dword Indices +INST3(vgatherdps, "gatherdps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Dword Indices +INST3(vgatherqps, "gatherqps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Qword Indices +INST3(vgatherdpd, "gatherdpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x92), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Dword Indices +INST3(vgatherqpd, "gatherqpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Qword Indices INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) // id nm um mr mi rm flags diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index c72d0a2..c748da0 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2580,6 +2580,13 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (!HWIntrinsicInfo::SupportsContainment(intrinsicId)) { + // AVX2 gather are not contaibable and always have constant IMM argument + if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId)) + { + GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node); + assert(lastOp != nullptr); + MakeSrcContained(node, lastOp); + } // Exit early if containment isn't supported return; } diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 8f012c0..c64ea11 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -533,6 +533,7 @@ int LinearScan::BuildNode(GenTree* tree) #ifdef FEATURE_HW_INTRINSICS case GT_HW_INTRINSIC_CHK: #endif // FEATURE_HW_INTRINSICS + // Consumes arrLen & index - has no result srcCount = 2; assert(dstCount == 0); @@ -2311,7 +2312,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) if (op1->OperIsList()) { assert(op2 == nullptr); - assert(numArgs == 3); + assert(numArgs >= 3); GenTreeArgList* argList = op1->AsArgList(); @@ -2321,10 +2322,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) op2 = argList->Current(); argList = argList->Rest(); - op3 = argList->Current(); + op3 = argList->Current(); + + while (argList->Rest() != nullptr) + { + argList = argList->Rest(); + } + + lastOp = argList->Current(); argList = argList->Rest(); - lastOp = op3; assert(argList == nullptr); } else if (op2 != nullptr) @@ -2590,6 +2597,45 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) { assert(numArgs == 1); srcCount += BuildDelayFreeUses(op1); + + buildUses = false; + break; + } + + case NI_AVX2_GatherVector128: + case NI_AVX2_GatherVector256: + { + assert(numArgs == 3); + // Any pair of the index, mask, or destination registers should be different + srcCount += BuildOperandUses(op1); + srcCount += BuildDelayFreeUses(op2); + + // get a tmp register for mask that will be cleared by gather instructions + buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + setInternalRegsDelayFree = true; + + buildUses = false; + break; + } + + case NI_AVX2_GatherMaskVector128: + case NI_AVX2_GatherMaskVector256: + { + assert(numArgs == 5); + // Any pair of the index, mask, or destination registers should be different + srcCount += BuildOperandUses(op1); + srcCount += BuildOperandUses(op2); + srcCount += BuildDelayFreeUses(op3); + + assert(intrinsicTree->gtGetOp1()->OperIsList()); + GenTreeArgList* argList = intrinsicTree->gtGetOp1()->AsArgList(); + GenTree* op4 = argList->Rest()->Rest()->Rest()->Current(); + srcCount += BuildDelayFreeUses(op4); + + // get a tmp register for mask that will be cleared by gather instructions + buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + setInternalRegsDelayFree = true; + buildUses = false; break; } diff --git a/src/jit/namedintrinsiclist.h b/src/jit/namedintrinsiclist.h index 314579c..187ef3f 100644 --- a/src/jit/namedintrinsiclist.h +++ b/src/jit/namedintrinsiclist.h @@ -7,7 +7,7 @@ // Named jit intrinsics -enum NamedIntrinsic : unsigned int +enum NamedIntrinsic : unsigned short { NI_Illegal = 0, NI_System_Enum_HasFlag = 1, -- 2.7.4