Implement AVX2 Gather intrinsic in JIT
authorFei Peng <fei.peng@intel.com>
Fri, 31 Aug 2018 21:15:45 +0000 (14:15 -0700)
committerTanner Gooding <tagoo@outlook.com>
Wed, 5 Sep 2018 19:56:39 +0000 (12:56 -0700)
13 files changed:
src/jit/emitfmtsxarch.h
src/jit/emitxarch.cpp
src/jit/emitxarch.h
src/jit/gentree.cpp
src/jit/gentree.h
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp
src/jit/hwintrinsicxarch.h
src/jit/instrsxarch.h
src/jit/lowerxarch.cpp
src/jit/lsraxarch.cpp
src/jit/namedintrinsiclist.h

index b070b3d..371023b 100644 (file)
@@ -188,6 +188,7 @@ IF_DEF(RRW_ARD_CNS, IS_AM_RD|IS_R1_RW,          AMD_CNS)  // r/w    reg , read [
 
 IF_DEF(RWR_RRD_ARD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD )     // write  reg , read  reg2, read [adr]
 IF_DEF(RWR_ARD_CNS, IS_AM_RD|IS_R1_WR,          AMD_CNS)  // write  reg , read [adr], const
+IF_DEF(RWR_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD)      // write  reg , read [adr], read reg2
 IF_DEF(RWR_RRD_ARD_CNS, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD_CNS) // write  reg , read  reg2, read [adr], const
 IF_DEF(RWR_RRD_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, AMD_CNS) // write  reg , read  reg2, read [adr], read reg3
 
index 5be69db..3dff716 100644 (file)
@@ -334,6 +334,10 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr)
         case INS_vfnmsub213sd:
         case INS_vfnmsub231sd:
         case INS_vpmaskmovq:
+        case INS_vpgatherdq:
+        case INS_vpgatherqq:
+        case INS_vgatherdpd:
+        case INS_vgatherqpd:
             return true;
         default:
             break;
@@ -2901,8 +2905,8 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
     if (dst->isContained() || (dst->isLclField() && (dst->gtRegNum == REG_NA)) || dst->isUsedFromSpillTemp())
     {
         // dst can only be a modrm
-        assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) ||
-               instrIs3opImul(ins)); // dst on 3opImul isn't really the dst
+        // dst on 3opImul isn't really the dst
+        assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) || instrIs3opImul(ins));
         assert(!src->isUsedFromMemory());
 
         memOp = dst;
@@ -4123,6 +4127,74 @@ void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, reg
     emitCurIGsize += sz;
 }
 
+//------------------------------------------------------------------------
+// IsAVX2GatherInstruction: return true if the instruction is AVX2 Gather
+//
+// Arguments:
+//    ins - the instruction to check
+// Return Value:
+//    true if the instruction is AVX2 Gather
+//
+bool IsAVX2GatherInstruction(instruction ins)
+{
+    switch (ins)
+    {
+        case INS_vpgatherdd:
+        case INS_vpgatherdq:
+        case INS_vpgatherqd:
+        case INS_vpgatherqq:
+        case INS_vgatherdps:
+        case INS_vgatherdpd:
+        case INS_vgatherqps:
+        case INS_vgatherqpd:
+            return true;
+        default:
+            return false;
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_R_AR_R: Emits an AVX2 Gather instructions
+//
+// Arguments:
+//    ins - the instruction to emit
+//    attr - the instruction operand size
+//    reg1 - the destination and first source operand
+//    reg2 - the mask operand (encoded in VEX.vvvv)
+//    base - the base register of address to load
+//    index - the index register of VSIB
+//    scale - the scale number of VSIB
+//    offs - the offset added to the memory address from base
+//
+void emitter::emitIns_R_AR_R(instruction ins,
+                             emitAttr    attr,
+                             regNumber   reg1,
+                             regNumber   reg2,
+                             regNumber   base,
+                             regNumber   index,
+                             int         scale,
+                             int         offs)
+{
+    assert(IsAVX2GatherInstruction(ins));
+
+    instrDesc* id = emitNewInstrAmd(attr, offs);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+
+    id->idInsFmt(IF_RWR_ARD_RRD);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = index;
+    id->idAddr()->iiaAddrMode.amScale   = emitEncodeSize((emitAttr)scale);
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
 void emitter::emitIns_R_R_C(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs)
 {
@@ -8341,6 +8413,17 @@ void emitter::emitDispIns(
             emitDispAddrMode(id);
             break;
 
+        case IF_RWR_ARD_RRD:
+            if (ins == INS_vpgatherqd || ins == INS_vgatherqps)
+            {
+                attr = EA_16BYTE;
+            }
+            sstr = codeGen->genSizeStr(EA_ATTR(4));
+            printf("%s, %s", emitRegName(id->idReg1(), attr), sstr);
+            emitDispAddrMode(id);
+            printf(", %s", emitRegName(id->idReg2(), attr));
+            break;
+
         case IF_RWR_RRD_ARD_CNS:
         {
             printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr);
@@ -9223,6 +9306,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
             switch (id->idInsFmt())
             {
                 case IF_RWR_RRD_ARD:
+                case IF_RWR_ARD_RRD:
                 case IF_RWR_RRD_ARD_CNS:
                 case IF_RWR_RRD_ARD_RRD:
                 {
@@ -12884,6 +12968,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             break;
         }
 
+        case IF_RWR_ARD_RRD:
+        {
+            assert(IsAVX2GatherInstruction(ins));
+            code = insCodeRM(ins);
+            dst  = emitOutputAM(dst, id, code);
+            sz   = emitSizeOfInsDsc(id);
+            break;
+        }
+
         case IF_RWR_RRD_ARD_CNS:
         case IF_RWR_RRD_ARD_RRD:
         {
index 3ec962f..965309d 100644 (file)
@@ -335,6 +335,15 @@ void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg
 
 void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs);
 
+void emitIns_R_AR_R(instruction ins,
+                    emitAttr    attr,
+                    regNumber   reg1,
+                    regNumber   reg2,
+                    regNumber   base,
+                    regNumber   index,
+                    int         scale,
+                    int         offs);
+
 void emitIns_R_R_C(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs);
 
index 620087a..94152ea 100644 (file)
@@ -17501,13 +17501,18 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad()
     {
         // Some AVX instructions here also have MemoryLoad sematics
 
-        // Do we have 3 operands?
-        if (HWIntrinsicInfo::lookupNumArgs(this) != 3)
+        // Do we have less than 3 operands?
+        if (HWIntrinsicInfo::lookupNumArgs(this) < 3)
         {
             return false;
         }
-        else // We have 3 operands/args
+        else // We have 3 or more operands/args
         {
+            if (HWIntrinsicInfo::isAVX2GatherIntrinsic(gtHWIntrinsicId))
+            {
+                return true;
+            }
+
             GenTreeArgList* argList = gtOp.gtOp1->AsArgList();
 
             if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) &&
@@ -17558,38 +17563,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryStore()
 bool GenTreeHWIntrinsic::OperIsMemoryLoadOrStore()
 {
 #ifdef _TARGET_XARCH_
-    // Some xarch instructions have MemoryLoad sematics
-    HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(gtHWIntrinsicId);
-    if ((category == HW_Category_MemoryLoad) || (category == HW_Category_MemoryStore))
-    {
-        return true;
-    }
-    else if (category == HW_Category_IMM)
-    {
-        // Some AVX instructions here also have MemoryLoad or MemoryStore sematics
-
-        // Do we have 3 operands?
-        if (HWIntrinsicInfo::lookupNumArgs(this) != 3)
-        {
-            return false;
-        }
-        else // We have 3 operands/args
-        {
-            GenTreeArgList* argList = gtOp.gtOp1->AsArgList();
-
-            if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) &&
-                (argList->Rest()->Current()->TypeGet() == TYP_I_IMPL)) // Is the type of the second arg TYP_I_IMPL?
-            {
-                // This is Avx/Avx2.InsertVector128
-                return true;
-            }
-            else if ((gtHWIntrinsicId == NI_AVX_ExtractVector128 || gtHWIntrinsicId == NI_AVX2_ExtractVector128))
-            {
-                // This is Avx/Avx2.ExtractVector128
-                return true;
-            }
-        }
-    }
+    return OperIsMemoryLoad() || OperIsMemoryStore();
 #endif // _TARGET_XARCH_
     return false;
 }
index 006812b..ef86c5d 100644 (file)
@@ -479,8 +479,8 @@ public:
     // happening.
     void CopyCosts(const GenTree* const tree)
     {
-        INDEBUG(gtCostsInitialized =
-                    tree->gtCostsInitialized;) // If the 'tree' costs aren't initialized, we'll hit an assert below.
+        // If the 'tree' costs aren't initialized, we'll hit an assert below.
+        INDEBUG(gtCostsInitialized = tree->gtCostsInitialized;)
         _gtCostEx = tree->gtCostEx;
         _gtCostSz = tree->gtCostSz;
     }
@@ -4115,6 +4115,7 @@ struct GenTreeSIMD : public GenTreeJitIntrinsic
 struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
 {
     NamedIntrinsic gtHWIntrinsicId;
+    var_types      gtIndexBaseType; // for AVX2 Gather* intrinsics
 
     GenTreeHWIntrinsic(var_types type, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size)
         : GenTreeJitIntrinsic(GT_HWIntrinsic, type, nullptr, nullptr, baseType, size), gtHWIntrinsicId(hwIntrinsicID)
index 0a9dfb3..f3dec92 100644 (file)
@@ -1184,6 +1184,9 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsi
                                               HWIntrinsicSwitchCaseBody emitSwCase)
 {
     assert(nonConstImmReg != REG_NA);
+    // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
+    // that does work with the current compiler generated jump-table fallback
+    assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
     emitter* emit = getEmitter();
 
     const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
@@ -2008,6 +2011,117 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
             break;
         }
 
+        case NI_AVX2_GatherVector128:
+        case NI_AVX2_GatherVector256:
+        case NI_AVX2_GatherMaskVector128:
+        case NI_AVX2_GatherMaskVector256:
+        {
+            GenTreeArgList* list = op1->AsArgList();
+            op1                  = list->Current();
+            op1Reg               = op1->gtRegNum;
+            genConsumeRegs(op1);
+
+            list   = list->Rest();
+            op2    = list->Current();
+            op2Reg = op2->gtRegNum;
+            genConsumeRegs(op2);
+
+            list         = list->Rest();
+            GenTree* op3 = list->Current();
+            genConsumeRegs(op3);
+
+            list             = list->Rest();
+            GenTree* op4     = nullptr;
+            GenTree* lastOp  = nullptr;
+            GenTree* indexOp = nullptr;
+
+            regNumber op3Reg       = REG_NA;
+            regNumber op4Reg       = REG_NA;
+            regNumber addrBaseReg  = REG_NA;
+            regNumber addrIndexReg = REG_NA;
+            regNumber maskReg      = node->ExtractTempReg(RBM_ALLFLOAT);
+
+            if (numArgs == 5)
+            {
+                assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
+                op4    = list->Current();
+                list   = list->Rest();
+                lastOp = list->Current();
+                op3Reg = op3->gtRegNum;
+                op4Reg = op4->gtRegNum;
+                genConsumeRegs(op4);
+                addrBaseReg  = op2Reg;
+                addrIndexReg = op3Reg;
+                indexOp      = op3;
+
+                // copy op4Reg into the tmp mask register,
+                // the mask register will be cleared by gather instructions
+                emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
+
+                if (targetReg != op1Reg)
+                {
+                    // copy source vector to the target register for masking merge
+                    emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+                }
+            }
+            else
+            {
+                assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
+                addrBaseReg  = op1Reg;
+                addrIndexReg = op2Reg;
+                indexOp      = op2;
+                lastOp       = op3;
+
+                // generate all-one mask vector
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
+            }
+
+            bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
+
+            // hwintrinsiclistxarch.h uses Dword index instructions in default
+            if (varTypeIsLong(node->gtIndexBaseType))
+            {
+                switch (ins)
+                {
+                    case INS_vpgatherdd:
+                        ins = INS_vpgatherqd;
+                        if (isVector128GatherWithVector256Index)
+                        {
+                            // YMM index in address mode
+                            attr = emitTypeSize(TYP_SIMD32);
+                        }
+                        break;
+                    case INS_vpgatherdq:
+                        ins = INS_vpgatherqq;
+                        break;
+                    case INS_vgatherdps:
+                        ins = INS_vgatherqps;
+                        if (isVector128GatherWithVector256Index)
+                        {
+                            // YMM index in address mode
+                            attr = emitTypeSize(TYP_SIMD32);
+                        }
+                        break;
+                    case INS_vgatherdpd:
+                        ins = INS_vgatherqpd;
+                        break;
+                    default:
+                        unreached();
+                }
+            }
+
+            assert(lastOp->IsCnsIntOrI());
+            ssize_t ival = lastOp->AsIntCon()->IconValue();
+            assert((ival >= 0) && (ival <= 255));
+
+            assert(targetReg != maskReg);
+            assert(targetReg != addrIndexReg);
+            assert(maskReg != addrIndexReg);
+            emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
+
+            break;
+        }
+
         case NI_AVX_GetLowerHalf:
         {
             assert(op2 == nullptr);
index ac59fc8..41ee9ef 100644 (file)
@@ -430,6 +430,10 @@ HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int32,                    "ConvertToVe
 HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt32,                   "ConvertToVector256UInt32",                     AVX2,         -1,              32,           1,     {INS_invalid,           INS_pmovzxbd,       INS_invalid,        INS_pmovzxwd,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int64,                    "ConvertToVector256Int64",                      AVX2,         -1,              32,           1,     {INS_pmovsxbq,          INS_invalid,        INS_pmovsxwq,       INS_invalid,        INS_pmovsxdq,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt64,                   "ConvertToVector256UInt64",                     AVX2,         -1,              32,           1,     {INS_invalid,           INS_pmovzxbq,       INS_invalid,        INS_pmovzxwq,       INS_invalid,        INS_pmovzxdq,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX2_GatherVector128,                            "GatherVector128",                              AVX2,         -1,              16,            3,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_GatherVector256,                            "GatherVector256",                              AVX2,         -1,              32,            3,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_GatherMaskVector128,                        "GatherMaskVector128",                          AVX2,         -1,              16,            5,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_GatherMaskVector256,                        "GatherMaskVector256",                          AVX2,         -1,              32,            5,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(AVX2_HorizontalAdd,                              "HorizontalAdd",                                AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phaddw,         INS_invalid,        INS_phaddd,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate,                      "HorizontalAddSaturate",                        AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phaddsw,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalSubtract,                         "HorizontalSubtract",                           AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phsubw,         INS_invalid,        INS_phsubd,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
index 33363b5..8f53c3a 100644 (file)
@@ -232,7 +232,6 @@ int HWIntrinsicInfo::lookupNumArgs(const GenTreeHWIntrinsic* node)
 
     if (op1->OperIsList())
     {
-#if DEBUG
         GenTreeArgList* list = op1->AsArgList();
         numArgs              = 0;
 
@@ -242,10 +241,7 @@ int HWIntrinsicInfo::lookupNumArgs(const GenTreeHWIntrinsic* node)
             list = list->Rest();
         } while (list != nullptr);
 
-        assert(numArgs == 3);
-#endif
-
-        return 3;
+        return numArgs;
     }
 
     GenTree* op2 = node->gtGetOp2();
@@ -303,6 +299,17 @@ GenTree* HWIntrinsicInfo::lookupLastOp(const GenTreeHWIntrinsic* node)
             return node->gtGetOp1()->AsArgList()->Rest()->Rest()->Current();
         }
 
+        case 5:
+        {
+            assert(node->gtGetOp1() != nullptr);
+            assert(node->gtGetOp1()->OperIsList());
+            assert(node->gtGetOp2() == nullptr);
+            assert(node->gtGetOp1()->AsArgList()->Rest()->Rest()->Rest()->Rest()->Current() != nullptr);
+            assert(node->gtGetOp1()->AsArgList()->Rest()->Rest()->Rest()->Rest()->Rest() == nullptr);
+
+            return node->gtGetOp1()->AsArgList()->Rest()->Rest()->Rest()->Rest()->Current();
+        }
+
         default:
         {
             unreached();
@@ -362,6 +369,12 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id)
             return 31; // enum FloatComparisonMode has 32 values
         }
 
+        case NI_AVX2_GatherVector128:
+        case NI_AVX2_GatherVector256:
+        case NI_AVX2_GatherMaskVector128:
+        case NI_AVX2_GatherMaskVector256:
+            return 8;
+
         default:
         {
             assert(HWIntrinsicInfo::HasFullRangeImm(id));
@@ -371,6 +384,53 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id)
 }
 
 //------------------------------------------------------------------------
+// isInImmRange: Check if ival is valid for the intrinsic
+//
+// Arguments:
+//    id   -- The NamedIntrinsic associated with the HWIntrinsic to lookup
+//    ival -- the imm value to be checked
+//
+// Return Value:
+//     true if ival is valid for the intrinsic
+//
+bool HWIntrinsicInfo::isInImmRange(NamedIntrinsic id, int ival)
+{
+    assert(HWIntrinsicInfo::lookupCategory(id) == HW_Category_IMM);
+
+    if (isAVX2GatherIntrinsic(id))
+    {
+        return ival == 1 || ival == 2 || ival == 4 || ival == 8;
+    }
+    else
+    {
+        return ival <= lookupImmUpperBound(id) && ival >= 0;
+    }
+}
+
+//------------------------------------------------------------------------
+// isAVX2GatherIntrinsic: Check if the intrinsic is AVX Gather*
+//
+// Arguments:
+//    id   -- The NamedIntrinsic associated with the HWIntrinsic to lookup
+//
+// Return Value:
+//     true if id is AVX Gather* intrinsic
+//
+bool HWIntrinsicInfo::isAVX2GatherIntrinsic(NamedIntrinsic id)
+{
+    switch (id)
+    {
+        case NI_AVX2_GatherVector128:
+        case NI_AVX2_GatherVector256:
+        case NI_AVX2_GatherMaskVector128:
+        case NI_AVX2_GatherMaskVector256:
+            return true;
+        default:
+            return false;
+    }
+}
+
+//------------------------------------------------------------------------
 // isFullyImplementedIsa: Gets a value that indicates whether the InstructionSet is fully implemented
 //
 // Arguments:
@@ -532,7 +592,10 @@ GenTree* Compiler::addRangeCheckIfNeeded(NamedIntrinsic intrinsic, GenTree* last
     assert(lastOp != nullptr);
     // Full-range imm-intrinsics do not need the range-check
     // because the imm-parameter of the intrinsic method is a byte.
-    if (mustExpand && !HWIntrinsicInfo::HasFullRangeImm(intrinsic) && HWIntrinsicInfo::isImmOp(intrinsic, lastOp))
+    // AVX2 Gather intrinsics no not need the range-check
+    // because their imm-parameter have discrete valid values that are handle by managed code
+    if (mustExpand && !HWIntrinsicInfo::HasFullRangeImm(intrinsic) && HWIntrinsicInfo::isImmOp(intrinsic, lastOp) &&
+        !HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic))
     {
         assert(!lastOp->IsCnsIntOrI());
         GenTree* upperBoundNode =
@@ -683,7 +746,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic        intrinsic,
         if (!HWIntrinsicInfo::HasFullRangeImm(intrinsic))
         {
             if (!mustExpand && lastOp->IsCnsIntOrI() &&
-                lastOp->AsIntCon()->IconValue() > HWIntrinsicInfo::lookupImmUpperBound(intrinsic))
+                !HWIntrinsicInfo::isInImmRange(intrinsic, (int)lastOp->AsIntCon()->IconValue()))
             {
                 return nullptr;
             }
@@ -808,13 +871,26 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic        intrinsic,
 
                 argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass)));
                 op2     = getArgForHWIntrinsic(argType, argClass);
+                var_types op2Type;
+                if (intrinsic == NI_AVX2_GatherVector128 || intrinsic == NI_AVX2_GatherVector256)
+                {
+                    assert(varTypeIsSIMD(op2->TypeGet()));
+                    op2Type = getBaseTypeOfSIMDType(argClass);
+                }
 
                 argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
                 op1     = getArgForHWIntrinsic(argType, argClass);
 
                 retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, baseType, simdSize);
+
+                if (intrinsic == NI_AVX2_GatherVector128 || intrinsic == NI_AVX2_GatherVector256)
+                {
+                    assert(varTypeIsSIMD(op2->TypeGet()));
+                    retNode->AsHWIntrinsic()->gtIndexBaseType = op2Type;
+                }
                 break;
             }
+
             default:
                 unreached();
         }
@@ -1276,6 +1352,50 @@ GenTree* Compiler::impAvxOrAvx2Intrinsic(NamedIntrinsic        intrinsic,
             }
             break;
         }
+
+        case NI_AVX2_GatherMaskVector128:
+        case NI_AVX2_GatherMaskVector256:
+        {
+            CORINFO_ARG_LIST_HANDLE argList = sig->args;
+            CORINFO_CLASS_HANDLE    argClass;
+            var_types               argType = TYP_UNKNOWN;
+            unsigned int            sizeBytes;
+            baseType          = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);
+            var_types retType = getSIMDTypeForSize(sizeBytes);
+
+            assert(sig->numArgs == 5);
+            CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(argList);
+            CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2);
+            CORINFO_ARG_LIST_HANDLE arg4 = info.compCompHnd->getArgNext(arg3);
+            CORINFO_ARG_LIST_HANDLE arg5 = info.compCompHnd->getArgNext(arg4);
+
+            argType      = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg5, &argClass)));
+            GenTree* op5 = getArgForHWIntrinsic(argType, argClass);
+            SetOpLclRelatedToSIMDIntrinsic(op5);
+
+            argType      = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg4, &argClass)));
+            GenTree* op4 = getArgForHWIntrinsic(argType, argClass);
+            SetOpLclRelatedToSIMDIntrinsic(op4);
+
+            argType                 = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass)));
+            var_types indexbaseType = getBaseTypeOfSIMDType(argClass);
+            GenTree*  op3           = getArgForHWIntrinsic(argType, argClass);
+            SetOpLclRelatedToSIMDIntrinsic(op3);
+
+            argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass)));
+            op2     = getArgForHWIntrinsic(argType, argClass);
+            SetOpLclRelatedToSIMDIntrinsic(op2);
+
+            argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
+            op1     = getArgForHWIntrinsic(argType, argClass);
+            SetOpLclRelatedToSIMDIntrinsic(op1);
+
+            GenTree* opList = new (this, GT_LIST) GenTreeArgList(op1, gtNewArgList(op2, op3, op4, op5));
+            retNode = new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(retType, opList, intrinsic, baseType, simdSize);
+            retNode->AsHWIntrinsic()->gtIndexBaseType = indexbaseType;
+            break;
+        }
+
         default:
             JITDUMP("Not implemented hardware intrinsic");
             break;
index 61a09a0..d74709a 100644 (file)
@@ -149,6 +149,8 @@ struct HWIntrinsicInfo
     static bool isImmOp(NamedIntrinsic id, const GenTree* op);
 
     static int lookupImmUpperBound(NamedIntrinsic id);
+    static bool isInImmRange(NamedIntrinsic id, int ival);
+    static bool isAVX2GatherIntrinsic(NamedIntrinsic id);
 
     static bool isFullyImplementedIsa(InstructionSet isa);
     static bool isScalarIsa(InstructionSet isa);
index f64edad..f205ed8 100644 (file)
@@ -503,6 +503,14 @@ INST3(vmaskmovps,       "maskmovps",        IUM_WR, SSE38(0x2E),  BAD_CODE,
 INST3(vmaskmovpd,       "maskmovpd",        IUM_WR, SSE38(0x2F),  BAD_CODE,     SSE38(0x2D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores
 INST3(vpmaskmovd,       "pmaskmovd",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Dword Loads and Stores
 INST3(vpmaskmovq,       "pmaskmovq",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Qword Loads and Stores
+INST3(vpgatherdd,       "pgatherdd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Dword
+INST3(vpgatherqd,       "pgatherqd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Qword
+INST3(vpgatherdq,       "pgatherdq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword with Signed Dword Indices
+INST3(vpgatherqq,       "pgatherqq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Qword with Signed Dword Indices
+INST3(vgatherdps,       "gatherdps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Dword Indices
+INST3(vgatherqps,       "gatherqps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Qword Indices
+INST3(vgatherdpd,       "gatherdpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Dword Indices
+INST3(vgatherqpd,       "gatherqpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Qword Indices
 
 INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
 //    id                nm                  um      mr            mi            rm                                       flags
index c72d0a2..c748da0 100644 (file)
@@ -2580,6 +2580,13 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
 
     if (!HWIntrinsicInfo::SupportsContainment(intrinsicId))
     {
+        // AVX2 gather are not contaibable and always have constant IMM argument
+        if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId))
+        {
+            GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
+            assert(lastOp != nullptr);
+            MakeSrcContained(node, lastOp);
+        }
         // Exit early if containment isn't supported
         return;
     }
index 8f012c0..c64ea11 100644 (file)
@@ -533,6 +533,7 @@ int LinearScan::BuildNode(GenTree* tree)
 #ifdef FEATURE_HW_INTRINSICS
         case GT_HW_INTRINSIC_CHK:
 #endif // FEATURE_HW_INTRINSICS
+
             // Consumes arrLen & index - has no result
             srcCount = 2;
             assert(dstCount == 0);
@@ -2311,7 +2312,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
         if (op1->OperIsList())
         {
             assert(op2 == nullptr);
-            assert(numArgs == 3);
+            assert(numArgs >= 3);
 
             GenTreeArgList* argList = op1->AsArgList();
 
@@ -2321,10 +2322,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
             op2     = argList->Current();
             argList = argList->Rest();
 
-            op3     = argList->Current();
+            op3 = argList->Current();
+
+            while (argList->Rest() != nullptr)
+            {
+                argList = argList->Rest();
+            }
+
+            lastOp  = argList->Current();
             argList = argList->Rest();
 
-            lastOp = op3;
             assert(argList == nullptr);
         }
         else if (op2 != nullptr)
@@ -2590,6 +2597,45 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
             {
                 assert(numArgs == 1);
                 srcCount += BuildDelayFreeUses(op1);
+
+                buildUses = false;
+                break;
+            }
+
+            case NI_AVX2_GatherVector128:
+            case NI_AVX2_GatherVector256:
+            {
+                assert(numArgs == 3);
+                // Any pair of the index, mask, or destination registers should be different
+                srcCount += BuildOperandUses(op1);
+                srcCount += BuildDelayFreeUses(op2);
+
+                // get a tmp register for mask that will be cleared by gather instructions
+                buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
+                setInternalRegsDelayFree = true;
+
+                buildUses = false;
+                break;
+            }
+
+            case NI_AVX2_GatherMaskVector128:
+            case NI_AVX2_GatherMaskVector256:
+            {
+                assert(numArgs == 5);
+                // Any pair of the index, mask, or destination registers should be different
+                srcCount += BuildOperandUses(op1);
+                srcCount += BuildOperandUses(op2);
+                srcCount += BuildDelayFreeUses(op3);
+
+                assert(intrinsicTree->gtGetOp1()->OperIsList());
+                GenTreeArgList* argList = intrinsicTree->gtGetOp1()->AsArgList();
+                GenTree*        op4     = argList->Rest()->Rest()->Rest()->Current();
+                srcCount += BuildDelayFreeUses(op4);
+
+                // get a tmp register for mask that will be cleared by gather instructions
+                buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
+                setInternalRegsDelayFree = true;
+
                 buildUses = false;
                 break;
             }
index 314579c..187ef3f 100644 (file)
@@ -7,7 +7,7 @@
 
 // Named jit intrinsics
 
-enum NamedIntrinsic : unsigned int
+enum NamedIntrinsic : unsigned short
 {
     NI_Illegal                                                 = 0,
     NI_System_Enum_HasFlag                                     = 1,