Implement AVX2.BroadcastScalarToVector128/256
authorFei Peng <fei.peng@intel.com>
Wed, 24 Oct 2018 19:27:31 +0000 (12:27 -0700)
committerFei Peng <fei.peng@intel.com>
Thu, 25 Oct 2018 18:34:31 +0000 (11:34 -0700)
src/jit/gentree.cpp
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp
src/jit/hwintrinsicxarch.h

index 37feba6..2e73857 100644 (file)
@@ -5274,7 +5274,7 @@ bool GenTree::OperMayThrow(Compiler* comp)
         {
             GenTreeHWIntrinsic* hwIntrinsicNode = this->AsHWIntrinsic();
             assert(hwIntrinsicNode != nullptr);
-            if (hwIntrinsicNode->OperIsMemoryStore() || hwIntrinsicNode->OperIsMemoryLoad())
+            if (hwIntrinsicNode->OperIsMemoryLoadOrStore())
             {
                 // This operation contains an implicit indirection
                 //   it could throw a null reference exception.
@@ -17355,29 +17355,47 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad()
     {
         return true;
     }
-    else if (category == HW_Category_IMM)
+    else if (HWIntrinsicInfo::MaybeMemoryLoad(gtHWIntrinsicId))
     {
-        // Some AVX instructions here also have MemoryLoad sematics
-
-        // Do we have less than 3 operands?
-        if (HWIntrinsicInfo::lookupNumArgs(this) < 3)
+        // Some AVX intrinsic (without HW_Category_MemoryLoad) also have MemoryLoad semantics
+        if (category == HW_Category_SIMDScalar)
         {
-            return false;
+            // Avx2.BroadcastScalarToVector128/256 have vector and pointer overloads both, e.g.,
+            // Vector128<byte> BroadcastScalarToVector128(Vector128<byte> value)
+            // Vector128<byte> BroadcastScalarToVector128(byte* source)
+            // So, we need to check the argument's type is memory-reference (TYP_I_IMPL) or not
+            assert(HWIntrinsicInfo::lookupNumArgs(this) == 1);
+            return (gtHWIntrinsicId == NI_AVX2_BroadcastScalarToVector128 ||
+                    gtHWIntrinsicId == NI_AVX2_BroadcastScalarToVector256) &&
+                   gtOp.gtOp1->TypeGet() == TYP_I_IMPL;
         }
-        else // We have 3 or more operands/args
+        else if (category == HW_Category_IMM)
         {
-            if (HWIntrinsicInfo::isAVX2GatherIntrinsic(gtHWIntrinsicId))
+            // Do we have less than 3 operands?
+            if (HWIntrinsicInfo::lookupNumArgs(this) < 3)
             {
-                return true;
+                return false;
             }
+            else // We have 3 or more operands/args
+            {
+                // All the Avx2.Gather* are "load" instructions
+                if (HWIntrinsicInfo::isAVX2GatherIntrinsic(gtHWIntrinsicId))
+                {
+                    return true;
+                }
 
-            GenTreeArgList* argList = gtOp.gtOp1->AsArgList();
+                GenTreeArgList* argList = gtOp.gtOp1->AsArgList();
 
-            if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) &&
-                (argList->Rest()->Current()->TypeGet() == TYP_I_IMPL)) // Is the type of the second arg TYP_I_IMPL?
-            {
-                // This is Avx/Avx2.InsertVector128
-                return true;
+                // Avx/Avx2.InsertVector128 have vector and pointer overloads both, e.g.,
+                // Vector256<sbyte> InsertVector128(Vector256<sbyte> value, Vector128<sbyte> data, byte index)
+                // Vector256<sbyte> InsertVector128(Vector256<sbyte> value, sbyte* address, byte index)
+                // So, we need to check the second argument's type is memory-reference (TYP_I_IMPL) or not
+                if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) &&
+                    (argList->Rest()->Current()->TypeGet() == TYP_I_IMPL)) // Is the type of the second arg TYP_I_IMPL?
+                {
+                    // This is Avx/Avx2.InsertVector128
+                    return true;
+                }
             }
         }
     }
@@ -17395,22 +17413,18 @@ bool GenTreeHWIntrinsic::OperIsMemoryStore()
     {
         return true;
     }
-    else if (category == HW_Category_IMM)
+    else if (HWIntrinsicInfo::MaybeMemoryStore(gtHWIntrinsicId) && category == HW_Category_IMM)
     {
-        // Some AVX instructions here also have MemoryStore sematics
+        // Some AVX intrinsic (without HW_Category_MemoryStore) also have MemoryStore semantics
 
-        // Do we have 3 operands?
-        if (HWIntrinsicInfo::lookupNumArgs(this) != 3)
-        {
-            return false;
-        }
-        else // We have 3 operands/args
+        // Avx/Avx2.InsertVector128 have vector and pointer overloads both, e.g.,
+        // Vector128<sbyte> ExtractVector128(Vector256<sbyte> value, byte index)
+        // void ExtractVector128(sbyte* address, Vector256<sbyte> value, byte index)
+        // So, the 3-argument form is MemoryStore
+        if ((HWIntrinsicInfo::lookupNumArgs(this) == 3) &&
+            (gtHWIntrinsicId == NI_AVX_ExtractVector128 || gtHWIntrinsicId == NI_AVX2_ExtractVector128))
         {
-            if ((gtHWIntrinsicId == NI_AVX_ExtractVector128 || gtHWIntrinsicId == NI_AVX2_ExtractVector128))
-            {
-                // This is Avx/Avx2.ExtractVector128
-                return true;
-            }
+            return true;
         }
     }
 #endif // _TARGET_XARCH_
index b4c3910..3e50f66 100644 (file)
@@ -112,7 +112,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                 genConsumeOperands(node);
                 op1Reg = op1->gtRegNum;
 
-                if (category == HW_Category_MemoryLoad)
+                if (node->OperIsMemoryLoad())
                 {
                     emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
                 }
index 4fe5345..4052e4b 100644 (file)
@@ -355,13 +355,13 @@ HARDWARE_INTRINSIC(AVX_DuplicateEvenIndexed,                        "DuplicateEv
 HARDWARE_INTRINSIC(AVX_DuplicateOddIndexed,                         "DuplicateOddIndexed",                          AVX,          -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movshdup,       INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_Extract,                                     "Extract",                                      AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_BaseTypeFromFirstArg|HW_Flag_FullRangeIMM|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(AVX_ExtendToVector256,                           "ExtendToVector256",                            AVX,          -1,              32,           1,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movaps,         INS_movapd},            HW_Category_Helper,                 HW_Flag_NoContainment|HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_ExtractVector128,                            "ExtractVector128",                             AVX,          -1,              32,          -1,     {INS_vextractf128,      INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128},      HW_Category_IMM,                    HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX_ExtractVector128,                            "ExtractVector128",                             AVX,          -1,              32,          -1,     {INS_vextractf128,      INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128,   INS_vextractf128},      HW_Category_IMM,                    HW_Flag_MaybeMemoryStore|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX_Floor,                                       "Floor",                                        AVX,           9,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundps,        INS_roundpd},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_GetLowerHalf,                                "GetLowerHalf",                                 AVX,          -1,              32,           1,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movaps,         INS_movapd},            HW_Category_Helper,                 HW_Flag_NoContainment|HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_HorizontalAdd,                               "HorizontalAdd",                                AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_haddps,         INS_haddpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_HorizontalSubtract,                          "HorizontalSubtract",                           AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_hsubps,         INS_hsubpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_Insert,                                      "Insert",                                       AVX,          -1,              32,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM|HW_Flag_NoCodeGen|HW_Flag_SecondArgMaybe64Bit)
-HARDWARE_INTRINSIC(AVX_InsertVector128,                             "InsertVector128",                              AVX,          -1,              32,           3,     {INS_vinsertf128,       INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128},       HW_Category_IMM,                    HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX_InsertVector128,                             "InsertVector128",                              AVX,          -1,              32,           3,     {INS_vinsertf128,       INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128},       HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX_LoadAlignedVector256,                        "LoadAlignedVector256",                         AVX,          -1,              32,           1,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movaps,         INS_movapd},            HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_LoadDquVector256,                            "LoadDquVector256",                             AVX,          -1,              32,           1,     {INS_lddqu,             INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_LoadVector256,                               "LoadVector256",                                AVX,          -1,              32,           1,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movups,         INS_movupd},            HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
@@ -415,12 +415,12 @@ HARDWARE_INTRINSIC(AVX2_AndNot,                                     "AndNot",
 HARDWARE_INTRINSIC(AVX2_Average,                                    "Average",                                      AVX2,         -1,              32,           2,     {INS_invalid,           INS_pavgb,          INS_invalid,        INS_pavgw,          INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX2_Blend,                                      "Blend",                                        AVX2,         -1,               0,           3,     {INS_invalid,           INS_invalid,        INS_pblendw,        INS_pblendw,        INS_vpblendd,       INS_vpblendd,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_UnfixedSIMDSize|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_BlendVariable,                              "BlendVariable",                                AVX2,         -1,              32,           3,     {INS_vpblendvb,         INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128,                 "BroadcastScalarToVector128",                   AVX2,         -1,              16,           1,     {INS_vpbroadcastb,      INS_vpbroadcastb,   INS_vpbroadcastw,   INS_vpbroadcastw,   INS_vpbroadcastd,   INS_vpbroadcastd,   INS_vpbroadcastq,   INS_vpbroadcastq,   INS_vbroadcastss,   INS_movddup},           HW_Category_SIMDScalar,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256,                 "BroadcastScalarToVector256",                   AVX2,         -1,              32,           1,     {INS_vpbroadcastb,      INS_vpbroadcastb,   INS_vpbroadcastw,   INS_vpbroadcastw,   INS_vpbroadcastd,   INS_vpbroadcastd,   INS_vpbroadcastq,   INS_vpbroadcastq,   INS_vbroadcastss,   INS_vbroadcastsd},      HW_Category_SIMDScalar,             HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128,                 "BroadcastScalarToVector128",                   AVX2,         -1,              16,           1,     {INS_vpbroadcastb,      INS_vpbroadcastb,   INS_vpbroadcastw,   INS_vpbroadcastw,   INS_vpbroadcastd,   INS_vpbroadcastd,   INS_vpbroadcastq,   INS_vpbroadcastq,   INS_vbroadcastss,   INS_movddup},           HW_Category_SIMDScalar,             HW_Flag_NoContainment|HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256,                 "BroadcastScalarToVector256",                   AVX2,         -1,              32,           1,     {INS_vpbroadcastb,      INS_vpbroadcastb,   INS_vpbroadcastw,   INS_vpbroadcastw,   INS_vpbroadcastd,   INS_vpbroadcastd,   INS_vpbroadcastq,   INS_vpbroadcastq,   INS_vbroadcastss,   INS_vbroadcastsd},      HW_Category_SIMDScalar,             HW_Flag_NoContainment|HW_Flag_MaybeMemoryLoad)
 HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256,              "BroadcastVector128ToVector256",                AVX2,         -1,              32,           1,     {INS_vbroadcasti128,    INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(AVX2_CompareEqual,                               "CompareEqual",                                 AVX2,         -1,              32,           2,     {INS_pcmpeqb,           INS_pcmpeqb,        INS_pcmpeqw,        INS_pcmpeqw,        INS_pcmpeqd,        INS_pcmpeqd,        INS_pcmpeqq,        INS_pcmpeqq,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX2_CompareGreaterThan,                         "CompareGreaterThan",                           AVX2,         -1,              32,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_pcmpgtq,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(AVX2_ExtractVector128,                           "ExtractVector128",                             AVX2,         -1,              32,          -1,     {INS_vextracti128,      INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX2_ExtractVector128,                           "ExtractVector128",                             AVX2,         -1,              32,          -1,     {INS_vextracti128,      INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeMemoryStore|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_ConvertToDouble,                            "ConvertToDouble",                              AVX2,         -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movsdsse2},         HW_Category_Helper,                 HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX2_ConvertToInt32,                             "ConvertToInt32",                               AVX2,         -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_mov_xmm2i,      INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX2_ConvertToUInt32,                            "ConvertToUInt32",                              AVX2,         -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_mov_xmm2i,      INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
@@ -430,15 +430,15 @@ HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int32,                    "ConvertToVe
 HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt32,                   "ConvertToVector256UInt32",                     AVX2,         -1,              32,           1,     {INS_invalid,           INS_pmovzxbd,       INS_invalid,        INS_pmovzxwd,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int64,                    "ConvertToVector256Int64",                      AVX2,         -1,              32,           1,     {INS_pmovsxbq,          INS_invalid,        INS_pmovsxwq,       INS_invalid,        INS_pmovsxdq,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt64,                   "ConvertToVector256UInt64",                     AVX2,         -1,              32,           1,     {INS_invalid,           INS_pmovzxbq,       INS_invalid,        INS_pmovzxwq,       INS_invalid,        INS_pmovzxdq,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(AVX2_GatherVector128,                            "GatherVector128",                              AVX2,         -1,              16,            3,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX2_GatherVector256,                            "GatherVector256",                              AVX2,         -1,              32,            3,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX2_GatherMaskVector128,                        "GatherMaskVector128",                          AVX2,         -1,              16,            5,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX2_GatherMaskVector256,                        "GatherMaskVector256",                          AVX2,         -1,              32,            5,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_GatherVector128,                            "GatherVector128",                              AVX2,         -1,              16,            3,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_GatherVector256,                            "GatherVector256",                              AVX2,         -1,              32,            3,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_GatherMaskVector128,                        "GatherMaskVector128",                          AVX2,         -1,              16,            5,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_GatherMaskVector256,                        "GatherMaskVector256",                          AVX2,         -1,              32,            5,     {INS_invalid,           INS_invalid,        INS_invalid,       INS_invalid,        INS_vpgatherdd,     INS_vpgatherdd,     INS_vpgatherdq,     INS_vpgatherdq,     INS_vgatherdps,     INS_vgatherdpd},        HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(AVX2_HorizontalAdd,                              "HorizontalAdd",                                AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phaddw,         INS_invalid,        INS_phaddd,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate,                      "HorizontalAddSaturate",                        AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phaddsw,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalSubtract,                         "HorizontalSubtract",                           AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phsubw,         INS_invalid,        INS_phsubd,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalSubtractSaturate,                 "HorizontalSubtractSaturate",                   AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phsubsw,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(AVX2_InsertVector128,                            "InsertVector128",                              AVX2,         -1,              32,           3,     {INS_vinserti128,       INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AVX2_InsertVector128,                            "InsertVector128",                              AVX2,         -1,              32,           3,     {INS_vinserti128,       INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeMemoryLoad|HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal,            "LoadAlignedVector256NonTemporal",              AVX2,         -1,              32,           1,     {INS_movntdqa,          INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(AVX2_MaskLoad,                                   "MaskLoad",                                     AVX2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpmaskmovd,     INS_vpmaskmovd,     INS_vpmaskmovq,     INS_vpmaskmovq,     INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize)
 HARDWARE_INTRINSIC(AVX2_MaskStore,                                  "MaskStore",                                    AVX2,         -1,               0,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpmaskmovd,     INS_vpmaskmovd,     INS_vpmaskmovq,     INS_vpmaskmovq,     INS_invalid,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg)
index 3948273..9f8a67e 100644 (file)
@@ -443,7 +443,6 @@ bool HWIntrinsicInfo::isFullyImplementedIsa(InstructionSet isa)
     switch (isa)
     {
         // These ISAs are partially implemented
-        case InstructionSet_AVX2:
         case InstructionSet_BMI1:
         case InstructionSet_BMI2:
         case InstructionSet_SSE42:
@@ -454,6 +453,7 @@ bool HWIntrinsicInfo::isFullyImplementedIsa(InstructionSet isa)
         // These ISAs are fully implemented
         case InstructionSet_AES:
         case InstructionSet_AVX:
+        case InstructionSet_AVX2:
         case InstructionSet_FMA:
         case InstructionSet_LZCNT:
         case InstructionSet_PCLMULQDQ:
index d74709a..3b4634b 100644 (file)
@@ -72,16 +72,13 @@ enum HWIntrinsicFlag : unsigned int
     // - overloaded on multiple vector sizes (SIMD size in the table is unreliable)
     HW_Flag_UnfixedSIMDSize = 0x20,
 
-    // Complex overload
-    // - the codegen of overloads cannot be determined by intrinsicID and base type
-    HW_Flag_ComplexOverloads = 0x40,
-
     // Multi-instruction
     // - that one intrinsic can generate multiple instructions
     HW_Flag_MultiIns = 0x80,
 
     // NoContainment
-    // the intrinsic cannot be contained
+    // the intrinsic cannot be handled by comtainment,
+    // all the intrinsic that have explicit memory load/store semantics should have this flag
     HW_Flag_NoContainment = 0x100,
 
     // Copy Upper bits
@@ -123,6 +120,11 @@ enum HWIntrinsicFlag : unsigned int
     // the intrinsics need special rules in importer,
     // but may be table-driven in the back-end
     HW_Flag_SpecialImport = 0x80000,
+
+    // Maybe Memory Load/Store
+    // - some intrinsics may have pointer overloads but without HW_Category_MemoryLoad/HW_Category_MemoryStore
+    HW_Flag_MaybeMemoryLoad  = 0x100000,
+    HW_Flag_MaybeMemoryStore = 0x200000,
 };
 
 struct HWIntrinsicInfo
@@ -241,12 +243,6 @@ struct HWIntrinsicInfo
         return (flags & HW_Flag_UnfixedSIMDSize) == 0;
     }
 
-    static const bool HasComplexOverloads(NamedIntrinsic id)
-    {
-        HWIntrinsicFlag flags = lookupFlags(id);
-        return (flags & HW_Flag_ComplexOverloads) != 0;
-    }
-
     static const bool GeneratesMultipleIns(NamedIntrinsic id)
     {
         HWIntrinsicFlag flags = lookupFlags(id);
@@ -283,6 +279,18 @@ struct HWIntrinsicInfo
         return (flags & HW_Flag_MaybeIMM) != 0;
     }
 
+    static const bool MaybeMemoryLoad(NamedIntrinsic id)
+    {
+        HWIntrinsicFlag flags = lookupFlags(id);
+        return (flags & HW_Flag_MaybeMemoryLoad) != 0;
+    }
+
+    static const bool MaybeMemoryStore(NamedIntrinsic id)
+    {
+        HWIntrinsicFlag flags = lookupFlags(id);
+        return (flags & HW_Flag_MaybeMemoryStore) != 0;
+    }
+
     static const bool NoJmpTableImm(NamedIntrinsic id)
     {
         HWIntrinsicFlag flags = lookupFlags(id);