Implement more AVX/AVX2 intrinsics
authorFei <fei.peng@intel.com>
Tue, 20 Mar 2018 10:07:59 +0000 (02:07 -0800)
committerFei <fei.peng@intel.com>
Tue, 20 Mar 2018 10:07:59 +0000 (02:07 -0800)
src/jit/emitxarch.cpp
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/instrsxarch.h

index 4e1bec9..117bc21 100644 (file)
@@ -223,7 +223,12 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
         case INS_unpcklpd:
         case INS_vinsertf128:
         case INS_vinserti128:
+        case INS_vmaskmovps:
+        case INS_vmaskmovpd:
         case INS_vperm2i128:
+        case INS_vperm2f128:
+        case INS_vpermilpsvar:
+        case INS_vpermilpdvar:
         case INS_vpsrlvd:
         case INS_vpsrlvq:
         case INS_vpsravd:
index 1d2b51e..1d6380e 100644 (file)
@@ -134,7 +134,14 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                 }
                 else if (category == HW_Category_MemoryLoad)
                 {
-                    emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
+                    if (intrinsicID == NI_AVX_MaskLoad)
+                    {
+                        emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg);
+                    }
+                    else
+                    {
+                        emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
+                    }
                 }
                 else if (Compiler::isImmHWIntrinsic(intrinsicID, op2))
                 {
@@ -1331,7 +1338,6 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
 
             if (op1Reg != targetReg)
             {
-                instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
                 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg);
             }
             break;
index 7435c28..216284f 100644 (file)
@@ -333,6 +333,7 @@ HARDWARE_INTRINSIC(AVX_BlendVariable,                                "BlendVaria
 HARDWARE_INTRINSIC(AVX_Ceiling,                                      "Ceiling",                                          AVX,        10,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_roundps,   INS_roundpd},           HW_Category_SimpleSIMD,                        HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector128,                   "BroadcastScalarToVector128",                       AVX,        -1,           16,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_vbroadcastss, INS_invalid},        HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector256,                   "BroadcastScalarToVector256",                       AVX,        -1,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_vbroadcastss, INS_vbroadcastsd},   HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_BroadcastVector128ToVector256,                "BroadcastVector128ToVector256",                    AVX,        -1,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,INS_vbroadcastf128,INS_vbroadcastf128},   HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_Compare,                                      "Compare",                                          AVX,        -1,           32,           3,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpps,     INS_cmppd},             HW_Category_IMM,                               HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_CompareScalar,                                "CompareScalar",                                    AVX,        -1,           16,           3,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpss,     INS_cmpsd},             HW_Category_IMM,                               HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(AVX_ConvertToSingle,                              "ConvertToSingle",                                  AVX,        -1,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoRMWSemantics)
@@ -359,9 +360,13 @@ HARDWARE_INTRINSIC(AVX_LoadDquVector256,                             "LoadDquVec
 HARDWARE_INTRINSIC(AVX_LoadVector256,                                "LoadVector256",                                    AVX,        -1,           32,           1,           {INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movups,    INS_movupd},            HW_Category_MemoryLoad,                        HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_Max,                                          "Max",                                              AVX,        -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxps,     INS_maxpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX_Min,                                          "Min",                                              AVX,        -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minps,     INS_minpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX_MaskLoad,                                     "MaskLoad",                                         AVX,        -1,           0,            2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_vmaskmovps,INS_vmaskmovpd},        HW_Category_MemoryLoad,                        HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(AVX_MoveMask,                                     "MoveMask",                                         AVX,        -1,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movmskps,  INS_movmskpd},          HW_Category_SimpleSIMD,                        HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX_Multiply,                                     "Multiply",                                         AVX,        -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulps,     INS_mulpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX_Or,                                           "Or",                                               AVX,        -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_orps,      INS_orpd},              HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX_Permute,                                      "Permute",                                          AVX,        -1,           0,            2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_vpermilps, INS_vpermilpd},         HW_Category_IMM,                               HW_Flag_FullRangeIMM|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(AVX_Permute2x128,                                 "Permute2x128",                                     AVX,        -1,           32,           3,           {INS_vperm2f128,INS_vperm2f128,INS_vperm2f128,INS_vperm2f128,INS_vperm2f128,INS_vperm2f128,INS_vperm2f128,INS_vperm2f128,INS_vperm2f128,INS_vperm2f128},        HW_Category_IMM,                               HW_Flag_OneTypeGeneric|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX_PermuteVar,                                   "PermuteVar",                                       AVX,        -1,           0,            2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_vpermilpsvar,INS_vpermilpdvar},    HW_Category_SimpleSIMD,                        HW_Flag_UnfixedSIMDSize)
 HARDWARE_INTRINSIC(AVX_Reciprocal,                                   "Reciprocal",                                       AVX,        -1,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rcpps,     INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_ReciprocalSqrt,                               "ReciprocalSqrt",                                   AVX,        -1,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rsqrtps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_RoundCurrentDirection,                        "RoundCurrentDirection",                            AVX,         4,           32,           1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_roundps,   INS_roundpd},           HW_Category_SimpleSIMD,                        HW_Flag_NoRMWSemantics)
@@ -395,17 +400,28 @@ HARDWARE_INTRINSIC(AVX2_Average,                                     "Average",
 HARDWARE_INTRINSIC(AVX2_BlendVariable,                               "BlendVariable",                                    AVX2,       -1,           32,           3,           {INS_vpblendvb, INS_vpblendvb, INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128,                  "BroadcastScalarToVector128",                       AVX2,       -1,           16,           1,           {INS_vpbroadcastb,INS_vpbroadcastb,INS_vpbroadcastw,INS_vpbroadcastw,INS_vpbroadcastd,INS_vpbroadcastd,INS_vpbroadcastq,INS_vpbroadcastq,INS_vbroadcastss,INS_movddup},        HW_Category_SimpleSIMD,         HW_Flag_OneTypeGeneric)
 HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256,                  "BroadcastScalarToVector256",                       AVX2,       -1,           32,           1,           {INS_vpbroadcastb,INS_vpbroadcastb,INS_vpbroadcastw,INS_vpbroadcastw,INS_vpbroadcastd,INS_vpbroadcastd,INS_vpbroadcastq,INS_vpbroadcastq,INS_vbroadcastss,INS_vbroadcastsd},   HW_Category_SimpleSIMD,         HW_Flag_OneTypeGeneric)
+HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256,               "BroadcastVector128ToVector256",                    AVX2,       -1,           32,           1,           {INS_vbroadcasti128,INS_vbroadcasti128,INS_vbroadcasti128,INS_vbroadcasti128,INS_vbroadcasti128,INS_vbroadcasti128,INS_vbroadcasti128,INS_vbroadcasti128,INS_invalid,INS_invalid},HW_Category_MemoryLoad,      HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_CompareEqual,                                "CompareEqual",                                     AVX2,       -1,           32,           2,           {INS_pcmpeqb,   INS_pcmpeqb,   INS_pcmpeqw,   INS_pcmpeqw,   INS_pcmpeqd,   INS_pcmpeqd,   INS_pcmpeqq,   INS_pcmpeqq,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX2_CompareGreaterThan,                          "CompareGreaterThan",                               AVX2,       -1,           32,           2,           {INS_pcmpgtb,   INS_invalid,   INS_pcmpgtw,   INS_invalid,   INS_pcmpgtd,   INS_invalid,   INS_pcmpgtq,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_ExtractVector128,                            "ExtractVector128",                                 AVX2,       -1,           32,          -1,           {INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_vextracti128,INS_invalid, INS_invalid},HW_Category_IMM,                            HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int16,                     "ConvertToVector256Int16",                          AVX2,       -1,           32,           1,           {INS_pmovsxbw,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt16,                    "ConvertToVector256UInt16",                         AVX2,       -1,           32,           1,           {INS_invalid,   INS_pmovzxbw,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int32,                     "ConvertToVector256Int32",                          AVX2,       -1,           32,           1,           {INS_pmovsxbd,  INS_invalid,   INS_pmovsxwd,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt32,                    "ConvertToVector256UInt32",                         AVX2,       -1,           32,           1,           {INS_invalid,   INS_pmovzxbd,  INS_invalid,   INS_pmovzxwd,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int64,                     "ConvertToVector256Int64",                          AVX2,       -1,           32,           1,           {INS_pmovsxbq,  INS_invalid,   INS_pmovsxwq,  INS_invalid,   INS_pmovsxdq,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt64,                    "ConvertToVector256UInt64",                         AVX2,       -1,           32,           1,           {INS_invalid,   INS_pmovzxbq,  INS_invalid,   INS_pmovzxwq,  INS_invalid,   INS_pmovzxdq,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX2_HorizontalAdd,                               "HorizontalAdd",                                    AVX2,       -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_phaddw,    INS_invalid,   INS_phaddd,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate,                       "HorizontalAddSaturate",                            AVX2,       -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_phaddsw,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalSubtract,                          "HorizontalSubtract",                               AVX2,       -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_phsubw,    INS_invalid,   INS_phsubd,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalSubtractSaturate,                  "HorizontalSubtractSaturate",                       AVX2,       -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_phsubsw,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_InsertVector128,                             "InsertVector128",                                  AVX2,       -1,           32,           3,           {INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_vinserti128,INS_invalid, INS_invalid},     HW_Category_IMM,                               HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal,             "LoadAlignedVector256NonTemporal",                  AVX2,       -1,           32,           1,           {INS_movntdqa,  INS_movntdqa,  INS_movntdqa,  INS_movntdqa,  INS_movntdqa,  INS_movntdqa,  INS_movntdqa,  INS_movntdqa,  INS_invalid,   INS_invalid},           HW_Category_MemoryLoad,                        HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX2_Max,                                         "Max",                                              AVX2,       -1,           32,           2,           {INS_pmaxsb,    INS_pmaxub,    INS_pmaxsw,    INS_pmaxuw,    INS_pmaxsd,    INS_pmaxud,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX2_Min,                                         "Min",                                              AVX2,       -1,           32,           2,           {INS_pminsb,    INS_pminub,    INS_pminsw,    INS_pminuw,    INS_pminsd,    INS_pminud,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX2_MoveMask,                                    "MoveMask",                                         AVX2,       -1,           32,           1,           {INS_pmovmskb,  INS_pmovmskb,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},            HW_Category_SimpleSIMD,                        HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX2_Multiply,                                    "Multiply",                                         AVX2,       -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_pmuldq,    INS_pmuludq,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX2_Or,                                          "Or",                                               AVX2,       -1,           32,           2,           {INS_por,       INS_por,       INS_por,       INS_por,       INS_por,       INS_por,       INS_por,       INS_por,       INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX2_Permute2x128,                                "Permute2x128",                                     AVX2,       -1,           32,           3,           {INS_vperm2i128,INS_vperm2i128,INS_vperm2i128,INS_vperm2i128,INS_vperm2i128,INS_vperm2i128,INS_vperm2i128,INS_vperm2i128,INS_invalid,   INS_invalid},           HW_Category_IMM,                               HW_Flag_OneTypeGeneric|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_ShiftLeftLogical,                            "ShiftLeftLogical",                                 AVX2,       -1,           32,           2,           {INS_invalid,   INS_invalid,   INS_psllw,     INS_psllw,     INS_pslld,     INS_pslld,     INS_psllq,     INS_psllq,     INS_invalid,   INS_invalid},           HW_Category_IMM,                               HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_ShiftLeftLogical128BitLane,                  "ShiftLeftLogical128BitLane",                       AVX2,       -1,           32,           2,           {INS_pslldq,    INS_pslldq,    INS_pslldq,    INS_pslldq,    INS_pslldq,    INS_pslldq,    INS_pslldq,    INS_pslldq,    INS_invalid,   INS_invalid},           HW_Category_IMM,                               HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_ShiftLeftLogicalVariable,                    "ShiftLeftLogicalVariable",                         AVX2,       -1,            0,           2,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_vpsllvd,   INS_vpsllvd,   INS_vpsllvq,   INS_vpsllvq,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_UnfixedSIMDSize|HW_Flag_NoContainment)
index c4caf9a..23af97a 100644 (file)
@@ -474,8 +474,8 @@ INST3( vpbroadcastb, "pbroadcastb" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SS
 INST3( vpbroadcastw, "pbroadcastw" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x79))   // Broadcast int16 value from reg/memory to entire ymm register
 INST3( vpbroadcastd, "pbroadcastd" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x58))   // Broadcast int32 value from reg/memory to entire ymm register
 INST3( vpbroadcastq, "pbroadcastq" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x59))   // Broadcast int64 value from reg/memory to entire ymm register
-INST3( vextractf128, "extractf128" , 0, IUM_WR, 0, 0, SSE3A(0x19),  BAD_CODE, BAD_CODE)      // Extract 128-bit packed floating point values
-INST3( vextracti128, "extracti128" , 0, IUM_WR, 0, 0, SSE3A(0x39),  BAD_CODE, BAD_CODE)      // Extract 128-bit packed integer values
+INST3( vextractf128, "extractf128" , 0, IUM_WR, 0, 0, SSE3A(0x19),  BAD_CODE, SSE3A(0x19))   // Extract 128-bit packed floating point values
+INST3( vextracti128, "extracti128" , 0, IUM_WR, 0, 0, SSE3A(0x39),  BAD_CODE, SSE3A(0x39))   // Extract 128-bit packed integer values
 INST3( vinsertf128,  "insertf128"  , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x18))   // Insert 128-bit packed floating point values
 INST3( vinserti128,  "inserti128"  , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x38))   // Insert 128-bit packed integer values
 INST3( vzeroupper,   "zeroupper"   , 0, IUM_WR, 0, 0, 0xC577F8,     BAD_CODE, BAD_CODE)      // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
@@ -493,7 +493,13 @@ INST3( vpsllvd,      "psllvd"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SS
 INST3( vpsllvq,      "psllvq"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x47))   // Variable Bit Shift Left Logical
 INST3( vpermilps,    "permilps"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x04))   // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
 INST3( vpermilpd,    "permilpd"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x05))   // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
-
+INST3( vpermilpsvar, "permilpsvar" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x0C))   // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
+INST3( vpermilpdvar, "permilpdvar" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x0D))   // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
+INST3( vperm2f128,   "perm2f128"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x06))   // Permute Floating-Point Values
+INST3(vbroadcastf128,"broadcastf128",0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x1A))   // Broadcast packed float values read from memory to entire ymm register
+INST3(vbroadcasti128,"broadcasti128",0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x5A))   // Broadcast packed integer values read from memory to entire ymm register
+INST3(vmaskmovps,    "maskmovps"    ,0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x2C))   // Conditional SIMD Packed Loads Float
+INST3(vmaskmovpd,    "maskmovpd"    ,0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x2D))   // Conditional SIMD Packed Loads Double
 INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 
 // Scalar instructions in SSE4.2