Implementing the ParallelBitDeposit and ParallelBitExtract Bmi2 HWIntrinsics
authorTanner Gooding <tagoo@outlook.com>
Fri, 6 Jul 2018 15:31:10 +0000 (08:31 -0700)
committerTanner Gooding <tagoo@outlook.com>
Mon, 9 Jul 2018 23:21:18 +0000 (16:21 -0700)
src/jit/emitxarch.cpp
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp
src/jit/instrsxarch.h

index c52d7108e93dece9a7d410edff13b94378c97731..75d9b733bb36ad046204c9274dbfe3fb85da2c0e 100644 (file)
@@ -217,6 +217,8 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
         case INS_pcmpgtd:
         case INS_pcmpgtq:
         case INS_pcmpgtw:
+        case INS_pdep:
+        case INS_pext:
         case INS_phaddd:
         case INS_phaddsw:
         case INS_phaddw:
@@ -620,6 +622,8 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr)
             case INS_mov_xmm2i:
             case INS_mov_i2xmm:
             case INS_movnti:
+            case INS_pdep:
+            case INS_pext:
                 return true;
             default:
                 return false;
@@ -850,7 +854,33 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c
                 switch (sizePrefix)
                 {
                     case 0x66:
-                        vexPrefix |= IsBMIInstruction(ins) ? 0x00 : 0x01;
+                        if (IsBMIInstruction(ins))
+                        {
+                            switch (ins)
+                            {
+                                case INS_pdep:
+                                {
+                                    vexPrefix |= 0x03;
+                                    break;
+                                }
+
+                                case INS_pext:
+                                {
+                                    vexPrefix |= 0x02;
+                                    break;
+                                }
+
+                                default:
+                                {
+                                    vexPrefix |= 0x00;
+                                    break;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            vexPrefix |= 0x01;
+                        }
                         break;
                     case 0xF3:
                         vexPrefix |= 0x02;
index edb3c307eddbb23e03e8c4c21f3c17fac9122c55..8a006f8d305d18d347dfc9318f7a121b46fda103 100644 (file)
@@ -2182,7 +2182,43 @@ void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node)
 //
 void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node)
 {
-    NYI("Implement BMI2 intrinsic code generation");
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    regNumber      targetReg   = node->gtRegNum;
+    GenTree*       op1         = node->gtGetOp1();
+    GenTree*       op2         = node->gtGetOp2();
+    var_types      baseType    = node->gtSIMDBaseType;
+    var_types      targetType  = node->TypeGet();
+    instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
+    emitter*       emit        = getEmitter();
+
+    assert(targetReg != REG_NA);
+    assert(op1 != nullptr);
+
+    if (!op1->OperIsList())
+    {
+        genConsumeOperands(node);
+    }
+
+    switch (intrinsicId)
+    {
+        case NI_BMI2_ParallelBitDeposit:
+        case NI_BMI2_ParallelBitExtract:
+        {
+            assert(op2 != nullptr);
+            assert(op1->TypeGet() == op2->TypeGet());
+            assert((targetType == TYP_INT) || (targetType == TYP_LONG));
+            genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
+            break;
+        }
+
+        default:
+        {
+            unreached();
+            break;
+        }
+    }
+
+    genProduceReg(node);
 }
 
 //------------------------------------------------------------------------
index ed00c05876954d72455a77a42fe2b89f134ab157..5c76d8911e34562af56d4307e75e197e6379df9d 100644 (file)
@@ -474,6 +474,8 @@ HARDWARE_INTRINSIC(BMI1_TrailingZeroCount,                          "TrailingZer
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //  BMI2 Intrinsics
 HARDWARE_INTRINSIC(BMI2_IsSupported,                                "get_IsSupported",                              BMI2,         -1,               0,           0,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IsSupportedProperty,    HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(BMI2_ParallelBitDeposit,                         "ParallelBitDeposit",                           BMI2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_pdep,           INS_pdep,           INS_pdep,           INS_pdep,           INS_invalid,        INS_invalid},           HW_Category_Scalar,                 HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(BMI2_ParallelBitExtract,                         "ParallelBitExtract",                           BMI2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_pext,           INS_pext,           INS_pext,           INS_pext,           INS_invalid,        INS_invalid},           HW_Category_Scalar,                 HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
 
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //                 Intrinsic ID                                     Function name                                   ISA         ival        SIMD size       NumArg                                                                                                     instructions                                                                                                     Category                            Flags
index 0a5e891a8172bd19ff2c4457167246109abb3ade..3d2915930caca7a682e3ffdc8608e4c46311572f 100644 (file)
@@ -379,7 +379,6 @@ bool HWIntrinsicInfo::isFullyImplementedIsa(InstructionSet isa)
     {
         // These ISAs have no implementation
         case InstructionSet_AES:
-        case InstructionSet_BMI2:
         case InstructionSet_PCLMULQDQ:
         {
             return false;
@@ -389,6 +388,7 @@ bool HWIntrinsicInfo::isFullyImplementedIsa(InstructionSet isa)
         case InstructionSet_AVX:
         case InstructionSet_AVX2:
         case InstructionSet_BMI1:
+        case InstructionSet_BMI2:
         case InstructionSet_SSE42:
         {
             return true;
@@ -1324,7 +1324,27 @@ GenTree* Compiler::impBMI2Intrinsic(NamedIntrinsic        intrinsic,
                                     CORINFO_SIG_INFO*     sig,
                                     bool                  mustExpand)
 {
-    return nullptr;
+    var_types callType = JITtype2varType(sig->retType);
+
+    switch (intrinsic)
+    {
+        case NI_BMI2_ParallelBitDeposit:
+        case NI_BMI2_ParallelBitExtract:
+        {
+            assert(sig->numArgs == 2);
+
+            GenTree* op2 = impPopStack().val;
+            GenTree* op1 = impPopStack().val;
+
+            return gtNewScalarHWIntrinsicNode(callType, op1, op2, intrinsic);
+        }
+
+        default:
+        {
+            unreached();
+            return nullptr;
+        }
+    }
 }
 
 GenTree* Compiler::impFMAIntrinsic(NamedIntrinsic        intrinsic,
index 477420e7af869a1d2462d624ed165da50254c261..995cb4cc13cb3139209e48d65bbbba3b7eae6306 100644 (file)
@@ -571,6 +571,9 @@ INST3(blsi,           "blsi",          0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,
 INST3(blsmsk,         "blsmsk",        0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0xF3))   // Get Mask Up to Lowest Set Bit
 INST3(blsr,           "blsr",          0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0xF3))   // Reset Lowest Set Bit
 
+// BMI2
+INST3(pdep,           "pdep",          0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0xF5))   // Parallel Bits Deposit
+INST3(pext,           "pext",          0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0xF5))   // Parallel Bits Extract
 INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 
 INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)