From 2ebb1ff93b00eeb90811c56cb5bf6a749479df8b Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 6 Jul 2018 08:31:10 -0700 Subject: [PATCH] Implementing the ParallelBitDeposit and ParallelBitExtract Bmi2 HWIntrinsics Commit migrated from https://github.com/dotnet/coreclr/commit/5ec86ef02448839f02bd94dad8ef662e55770a19 --- src/coreclr/src/jit/emitxarch.cpp | 32 +++++++++++++++- .../src/jit/hwintrinsiccodegenxarch.cpp | 38 ++++++++++++++++++- src/coreclr/src/jit/hwintrinsiclistxarch.h | 2 + src/coreclr/src/jit/hwintrinsicxarch.cpp | 24 +++++++++++- src/coreclr/src/jit/instrsxarch.h | 3 ++ 5 files changed, 95 insertions(+), 4 deletions(-) diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp index c52d7108e93..75d9b733bb3 100644 --- a/src/coreclr/src/jit/emitxarch.cpp +++ b/src/coreclr/src/jit/emitxarch.cpp @@ -217,6 +217,8 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_pcmpgtd: case INS_pcmpgtq: case INS_pcmpgtw: + case INS_pdep: + case INS_pext: case INS_phaddd: case INS_phaddsw: case INS_phaddw: @@ -620,6 +622,8 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr) case INS_mov_xmm2i: case INS_mov_i2xmm: case INS_movnti: + case INS_pdep: + case INS_pext: return true; default: return false; @@ -850,7 +854,33 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c switch (sizePrefix) { case 0x66: - vexPrefix |= IsBMIInstruction(ins) ? 0x00 : 0x01; + if (IsBMIInstruction(ins)) + { + switch (ins) + { + case INS_pdep: + { + vexPrefix |= 0x03; + break; + } + + case INS_pext: + { + vexPrefix |= 0x02; + break; + } + + default: + { + vexPrefix |= 0x00; + break; + } + } + } + else + { + vexPrefix |= 0x01; + } break; case 0xF3: vexPrefix |= 0x02; diff --git a/src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp index edb3c307edd..8a006f8d305 100644 --- a/src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp @@ -2182,7 +2182,43 @@ void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node) // void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node) { - NYI("Implement BMI2 intrinsic code generation"); + NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; + regNumber targetReg = node->gtRegNum; + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + var_types baseType = node->gtSIMDBaseType; + var_types targetType = node->TypeGet(); + instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType); + emitter* emit = getEmitter(); + + assert(targetReg != REG_NA); + assert(op1 != nullptr); + + if (!op1->OperIsList()) + { + genConsumeOperands(node); + } + + switch (intrinsicId) + { + case NI_BMI2_ParallelBitDeposit: + case NI_BMI2_ParallelBitExtract: + { + assert(op2 != nullptr); + assert(op1->TypeGet() == op2->TypeGet()); + assert((targetType == TYP_INT) || (targetType == TYP_LONG)); + genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet())); + break; + } + + default: + { + unreached(); + break; + } + } + + genProduceReg(node); } //------------------------------------------------------------------------ diff --git a/src/coreclr/src/jit/hwintrinsiclistxarch.h b/src/coreclr/src/jit/hwintrinsiclistxarch.h index ed00c058769..5c76d8911e3 100644 --- a/src/coreclr/src/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/src/jit/hwintrinsiclistxarch.h @@ -474,6 +474,8 @@ HARDWARE_INTRINSIC(BMI1_TrailingZeroCount, "TrailingZer // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics HARDWARE_INTRINSIC(BMI2_IsSupported, "get_IsSupported", BMI2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(BMI2_ParallelBitDeposit, "ParallelBitDeposit", BMI2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_pdep, INS_pdep, INS_pdep, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2_ParallelBitExtract, "ParallelBitExtract", BMI2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_pext, INS_pext, INS_pext, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags diff --git a/src/coreclr/src/jit/hwintrinsicxarch.cpp b/src/coreclr/src/jit/hwintrinsicxarch.cpp index 0a5e891a817..3d2915930ca 100644 --- a/src/coreclr/src/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/src/jit/hwintrinsicxarch.cpp @@ -379,7 +379,6 @@ bool HWIntrinsicInfo::isFullyImplementedIsa(InstructionSet isa) { // These ISAs have no implementation case InstructionSet_AES: - case InstructionSet_BMI2: case InstructionSet_PCLMULQDQ: { return false; @@ -389,6 +388,7 @@ bool HWIntrinsicInfo::isFullyImplementedIsa(InstructionSet isa) case InstructionSet_AVX: case InstructionSet_AVX2: case InstructionSet_BMI1: + case InstructionSet_BMI2: case InstructionSet_SSE42: { return true; @@ -1324,7 +1324,27 @@ GenTree* Compiler::impBMI2Intrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig, bool mustExpand) { - return nullptr; + var_types callType = JITtype2varType(sig->retType); + + switch (intrinsic) + { + case NI_BMI2_ParallelBitDeposit: + case NI_BMI2_ParallelBitExtract: + { + assert(sig->numArgs == 2); + + GenTree* op2 = impPopStack().val; + GenTree* op1 = impPopStack().val; + + return gtNewScalarHWIntrinsicNode(callType, op1, op2, intrinsic); + } + + default: + { + unreached(); + return nullptr; + } + } } GenTree* Compiler::impFMAIntrinsic(NamedIntrinsic intrinsic, diff --git a/src/coreclr/src/jit/instrsxarch.h b/src/coreclr/src/jit/instrsxarch.h index 477420e7af8..995cb4cc13c 100644 --- a/src/coreclr/src/jit/instrsxarch.h +++ b/src/coreclr/src/jit/instrsxarch.h @@ -571,6 +571,9 @@ INST3(blsi, "blsi", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, INST3(blsmsk, "blsmsk", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xF3)) // Get Mask Up to Lowest Set Bit INST3(blsr, "blsr", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xF3)) // Reset Lowest Set Bit +// BMI2 +INST3(pdep, "pdep", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xF5)) // Parallel Bits Deposit +INST3(pext, "pext", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xF5)) // Parallel Bits Extract INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) -- 2.34.1