1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Intel hardware intrinsic Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifdef FEATURE_HW_INTRINSICS
22 #include "sideeffects.h"
25 #include "gcinfoencoder.h"
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
31 // lowering - The lowering phase from the compiler
32 // node - The HWIntrinsic node that has the contained node
33 // op - The op that is contained
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
38 // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39 // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
41 // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42 // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
44 // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
46 // in the first place).
48 bool supportsRegOptional = false;
49 bool isContainable = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50 assert(isContainable || supportsRegOptional);
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
58 // category - category of a HW intrinsic
61 // returns true if this category can be table-driven in CodeGen
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
65 // TODO - make more categories to the table-driven framework
66 // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67 const bool tableDrivenCategory =
68 (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69 const bool tableDrivenFlag =
70 !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71 return tableDrivenCategory && tableDrivenFlag;
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
78 // node - The hardware intrinsic node
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
82 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
83 InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId);
84 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
85 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
86 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
88 assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
90 if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
92 GenTree* op1 = node->gtGetOp1();
93 GenTree* op2 = node->gtGetOp2();
94 regNumber targetReg = node->gtRegNum;
95 var_types targetType = node->TypeGet();
96 var_types baseType = node->gtSIMDBaseType;
98 regNumber op1Reg = REG_NA;
99 regNumber op2Reg = REG_NA;
100 emitter* emit = getEmitter();
102 assert(numArgs >= 0);
103 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104 assert(ins != INS_invalid);
105 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106 assert(simdSize != 0);
112 if (node->OperIsMemoryLoad())
114 genConsumeAddress(op1);
115 // Until we improve the handling of addressing modes in the emitter, we'll create a
116 // temporary GT_IND to generate code with.
117 GenTreeIndir load = indirForm(node->TypeGet(), op1);
118 emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
123 op1Reg = op1->gtRegNum;
125 if ((ival != -1) && varTypeIsFloating(baseType))
127 assert((ival >= 0) && (ival <= 127));
128 if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
130 assert(!op1->isContained());
131 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op1Reg,
132 static_cast<int8_t>(ival));
136 genHWIntrinsic_R_RM_I(node, ins, static_cast<int8_t>(ival));
139 else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
141 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
145 genHWIntrinsic_R_RM(node, ins, simdSize);
153 if (category == HW_Category_MemoryStore)
155 genConsumeAddress(op1);
157 // Until we improve the handling of addressing modes in the emitter, we'll create a
158 // temporary GT_STORE_IND to generate code with.
159 GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
160 emit->emitInsStoreInd(ins, simdSize, &store);
166 op1Reg = op1->gtRegNum;
167 op2Reg = op2->gtRegNum;
169 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
171 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
173 // For non-commutative intrinsics, we should have ensured that op2 was marked
174 // delay free in order to prevent it from getting assigned the same register
175 // as target. However, for commutative intrinsics, we can just swap the operands
176 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
178 noway_assert(node->OperIsCommutative());
183 if ((ival != -1) && varTypeIsFloating(baseType))
185 assert((ival >= 0) && (ival <= 127));
186 genHWIntrinsic_R_R_RM_I(node, ins, static_cast<int8_t>(ival));
188 else if (category == HW_Category_MemoryLoad)
190 // Get the address and the 'other' register.
193 if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
203 // Until we improve the handling of addressing modes in the emitter, we'll create a
204 // temporary GT_IND to generate code with.
205 GenTreeIndir load = indirForm(node->TypeGet(), addr);
206 genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
208 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
212 if (intrinsicId == NI_SSE2_Extract)
214 // extract instructions return to GP-registers, so it needs int size as the emitsize
215 simdSize = emitTypeSize(TYP_INT);
218 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
220 if (op2->IsCnsIntOrI())
222 ssize_t ival = op2->AsIntCon()->IconValue();
223 assert((ival >= 0) && (ival <= 255));
224 emitSwCase((int8_t)ival);
228 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
229 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
230 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
231 regNumber baseReg = node->ExtractTempReg();
232 regNumber offsReg = node->GetSingleTempReg();
233 genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
238 genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
245 GenTreeArgList* argList = op1->AsArgList();
246 op1 = argList->Current();
248 op1Reg = op1->gtRegNum;
250 argList = argList->Rest();
251 op2 = argList->Current();
253 op2Reg = op2->gtRegNum;
255 argList = argList->Rest();
256 GenTree* op3 = argList->Current();
258 regNumber op3Reg = op3->gtRegNum;
260 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
264 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
266 if (op3->IsCnsIntOrI())
268 ssize_t ival = op3->AsIntCon()->IconValue();
269 assert((ival >= 0) && (ival <= 255));
270 emitSwCase((int8_t)ival);
274 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
275 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
276 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
277 regNumber baseReg = node->ExtractTempReg();
278 regNumber offsReg = node->GetSingleTempReg();
279 genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
282 else if (category == HW_Category_MemoryStore)
284 // The Mask instructions do not currently support containment of the address.
285 assert(!op2->isContained());
286 if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
288 emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
292 assert(intrinsicId == NI_SSE2_MaskMove);
293 assert(targetReg == REG_NA);
295 // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
296 if (op3Reg != REG_EDI)
298 emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
300 emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
307 case NI_SSE41_BlendVariable:
308 case NI_AVX_BlendVariable:
309 case NI_AVX2_BlendVariable:
311 genHWIntrinsic_R_R_RM_R(node, ins);
335 case InstructionSet_Vector128:
336 case InstructionSet_Vector256:
337 genBaseIntrinsic(node);
339 case InstructionSet_SSE:
340 case InstructionSet_SSE_X64:
341 genSSEIntrinsic(node);
343 case InstructionSet_SSE2:
344 case InstructionSet_SSE2_X64:
345 genSSE2Intrinsic(node);
347 case InstructionSet_SSE41:
348 case InstructionSet_SSE41_X64:
349 genSSE41Intrinsic(node);
351 case InstructionSet_SSE42:
352 case InstructionSet_SSE42_X64:
353 genSSE42Intrinsic(node);
355 case InstructionSet_AVX:
356 case InstructionSet_AVX2:
357 genAvxOrAvx2Intrinsic(node);
359 case InstructionSet_AES:
360 genAESIntrinsic(node);
362 case InstructionSet_BMI1:
363 case InstructionSet_BMI1_X64:
364 case InstructionSet_BMI2:
365 case InstructionSet_BMI2_X64:
366 genBMI1OrBMI2Intrinsic(node);
368 case InstructionSet_FMA:
369 genFMAIntrinsic(node);
371 case InstructionSet_LZCNT:
372 case InstructionSet_LZCNT_X64:
373 genLZCNTIntrinsic(node);
375 case InstructionSet_PCLMULQDQ:
376 genPCLMULQDQIntrinsic(node);
378 case InstructionSet_POPCNT:
379 case InstructionSet_POPCNT_X64:
380 genPOPCNTIntrinsic(node);
388 //------------------------------------------------------------------------
389 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
390 // register/memory operand and that returns a value in register
393 // node - The hardware intrinsic node
394 // ins - The instruction being generated
395 // attr - The emit attribute for the instruciton being generated
397 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
399 var_types targetType = node->TypeGet();
400 regNumber targetReg = node->gtRegNum;
401 GenTree* op1 = node->gtGetOp1();
402 GenTree* op2 = node->gtGetOp2();
403 emitter* emit = getEmitter();
407 // The CompareScalarOrdered* and CompareScalarUnordered* intrinsics come down this
408 // code path. They are all MultiIns, as the return value comes from the flags and
409 // we have two operands instead.
411 assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
412 assert(targetReg != REG_NA);
414 targetReg = op1->gtRegNum;
420 assert(!node->OperIsCommutative());
423 assert(targetReg != REG_NA);
424 assert(op2 == nullptr);
426 if (op1->isContained() || op1->isUsedFromSpillTemp())
428 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
429 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
431 TempDsc* tmpDsc = nullptr;
432 unsigned varNum = BAD_VAR_NUM;
433 unsigned offset = (unsigned)-1;
435 if (op1->isUsedFromSpillTemp())
437 assert(op1->IsRegOptional());
439 tmpDsc = getSpillTempDsc(op1);
440 varNum = tmpDsc->tdTempNum();
443 regSet.tmpRlsTemp(tmpDsc);
445 else if (op1->isIndir() || op1->OperIsHWIntrinsic())
448 GenTreeIndir* memIndir = nullptr;
452 memIndir = op1->AsIndir();
453 addr = memIndir->Addr();
457 assert(op1->AsHWIntrinsic()->OperIsMemoryLoad());
458 assert(HWIntrinsicInfo::lookupNumArgs(op1->AsHWIntrinsic()) == 1);
459 addr = op1->gtGetOp1();
462 switch (addr->OperGet())
464 case GT_LCL_VAR_ADDR:
466 varNum = addr->AsLclVarCommon()->GetLclNum();
471 case GT_CLS_VAR_ADDR:
473 emit->emitIns_R_C(ins, attr, targetReg, addr->gtClsVar.gtClsVarHnd, 0);
479 if (memIndir == nullptr)
481 // This is the HW intrinsic load case.
482 // Until we improve the handling of addressing modes in the emitter, we'll create a
483 // temporary GT_IND to generate code with.
484 GenTreeIndir load = indirForm(op1->TypeGet(), addr);
487 emit->emitIns_R_A(ins, attr, targetReg, memIndir);
494 switch (op1->OperGet())
498 GenTreeLclFld* lclField = op1->AsLclFld();
500 varNum = lclField->GetLclNum();
501 offset = lclField->gtLclFld.gtLclOffs;
507 assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
508 varNum = op1->AsLclVar()->GetLclNum();
521 // Ensure we got a good varNum and offset.
522 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
523 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
524 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
525 assert(offset != (unsigned)-1);
527 emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
531 regNumber op1Reg = op1->gtRegNum;
532 emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
536 //------------------------------------------------------------------------
537 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
538 // an immediate operand, and that returns a value in register
541 // node - The hardware intrinsic node
542 // ins - The instruction being generated
543 // ival - The immediate value
545 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
547 var_types targetType = node->TypeGet();
548 regNumber targetReg = node->gtRegNum;
549 GenTree* op1 = node->gtGetOp1();
550 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
551 emitter* emit = getEmitter();
553 // TODO-XArch-CQ: Commutative operations can have op1 be contained
554 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
556 assert(targetReg != REG_NA);
557 assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
559 if (op1->isContained() || op1->isUsedFromSpillTemp())
561 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
562 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
564 inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
567 //------------------------------------------------------------------------
568 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
569 // register/memory operand, and that returns a value in register
572 // node - The hardware intrinsic node
573 // ins - The instruction being generated
574 // attr - The emit attribute for the instruciton being generated
576 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
578 regNumber targetReg = node->gtRegNum;
579 GenTree* op1 = node->gtGetOp1();
580 GenTree* op2 = node->gtGetOp2();
581 regNumber op1Reg = op1->gtRegNum;
583 assert(targetReg != REG_NA);
584 assert(op1Reg != REG_NA);
586 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
589 //------------------------------------------------------------------------
590 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
591 // register/memory operand, and that returns a value in register
594 // node - The hardware intrinsic node
595 // ins - The instruction being generated
596 // attr - The emit attribute for the instruciton being generated
597 // targetReg - The register allocated to the result
598 // op1Reg - The register allocated to the first operand
599 // op2 - Another operand that maybe in register or memory
601 void CodeGen::genHWIntrinsic_R_R_RM(
602 GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
604 emitter* emit = getEmitter();
606 // TODO-XArch-CQ: Commutative operations can have op1 be contained
607 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
609 assert(targetReg != REG_NA);
610 assert(op1Reg != REG_NA);
612 if (op2->isContained() || op2->isUsedFromSpillTemp())
614 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
615 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
617 TempDsc* tmpDsc = nullptr;
618 unsigned varNum = BAD_VAR_NUM;
619 unsigned offset = (unsigned)-1;
621 if (op2->isUsedFromSpillTemp())
623 assert(op2->IsRegOptional());
625 tmpDsc = getSpillTempDsc(op2);
626 varNum = tmpDsc->tdTempNum();
629 regSet.tmpRlsTemp(tmpDsc);
631 else if (op2->isIndir() || op2->OperIsHWIntrinsic())
634 GenTreeIndir* memIndir = nullptr;
638 memIndir = op2->AsIndir();
639 addr = memIndir->Addr();
643 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
644 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
645 addr = op2->gtGetOp1();
648 switch (addr->OperGet())
650 case GT_LCL_VAR_ADDR:
652 varNum = addr->AsLclVarCommon()->GetLclNum();
657 case GT_CLS_VAR_ADDR:
659 emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, addr->gtClsVar.gtClsVarHnd, 0);
665 if (memIndir == nullptr)
667 // This is the HW intrinsic load case.
668 // Until we improve the handling of addressing modes in the emitter, we'll create a
669 // temporary GT_IND to generate code with.
670 GenTreeIndir load = indirForm(op2->TypeGet(), addr);
673 emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
680 switch (op2->OperGet())
684 GenTreeLclFld* lclField = op2->AsLclFld();
686 varNum = lclField->GetLclNum();
687 offset = lclField->gtLclFld.gtLclOffs;
693 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
694 varNum = op2->AsLclVar()->GetLclNum();
705 // Ensure we got a good varNum and offset.
706 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
707 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
708 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
709 assert(offset != (unsigned)-1);
711 emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
715 regNumber op2Reg = op2->gtRegNum;
717 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
719 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
721 // For non-commutative intrinsics, we should have ensured that op2 was marked
722 // delay free in order to prevent it from getting assigned the same register
723 // as target. However, for commutative intrinsics, we can just swap the operands
724 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
726 noway_assert(node->OperIsCommutative());
731 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
735 //------------------------------------------------------------------------
736 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
737 // register/memory operand, an immediate operand, and that returns a value in register
740 // node - The hardware intrinsic node
741 // ins - The instruction being generated
742 // ival - The immediate value
744 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
746 var_types targetType = node->TypeGet();
747 regNumber targetReg = node->gtRegNum;
748 GenTree* op1 = node->gtGetOp1();
749 GenTree* op2 = node->gtGetOp2();
750 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
751 emitter* emit = getEmitter();
753 // TODO-XArch-CQ: Commutative operations can have op1 be contained
754 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
756 if (op1->OperIsList())
758 assert(op2 == nullptr);
760 GenTreeArgList* argList = op1->AsArgList();
762 op1 = argList->Current();
763 argList = argList->Rest();
765 op2 = argList->Current();
766 argList = argList->Rest();
768 assert(argList->Current() != nullptr);
769 assert(argList->Rest() == nullptr);
772 regNumber op1Reg = op1->gtRegNum;
774 assert(targetReg != REG_NA);
775 assert(op1Reg != REG_NA);
777 if (op2->isContained() || op2->isUsedFromSpillTemp())
779 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
780 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
782 TempDsc* tmpDsc = nullptr;
783 unsigned varNum = BAD_VAR_NUM;
784 unsigned offset = (unsigned)-1;
786 if (op2->isUsedFromSpillTemp())
788 assert(op2->IsRegOptional());
790 tmpDsc = getSpillTempDsc(op2);
791 varNum = tmpDsc->tdTempNum();
794 regSet.tmpRlsTemp(tmpDsc);
796 else if (op2->isIndir() || op2->OperIsHWIntrinsic())
799 GenTreeIndir* memIndir = nullptr;
803 memIndir = op2->AsIndir();
804 addr = memIndir->Addr();
808 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
809 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
810 addr = op2->gtGetOp1();
813 switch (addr->OperGet())
815 case GT_LCL_VAR_ADDR:
817 varNum = addr->AsLclVarCommon()->GetLclNum();
822 case GT_CLS_VAR_ADDR:
824 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, addr->gtClsVar.gtClsVarHnd, 0, ival);
830 if (memIndir == nullptr)
832 // This is the HW intrinsic load case.
833 // Until we improve the handling of addressing modes in the emitter, we'll create a
834 // temporary GT_IND to generate code with.
835 GenTreeIndir load = indirForm(op2->TypeGet(), addr);
838 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
845 switch (op2->OperGet())
849 GenTreeLclFld* lclField = op2->AsLclFld();
851 varNum = lclField->GetLclNum();
852 offset = lclField->gtLclFld.gtLclOffs;
858 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
859 varNum = op2->AsLclVar()->GetLclNum();
870 // Ensure we got a good varNum and offset.
871 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
872 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
873 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
874 assert(offset != (unsigned)-1);
876 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
880 regNumber op2Reg = op2->gtRegNum;
882 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
884 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
886 // For non-commutative intrinsics, we should have ensured that op2 was marked
887 // delay free in order to prevent it from getting assigned the same register
888 // as target. However, for commutative intrinsics, we can just swap the operands
889 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
891 noway_assert(node->OperIsCommutative());
896 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
900 //------------------------------------------------------------------------
901 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
902 // register/memory operand, another register operand, and that returns a value in register
905 // node - The hardware intrinsic node
906 // ins - The instruction being generated
908 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
910 var_types targetType = node->TypeGet();
911 regNumber targetReg = node->gtRegNum;
912 GenTree* op1 = node->gtGetOp1();
913 GenTree* op2 = node->gtGetOp2();
914 GenTree* op3 = nullptr;
915 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
916 emitter* emit = getEmitter();
918 assert(op1->OperIsList());
919 assert(op2 == nullptr);
921 GenTreeArgList* argList = op1->AsArgList();
923 op1 = argList->Current();
924 argList = argList->Rest();
926 op2 = argList->Current();
927 argList = argList->Rest();
929 op3 = argList->Current();
930 assert(argList->Rest() == nullptr);
932 regNumber op1Reg = op1->gtRegNum;
933 regNumber op3Reg = op3->gtRegNum;
935 assert(targetReg != REG_NA);
936 assert(op1Reg != REG_NA);
937 assert(op3Reg != REG_NA);
939 if (op2->isContained() || op2->isUsedFromSpillTemp())
941 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
942 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
944 TempDsc* tmpDsc = nullptr;
945 unsigned varNum = BAD_VAR_NUM;
946 unsigned offset = (unsigned)-1;
948 if (op2->isUsedFromSpillTemp())
950 assert(op2->IsRegOptional());
952 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
953 // pattern. It could probably be extracted to its own method.
954 tmpDsc = getSpillTempDsc(op2);
955 varNum = tmpDsc->tdTempNum();
958 regSet.tmpRlsTemp(tmpDsc);
960 else if (op2->isIndir() || op2->OperIsHWIntrinsic())
963 GenTreeIndir* memIndir = nullptr;
967 memIndir = op2->AsIndir();
968 addr = memIndir->Addr();
972 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
973 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
974 addr = op2->gtGetOp1();
977 switch (addr->OperGet())
979 case GT_LCL_VAR_ADDR:
981 varNum = addr->AsLclVarCommon()->GetLclNum();
986 case GT_CLS_VAR_ADDR:
988 emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, addr->gtClsVar.gtClsVarHnd, 0);
994 if (memIndir == nullptr)
996 // This is the HW intrinsic load case.
997 // Until we improve the handling of addressing modes in the emitter, we'll create a
998 // temporary GT_IND to generate code with.
999 GenTreeIndir load = indirForm(op2->TypeGet(), addr);
1002 emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
1009 switch (op2->OperGet())
1013 GenTreeLclFld* lclField = op2->AsLclFld();
1015 varNum = lclField->GetLclNum();
1016 offset = lclField->gtLclFld.gtLclOffs;
1022 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
1023 varNum = op2->AsLclVar()->GetLclNum();
1034 // Ensure we got a good varNum and offset.
1035 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1036 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1037 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1038 assert(offset != (unsigned)-1);
1040 emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
1044 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1048 //------------------------------------------------------------------------
1049 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1050 // a register/memory operand, and that returns a value in register
1053 // ins - The instruction being generated
1054 // attr - The emit attribute
1055 // targetReg - The target register
1056 // op1Reg - The register of the first operand
1057 // op2Reg - The register of the second operand
1058 // op3 - The third operand
1060 void CodeGen::genHWIntrinsic_R_R_R_RM(
1061 instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1063 assert(targetReg != REG_NA);
1064 assert(op1Reg != REG_NA);
1065 assert(op2Reg != REG_NA);
1067 emitter* emit = getEmitter();
1069 if (op3->isContained() || op3->isUsedFromSpillTemp())
1071 TempDsc* tmpDsc = nullptr;
1072 unsigned varNum = BAD_VAR_NUM;
1073 unsigned offset = (unsigned)-1;
1075 if (op3->isUsedFromSpillTemp())
1077 assert(op3->IsRegOptional());
1079 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1080 // pattern. It could probably be extracted to its own method.
1081 tmpDsc = getSpillTempDsc(op3);
1082 varNum = tmpDsc->tdTempNum();
1085 regSet.tmpRlsTemp(tmpDsc);
1087 else if (op3->isIndir() || op3->OperIsHWIntrinsic())
1090 GenTreeIndir* memIndir = nullptr;
1093 memIndir = op3->AsIndir();
1094 addr = memIndir->Addr();
1098 assert(op3->AsHWIntrinsic()->OperIsMemoryLoad());
1099 assert(HWIntrinsicInfo::lookupNumArgs(op3->AsHWIntrinsic()) == 1);
1100 addr = op3->gtGetOp1();
1103 switch (addr->OperGet())
1105 case GT_LCL_VAR_ADDR:
1107 varNum = addr->AsLclVarCommon()->GetLclNum();
1112 case GT_CLS_VAR_ADDR:
1114 emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, addr->gtClsVar.gtClsVarHnd, 0);
1120 if (memIndir == nullptr)
1122 // This is the HW intrinsic load case.
1123 // Until we improve the handling of addressing modes in the emitter, we'll create a
1124 // temporary GT_IND to generate code with.
1125 GenTreeIndir load = indirForm(op3->TypeGet(), addr);
1128 emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1135 switch (op3->OperGet())
1139 GenTreeLclFld* lclField = op3->AsLclFld();
1141 varNum = lclField->GetLclNum();
1142 offset = lclField->gtLclFld.gtLclOffs;
1148 assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1149 varNum = op3->AsLclVar()->GetLclNum();
1160 // Ensure we got a good varNum and offset.
1161 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1162 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1163 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1164 assert(offset != (unsigned)-1);
1166 emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1170 emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1174 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1175 // with non-constant argument
1178 // intrinsic - intrinsic ID
1179 // nonConstImmReg - the register contains non-constant imm8 argument
1180 // baseReg - a register for the start of the switch table
1181 // offsReg - a register for the offset into the switch table
1182 // emitSwCase - the lambda to generate a switch case
1185 // generate the jump-table fallback for imm-intrinsics with non-constant argument.
1187 // This function can be used for all imm-intrinsics (whether full-range or not),
1188 // The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1189 // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1191 template <typename HWIntrinsicSwitchCaseBody>
1192 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
1193 regNumber nonConstImmReg,
1196 HWIntrinsicSwitchCaseBody emitSwCase)
1198 assert(nonConstImmReg != REG_NA);
1199 // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1200 // that does work with the current compiler generated jump-table fallback
1201 assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1202 emitter* emit = getEmitter();
1204 const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1205 assert(maxByte <= 256);
1206 BasicBlock* jmpTable[256];
1208 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1209 unsigned jmpTableOffs = 0;
1211 // Emit the jump table
1212 for (unsigned i = 0; i < maxByte; i++)
1214 jmpTable[i] = genCreateTempLabel();
1215 emit->emitDataGenData(i, jmpTable[i]);
1218 emit->emitDataGenEnd();
1220 // Compute and jump to the appropriate offset in the switch table
1221 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1223 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1224 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1225 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1226 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1228 // Emit the switch table entries
1230 BasicBlock* switchTableBeg = genCreateTempLabel();
1231 BasicBlock* switchTableEnd = genCreateTempLabel();
1233 genDefineTempLabel(switchTableBeg);
1235 for (unsigned i = 0; i < maxByte; i++)
1237 genDefineTempLabel(jmpTable[i]);
1238 emitSwCase((int8_t)i);
1239 emit->emitIns_J(INS_jmp, switchTableEnd);
1242 genDefineTempLabel(switchTableEnd);
1245 //------------------------------------------------------------------------
1246 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1249 // node - The hardware intrinsic node
1252 // We currently assume that all base intrinsics have zero or one operand.
1254 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1256 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1257 regNumber targetReg = node->gtRegNum;
1258 var_types targetType = node->TypeGet();
1259 var_types baseType = node->gtSIMDBaseType;
1261 assert(compiler->compSupports(InstructionSet_SSE));
1262 assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1264 GenTree* op1 = node->gtGetOp1();
1266 genConsumeHWIntrinsicOperands(node);
1267 regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
1269 assert(node->gtGetOp2() == nullptr);
1271 emitter* emit = getEmitter();
1272 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1273 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1275 switch (intrinsicId)
1277 case NI_Vector128_CreateScalarUnsafe:
1278 case NI_Vector256_CreateScalarUnsafe:
1280 if (varTypeIsIntegral(baseType))
1282 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1286 assert(varTypeIsFloating(baseType));
1288 attr = emitTypeSize(baseType);
1290 if (op1->isContained() || op1->isUsedFromSpillTemp())
1292 genHWIntrinsic_R_RM(node, ins, attr);
1294 else if (targetReg != op1Reg)
1296 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1297 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1303 case NI_Vector128_ToScalar:
1304 case NI_Vector256_ToScalar:
1306 assert(varTypeIsFloating(baseType));
1308 attr = emitTypeSize(TYP_SIMD16);
1310 if (op1->isContained() || op1->isUsedFromSpillTemp())
1312 genHWIntrinsic_R_RM(node, ins, attr);
1314 else if (targetReg != op1Reg)
1316 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1317 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1322 case NI_Vector128_ToVector256:
1324 // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1325 // We always emit a move to the target register, even when op1Reg == targetReg,
1326 // in order to ensure that Bits MAXVL-1:128 are zeroed.
1328 attr = emitTypeSize(TYP_SIMD16);
1330 if (op1->isContained() || op1->isUsedFromSpillTemp())
1332 genHWIntrinsic_R_RM(node, ins, attr);
1336 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1337 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1342 case NI_Vector128_ToVector256Unsafe:
1343 case NI_Vector256_GetLower:
1345 if (op1->isContained() || op1->isUsedFromSpillTemp())
1347 genHWIntrinsic_R_RM(node, ins, attr);
1349 else if (targetReg != op1Reg)
1351 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1352 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1357 case NI_Vector128_Zero:
1358 case NI_Vector256_Zero:
1360 assert(op1 == nullptr);
1361 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1372 genProduceReg(node);
1375 //------------------------------------------------------------------------
1376 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1379 // node - The hardware intrinsic node
1381 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1383 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1384 GenTree* op1 = node->gtGetOp1();
1385 GenTree* op2 = node->gtGetOp2();
1386 GenTree* op3 = nullptr;
1387 GenTree* op4 = nullptr;
1388 regNumber targetReg = node->gtRegNum;
1389 var_types targetType = node->TypeGet();
1390 var_types baseType = node->gtSIMDBaseType;
1392 regNumber op1Reg = REG_NA;
1393 regNumber op2Reg = REG_NA;
1394 regNumber op3Reg = REG_NA;
1395 regNumber op4Reg = REG_NA;
1396 emitter* emit = getEmitter();
1398 genConsumeHWIntrinsicOperands(node);
1400 switch (intrinsicId)
1402 case NI_SSE_CompareScalarOrderedEqual:
1403 case NI_SSE_CompareScalarUnorderedEqual:
1405 assert(baseType == TYP_FLOAT);
1406 regNumber tmpReg = node->GetSingleTempReg();
1407 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1409 // Ensure we aren't overwriting targetReg
1410 assert(tmpReg != targetReg);
1412 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1413 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1414 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1415 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1416 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1420 case NI_SSE_CompareScalarOrderedGreaterThan:
1421 case NI_SSE_CompareScalarUnorderedGreaterThan:
1423 assert(baseType == TYP_FLOAT);
1424 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1426 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1427 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1428 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1432 case NI_SSE_CompareScalarOrderedGreaterThanOrEqual:
1433 case NI_SSE_CompareScalarUnorderedGreaterThanOrEqual:
1435 assert(baseType == TYP_FLOAT);
1436 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1438 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1439 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1440 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1444 case NI_SSE_CompareScalarOrderedLessThan:
1445 case NI_SSE_CompareScalarUnorderedLessThan:
1447 assert(baseType == TYP_FLOAT);
1448 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1450 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1451 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1452 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1456 case NI_SSE_CompareScalarOrderedLessThanOrEqual:
1457 case NI_SSE_CompareScalarUnorderedLessThanOrEqual:
1459 assert(baseType == TYP_FLOAT);
1460 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1462 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1463 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1464 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1468 case NI_SSE_CompareScalarOrderedNotEqual:
1469 case NI_SSE_CompareScalarUnorderedNotEqual:
1471 assert(baseType == TYP_FLOAT);
1472 regNumber tmpReg = node->GetSingleTempReg();
1473 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1475 // Ensure we aren't overwriting targetReg
1476 assert(tmpReg != targetReg);
1478 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1479 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1480 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1481 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1482 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1486 case NI_SSE_X64_ConvertToInt64:
1487 case NI_SSE_X64_ConvertToInt64WithTruncation:
1489 assert(targetType == TYP_LONG);
1490 assert(op1 != nullptr);
1491 assert(op2 == nullptr);
1492 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1493 genHWIntrinsic_R_RM(node, ins, EA_8BYTE);
1497 case NI_SSE_X64_ConvertScalarToVector128Single:
1499 assert(baseType == TYP_LONG);
1500 assert(op1 != nullptr);
1501 assert(op2 != nullptr);
1502 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1503 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1507 case NI_SSE_Prefetch0:
1508 case NI_SSE_Prefetch1:
1509 case NI_SSE_Prefetch2:
1510 case NI_SSE_PrefetchNonTemporal:
1512 assert(baseType == TYP_UBYTE);
1513 assert(op2 == nullptr);
1515 // These do not support containment.
1516 assert(!op1->isContained());
1517 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1518 op1Reg = op1->gtRegNum;
1519 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1523 case NI_SSE_StoreFence:
1525 assert(baseType == TYP_VOID);
1526 assert(op1 == nullptr);
1527 assert(op2 == nullptr);
1528 emit->emitIns(INS_sfence);
1537 genProduceReg(node);
1540 //------------------------------------------------------------------------
1541 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1544 // node - The hardware intrinsic node
1546 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1548 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1549 GenTree* op1 = node->gtGetOp1();
1550 GenTree* op2 = node->gtGetOp2();
1551 regNumber targetReg = node->gtRegNum;
1552 var_types targetType = node->TypeGet();
1553 var_types baseType = node->gtSIMDBaseType;
1554 regNumber op1Reg = REG_NA;
1555 regNumber op2Reg = REG_NA;
1556 emitter* emit = getEmitter();
1558 genConsumeHWIntrinsicOperands(node);
1560 switch (intrinsicId)
1562 // All integer overloads are handled by table codegen
1563 case NI_SSE2_CompareLessThan:
1565 assert(op1 != nullptr);
1566 assert(op2 != nullptr);
1568 assert(baseType == TYP_DOUBLE);
1570 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1571 assert((ival >= 0) && (ival <= 127));
1573 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1574 op1Reg = op1->gtRegNum;
1575 op2Reg = op2->gtRegNum;
1576 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1581 case NI_SSE2_CompareScalarOrderedEqual:
1582 case NI_SSE2_CompareScalarUnorderedEqual:
1584 assert(baseType == TYP_DOUBLE);
1585 regNumber tmpReg = node->GetSingleTempReg();
1586 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1588 // Ensure we aren't overwriting targetReg
1589 assert(tmpReg != targetReg);
1591 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1592 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1593 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1594 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1595 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1599 case NI_SSE2_CompareScalarOrderedGreaterThan:
1600 case NI_SSE2_CompareScalarUnorderedGreaterThan:
1602 assert(baseType == TYP_DOUBLE);
1603 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1605 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1606 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1607 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1611 case NI_SSE2_CompareScalarOrderedGreaterThanOrEqual:
1612 case NI_SSE2_CompareScalarUnorderedGreaterThanOrEqual:
1614 assert(baseType == TYP_DOUBLE);
1615 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1617 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1618 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1619 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1623 case NI_SSE2_CompareScalarOrderedLessThan:
1624 case NI_SSE2_CompareScalarUnorderedLessThan:
1626 assert(baseType == TYP_DOUBLE);
1627 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1629 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1630 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1631 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1635 case NI_SSE2_CompareScalarOrderedLessThanOrEqual:
1636 case NI_SSE2_CompareScalarUnorderedLessThanOrEqual:
1638 assert(baseType == TYP_DOUBLE);
1639 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1641 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1642 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1643 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1647 case NI_SSE2_CompareScalarOrderedNotEqual:
1648 case NI_SSE2_CompareScalarUnorderedNotEqual:
1650 assert(baseType == TYP_DOUBLE);
1651 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1652 regNumber tmpReg = node->GetSingleTempReg();
1654 // Ensure we aren't overwriting targetReg
1655 assert(tmpReg != targetReg);
1657 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1658 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1659 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1660 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1661 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1665 case NI_SSE2_X64_ConvertScalarToVector128Double:
1667 assert(baseType == TYP_LONG);
1668 assert(op1 != nullptr);
1669 assert(op2 != nullptr);
1670 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1671 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1675 case NI_SSE2_X64_ConvertScalarToVector128Int64:
1676 case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1678 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1679 assert(op1 != nullptr);
1680 assert(op2 == nullptr);
1681 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1682 genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1686 case NI_SSE2_ConvertToInt32:
1687 case NI_SSE2_ConvertToInt32WithTruncation:
1688 case NI_SSE2_ConvertToUInt32:
1689 case NI_SSE2_X64_ConvertToInt64:
1690 case NI_SSE2_X64_ConvertToInt64WithTruncation:
1691 case NI_SSE2_X64_ConvertToUInt64:
1693 assert(op2 == nullptr);
1694 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1696 if (varTypeIsIntegral(baseType))
1698 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1699 op1Reg = op1->gtRegNum;
1700 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1704 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1705 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1710 case NI_SSE2_LoadFence:
1712 assert(baseType == TYP_VOID);
1713 assert(op1 == nullptr);
1714 assert(op2 == nullptr);
1715 emit->emitIns(INS_lfence);
1719 case NI_SSE2_MemoryFence:
1721 assert(baseType == TYP_VOID);
1722 assert(op1 == nullptr);
1723 assert(op2 == nullptr);
1724 emit->emitIns(INS_mfence);
1728 case NI_SSE2_StoreNonTemporal:
1729 case NI_SSE2_X64_StoreNonTemporal:
1731 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1732 assert(op1 != nullptr);
1733 assert(op2 != nullptr);
1735 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1736 GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
1737 emit->emitInsStoreInd(ins, emitTypeSize(baseType), &store);
1746 genProduceReg(node);
1749 //------------------------------------------------------------------------
1750 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1753 // node - The hardware intrinsic node
1755 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1757 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1758 GenTree* op1 = node->gtGetOp1();
1759 GenTree* op2 = node->gtGetOp2();
1760 GenTree* op3 = nullptr;
1761 GenTree* op4 = nullptr;
1762 regNumber targetReg = node->gtRegNum;
1763 var_types targetType = node->TypeGet();
1764 var_types baseType = node->gtSIMDBaseType;
1766 regNumber op1Reg = REG_NA;
1767 regNumber op2Reg = REG_NA;
1768 regNumber op3Reg = REG_NA;
1769 regNumber op4Reg = REG_NA;
1770 emitter* emit = getEmitter();
1772 genConsumeHWIntrinsicOperands(node);
1774 switch (intrinsicId)
1776 case NI_SSE41_ConvertToVector128Int16:
1777 case NI_SSE41_ConvertToVector128Int32:
1778 case NI_SSE41_ConvertToVector128Int64:
1780 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1782 if (!varTypeIsSIMD(op1->gtType))
1784 // Until we improve the handling of addressing modes in the emitter, we'll create a
1785 // temporary GT_IND to generate code with.
1786 GenTreeIndir load = indirForm(node->TypeGet(), op1);
1787 emit->emitInsLoadInd(ins, emitTypeSize(TYP_SIMD16), node->gtRegNum, &load);
1791 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1796 case NI_SSE41_TestZ:
1798 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1799 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1800 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1801 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1805 case NI_SSE41_TestC:
1807 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1808 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1809 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1810 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1814 case NI_SSE41_TestNotZAndNotC:
1816 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1817 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1818 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1819 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1823 case NI_SSE41_Extract:
1824 case NI_SSE41_X64_Extract:
1826 regNumber tmpTargetReg = REG_NA;
1827 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1828 if (baseType == TYP_FLOAT)
1830 tmpTargetReg = node->ExtractTempReg();
1833 auto emitSwCase = [&](int8_t i) {
1834 if (baseType == TYP_FLOAT)
1836 // extract instructions return to GP-registers, so it needs int size as the emitsize
1837 inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
1838 emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1842 inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
1846 if (op2->IsCnsIntOrI())
1848 ssize_t ival = op2->AsIntCon()->IconValue();
1849 assert((ival >= 0) && (ival <= 255));
1850 emitSwCase((int8_t)ival);
1854 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1855 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1856 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1857 regNumber baseReg = node->ExtractTempReg();
1858 regNumber offsReg = node->GetSingleTempReg();
1859 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1869 genProduceReg(node);
1872 //------------------------------------------------------------------------
1873 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1876 // node - The hardware intrinsic node
1878 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1880 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1881 regNumber targetReg = node->gtRegNum;
1882 GenTree* op1 = node->gtGetOp1();
1883 GenTree* op2 = node->gtGetOp2();
1884 var_types baseType = node->gtSIMDBaseType;
1885 var_types targetType = node->TypeGet();
1886 emitter* emit = getEmitter();
1888 genConsumeHWIntrinsicOperands(node);
1889 regNumber op1Reg = op1->gtRegNum;
1891 assert(targetReg != REG_NA);
1892 assert(op1Reg != REG_NA);
1893 assert(op2 != nullptr);
1894 assert(!node->OperIsCommutative());
1896 switch (intrinsicId)
1898 case NI_SSE42_Crc32:
1899 case NI_SSE42_X64_Crc32:
1901 if (op1Reg != targetReg)
1903 assert(op2->gtRegNum != targetReg);
1904 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1907 // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1908 // overload that explicitly takes the operands.
1910 node->gtOp2 = nullptr;
1912 if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1914 assert(targetType == TYP_INT);
1915 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1919 assert(op1->TypeGet() == op2->TypeGet());
1920 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1921 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1934 genProduceReg(node);
1937 //------------------------------------------------------------------------
1938 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1941 // node - The hardware intrinsic node
1943 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1945 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1946 var_types baseType = node->gtSIMDBaseType;
1947 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1948 var_types targetType = node->TypeGet();
1949 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1950 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
1951 GenTree* op1 = node->gtGetOp1();
1952 GenTree* op2 = node->gtGetOp2();
1953 regNumber op1Reg = REG_NA;
1954 regNumber op2Reg = REG_NA;
1955 regNumber targetReg = node->gtRegNum;
1956 emitter* emit = getEmitter();
1958 genConsumeHWIntrinsicOperands(node);
1960 switch (intrinsicId)
1962 case NI_AVX2_ConvertToInt32:
1963 case NI_AVX2_ConvertToUInt32:
1965 op1Reg = op1->gtRegNum;
1966 assert(numArgs == 1);
1967 assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1968 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1969 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1973 case NI_AVX2_ConvertToVector256Int16:
1974 case NI_AVX2_ConvertToVector256Int32:
1975 case NI_AVX2_ConvertToVector256Int64:
1977 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1979 if (!varTypeIsSIMD(op1->gtType))
1981 // Until we improve the handling of addressing modes in the emitter, we'll create a
1982 // temporary GT_IND to generate code with.
1983 GenTreeIndir load = indirForm(node->TypeGet(), op1);
1984 emit->emitInsLoadInd(ins, emitTypeSize(TYP_SIMD32), node->gtRegNum, &load);
1988 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD32));
1993 case NI_AVX2_GatherVector128:
1994 case NI_AVX2_GatherVector256:
1995 case NI_AVX2_GatherMaskVector128:
1996 case NI_AVX2_GatherMaskVector256:
1998 GenTreeArgList* list = op1->AsArgList();
1999 op1 = list->Current();
2000 op1Reg = op1->gtRegNum;
2002 list = list->Rest();
2003 op2 = list->Current();
2004 op2Reg = op2->gtRegNum;
2006 list = list->Rest();
2007 GenTree* op3 = list->Current();
2009 list = list->Rest();
2010 GenTree* op4 = nullptr;
2011 GenTree* lastOp = nullptr;
2012 GenTree* indexOp = nullptr;
2014 regNumber op3Reg = REG_NA;
2015 regNumber op4Reg = REG_NA;
2016 regNumber addrBaseReg = REG_NA;
2017 regNumber addrIndexReg = REG_NA;
2018 regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT);
2022 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
2023 op4 = list->Current();
2024 list = list->Rest();
2025 lastOp = list->Current();
2026 op3Reg = op3->gtRegNum;
2027 op4Reg = op4->gtRegNum;
2028 addrBaseReg = op2Reg;
2029 addrIndexReg = op3Reg;
2032 // copy op4Reg into the tmp mask register,
2033 // the mask register will be cleared by gather instructions
2034 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
2036 if (targetReg != op1Reg)
2038 // copy source vector to the target register for masking merge
2039 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
2044 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
2045 addrBaseReg = op1Reg;
2046 addrIndexReg = op2Reg;
2050 // generate all-one mask vector
2051 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
2054 bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
2056 // hwintrinsiclistxarch.h uses Dword index instructions in default
2057 if (varTypeIsLong(node->gtIndexBaseType))
2061 case INS_vpgatherdd:
2062 ins = INS_vpgatherqd;
2063 if (isVector128GatherWithVector256Index)
2065 // YMM index in address mode
2066 attr = emitTypeSize(TYP_SIMD32);
2069 case INS_vpgatherdq:
2070 ins = INS_vpgatherqq;
2072 case INS_vgatherdps:
2073 ins = INS_vgatherqps;
2074 if (isVector128GatherWithVector256Index)
2076 // YMM index in address mode
2077 attr = emitTypeSize(TYP_SIMD32);
2080 case INS_vgatherdpd:
2081 ins = INS_vgatherqpd;
2088 assert(lastOp->IsCnsIntOrI());
2089 ssize_t ival = lastOp->AsIntCon()->IconValue();
2090 assert((ival >= 0) && (ival <= 255));
2092 assert(targetReg != maskReg);
2093 assert(targetReg != addrIndexReg);
2094 assert(maskReg != addrIndexReg);
2095 emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2102 genHWIntrinsic_R_RM(node, ins, attr);
2103 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2104 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2108 case NI_AVX_TestNotZAndNotC:
2110 genHWIntrinsic_R_RM(node, ins, attr);
2111 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2112 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2118 genHWIntrinsic_R_RM(node, ins, attr);
2119 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2120 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2129 genProduceReg(node);
2132 //------------------------------------------------------------------------
2133 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2136 // node - The hardware intrinsic node
2138 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2140 NYI("Implement AES intrinsic code generation");
2143 //------------------------------------------------------------------------
2144 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2147 // node - The hardware intrinsic node
2149 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2151 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2152 regNumber targetReg = node->gtRegNum;
2153 GenTree* op1 = node->gtGetOp1();
2154 GenTree* op2 = node->gtGetOp2();
2155 var_types targetType = node->TypeGet();
2156 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2157 emitter* emit = getEmitter();
2159 assert(targetReg != REG_NA);
2160 assert(op1 != nullptr);
2162 genConsumeHWIntrinsicOperands(node);
2164 switch (intrinsicId)
2166 case NI_BMI1_AndNot:
2167 case NI_BMI1_X64_AndNot:
2168 case NI_BMI1_BitFieldExtract:
2169 case NI_BMI1_X64_BitFieldExtract:
2170 case NI_BMI2_ParallelBitDeposit:
2171 case NI_BMI2_ParallelBitExtract:
2172 case NI_BMI2_X64_ParallelBitDeposit:
2173 case NI_BMI2_X64_ParallelBitExtract:
2174 case NI_BMI2_ZeroHighBits:
2175 case NI_BMI2_X64_ZeroHighBits:
2177 assert(op2 != nullptr);
2178 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2179 genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2183 case NI_BMI1_ExtractLowestSetBit:
2184 case NI_BMI1_GetMaskUpToLowestSetBit:
2185 case NI_BMI1_ResetLowestSetBit:
2186 case NI_BMI1_X64_ExtractLowestSetBit:
2187 case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2188 case NI_BMI1_X64_ResetLowestSetBit:
2190 assert(op2 == nullptr);
2191 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2192 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2196 case NI_BMI1_TrailingZeroCount:
2197 case NI_BMI1_X64_TrailingZeroCount:
2199 assert(op2 == nullptr);
2200 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2201 genXCNTIntrinsic(node, ins);
2205 case NI_BMI2_MultiplyNoFlags:
2206 case NI_BMI2_X64_MultiplyNoFlags:
2208 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2209 assert(numArgs == 2 || numArgs == 3);
2211 regNumber op1Reg = REG_NA;
2212 regNumber op2Reg = REG_NA;
2213 regNumber op3Reg = REG_NA;
2214 regNumber lowReg = REG_NA;
2218 op1Reg = op1->gtRegNum;
2219 op2Reg = op2->gtRegNum;
2224 GenTreeArgList* argList = op1->AsArgList();
2225 op1 = argList->Current();
2226 op1Reg = op1->gtRegNum;
2227 argList = argList->Rest();
2228 op2 = argList->Current();
2229 op2Reg = op2->gtRegNum;
2230 argList = argList->Rest();
2231 GenTree* op3 = argList->Current();
2232 op3Reg = op3->gtRegNum;
2233 assert(!op3->isContained());
2234 assert(op3Reg != op1Reg);
2235 assert(op3Reg != targetReg);
2236 assert(op3Reg != REG_EDX);
2237 lowReg = node->GetSingleTempReg();
2238 assert(op3Reg != lowReg);
2239 assert(lowReg != targetReg);
2242 // These do not support containment
2243 assert(!op2->isContained());
2244 emitAttr attr = emitTypeSize(targetType);
2245 // mov the first operand into implicit source operand EDX/RDX
2246 if (op1Reg != REG_EDX)
2248 assert(op2Reg != REG_EDX);
2249 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2252 // generate code for MULX
2253 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2255 // If requires the lower half result, store in the memory pointed to by op3
2258 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2271 genProduceReg(node);
2274 //------------------------------------------------------------------------
2275 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2278 // node - The hardware intrinsic node
2280 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2282 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2283 var_types baseType = node->gtSIMDBaseType;
2284 emitAttr attr = EA_ATTR(node->gtSIMDSize);
2285 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2286 GenTree* op1 = node->gtGetOp1();
2287 regNumber targetReg = node->gtRegNum;
2289 assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2291 genConsumeHWIntrinsicOperands(node);
2292 GenTreeArgList* argList = op1->AsArgList();
2293 op1 = argList->Current();
2295 argList = argList->Rest();
2296 GenTree* op2 = argList->Current();
2298 argList = argList->Rest();
2299 GenTree* op3 = argList->Current();
2304 bool isCommutative = false;
2305 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2307 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2308 assert(!copiesUpperBits || !op1->isContained());
2310 if (op3->isContained() || op3->isUsedFromSpillTemp())
2312 // 213 form: op1 = (op2 * op1) + [op3]
2314 op1Reg = op1->gtRegNum;
2315 op2Reg = op2->gtRegNum;
2317 isCommutative = !copiesUpperBits;
2319 else if (op2->isContained() || op2->isUsedFromSpillTemp())
2321 // 132 form: op1 = (op1 * op3) + [op2]
2323 ins = (instruction)(ins - 1);
2324 op1Reg = op1->gtRegNum;
2325 op2Reg = op3->gtRegNum;
2328 else if (op1->isContained() || op1->isUsedFromSpillTemp())
2330 // 231 form: op3 = (op2 * op3) + [op1]
2332 ins = (instruction)(ins + 1);
2333 op1Reg = op3->gtRegNum;
2334 op2Reg = op2->gtRegNum;
2339 // 213 form: op1 = (op2 * op1) + op3
2341 op1Reg = op1->gtRegNum;
2342 op2Reg = op2->gtRegNum;
2344 isCommutative = !copiesUpperBits;
2347 if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2349 assert(node->isRMWHWIntrinsic(compiler));
2351 // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2353 // For non-commutative intrinsics, we should have ensured that op2 was marked
2354 // delay free in order to prevent it from getting assigned the same register
2355 // as target. However, for commutative intrinsics, we can just swap the operands
2356 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2362 genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2363 genProduceReg(node);
2366 //------------------------------------------------------------------------
2367 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2370 // node - The hardware intrinsic node
2372 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2374 assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2375 node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2377 genConsumeOperands(node);
2378 genXCNTIntrinsic(node, INS_lzcnt);
2379 genProduceReg(node);
2382 //------------------------------------------------------------------------
2383 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2386 // node - The hardware intrinsic node
2388 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2390 NYI("Implement PCLMULQDQ intrinsic code generation");
2393 //------------------------------------------------------------------------
2394 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2397 // node - The hardware intrinsic node
2399 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2401 assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2403 genConsumeOperands(node);
2404 genXCNTIntrinsic(node, INS_popcnt);
2405 genProduceReg(node);
2408 //------------------------------------------------------------------------
2409 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2410 // the target register
2413 // node - The hardware intrinsic node
2414 // ins - The instruction being generated
2416 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2418 // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2419 // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2420 // renaming, but only if it's not an actual dependency.
2422 GenTree* op1 = node->gtGetOp1();
2423 regNumber sourceReg1 = REG_NA;
2424 regNumber sourceReg2 = REG_NA;
2426 if (!op1->isContained())
2428 sourceReg1 = op1->gtRegNum;
2430 else if (op1->isIndir())
2432 GenTreeIndir* indir = op1->AsIndir();
2433 GenTree* memBase = indir->Base();
2435 if (memBase != nullptr)
2437 sourceReg1 = memBase->gtRegNum;
2440 if (indir->HasIndex())
2442 sourceReg2 = indir->Index()->gtRegNum;
2446 regNumber targetReg = node->gtRegNum;
2447 if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2449 getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2451 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2454 #endif // FEATURE_HW_INTRINSICS