1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Intel hardware intrinsic Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifdef FEATURE_HW_INTRINSICS
22 #include "sideeffects.h"
25 #include "gcinfoencoder.h"
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
31 // lowering - The lowering phase from the compiler
32 // node - The HWIntrinsic node that has the contained node
33 // op - The op that is contained
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
38 // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39 // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
41 // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42 // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
44 // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
46 // in the first place).
48 bool supportsRegOptional = false;
49 bool isContainable = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50 assert(isContainable || supportsRegOptional);
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
58 // category - category of a HW intrinsic
61 // returns true if this category can be table-driven in CodeGen
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
65 // TODO - make more categories to the table-driven framework
66 // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67 const bool tableDrivenCategory =
68 (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69 const bool tableDrivenFlag =
70 !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71 return tableDrivenCategory && tableDrivenFlag;
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
78 // node - The hardware intrinsic node
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
82 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
83 InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId);
84 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
85 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
86 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
88 assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
90 if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
92 GenTree* op1 = node->gtGetOp1();
93 GenTree* op2 = node->gtGetOp2();
94 regNumber targetReg = node->gtRegNum;
95 var_types targetType = node->TypeGet();
96 var_types baseType = node->gtSIMDBaseType;
98 regNumber op1Reg = REG_NA;
99 regNumber op2Reg = REG_NA;
100 emitter* emit = getEmitter();
102 assert(numArgs >= 0);
103 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104 assert(ins != INS_invalid);
105 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106 assert(simdSize != 0);
112 genConsumeOperands(node);
113 op1Reg = op1->gtRegNum;
115 if (node->OperIsMemoryLoad())
117 emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
119 else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
121 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
123 else if ((ival != -1) && varTypeIsFloating(baseType))
125 assert((ival >= 0) && (ival <= 127));
126 genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
130 genHWIntrinsic_R_RM(node, ins, simdSize);
137 genConsumeOperands(node);
139 op1Reg = op1->gtRegNum;
140 op2Reg = op2->gtRegNum;
142 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
144 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
146 // For non-commutative intrinsics, we should have ensured that op2 was marked
147 // delay free in order to prevent it from getting assigned the same register
148 // as target. However, for commutative intrinsics, we can just swap the operands
149 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
151 noway_assert(node->OperIsCommutative());
156 if (category == HW_Category_MemoryStore)
158 emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
160 else if ((ival != -1) && varTypeIsFloating(baseType))
162 assert((ival >= 0) && (ival <= 127));
163 genHWIntrinsic_R_R_RM_I(node, ins, ival);
165 else if (category == HW_Category_MemoryLoad)
167 if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
169 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg);
173 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
176 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
180 if (intrinsicId == NI_SSE2_Extract)
182 // extract instructions return to GP-registers, so it needs int size as the emitsize
183 simdSize = emitTypeSize(TYP_INT);
186 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
188 if (op2->IsCnsIntOrI())
190 ssize_t ival = op2->AsIntCon()->IconValue();
191 assert((ival >= 0) && (ival <= 255));
192 emitSwCase((int8_t)ival);
196 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
197 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
198 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
199 regNumber baseReg = node->ExtractTempReg();
200 regNumber offsReg = node->GetSingleTempReg();
201 genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
206 genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
213 assert(op1->OperIsList());
214 assert(op1->gtGetOp2()->OperIsList());
215 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
217 GenTreeArgList* argList = op1->AsArgList();
218 op1 = argList->Current();
220 op1Reg = op1->gtRegNum;
222 argList = argList->Rest();
223 op2 = argList->Current();
225 op2Reg = op2->gtRegNum;
227 argList = argList->Rest();
228 GenTree* op3 = argList->Current();
230 regNumber op3Reg = op3->gtRegNum;
232 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
236 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
238 if (op3->IsCnsIntOrI())
240 ssize_t ival = op3->AsIntCon()->IconValue();
241 assert((ival >= 0) && (ival <= 255));
242 emitSwCase((int8_t)ival);
246 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
247 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
248 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
249 regNumber baseReg = node->ExtractTempReg();
250 regNumber offsReg = node->GetSingleTempReg();
251 genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
254 else if (category == HW_Category_MemoryStore)
256 if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
258 emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
262 assert(intrinsicId == NI_SSE2_MaskMove);
263 assert(targetReg == REG_NA);
265 // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
266 if (op3Reg != REG_EDI)
268 emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
270 emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
277 case NI_SSE41_BlendVariable:
278 case NI_AVX_BlendVariable:
279 case NI_AVX2_BlendVariable:
281 genHWIntrinsic_R_R_RM_R(node, ins);
305 case InstructionSet_Base:
306 genBaseIntrinsic(node);
308 case InstructionSet_SSE:
309 case InstructionSet_SSE_X64:
310 genSSEIntrinsic(node);
312 case InstructionSet_SSE2:
313 case InstructionSet_SSE2_X64:
314 genSSE2Intrinsic(node);
316 case InstructionSet_SSE41:
317 case InstructionSet_SSE41_X64:
318 genSSE41Intrinsic(node);
320 case InstructionSet_SSE42:
321 case InstructionSet_SSE42_X64:
322 genSSE42Intrinsic(node);
324 case InstructionSet_AVX:
325 case InstructionSet_AVX2:
326 genAvxOrAvx2Intrinsic(node);
328 case InstructionSet_AES:
329 genAESIntrinsic(node);
331 case InstructionSet_BMI1:
332 case InstructionSet_BMI1_X64:
333 case InstructionSet_BMI2:
334 case InstructionSet_BMI2_X64:
335 genBMI1OrBMI2Intrinsic(node);
337 case InstructionSet_FMA:
338 genFMAIntrinsic(node);
340 case InstructionSet_LZCNT:
341 case InstructionSet_LZCNT_X64:
342 genLZCNTIntrinsic(node);
344 case InstructionSet_PCLMULQDQ:
345 genPCLMULQDQIntrinsic(node);
347 case InstructionSet_POPCNT:
348 case InstructionSet_POPCNT_X64:
349 genPOPCNTIntrinsic(node);
357 //------------------------------------------------------------------------
358 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
359 // register/memory operand and that returns a value in register
362 // node - The hardware intrinsic node
363 // ins - The instruction being generated
364 // attr - The emit attribute for the instruciton being generated
366 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
368 var_types targetType = node->TypeGet();
369 regNumber targetReg = node->gtRegNum;
370 GenTree* op1 = node->gtGetOp1();
371 GenTree* op2 = node->gtGetOp2();
372 emitter* emit = getEmitter();
376 // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this
377 // code path. They are all MultiIns, as the return value comes from the flags and
378 // we have two operands instead.
380 assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
381 assert(targetReg != REG_NA);
383 targetReg = op1->gtRegNum;
389 assert(!node->OperIsCommutative());
392 assert(targetReg != REG_NA);
393 assert(op2 == nullptr);
395 if (op1->isContained() || op1->isUsedFromSpillTemp())
397 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
398 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
400 TempDsc* tmpDsc = nullptr;
401 unsigned varNum = BAD_VAR_NUM;
402 unsigned offset = (unsigned)-1;
404 if (op1->isUsedFromSpillTemp())
406 assert(op1->IsRegOptional());
408 tmpDsc = getSpillTempDsc(op1);
409 varNum = tmpDsc->tdTempNum();
412 regSet.tmpRlsTemp(tmpDsc);
414 else if (op1->OperIsHWIntrinsic())
416 emit->emitIns_R_AR(ins, attr, targetReg, op1->gtGetOp1()->gtRegNum, 0);
419 else if (op1->isIndir())
421 GenTreeIndir* memIndir = op1->AsIndir();
422 GenTree* memBase = memIndir->gtOp1;
424 switch (memBase->OperGet())
426 case GT_LCL_VAR_ADDR:
428 varNum = memBase->AsLclVarCommon()->GetLclNum();
431 // Ensure that all the GenTreeIndir values are set to their defaults.
432 assert(!memIndir->HasIndex());
433 assert(memIndir->Scale() == 1);
434 assert(memIndir->Offset() == 0);
439 case GT_CLS_VAR_ADDR:
441 emit->emitIns_R_C(ins, attr, targetReg, memBase->gtClsVar.gtClsVarHnd, 0);
447 emit->emitIns_R_A(ins, attr, targetReg, memIndir);
454 switch (op1->OperGet())
458 GenTreeLclFld* lclField = op1->AsLclFld();
460 varNum = lclField->GetLclNum();
461 offset = lclField->gtLclFld.gtLclOffs;
467 assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
468 varNum = op1->AsLclVar()->GetLclNum();
481 // Ensure we got a good varNum and offset.
482 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
483 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
484 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
485 assert(offset != (unsigned)-1);
487 emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
491 regNumber op1Reg = op1->gtRegNum;
492 emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
496 //------------------------------------------------------------------------
497 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
498 // an immediate operand, and that returns a value in register
501 // node - The hardware intrinsic node
502 // ins - The instruction being generated
503 // ival - The immediate value
505 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
507 var_types targetType = node->TypeGet();
508 regNumber targetReg = node->gtRegNum;
509 GenTree* op1 = node->gtGetOp1();
510 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
511 emitter* emit = getEmitter();
513 // TODO-XArch-CQ: Commutative operations can have op1 be contained
514 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
516 assert(targetReg != REG_NA);
517 assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
519 if (op1->isContained() || op1->isUsedFromSpillTemp())
521 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
522 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
524 TempDsc* tmpDsc = nullptr;
525 unsigned varNum = BAD_VAR_NUM;
526 unsigned offset = (unsigned)-1;
528 if (op1->isUsedFromSpillTemp())
530 assert(op1->IsRegOptional());
532 tmpDsc = getSpillTempDsc(op1);
533 varNum = tmpDsc->tdTempNum();
536 regSet.tmpRlsTemp(tmpDsc);
538 else if (op1->OperIsHWIntrinsic())
540 emit->emitIns_R_AR_I(ins, simdSize, targetReg, op1->gtGetOp1()->gtRegNum, 0, ival);
543 else if (op1->isIndir())
545 GenTreeIndir* memIndir = op1->AsIndir();
546 GenTree* memBase = memIndir->gtOp1;
548 switch (memBase->OperGet())
550 case GT_LCL_VAR_ADDR:
552 varNum = memBase->AsLclVarCommon()->GetLclNum();
555 // Ensure that all the GenTreeIndir values are set to their defaults.
556 assert(!memIndir->HasIndex());
557 assert(memIndir->Scale() == 1);
558 assert(memIndir->Offset() == 0);
563 case GT_CLS_VAR_ADDR:
565 emit->emitIns_R_C_I(ins, simdSize, targetReg, memBase->gtClsVar.gtClsVarHnd, 0, ival);
571 emit->emitIns_R_A_I(ins, simdSize, targetReg, memIndir, ival);
578 switch (op1->OperGet())
582 GenTreeLclFld* lclField = op1->AsLclFld();
584 varNum = lclField->GetLclNum();
585 offset = lclField->gtLclFld.gtLclOffs;
591 assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
592 varNum = op1->AsLclVar()->GetLclNum();
603 // Ensure we got a good varNum and offset.
604 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
605 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
606 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
607 assert(offset != (unsigned)-1);
609 emit->emitIns_R_S_I(ins, simdSize, targetReg, varNum, offset, ival);
613 regNumber op1Reg = op1->gtRegNum;
614 emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
618 //------------------------------------------------------------------------
619 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
620 // register/memory operand, and that returns a value in register
623 // node - The hardware intrinsic node
624 // ins - The instruction being generated
625 // attr - The emit attribute for the instruciton being generated
627 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
629 regNumber targetReg = node->gtRegNum;
630 GenTree* op1 = node->gtGetOp1();
631 GenTree* op2 = node->gtGetOp2();
632 regNumber op1Reg = op1->gtRegNum;
634 assert(targetReg != REG_NA);
635 assert(op1Reg != REG_NA);
637 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
640 //------------------------------------------------------------------------
641 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
642 // register/memory operand, and that returns a value in register
645 // node - The hardware intrinsic node
646 // ins - The instruction being generated
647 // attr - The emit attribute for the instruciton being generated
648 // targetReg - The register allocated to the result
649 // op1Reg - The register allocated to the first operand
650 // op2 - Another operand that maybe in register or memory
652 void CodeGen::genHWIntrinsic_R_R_RM(
653 GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
655 emitter* emit = getEmitter();
657 // TODO-XArch-CQ: Commutative operations can have op1 be contained
658 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
660 assert(targetReg != REG_NA);
661 assert(op1Reg != REG_NA);
663 if (op2->isContained() || op2->isUsedFromSpillTemp())
665 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
666 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
668 TempDsc* tmpDsc = nullptr;
669 unsigned varNum = BAD_VAR_NUM;
670 unsigned offset = (unsigned)-1;
672 if (op2->isUsedFromSpillTemp())
674 assert(op2->IsRegOptional());
676 tmpDsc = getSpillTempDsc(op2);
677 varNum = tmpDsc->tdTempNum();
680 regSet.tmpRlsTemp(tmpDsc);
682 else if (op2->OperIsHWIntrinsic())
684 emit->emitIns_SIMD_R_R_AR(ins, attr, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
687 else if (op2->isIndir())
689 GenTreeIndir* memIndir = op2->AsIndir();
690 GenTree* memBase = memIndir->gtOp1;
692 switch (memBase->OperGet())
694 case GT_LCL_VAR_ADDR:
696 varNum = memBase->AsLclVarCommon()->GetLclNum();
699 // Ensure that all the GenTreeIndir values are set to their defaults.
700 assert(!memIndir->HasIndex());
701 assert(memIndir->Scale() == 1);
702 assert(memIndir->Offset() == 0);
707 case GT_CLS_VAR_ADDR:
709 emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
715 emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
722 switch (op2->OperGet())
726 GenTreeLclFld* lclField = op2->AsLclFld();
728 varNum = lclField->GetLclNum();
729 offset = lclField->gtLclFld.gtLclOffs;
735 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
736 varNum = op2->AsLclVar()->GetLclNum();
747 // Ensure we got a good varNum and offset.
748 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
749 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
750 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
751 assert(offset != (unsigned)-1);
753 emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
757 regNumber op2Reg = op2->gtRegNum;
759 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
761 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
763 // For non-commutative intrinsics, we should have ensured that op2 was marked
764 // delay free in order to prevent it from getting assigned the same register
765 // as target. However, for commutative intrinsics, we can just swap the operands
766 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
768 noway_assert(node->OperIsCommutative());
773 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
777 //------------------------------------------------------------------------
778 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
779 // register/memory operand, an immediate operand, and that returns a value in register
782 // node - The hardware intrinsic node
783 // ins - The instruction being generated
784 // ival - The immediate value
786 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
788 var_types targetType = node->TypeGet();
789 regNumber targetReg = node->gtRegNum;
790 GenTree* op1 = node->gtGetOp1();
791 GenTree* op2 = node->gtGetOp2();
792 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
793 emitter* emit = getEmitter();
795 // TODO-XArch-CQ: Commutative operations can have op1 be contained
796 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
798 if (op1->OperIsList())
800 assert(op2 == nullptr);
802 GenTreeArgList* argList = op1->AsArgList();
804 op1 = argList->Current();
805 argList = argList->Rest();
807 op2 = argList->Current();
808 argList = argList->Rest();
810 assert(argList->Current() != nullptr);
811 assert(argList->Rest() == nullptr);
814 regNumber op1Reg = op1->gtRegNum;
816 assert(targetReg != REG_NA);
817 assert(op1Reg != REG_NA);
819 if (op2->isContained() || op2->isUsedFromSpillTemp())
821 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
822 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
824 TempDsc* tmpDsc = nullptr;
825 unsigned varNum = BAD_VAR_NUM;
826 unsigned offset = (unsigned)-1;
828 if (op2->isUsedFromSpillTemp())
830 assert(op2->IsRegOptional());
832 tmpDsc = getSpillTempDsc(op2);
833 varNum = tmpDsc->tdTempNum();
836 regSet.tmpRlsTemp(tmpDsc);
838 else if (op2->OperIsHWIntrinsic())
840 emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
843 else if (op2->isIndir())
845 GenTreeIndir* memIndir = op2->AsIndir();
846 GenTree* memBase = memIndir->gtOp1;
848 switch (memBase->OperGet())
850 case GT_LCL_VAR_ADDR:
852 varNum = memBase->AsLclVarCommon()->GetLclNum();
855 // Ensure that all the GenTreeIndir values are set to their defaults.
856 assert(!memIndir->HasIndex());
857 assert(memIndir->Scale() == 1);
858 assert(memIndir->Offset() == 0);
863 case GT_CLS_VAR_ADDR:
865 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
872 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
879 switch (op2->OperGet())
883 GenTreeLclFld* lclField = op2->AsLclFld();
885 varNum = lclField->GetLclNum();
886 offset = lclField->gtLclFld.gtLclOffs;
892 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
893 varNum = op2->AsLclVar()->GetLclNum();
904 // Ensure we got a good varNum and offset.
905 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
906 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
907 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
908 assert(offset != (unsigned)-1);
910 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
914 regNumber op2Reg = op2->gtRegNum;
916 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
918 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
920 // For non-commutative intrinsics, we should have ensured that op2 was marked
921 // delay free in order to prevent it from getting assigned the same register
922 // as target. However, for commutative intrinsics, we can just swap the operands
923 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
925 noway_assert(node->OperIsCommutative());
930 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
934 //------------------------------------------------------------------------
935 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
936 // register/memory operand, another register operand, and that returns a value in register
939 // node - The hardware intrinsic node
940 // ins - The instruction being generated
942 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
944 var_types targetType = node->TypeGet();
945 regNumber targetReg = node->gtRegNum;
946 GenTree* op1 = node->gtGetOp1();
947 GenTree* op2 = node->gtGetOp2();
948 GenTree* op3 = nullptr;
949 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
950 emitter* emit = getEmitter();
952 assert(op1->OperIsList());
953 assert(op2 == nullptr);
955 GenTreeArgList* argList = op1->AsArgList();
957 op1 = argList->Current();
958 argList = argList->Rest();
960 op2 = argList->Current();
961 argList = argList->Rest();
963 op3 = argList->Current();
964 assert(argList->Rest() == nullptr);
966 regNumber op1Reg = op1->gtRegNum;
967 regNumber op3Reg = op3->gtRegNum;
969 assert(targetReg != REG_NA);
970 assert(op1Reg != REG_NA);
971 assert(op3Reg != REG_NA);
973 if (op2->isContained() || op2->isUsedFromSpillTemp())
975 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
976 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
978 TempDsc* tmpDsc = nullptr;
979 unsigned varNum = BAD_VAR_NUM;
980 unsigned offset = (unsigned)-1;
982 if (op2->isUsedFromSpillTemp())
984 assert(op2->IsRegOptional());
986 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
987 // pattern. It could probably be extracted to its own method.
988 tmpDsc = getSpillTempDsc(op2);
989 varNum = tmpDsc->tdTempNum();
992 regSet.tmpRlsTemp(tmpDsc);
994 else if (op2->OperIsHWIntrinsic())
996 emit->emitIns_SIMD_R_R_AR_R(ins, simdSize, targetReg, op1Reg, op3Reg, op2->gtGetOp1()->gtRegNum);
999 else if (op2->isIndir())
1001 GenTreeIndir* memIndir = op2->AsIndir();
1002 GenTree* memBase = memIndir->gtOp1;
1004 switch (memBase->OperGet())
1006 case GT_LCL_VAR_ADDR:
1008 varNum = memBase->AsLclVarCommon()->GetLclNum();
1011 // Ensure that all the GenTreeIndir values are set to their defaults.
1012 assert(!memIndir->HasIndex());
1013 assert(memIndir->Scale() == 1);
1014 assert(memIndir->Offset() == 0);
1019 case GT_CLS_VAR_ADDR:
1021 emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, memBase->gtClsVar.gtClsVarHnd,
1028 emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
1035 switch (op2->OperGet())
1039 GenTreeLclFld* lclField = op2->AsLclFld();
1041 varNum = lclField->GetLclNum();
1042 offset = lclField->gtLclFld.gtLclOffs;
1048 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
1049 varNum = op2->AsLclVar()->GetLclNum();
1060 // Ensure we got a good varNum and offset.
1061 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1062 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1063 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1064 assert(offset != (unsigned)-1);
1066 emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
1070 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1074 //------------------------------------------------------------------------
1075 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1076 // a register/memory operand, and that returns a value in register
1079 // ins - The instruction being generated
1080 // attr - The emit attribute
1081 // targetReg - The target register
1082 // op1Reg - The register of the first operand
1083 // op2Reg - The register of the second operand
1084 // op3 - The third operand
1086 void CodeGen::genHWIntrinsic_R_R_R_RM(
1087 instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1089 assert(targetReg != REG_NA);
1090 assert(op1Reg != REG_NA);
1091 assert(op2Reg != REG_NA);
1093 emitter* emit = getEmitter();
1095 if (op3->isContained() || op3->isUsedFromSpillTemp())
1097 TempDsc* tmpDsc = nullptr;
1098 unsigned varNum = BAD_VAR_NUM;
1099 unsigned offset = (unsigned)-1;
1101 if (op3->isUsedFromSpillTemp())
1103 assert(op3->IsRegOptional());
1105 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1106 // pattern. It could probably be extracted to its own method.
1107 tmpDsc = getSpillTempDsc(op3);
1108 varNum = tmpDsc->tdTempNum();
1111 regSet.tmpRlsTemp(tmpDsc);
1113 else if (op3->OperIsHWIntrinsic())
1115 emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum);
1118 else if (op3->isIndir())
1120 GenTreeIndir* memIndir = op3->AsIndir();
1121 GenTree* memBase = memIndir->gtOp1;
1123 switch (memBase->OperGet())
1125 case GT_LCL_VAR_ADDR:
1127 varNum = memBase->AsLclVarCommon()->GetLclNum();
1130 // Ensure that all the GenTreeIndir values are set to their defaults.
1131 assert(!memIndir->HasIndex());
1132 assert(memIndir->Scale() == 1);
1133 assert(memIndir->Offset() == 0);
1138 case GT_CLS_VAR_ADDR:
1140 emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0);
1146 emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1153 switch (op3->OperGet())
1157 GenTreeLclFld* lclField = op3->AsLclFld();
1159 varNum = lclField->GetLclNum();
1160 offset = lclField->gtLclFld.gtLclOffs;
1166 assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1167 varNum = op3->AsLclVar()->GetLclNum();
1178 // Ensure we got a good varNum and offset.
1179 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1180 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1181 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1182 assert(offset != (unsigned)-1);
1184 emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1188 emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1192 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1193 // with non-constant argument
1196 // intrinsic - intrinsic ID
1197 // nonConstImmReg - the register contains non-constant imm8 argument
1198 // baseReg - a register for the start of the switch table
1199 // offsReg - a register for the offset into the switch table
1200 // emitSwCase - the lambda to generate a switch case
1203 // generate the jump-table fallback for imm-intrinsics with non-constant argument.
1205 // This function can be used for all imm-intrinsics (whether full-range or not),
1206 // The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1207 // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1209 template <typename HWIntrinsicSwitchCaseBody>
1210 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
1211 regNumber nonConstImmReg,
1214 HWIntrinsicSwitchCaseBody emitSwCase)
1216 assert(nonConstImmReg != REG_NA);
1217 // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1218 // that does work with the current compiler generated jump-table fallback
1219 assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1220 emitter* emit = getEmitter();
1222 const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1223 assert(maxByte <= 256);
1224 BasicBlock* jmpTable[256];
1226 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1227 unsigned jmpTableOffs = 0;
1229 // Emit the jump table
1230 for (unsigned i = 0; i < maxByte; i++)
1232 jmpTable[i] = genCreateTempLabel();
1233 emit->emitDataGenData(i, jmpTable[i]);
1236 emit->emitDataGenEnd();
1238 // Compute and jump to the appropriate offset in the switch table
1239 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1241 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1242 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1243 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1244 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1246 // Emit the switch table entries
1248 BasicBlock* switchTableBeg = genCreateTempLabel();
1249 BasicBlock* switchTableEnd = genCreateTempLabel();
1251 genDefineTempLabel(switchTableBeg);
1253 for (unsigned i = 0; i < maxByte; i++)
1255 genDefineTempLabel(jmpTable[i]);
1256 emitSwCase((int8_t)i);
1257 emit->emitIns_J(INS_jmp, switchTableEnd);
1260 genDefineTempLabel(switchTableEnd);
1263 //------------------------------------------------------------------------
1264 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1267 // node - The hardware intrinsic node
1270 // We currently assume that all base intrinsics only have a single operand.
1272 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1274 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1275 regNumber targetReg = node->gtRegNum;
1276 var_types targetType = node->TypeGet();
1277 var_types baseType = node->gtSIMDBaseType;
1279 assert(compiler->compSupports(InstructionSet_SSE));
1280 assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1282 GenTree* op1 = node->gtGetOp1();
1283 regNumber op1Reg = REG_NA;
1287 assert(!op1->OperIsList());
1288 op1Reg = op1->gtRegNum;
1289 genConsumeOperands(node);
1292 assert(node->gtGetOp2() == nullptr);
1294 emitter* emit = getEmitter();
1295 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1296 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1298 switch (intrinsicId)
1300 case NI_Base_Vector128_CreateScalarUnsafe:
1301 case NI_Base_Vector256_CreateScalarUnsafe:
1303 if (varTypeIsIntegral(baseType))
1305 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1309 assert(varTypeIsFloating(baseType));
1311 attr = emitTypeSize(baseType);
1313 if (op1->isContained() || op1->isUsedFromSpillTemp())
1315 genHWIntrinsic_R_RM(node, ins, attr);
1317 else if (targetReg != op1Reg)
1319 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1320 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1326 case NI_Base_Vector128_ToScalar:
1327 case NI_Base_Vector256_ToScalar:
1329 assert(varTypeIsFloating(baseType));
1331 attr = emitTypeSize(TYP_SIMD16);
1333 if (op1->isContained() || op1->isUsedFromSpillTemp())
1335 genHWIntrinsic_R_RM(node, ins, attr);
1337 else if (targetReg != op1Reg)
1339 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1340 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1345 case NI_Base_Vector128_ToVector256:
1347 // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1348 // We always emit a move to the target register, even when op1Reg == targetReg,
1349 // in order to ensure that Bits MAXVL-1:128 are zeroed.
1351 attr = emitTypeSize(TYP_SIMD16);
1353 if (op1->isContained() || op1->isUsedFromSpillTemp())
1355 genHWIntrinsic_R_RM(node, ins, attr);
1359 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1360 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1365 case NI_Base_Vector128_ToVector256Unsafe:
1366 case NI_Base_Vector256_GetLower:
1368 if (op1->isContained() || op1->isUsedFromSpillTemp())
1370 genHWIntrinsic_R_RM(node, ins, attr);
1372 else if (targetReg != op1Reg)
1374 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1375 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1380 case NI_Base_Vector128_Zero:
1381 case NI_Base_Vector256_Zero:
1383 assert(op1 == nullptr);
1384 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1395 genProduceReg(node);
1398 //------------------------------------------------------------------------
1399 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1402 // node - The hardware intrinsic node
1404 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1406 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1407 GenTree* op1 = node->gtGetOp1();
1408 GenTree* op2 = node->gtGetOp2();
1409 GenTree* op3 = nullptr;
1410 GenTree* op4 = nullptr;
1411 regNumber targetReg = node->gtRegNum;
1412 var_types targetType = node->TypeGet();
1413 var_types baseType = node->gtSIMDBaseType;
1415 regNumber op1Reg = REG_NA;
1416 regNumber op2Reg = REG_NA;
1417 regNumber op3Reg = REG_NA;
1418 regNumber op4Reg = REG_NA;
1419 emitter* emit = getEmitter();
1421 if ((op1 != nullptr) && !op1->OperIsList())
1423 op1Reg = op1->gtRegNum;
1424 genConsumeOperands(node);
1427 switch (intrinsicId)
1429 case NI_SSE_CompareEqualOrderedScalar:
1430 case NI_SSE_CompareEqualUnorderedScalar:
1432 assert(baseType == TYP_FLOAT);
1433 regNumber tmpReg = node->GetSingleTempReg();
1434 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1436 // Ensure we aren't overwriting targetReg
1437 assert(tmpReg != targetReg);
1439 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1440 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1441 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1442 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1443 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1447 case NI_SSE_CompareGreaterThanOrderedScalar:
1448 case NI_SSE_CompareGreaterThanUnorderedScalar:
1450 assert(baseType == TYP_FLOAT);
1451 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1453 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1454 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1455 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1459 case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
1460 case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
1462 assert(baseType == TYP_FLOAT);
1463 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1465 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1466 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1467 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1471 case NI_SSE_CompareLessThanOrderedScalar:
1472 case NI_SSE_CompareLessThanUnorderedScalar:
1474 assert(baseType == TYP_FLOAT);
1475 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1477 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1478 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1479 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1483 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
1484 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
1486 assert(baseType == TYP_FLOAT);
1487 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1489 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1490 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1491 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1495 case NI_SSE_CompareNotEqualOrderedScalar:
1496 case NI_SSE_CompareNotEqualUnorderedScalar:
1498 assert(baseType == TYP_FLOAT);
1499 regNumber tmpReg = node->GetSingleTempReg();
1500 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1502 // Ensure we aren't overwriting targetReg
1503 assert(tmpReg != targetReg);
1505 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1506 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1507 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1508 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1509 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1513 case NI_SSE_X64_ConvertScalarToVector128Single:
1515 assert(baseType == TYP_LONG);
1516 assert(op1 != nullptr);
1517 assert(op2 != nullptr);
1518 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1519 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1523 case NI_SSE_Prefetch0:
1524 case NI_SSE_Prefetch1:
1525 case NI_SSE_Prefetch2:
1526 case NI_SSE_PrefetchNonTemporal:
1528 assert(baseType == TYP_UBYTE);
1529 assert(op2 == nullptr);
1531 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1532 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1536 case NI_SSE_StoreFence:
1538 assert(baseType == TYP_VOID);
1539 assert(op1 == nullptr);
1540 assert(op2 == nullptr);
1541 emit->emitIns(INS_sfence);
1550 genProduceReg(node);
1553 //------------------------------------------------------------------------
1554 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1557 // node - The hardware intrinsic node
1559 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1561 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1562 GenTree* op1 = node->gtGetOp1();
1563 GenTree* op2 = node->gtGetOp2();
1564 regNumber targetReg = node->gtRegNum;
1565 var_types targetType = node->TypeGet();
1566 var_types baseType = node->gtSIMDBaseType;
1567 regNumber op1Reg = REG_NA;
1568 regNumber op2Reg = REG_NA;
1569 emitter* emit = getEmitter();
1571 if ((op1 != nullptr) && !op1->OperIsList())
1573 op1Reg = op1->gtRegNum;
1574 genConsumeOperands(node);
1577 switch (intrinsicId)
1579 // All integer overloads are handled by table codegen
1580 case NI_SSE2_CompareLessThan:
1582 assert(op1 != nullptr);
1583 assert(op2 != nullptr);
1585 assert(baseType == TYP_DOUBLE);
1587 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1588 assert((ival >= 0) && (ival <= 127));
1590 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1591 op2Reg = op2->gtRegNum;
1592 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1597 case NI_SSE2_CompareEqualOrderedScalar:
1598 case NI_SSE2_CompareEqualUnorderedScalar:
1600 assert(baseType == TYP_DOUBLE);
1601 regNumber tmpReg = node->GetSingleTempReg();
1602 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1604 // Ensure we aren't overwriting targetReg
1605 assert(tmpReg != targetReg);
1607 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1608 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1609 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1610 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1611 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1615 case NI_SSE2_CompareGreaterThanOrderedScalar:
1616 case NI_SSE2_CompareGreaterThanUnorderedScalar:
1618 assert(baseType == TYP_DOUBLE);
1619 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1621 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1622 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1623 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1627 case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
1628 case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
1630 assert(baseType == TYP_DOUBLE);
1631 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1633 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1634 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1635 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1639 case NI_SSE2_CompareLessThanOrderedScalar:
1640 case NI_SSE2_CompareLessThanUnorderedScalar:
1642 assert(baseType == TYP_DOUBLE);
1643 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1645 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1646 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1647 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1651 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
1652 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
1654 assert(baseType == TYP_DOUBLE);
1655 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1657 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1658 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1659 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1663 case NI_SSE2_CompareNotEqualOrderedScalar:
1664 case NI_SSE2_CompareNotEqualUnorderedScalar:
1666 assert(baseType == TYP_DOUBLE);
1667 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1668 regNumber tmpReg = node->GetSingleTempReg();
1670 // Ensure we aren't overwriting targetReg
1671 assert(tmpReg != targetReg);
1673 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1674 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1675 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1676 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1677 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1681 case NI_SSE2_X64_ConvertScalarToVector128Double:
1683 assert(baseType == TYP_LONG);
1684 assert(op1 != nullptr);
1685 assert(op2 != nullptr);
1686 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1687 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1691 case NI_SSE2_X64_ConvertScalarToVector128Int64:
1692 case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1694 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1695 assert(op1 != nullptr);
1696 assert(op2 == nullptr);
1697 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1698 genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1702 case NI_SSE2_ConvertToInt32:
1703 case NI_SSE2_ConvertToInt32WithTruncation:
1704 case NI_SSE2_ConvertToUInt32:
1705 case NI_SSE2_X64_ConvertToUInt64:
1706 case NI_SSE2_X64_ConvertToInt64:
1708 assert(op2 == nullptr);
1709 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1711 if (varTypeIsIntegral(baseType))
1713 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1714 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1718 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1719 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1724 case NI_SSE2_LoadFence:
1726 assert(baseType == TYP_VOID);
1727 assert(op1 == nullptr);
1728 assert(op2 == nullptr);
1729 emit->emitIns(INS_lfence);
1733 case NI_SSE2_MemoryFence:
1735 assert(baseType == TYP_VOID);
1736 assert(op1 == nullptr);
1737 assert(op2 == nullptr);
1738 emit->emitIns(INS_mfence);
1742 case NI_SSE2_StoreNonTemporal:
1743 case NI_SSE2_X64_StoreNonTemporal:
1745 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1746 assert(op1 != nullptr);
1747 assert(op2 != nullptr);
1749 op2Reg = op2->gtRegNum;
1750 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1751 emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1760 genProduceReg(node);
1763 //------------------------------------------------------------------------
1764 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1767 // node - The hardware intrinsic node
1769 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1771 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1772 GenTree* op1 = node->gtGetOp1();
1773 GenTree* op2 = node->gtGetOp2();
1774 GenTree* op3 = nullptr;
1775 GenTree* op4 = nullptr;
1776 regNumber targetReg = node->gtRegNum;
1777 var_types targetType = node->TypeGet();
1778 var_types baseType = node->gtSIMDBaseType;
1780 regNumber op1Reg = REG_NA;
1781 regNumber op2Reg = REG_NA;
1782 regNumber op3Reg = REG_NA;
1783 regNumber op4Reg = REG_NA;
1784 emitter* emit = getEmitter();
1786 if ((op1 != nullptr) && !op1->OperIsList())
1788 op1Reg = op1->gtRegNum;
1789 genConsumeOperands(node);
1792 switch (intrinsicId)
1794 case NI_SSE41_TestAllOnes:
1796 regNumber tmpReg = node->GetSingleTempReg();
1797 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1798 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1799 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1800 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1801 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1805 case NI_SSE41_TestAllZeros:
1806 case NI_SSE41_TestZ:
1808 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1809 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1810 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1811 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1815 case NI_SSE41_TestC:
1817 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1818 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1819 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1820 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1824 case NI_SSE41_TestMixOnesZeros:
1825 case NI_SSE41_TestNotZAndNotC:
1827 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1828 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1829 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1830 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1834 case NI_SSE41_Extract:
1835 case NI_SSE41_X64_Extract:
1837 regNumber tmpTargetReg = REG_NA;
1838 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1839 if (baseType == TYP_FLOAT)
1841 tmpTargetReg = node->ExtractTempReg();
1844 auto emitSwCase = [&](int8_t i) {
1845 if (baseType == TYP_FLOAT)
1847 // extract instructions return to GP-registers, so it needs int size as the emitsize
1848 emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, i);
1849 emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1853 emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, i);
1857 if (op2->IsCnsIntOrI())
1859 ssize_t ival = op2->AsIntCon()->IconValue();
1860 assert((ival >= 0) && (ival <= 255));
1861 emitSwCase((int8_t)ival);
1865 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1866 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1867 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1868 regNumber baseReg = node->ExtractTempReg();
1869 regNumber offsReg = node->GetSingleTempReg();
1870 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1880 genProduceReg(node);
1883 //------------------------------------------------------------------------
1884 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1887 // node - The hardware intrinsic node
1889 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1891 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1892 regNumber targetReg = node->gtRegNum;
1893 GenTree* op1 = node->gtGetOp1();
1894 GenTree* op2 = node->gtGetOp2();
1895 var_types baseType = node->gtSIMDBaseType;
1896 var_types targetType = node->TypeGet();
1897 emitter* emit = getEmitter();
1899 regNumber op1Reg = op1->gtRegNum;
1900 genConsumeOperands(node);
1902 assert(targetReg != REG_NA);
1903 assert(op1Reg != REG_NA);
1904 assert(op2 != nullptr);
1905 assert(!node->OperIsCommutative());
1907 switch (intrinsicId)
1909 case NI_SSE42_Crc32:
1910 case NI_SSE42_X64_Crc32:
1912 if (op1Reg != targetReg)
1914 assert(op2->gtRegNum != targetReg);
1915 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1918 // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1919 // overload that explicitly takes the operands.
1921 node->gtOp2 = nullptr;
1923 if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1925 assert(targetType == TYP_INT);
1926 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1930 assert(op1->TypeGet() == op2->TypeGet());
1931 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1932 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1945 genProduceReg(node);
1948 //------------------------------------------------------------------------
1949 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1952 // node - The hardware intrinsic node
1954 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1956 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1957 var_types baseType = node->gtSIMDBaseType;
1958 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1959 var_types targetType = node->TypeGet();
1960 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1961 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
1962 GenTree* op1 = node->gtGetOp1();
1963 GenTree* op2 = node->gtGetOp2();
1964 regNumber op1Reg = REG_NA;
1965 regNumber op2Reg = REG_NA;
1966 regNumber targetReg = node->gtRegNum;
1967 emitter* emit = getEmitter();
1969 if ((op1 != nullptr) && !op1->OperIsList())
1971 op1Reg = op1->gtRegNum;
1972 genConsumeOperands(node);
1975 switch (intrinsicId)
1977 case NI_AVX2_ConvertToInt32:
1978 case NI_AVX2_ConvertToUInt32:
1980 assert(op2 == nullptr);
1981 assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1982 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1983 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1987 case NI_AVX2_GatherVector128:
1988 case NI_AVX2_GatherVector256:
1989 case NI_AVX2_GatherMaskVector128:
1990 case NI_AVX2_GatherMaskVector256:
1992 GenTreeArgList* list = op1->AsArgList();
1993 op1 = list->Current();
1994 op1Reg = op1->gtRegNum;
1995 genConsumeRegs(op1);
1997 list = list->Rest();
1998 op2 = list->Current();
1999 op2Reg = op2->gtRegNum;
2000 genConsumeRegs(op2);
2002 list = list->Rest();
2003 GenTree* op3 = list->Current();
2004 genConsumeRegs(op3);
2006 list = list->Rest();
2007 GenTree* op4 = nullptr;
2008 GenTree* lastOp = nullptr;
2009 GenTree* indexOp = nullptr;
2011 regNumber op3Reg = REG_NA;
2012 regNumber op4Reg = REG_NA;
2013 regNumber addrBaseReg = REG_NA;
2014 regNumber addrIndexReg = REG_NA;
2015 regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT);
2019 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
2020 op4 = list->Current();
2021 list = list->Rest();
2022 lastOp = list->Current();
2023 op3Reg = op3->gtRegNum;
2024 op4Reg = op4->gtRegNum;
2025 genConsumeRegs(op4);
2026 addrBaseReg = op2Reg;
2027 addrIndexReg = op3Reg;
2030 // copy op4Reg into the tmp mask register,
2031 // the mask register will be cleared by gather instructions
2032 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
2034 if (targetReg != op1Reg)
2036 // copy source vector to the target register for masking merge
2037 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
2042 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
2043 addrBaseReg = op1Reg;
2044 addrIndexReg = op2Reg;
2048 // generate all-one mask vector
2049 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
2052 bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
2054 // hwintrinsiclistxarch.h uses Dword index instructions in default
2055 if (varTypeIsLong(node->gtIndexBaseType))
2059 case INS_vpgatherdd:
2060 ins = INS_vpgatherqd;
2061 if (isVector128GatherWithVector256Index)
2063 // YMM index in address mode
2064 attr = emitTypeSize(TYP_SIMD32);
2067 case INS_vpgatherdq:
2068 ins = INS_vpgatherqq;
2070 case INS_vgatherdps:
2071 ins = INS_vgatherqps;
2072 if (isVector128GatherWithVector256Index)
2074 // YMM index in address mode
2075 attr = emitTypeSize(TYP_SIMD32);
2078 case INS_vgatherdpd:
2079 ins = INS_vgatherqpd;
2086 assert(lastOp->IsCnsIntOrI());
2087 ssize_t ival = lastOp->AsIntCon()->IconValue();
2088 assert((ival >= 0) && (ival <= 255));
2090 assert(targetReg != maskReg);
2091 assert(targetReg != addrIndexReg);
2092 assert(maskReg != addrIndexReg);
2093 emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2100 genHWIntrinsic_R_RM(node, ins, attr);
2101 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2102 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2106 case NI_AVX_TestNotZAndNotC:
2108 genHWIntrinsic_R_RM(node, ins, attr);
2109 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2110 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2116 genHWIntrinsic_R_RM(node, ins, attr);
2117 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2118 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2127 genProduceReg(node);
2130 //------------------------------------------------------------------------
2131 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2134 // node - The hardware intrinsic node
2136 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2138 NYI("Implement AES intrinsic code generation");
2141 //------------------------------------------------------------------------
2142 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2145 // node - The hardware intrinsic node
2147 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2149 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2150 regNumber targetReg = node->gtRegNum;
2151 GenTree* op1 = node->gtGetOp1();
2152 GenTree* op2 = node->gtGetOp2();
2153 var_types targetType = node->TypeGet();
2154 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2155 emitter* emit = getEmitter();
2157 assert(targetReg != REG_NA);
2158 assert(op1 != nullptr);
2160 if (!op1->OperIsList())
2162 genConsumeOperands(node);
2165 switch (intrinsicId)
2167 case NI_BMI1_AndNot:
2168 case NI_BMI1_X64_AndNot:
2169 case NI_BMI1_BitFieldExtract:
2170 case NI_BMI1_X64_BitFieldExtract:
2171 case NI_BMI2_ParallelBitDeposit:
2172 case NI_BMI2_ParallelBitExtract:
2173 case NI_BMI2_X64_ParallelBitDeposit:
2174 case NI_BMI2_X64_ParallelBitExtract:
2175 case NI_BMI2_ZeroHighBits:
2176 case NI_BMI2_X64_ZeroHighBits:
2178 assert(op2 != nullptr);
2179 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2180 genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2184 case NI_BMI1_ExtractLowestSetBit:
2185 case NI_BMI1_GetMaskUpToLowestSetBit:
2186 case NI_BMI1_ResetLowestSetBit:
2187 case NI_BMI1_X64_ExtractLowestSetBit:
2188 case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2189 case NI_BMI1_X64_ResetLowestSetBit:
2191 assert(op2 == nullptr);
2192 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2193 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2197 case NI_BMI1_TrailingZeroCount:
2198 case NI_BMI1_X64_TrailingZeroCount:
2200 assert(op2 == nullptr);
2201 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2202 genXCNTIntrinsic(node, ins);
2206 case NI_BMI2_MultiplyNoFlags:
2207 case NI_BMI2_X64_MultiplyNoFlags:
2209 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2210 assert(numArgs == 2 || numArgs == 3);
2212 regNumber op1Reg = REG_NA;
2213 regNumber op2Reg = REG_NA;
2214 regNumber op3Reg = REG_NA;
2215 regNumber lowReg = REG_NA;
2219 op1Reg = op1->gtRegNum;
2220 op2Reg = op2->gtRegNum;
2225 GenTreeArgList* argList = op1->AsArgList();
2226 op1 = argList->Current();
2227 genConsumeRegs(op1);
2228 op1Reg = op1->gtRegNum;
2229 argList = argList->Rest();
2230 op2 = argList->Current();
2231 genConsumeRegs(op2);
2232 op2Reg = op2->gtRegNum;
2233 argList = argList->Rest();
2234 GenTree* op3 = argList->Current();
2235 genConsumeRegs(op3);
2236 op3Reg = op3->gtRegNum;
2237 assert(op3Reg != op1Reg);
2238 assert(op3Reg != targetReg);
2239 assert(op3Reg != REG_EDX);
2240 lowReg = node->GetSingleTempReg();
2241 assert(op3Reg != lowReg);
2242 assert(lowReg != targetReg);
2245 emitAttr attr = emitTypeSize(targetType);
2246 // mov the first operand into implicit source operand EDX/RDX
2247 if (op1Reg != REG_EDX)
2249 assert(op2Reg != REG_EDX);
2250 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2253 // generate code for MULX
2254 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2256 // If requires the lower half result, store in the memory opinted by op3
2259 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2272 genProduceReg(node);
2275 //------------------------------------------------------------------------
2276 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2279 // node - The hardware intrinsic node
2281 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2283 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2284 var_types baseType = node->gtSIMDBaseType;
2285 emitAttr attr = EA_ATTR(node->gtSIMDSize);
2286 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2287 GenTree* op1 = node->gtGetOp1();
2288 regNumber targetReg = node->gtRegNum;
2290 assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2291 assert(op1 != nullptr);
2292 assert(op1->OperIsList());
2293 assert(op1->gtGetOp2()->OperIsList());
2294 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
2296 GenTreeArgList* argList = op1->AsArgList();
2297 op1 = argList->Current();
2298 genConsumeRegs(op1);
2300 argList = argList->Rest();
2301 GenTree* op2 = argList->Current();
2302 genConsumeRegs(op2);
2304 argList = argList->Rest();
2305 GenTree* op3 = argList->Current();
2306 genConsumeRegs(op3);
2311 bool isCommutative = false;
2312 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2314 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2315 assert(!copiesUpperBits || !op1->isContained());
2317 if (op3->isContained() || op3->isUsedFromSpillTemp())
2319 // 213 form: op1 = (op2 * op1) + [op3]
2321 op1Reg = op1->gtRegNum;
2322 op2Reg = op2->gtRegNum;
2324 isCommutative = !copiesUpperBits;
2326 else if (op2->isContained() || op2->isUsedFromSpillTemp())
2328 // 132 form: op1 = (op1 * op3) + [op2]
2330 ins = (instruction)(ins - 1);
2331 op1Reg = op1->gtRegNum;
2332 op2Reg = op3->gtRegNum;
2335 else if (op1->isContained() || op1->isUsedFromSpillTemp())
2337 // 231 form: op3 = (op2 * op3) + [op1]
2339 ins = (instruction)(ins + 1);
2340 op1Reg = op3->gtRegNum;
2341 op2Reg = op2->gtRegNum;
2346 // 213 form: op1 = (op2 * op1) + op3
2348 op1Reg = op1->gtRegNum;
2349 op2Reg = op2->gtRegNum;
2351 isCommutative = !copiesUpperBits;
2354 if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2356 assert(node->isRMWHWIntrinsic(compiler));
2358 // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2360 // For non-commutative intrinsics, we should have ensured that op2 was marked
2361 // delay free in order to prevent it from getting assigned the same register
2362 // as target. However, for commutative intrinsics, we can just swap the operands
2363 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2369 genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2370 genProduceReg(node);
2373 //------------------------------------------------------------------------
2374 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2377 // node - The hardware intrinsic node
2379 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2381 assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2382 node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2384 genConsumeOperands(node);
2385 genXCNTIntrinsic(node, INS_lzcnt);
2386 genProduceReg(node);
2389 //------------------------------------------------------------------------
2390 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2393 // node - The hardware intrinsic node
2395 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2397 NYI("Implement PCLMULQDQ intrinsic code generation");
2400 //------------------------------------------------------------------------
2401 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2404 // node - The hardware intrinsic node
2406 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2408 assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2410 genConsumeOperands(node);
2411 genXCNTIntrinsic(node, INS_popcnt);
2412 genProduceReg(node);
2415 //------------------------------------------------------------------------
2416 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2417 // the target register
2420 // node - The hardware intrinsic node
2421 // ins - The instruction being generated
2423 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2425 // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2426 // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2427 // renaming, but only if it's not an actual dependency.
2429 GenTree* op1 = node->gtGetOp1();
2430 regNumber sourceReg1 = REG_NA;
2431 regNumber sourceReg2 = REG_NA;
2433 if (!op1->isContained())
2435 sourceReg1 = op1->gtRegNum;
2437 else if (op1->isIndir())
2439 GenTreeIndir* indir = op1->AsIndir();
2440 GenTree* memBase = indir->Base();
2442 if (memBase != nullptr)
2444 sourceReg1 = memBase->gtRegNum;
2447 if (indir->HasIndex())
2449 sourceReg2 = indir->Index()->gtRegNum;
2453 regNumber targetReg = node->gtRegNum;
2454 if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2456 getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2458 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2461 #endif // FEATURE_HW_INTRINSICS