1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Intel hardware intrinsic Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifdef FEATURE_HW_INTRINSICS
22 #include "sideeffects.h"
25 #include "gcinfoencoder.h"
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
31 // lowering - The lowering phase from the compiler
32 // node - The HWIntrinsic node that has the contained node
33 // op - The op that is contained
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
38 // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39 // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
41 // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42 // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
44 // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
46 // in the first place).
48 bool supportsRegOptional = false;
49 bool isContainable = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50 assert(isContainable || supportsRegOptional);
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
58 // category - category of a HW intrinsic
61 // returns true if this category can be table-driven in CodeGen
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
65 // TODO - make more categories to the table-driven framework
66 // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67 const bool tableDrivenCategory =
68 (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69 const bool tableDrivenFlag =
70 !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71 return tableDrivenCategory && tableDrivenFlag;
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
78 // node - The hardware intrinsic node
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
82 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
83 InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId);
84 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
85 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
86 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
88 assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
90 if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
92 GenTree* op1 = node->gtGetOp1();
93 GenTree* op2 = node->gtGetOp2();
94 regNumber targetReg = node->gtRegNum;
95 var_types targetType = node->TypeGet();
96 var_types baseType = node->gtSIMDBaseType;
98 regNumber op1Reg = REG_NA;
99 regNumber op2Reg = REG_NA;
100 emitter* emit = getEmitter();
102 assert(numArgs >= 0);
103 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104 assert(ins != INS_invalid);
105 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106 assert(simdSize != 0);
112 if (node->OperIsMemoryLoad())
114 genConsumeAddress(op1);
115 // Until we improve the handling of addressing modes in the emitter, we'll create a
116 // temporary GT_IND to generate code with.
117 GenTreeIndir load = indirForm(node->TypeGet(), op1);
118 emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
123 op1Reg = op1->gtRegNum;
125 if ((ival != -1) && varTypeIsFloating(baseType))
127 assert((ival >= 0) && (ival <= 127));
128 genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
130 else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
132 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
136 genHWIntrinsic_R_RM(node, ins, simdSize);
144 if (category == HW_Category_MemoryStore)
146 genConsumeAddress(op1);
148 // Until we improve the handling of addressing modes in the emitter, we'll create a
149 // temporary GT_STORE_IND to generate code with.
150 GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
151 emit->emitInsStoreInd(ins, simdSize, &store);
157 op1Reg = op1->gtRegNum;
158 op2Reg = op2->gtRegNum;
160 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
162 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
164 // For non-commutative intrinsics, we should have ensured that op2 was marked
165 // delay free in order to prevent it from getting assigned the same register
166 // as target. However, for commutative intrinsics, we can just swap the operands
167 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
169 noway_assert(node->OperIsCommutative());
174 if ((ival != -1) && varTypeIsFloating(baseType))
176 assert((ival >= 0) && (ival <= 127));
177 genHWIntrinsic_R_R_RM_I(node, ins, static_cast<int8_t>(ival));
179 else if (category == HW_Category_MemoryLoad)
181 // Get the address and the 'other' register.
184 if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
194 // Until we improve the handling of addressing modes in the emitter, we'll create a
195 // temporary GT_IND to generate code with.
196 GenTreeIndir load = indirForm(node->TypeGet(), addr);
197 genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
199 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
203 if (intrinsicId == NI_SSE2_Extract)
205 // extract instructions return to GP-registers, so it needs int size as the emitsize
206 simdSize = emitTypeSize(TYP_INT);
209 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
211 if (op2->IsCnsIntOrI())
213 ssize_t ival = op2->AsIntCon()->IconValue();
214 assert((ival >= 0) && (ival <= 255));
215 emitSwCase((int8_t)ival);
219 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
220 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
221 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
222 regNumber baseReg = node->ExtractTempReg();
223 regNumber offsReg = node->GetSingleTempReg();
224 genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
229 genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
236 GenTreeArgList* argList = op1->AsArgList();
237 op1 = argList->Current();
239 op1Reg = op1->gtRegNum;
241 argList = argList->Rest();
242 op2 = argList->Current();
244 op2Reg = op2->gtRegNum;
246 argList = argList->Rest();
247 GenTree* op3 = argList->Current();
249 regNumber op3Reg = op3->gtRegNum;
251 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
255 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
257 if (op3->IsCnsIntOrI())
259 ssize_t ival = op3->AsIntCon()->IconValue();
260 assert((ival >= 0) && (ival <= 255));
261 emitSwCase((int8_t)ival);
265 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
266 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
267 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
268 regNumber baseReg = node->ExtractTempReg();
269 regNumber offsReg = node->GetSingleTempReg();
270 genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
273 else if (category == HW_Category_MemoryStore)
275 // The Mask instructions do not currently support containment of the address.
276 assert(!op2->isContained());
277 if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
279 emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
283 assert(intrinsicId == NI_SSE2_MaskMove);
284 assert(targetReg == REG_NA);
286 // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
287 if (op3Reg != REG_EDI)
289 emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
291 emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
298 case NI_SSE41_BlendVariable:
299 case NI_AVX_BlendVariable:
300 case NI_AVX2_BlendVariable:
302 genHWIntrinsic_R_R_RM_R(node, ins);
326 case InstructionSet_Vector128:
327 case InstructionSet_Vector256:
328 genBaseIntrinsic(node);
330 case InstructionSet_SSE:
331 case InstructionSet_SSE_X64:
332 genSSEIntrinsic(node);
334 case InstructionSet_SSE2:
335 case InstructionSet_SSE2_X64:
336 genSSE2Intrinsic(node);
338 case InstructionSet_SSE41:
339 case InstructionSet_SSE41_X64:
340 genSSE41Intrinsic(node);
342 case InstructionSet_SSE42:
343 case InstructionSet_SSE42_X64:
344 genSSE42Intrinsic(node);
346 case InstructionSet_AVX:
347 case InstructionSet_AVX2:
348 genAvxOrAvx2Intrinsic(node);
350 case InstructionSet_AES:
351 genAESIntrinsic(node);
353 case InstructionSet_BMI1:
354 case InstructionSet_BMI1_X64:
355 case InstructionSet_BMI2:
356 case InstructionSet_BMI2_X64:
357 genBMI1OrBMI2Intrinsic(node);
359 case InstructionSet_FMA:
360 genFMAIntrinsic(node);
362 case InstructionSet_LZCNT:
363 case InstructionSet_LZCNT_X64:
364 genLZCNTIntrinsic(node);
366 case InstructionSet_PCLMULQDQ:
367 genPCLMULQDQIntrinsic(node);
369 case InstructionSet_POPCNT:
370 case InstructionSet_POPCNT_X64:
371 genPOPCNTIntrinsic(node);
379 //------------------------------------------------------------------------
380 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
381 // register/memory operand and that returns a value in register
384 // node - The hardware intrinsic node
385 // ins - The instruction being generated
386 // attr - The emit attribute for the instruciton being generated
388 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
390 var_types targetType = node->TypeGet();
391 regNumber targetReg = node->gtRegNum;
392 GenTree* op1 = node->gtGetOp1();
393 GenTree* op2 = node->gtGetOp2();
394 emitter* emit = getEmitter();
398 // The CompareScalarOrdered* and CompareScalarUnordered* intrinsics come down this
399 // code path. They are all MultiIns, as the return value comes from the flags and
400 // we have two operands instead.
402 assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
403 assert(targetReg != REG_NA);
405 targetReg = op1->gtRegNum;
411 assert(!node->OperIsCommutative());
414 assert(targetReg != REG_NA);
415 assert(op2 == nullptr);
417 if (op1->isContained() || op1->isUsedFromSpillTemp())
419 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
420 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
422 TempDsc* tmpDsc = nullptr;
423 unsigned varNum = BAD_VAR_NUM;
424 unsigned offset = (unsigned)-1;
426 if (op1->isUsedFromSpillTemp())
428 assert(op1->IsRegOptional());
430 tmpDsc = getSpillTempDsc(op1);
431 varNum = tmpDsc->tdTempNum();
434 regSet.tmpRlsTemp(tmpDsc);
436 else if (op1->isIndir() || op1->OperIsHWIntrinsic())
439 GenTreeIndir* memIndir = nullptr;
443 memIndir = op1->AsIndir();
444 addr = memIndir->Addr();
448 assert(op1->AsHWIntrinsic()->OperIsMemoryLoad());
449 assert(HWIntrinsicInfo::lookupNumArgs(op1->AsHWIntrinsic()) == 1);
450 addr = op1->gtGetOp1();
453 switch (addr->OperGet())
455 case GT_LCL_VAR_ADDR:
457 varNum = addr->AsLclVarCommon()->GetLclNum();
462 case GT_CLS_VAR_ADDR:
464 emit->emitIns_R_C(ins, attr, targetReg, addr->gtClsVar.gtClsVarHnd, 0);
470 if (memIndir == nullptr)
472 // This is the HW intrinsic load case.
473 // Until we improve the handling of addressing modes in the emitter, we'll create a
474 // temporary GT_IND to generate code with.
475 GenTreeIndir load = indirForm(op1->TypeGet(), addr);
478 emit->emitIns_R_A(ins, attr, targetReg, memIndir);
485 switch (op1->OperGet())
489 GenTreeLclFld* lclField = op1->AsLclFld();
491 varNum = lclField->GetLclNum();
492 offset = lclField->gtLclFld.gtLclOffs;
498 assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
499 varNum = op1->AsLclVar()->GetLclNum();
512 // Ensure we got a good varNum and offset.
513 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
514 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
515 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
516 assert(offset != (unsigned)-1);
518 emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
522 regNumber op1Reg = op1->gtRegNum;
523 emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
527 //------------------------------------------------------------------------
528 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
529 // an immediate operand, and that returns a value in register
532 // node - The hardware intrinsic node
533 // ins - The instruction being generated
534 // ival - The immediate value
536 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
538 var_types targetType = node->TypeGet();
539 regNumber targetReg = node->gtRegNum;
540 GenTree* op1 = node->gtGetOp1();
541 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
542 emitter* emit = getEmitter();
544 // TODO-XArch-CQ: Commutative operations can have op1 be contained
545 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
547 assert(targetReg != REG_NA);
548 assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
550 if (op1->isContained() || op1->isUsedFromSpillTemp())
552 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
553 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
555 inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
558 //------------------------------------------------------------------------
559 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
560 // register/memory operand, and that returns a value in register
563 // node - The hardware intrinsic node
564 // ins - The instruction being generated
565 // attr - The emit attribute for the instruciton being generated
567 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
569 regNumber targetReg = node->gtRegNum;
570 GenTree* op1 = node->gtGetOp1();
571 GenTree* op2 = node->gtGetOp2();
572 regNumber op1Reg = op1->gtRegNum;
574 assert(targetReg != REG_NA);
575 assert(op1Reg != REG_NA);
577 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
580 //------------------------------------------------------------------------
581 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
582 // register/memory operand, and that returns a value in register
585 // node - The hardware intrinsic node
586 // ins - The instruction being generated
587 // attr - The emit attribute for the instruciton being generated
588 // targetReg - The register allocated to the result
589 // op1Reg - The register allocated to the first operand
590 // op2 - Another operand that maybe in register or memory
592 void CodeGen::genHWIntrinsic_R_R_RM(
593 GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
595 emitter* emit = getEmitter();
597 // TODO-XArch-CQ: Commutative operations can have op1 be contained
598 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
600 assert(targetReg != REG_NA);
601 assert(op1Reg != REG_NA);
603 if (op2->isContained() || op2->isUsedFromSpillTemp())
605 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
606 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
608 TempDsc* tmpDsc = nullptr;
609 unsigned varNum = BAD_VAR_NUM;
610 unsigned offset = (unsigned)-1;
612 if (op2->isUsedFromSpillTemp())
614 assert(op2->IsRegOptional());
616 tmpDsc = getSpillTempDsc(op2);
617 varNum = tmpDsc->tdTempNum();
620 regSet.tmpRlsTemp(tmpDsc);
622 else if (op2->isIndir() || op2->OperIsHWIntrinsic())
625 GenTreeIndir* memIndir = nullptr;
629 memIndir = op2->AsIndir();
630 addr = memIndir->Addr();
634 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
635 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
636 addr = op2->gtGetOp1();
639 switch (addr->OperGet())
641 case GT_LCL_VAR_ADDR:
643 varNum = addr->AsLclVarCommon()->GetLclNum();
648 case GT_CLS_VAR_ADDR:
650 emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, addr->gtClsVar.gtClsVarHnd, 0);
656 if (memIndir == nullptr)
658 // This is the HW intrinsic load case.
659 // Until we improve the handling of addressing modes in the emitter, we'll create a
660 // temporary GT_IND to generate code with.
661 GenTreeIndir load = indirForm(op2->TypeGet(), addr);
664 emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
671 switch (op2->OperGet())
675 GenTreeLclFld* lclField = op2->AsLclFld();
677 varNum = lclField->GetLclNum();
678 offset = lclField->gtLclFld.gtLclOffs;
684 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
685 varNum = op2->AsLclVar()->GetLclNum();
696 // Ensure we got a good varNum and offset.
697 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
698 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
699 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
700 assert(offset != (unsigned)-1);
702 emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
706 regNumber op2Reg = op2->gtRegNum;
708 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
710 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
712 // For non-commutative intrinsics, we should have ensured that op2 was marked
713 // delay free in order to prevent it from getting assigned the same register
714 // as target. However, for commutative intrinsics, we can just swap the operands
715 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
717 noway_assert(node->OperIsCommutative());
722 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
726 //------------------------------------------------------------------------
727 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
728 // register/memory operand, an immediate operand, and that returns a value in register
731 // node - The hardware intrinsic node
732 // ins - The instruction being generated
733 // ival - The immediate value
735 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
737 var_types targetType = node->TypeGet();
738 regNumber targetReg = node->gtRegNum;
739 GenTree* op1 = node->gtGetOp1();
740 GenTree* op2 = node->gtGetOp2();
741 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
742 emitter* emit = getEmitter();
744 // TODO-XArch-CQ: Commutative operations can have op1 be contained
745 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
747 if (op1->OperIsList())
749 assert(op2 == nullptr);
751 GenTreeArgList* argList = op1->AsArgList();
753 op1 = argList->Current();
754 argList = argList->Rest();
756 op2 = argList->Current();
757 argList = argList->Rest();
759 assert(argList->Current() != nullptr);
760 assert(argList->Rest() == nullptr);
763 regNumber op1Reg = op1->gtRegNum;
765 assert(targetReg != REG_NA);
766 assert(op1Reg != REG_NA);
768 if (op2->isContained() || op2->isUsedFromSpillTemp())
770 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
771 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
773 TempDsc* tmpDsc = nullptr;
774 unsigned varNum = BAD_VAR_NUM;
775 unsigned offset = (unsigned)-1;
777 if (op2->isUsedFromSpillTemp())
779 assert(op2->IsRegOptional());
781 tmpDsc = getSpillTempDsc(op2);
782 varNum = tmpDsc->tdTempNum();
785 regSet.tmpRlsTemp(tmpDsc);
787 else if (op2->isIndir() || op2->OperIsHWIntrinsic())
790 GenTreeIndir* memIndir = nullptr;
794 memIndir = op2->AsIndir();
795 addr = memIndir->Addr();
799 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
800 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
801 addr = op2->gtGetOp1();
804 switch (addr->OperGet())
806 case GT_LCL_VAR_ADDR:
808 varNum = addr->AsLclVarCommon()->GetLclNum();
813 case GT_CLS_VAR_ADDR:
815 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, addr->gtClsVar.gtClsVarHnd, 0, ival);
821 if (memIndir == nullptr)
823 // This is the HW intrinsic load case.
824 // Until we improve the handling of addressing modes in the emitter, we'll create a
825 // temporary GT_IND to generate code with.
826 GenTreeIndir load = indirForm(op2->TypeGet(), addr);
829 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
836 switch (op2->OperGet())
840 GenTreeLclFld* lclField = op2->AsLclFld();
842 varNum = lclField->GetLclNum();
843 offset = lclField->gtLclFld.gtLclOffs;
849 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
850 varNum = op2->AsLclVar()->GetLclNum();
861 // Ensure we got a good varNum and offset.
862 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
863 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
864 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
865 assert(offset != (unsigned)-1);
867 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
871 regNumber op2Reg = op2->gtRegNum;
873 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
875 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
877 // For non-commutative intrinsics, we should have ensured that op2 was marked
878 // delay free in order to prevent it from getting assigned the same register
879 // as target. However, for commutative intrinsics, we can just swap the operands
880 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
882 noway_assert(node->OperIsCommutative());
887 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
891 //------------------------------------------------------------------------
892 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
893 // register/memory operand, another register operand, and that returns a value in register
896 // node - The hardware intrinsic node
897 // ins - The instruction being generated
899 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
901 var_types targetType = node->TypeGet();
902 regNumber targetReg = node->gtRegNum;
903 GenTree* op1 = node->gtGetOp1();
904 GenTree* op2 = node->gtGetOp2();
905 GenTree* op3 = nullptr;
906 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
907 emitter* emit = getEmitter();
909 assert(op1->OperIsList());
910 assert(op2 == nullptr);
912 GenTreeArgList* argList = op1->AsArgList();
914 op1 = argList->Current();
915 argList = argList->Rest();
917 op2 = argList->Current();
918 argList = argList->Rest();
920 op3 = argList->Current();
921 assert(argList->Rest() == nullptr);
923 regNumber op1Reg = op1->gtRegNum;
924 regNumber op3Reg = op3->gtRegNum;
926 assert(targetReg != REG_NA);
927 assert(op1Reg != REG_NA);
928 assert(op3Reg != REG_NA);
930 if (op2->isContained() || op2->isUsedFromSpillTemp())
932 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
933 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
935 TempDsc* tmpDsc = nullptr;
936 unsigned varNum = BAD_VAR_NUM;
937 unsigned offset = (unsigned)-1;
939 if (op2->isUsedFromSpillTemp())
941 assert(op2->IsRegOptional());
943 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
944 // pattern. It could probably be extracted to its own method.
945 tmpDsc = getSpillTempDsc(op2);
946 varNum = tmpDsc->tdTempNum();
949 regSet.tmpRlsTemp(tmpDsc);
951 else if (op2->isIndir() || op2->OperIsHWIntrinsic())
954 GenTreeIndir* memIndir = nullptr;
958 memIndir = op2->AsIndir();
959 addr = memIndir->Addr();
963 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
964 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
965 addr = op2->gtGetOp1();
968 switch (addr->OperGet())
970 case GT_LCL_VAR_ADDR:
972 varNum = addr->AsLclVarCommon()->GetLclNum();
977 case GT_CLS_VAR_ADDR:
979 emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, addr->gtClsVar.gtClsVarHnd, 0);
985 if (memIndir == nullptr)
987 // This is the HW intrinsic load case.
988 // Until we improve the handling of addressing modes in the emitter, we'll create a
989 // temporary GT_IND to generate code with.
990 GenTreeIndir load = indirForm(op2->TypeGet(), addr);
993 emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
1000 switch (op2->OperGet())
1004 GenTreeLclFld* lclField = op2->AsLclFld();
1006 varNum = lclField->GetLclNum();
1007 offset = lclField->gtLclFld.gtLclOffs;
1013 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
1014 varNum = op2->AsLclVar()->GetLclNum();
1025 // Ensure we got a good varNum and offset.
1026 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1027 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1028 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1029 assert(offset != (unsigned)-1);
1031 emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
1035 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1039 //------------------------------------------------------------------------
1040 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1041 // a register/memory operand, and that returns a value in register
1044 // ins - The instruction being generated
1045 // attr - The emit attribute
1046 // targetReg - The target register
1047 // op1Reg - The register of the first operand
1048 // op2Reg - The register of the second operand
1049 // op3 - The third operand
1051 void CodeGen::genHWIntrinsic_R_R_R_RM(
1052 instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1054 assert(targetReg != REG_NA);
1055 assert(op1Reg != REG_NA);
1056 assert(op2Reg != REG_NA);
1058 emitter* emit = getEmitter();
1060 if (op3->isContained() || op3->isUsedFromSpillTemp())
1062 TempDsc* tmpDsc = nullptr;
1063 unsigned varNum = BAD_VAR_NUM;
1064 unsigned offset = (unsigned)-1;
1066 if (op3->isUsedFromSpillTemp())
1068 assert(op3->IsRegOptional());
1070 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1071 // pattern. It could probably be extracted to its own method.
1072 tmpDsc = getSpillTempDsc(op3);
1073 varNum = tmpDsc->tdTempNum();
1076 regSet.tmpRlsTemp(tmpDsc);
1078 else if (op3->isIndir() || op3->OperIsHWIntrinsic())
1081 GenTreeIndir* memIndir = nullptr;
1084 memIndir = op3->AsIndir();
1085 addr = memIndir->Addr();
1089 assert(op3->AsHWIntrinsic()->OperIsMemoryLoad());
1090 assert(HWIntrinsicInfo::lookupNumArgs(op3->AsHWIntrinsic()) == 1);
1091 addr = op3->gtGetOp1();
1094 switch (addr->OperGet())
1096 case GT_LCL_VAR_ADDR:
1098 varNum = addr->AsLclVarCommon()->GetLclNum();
1103 case GT_CLS_VAR_ADDR:
1105 emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, addr->gtClsVar.gtClsVarHnd, 0);
1111 if (memIndir == nullptr)
1113 // This is the HW intrinsic load case.
1114 // Until we improve the handling of addressing modes in the emitter, we'll create a
1115 // temporary GT_IND to generate code with.
1116 GenTreeIndir load = indirForm(op3->TypeGet(), addr);
1119 emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1126 switch (op3->OperGet())
1130 GenTreeLclFld* lclField = op3->AsLclFld();
1132 varNum = lclField->GetLclNum();
1133 offset = lclField->gtLclFld.gtLclOffs;
1139 assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1140 varNum = op3->AsLclVar()->GetLclNum();
1151 // Ensure we got a good varNum and offset.
1152 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1153 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1154 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1155 assert(offset != (unsigned)-1);
1157 emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1161 emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1165 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1166 // with non-constant argument
1169 // intrinsic - intrinsic ID
1170 // nonConstImmReg - the register contains non-constant imm8 argument
1171 // baseReg - a register for the start of the switch table
1172 // offsReg - a register for the offset into the switch table
1173 // emitSwCase - the lambda to generate a switch case
1176 // generate the jump-table fallback for imm-intrinsics with non-constant argument.
1178 // This function can be used for all imm-intrinsics (whether full-range or not),
1179 // The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1180 // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1182 template <typename HWIntrinsicSwitchCaseBody>
1183 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
1184 regNumber nonConstImmReg,
1187 HWIntrinsicSwitchCaseBody emitSwCase)
1189 assert(nonConstImmReg != REG_NA);
1190 // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1191 // that does work with the current compiler generated jump-table fallback
1192 assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1193 emitter* emit = getEmitter();
1195 const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1196 assert(maxByte <= 256);
1197 BasicBlock* jmpTable[256];
1199 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1200 unsigned jmpTableOffs = 0;
1202 // Emit the jump table
1203 for (unsigned i = 0; i < maxByte; i++)
1205 jmpTable[i] = genCreateTempLabel();
1206 emit->emitDataGenData(i, jmpTable[i]);
1209 emit->emitDataGenEnd();
1211 // Compute and jump to the appropriate offset in the switch table
1212 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1214 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1215 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1216 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1217 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1219 // Emit the switch table entries
1221 BasicBlock* switchTableBeg = genCreateTempLabel();
1222 BasicBlock* switchTableEnd = genCreateTempLabel();
1224 genDefineTempLabel(switchTableBeg);
1226 for (unsigned i = 0; i < maxByte; i++)
1228 genDefineTempLabel(jmpTable[i]);
1229 emitSwCase((int8_t)i);
1230 emit->emitIns_J(INS_jmp, switchTableEnd);
1233 genDefineTempLabel(switchTableEnd);
1236 //------------------------------------------------------------------------
1237 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1240 // node - The hardware intrinsic node
1243 // We currently assume that all base intrinsics have zero or one operand.
1245 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1247 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1248 regNumber targetReg = node->gtRegNum;
1249 var_types targetType = node->TypeGet();
1250 var_types baseType = node->gtSIMDBaseType;
1252 assert(compiler->compSupports(InstructionSet_SSE));
1253 assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1255 GenTree* op1 = node->gtGetOp1();
1257 genConsumeHWIntrinsicOperands(node);
1258 regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
1260 assert(node->gtGetOp2() == nullptr);
1262 emitter* emit = getEmitter();
1263 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1264 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1266 switch (intrinsicId)
1268 case NI_Vector128_CreateScalarUnsafe:
1269 case NI_Vector256_CreateScalarUnsafe:
1271 if (varTypeIsIntegral(baseType))
1273 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1277 assert(varTypeIsFloating(baseType));
1279 attr = emitTypeSize(baseType);
1281 if (op1->isContained() || op1->isUsedFromSpillTemp())
1283 genHWIntrinsic_R_RM(node, ins, attr);
1285 else if (targetReg != op1Reg)
1287 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1288 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1294 case NI_Vector128_ToScalar:
1295 case NI_Vector256_ToScalar:
1297 assert(varTypeIsFloating(baseType));
1299 attr = emitTypeSize(TYP_SIMD16);
1301 if (op1->isContained() || op1->isUsedFromSpillTemp())
1303 genHWIntrinsic_R_RM(node, ins, attr);
1305 else if (targetReg != op1Reg)
1307 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1308 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1313 case NI_Vector128_ToVector256:
1315 // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1316 // We always emit a move to the target register, even when op1Reg == targetReg,
1317 // in order to ensure that Bits MAXVL-1:128 are zeroed.
1319 attr = emitTypeSize(TYP_SIMD16);
1321 if (op1->isContained() || op1->isUsedFromSpillTemp())
1323 genHWIntrinsic_R_RM(node, ins, attr);
1327 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1328 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1333 case NI_Vector128_ToVector256Unsafe:
1334 case NI_Vector256_GetLower:
1336 if (op1->isContained() || op1->isUsedFromSpillTemp())
1338 genHWIntrinsic_R_RM(node, ins, attr);
1340 else if (targetReg != op1Reg)
1342 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1343 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1348 case NI_Vector128_Zero:
1349 case NI_Vector256_Zero:
1351 assert(op1 == nullptr);
1352 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1363 genProduceReg(node);
1366 //------------------------------------------------------------------------
1367 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1370 // node - The hardware intrinsic node
1372 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1374 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1375 GenTree* op1 = node->gtGetOp1();
1376 GenTree* op2 = node->gtGetOp2();
1377 GenTree* op3 = nullptr;
1378 GenTree* op4 = nullptr;
1379 regNumber targetReg = node->gtRegNum;
1380 var_types targetType = node->TypeGet();
1381 var_types baseType = node->gtSIMDBaseType;
1383 regNumber op1Reg = REG_NA;
1384 regNumber op2Reg = REG_NA;
1385 regNumber op3Reg = REG_NA;
1386 regNumber op4Reg = REG_NA;
1387 emitter* emit = getEmitter();
1389 genConsumeHWIntrinsicOperands(node);
1391 switch (intrinsicId)
1393 case NI_SSE_CompareScalarOrderedEqual:
1394 case NI_SSE_CompareScalarUnorderedEqual:
1396 assert(baseType == TYP_FLOAT);
1397 regNumber tmpReg = node->GetSingleTempReg();
1398 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1400 // Ensure we aren't overwriting targetReg
1401 assert(tmpReg != targetReg);
1403 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1404 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1405 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1406 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1407 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1411 case NI_SSE_CompareScalarOrderedGreaterThan:
1412 case NI_SSE_CompareScalarUnorderedGreaterThan:
1414 assert(baseType == TYP_FLOAT);
1415 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1417 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1418 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1419 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1423 case NI_SSE_CompareScalarOrderedGreaterThanOrEqual:
1424 case NI_SSE_CompareScalarUnorderedGreaterThanOrEqual:
1426 assert(baseType == TYP_FLOAT);
1427 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1429 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1430 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1431 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1435 case NI_SSE_CompareScalarOrderedLessThan:
1436 case NI_SSE_CompareScalarUnorderedLessThan:
1438 assert(baseType == TYP_FLOAT);
1439 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1441 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1442 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1443 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1447 case NI_SSE_CompareScalarOrderedLessThanOrEqual:
1448 case NI_SSE_CompareScalarUnorderedLessThanOrEqual:
1450 assert(baseType == TYP_FLOAT);
1451 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1453 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1454 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1455 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1459 case NI_SSE_CompareScalarOrderedNotEqual:
1460 case NI_SSE_CompareScalarUnorderedNotEqual:
1462 assert(baseType == TYP_FLOAT);
1463 regNumber tmpReg = node->GetSingleTempReg();
1464 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1466 // Ensure we aren't overwriting targetReg
1467 assert(tmpReg != targetReg);
1469 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1470 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1471 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1472 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1473 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1477 case NI_SSE_X64_ConvertToInt64:
1478 case NI_SSE_X64_ConvertToInt64WithTruncation:
1480 assert(targetType == TYP_LONG);
1481 assert(op1 != nullptr);
1482 assert(op2 == nullptr);
1483 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1484 genHWIntrinsic_R_RM(node, ins, EA_8BYTE);
1488 case NI_SSE_X64_ConvertScalarToVector128Single:
1490 assert(baseType == TYP_LONG);
1491 assert(op1 != nullptr);
1492 assert(op2 != nullptr);
1493 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1494 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1498 case NI_SSE_Prefetch0:
1499 case NI_SSE_Prefetch1:
1500 case NI_SSE_Prefetch2:
1501 case NI_SSE_PrefetchNonTemporal:
1503 assert(baseType == TYP_UBYTE);
1504 assert(op2 == nullptr);
1506 // These do not support containment.
1507 assert(!op1->isContained());
1508 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1509 op1Reg = op1->gtRegNum;
1510 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1514 case NI_SSE_StoreFence:
1516 assert(baseType == TYP_VOID);
1517 assert(op1 == nullptr);
1518 assert(op2 == nullptr);
1519 emit->emitIns(INS_sfence);
1528 genProduceReg(node);
1531 //------------------------------------------------------------------------
1532 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1535 // node - The hardware intrinsic node
1537 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1539 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1540 GenTree* op1 = node->gtGetOp1();
1541 GenTree* op2 = node->gtGetOp2();
1542 regNumber targetReg = node->gtRegNum;
1543 var_types targetType = node->TypeGet();
1544 var_types baseType = node->gtSIMDBaseType;
1545 regNumber op1Reg = REG_NA;
1546 regNumber op2Reg = REG_NA;
1547 emitter* emit = getEmitter();
1549 genConsumeHWIntrinsicOperands(node);
1551 switch (intrinsicId)
1553 // All integer overloads are handled by table codegen
1554 case NI_SSE2_CompareLessThan:
1556 assert(op1 != nullptr);
1557 assert(op2 != nullptr);
1559 assert(baseType == TYP_DOUBLE);
1561 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1562 assert((ival >= 0) && (ival <= 127));
1564 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1565 op1Reg = op1->gtRegNum;
1566 op2Reg = op2->gtRegNum;
1567 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1572 case NI_SSE2_CompareScalarOrderedEqual:
1573 case NI_SSE2_CompareScalarUnorderedEqual:
1575 assert(baseType == TYP_DOUBLE);
1576 regNumber tmpReg = node->GetSingleTempReg();
1577 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1579 // Ensure we aren't overwriting targetReg
1580 assert(tmpReg != targetReg);
1582 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1583 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1584 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1585 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1586 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1590 case NI_SSE2_CompareScalarOrderedGreaterThan:
1591 case NI_SSE2_CompareScalarUnorderedGreaterThan:
1593 assert(baseType == TYP_DOUBLE);
1594 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1596 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1597 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1598 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1602 case NI_SSE2_CompareScalarOrderedGreaterThanOrEqual:
1603 case NI_SSE2_CompareScalarUnorderedGreaterThanOrEqual:
1605 assert(baseType == TYP_DOUBLE);
1606 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1608 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1609 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1610 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1614 case NI_SSE2_CompareScalarOrderedLessThan:
1615 case NI_SSE2_CompareScalarUnorderedLessThan:
1617 assert(baseType == TYP_DOUBLE);
1618 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1620 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1621 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1622 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1626 case NI_SSE2_CompareScalarOrderedLessThanOrEqual:
1627 case NI_SSE2_CompareScalarUnorderedLessThanOrEqual:
1629 assert(baseType == TYP_DOUBLE);
1630 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1632 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1633 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1634 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1638 case NI_SSE2_CompareScalarOrderedNotEqual:
1639 case NI_SSE2_CompareScalarUnorderedNotEqual:
1641 assert(baseType == TYP_DOUBLE);
1642 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1643 regNumber tmpReg = node->GetSingleTempReg();
1645 // Ensure we aren't overwriting targetReg
1646 assert(tmpReg != targetReg);
1648 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1649 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1650 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1651 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1652 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1656 case NI_SSE2_X64_ConvertScalarToVector128Double:
1658 assert(baseType == TYP_LONG);
1659 assert(op1 != nullptr);
1660 assert(op2 != nullptr);
1661 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1662 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1666 case NI_SSE2_X64_ConvertScalarToVector128Int64:
1667 case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1669 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1670 assert(op1 != nullptr);
1671 assert(op2 == nullptr);
1672 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1673 genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1677 case NI_SSE2_ConvertToInt32:
1678 case NI_SSE2_ConvertToInt32WithTruncation:
1679 case NI_SSE2_ConvertToUInt32:
1680 case NI_SSE2_X64_ConvertToInt64:
1681 case NI_SSE2_X64_ConvertToInt64WithTruncation:
1682 case NI_SSE2_X64_ConvertToUInt64:
1684 assert(op2 == nullptr);
1685 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1687 if (varTypeIsIntegral(baseType))
1689 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1690 op1Reg = op1->gtRegNum;
1691 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1695 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1696 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1701 case NI_SSE2_LoadFence:
1703 assert(baseType == TYP_VOID);
1704 assert(op1 == nullptr);
1705 assert(op2 == nullptr);
1706 emit->emitIns(INS_lfence);
1710 case NI_SSE2_MemoryFence:
1712 assert(baseType == TYP_VOID);
1713 assert(op1 == nullptr);
1714 assert(op2 == nullptr);
1715 emit->emitIns(INS_mfence);
1719 case NI_SSE2_StoreNonTemporal:
1720 case NI_SSE2_X64_StoreNonTemporal:
1722 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1723 assert(op1 != nullptr);
1724 assert(op2 != nullptr);
1726 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1727 GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
1728 emit->emitInsStoreInd(ins, emitTypeSize(baseType), &store);
1737 genProduceReg(node);
1740 //------------------------------------------------------------------------
1741 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1744 // node - The hardware intrinsic node
1746 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1748 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1749 GenTree* op1 = node->gtGetOp1();
1750 GenTree* op2 = node->gtGetOp2();
1751 GenTree* op3 = nullptr;
1752 GenTree* op4 = nullptr;
1753 regNumber targetReg = node->gtRegNum;
1754 var_types targetType = node->TypeGet();
1755 var_types baseType = node->gtSIMDBaseType;
1757 regNumber op1Reg = REG_NA;
1758 regNumber op2Reg = REG_NA;
1759 regNumber op3Reg = REG_NA;
1760 regNumber op4Reg = REG_NA;
1761 emitter* emit = getEmitter();
1763 genConsumeHWIntrinsicOperands(node);
1765 switch (intrinsicId)
1767 case NI_SSE41_ConvertToVector128Int16:
1768 case NI_SSE41_ConvertToVector128Int32:
1769 case NI_SSE41_ConvertToVector128Int64:
1771 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1773 if (!varTypeIsSIMD(op1->gtType))
1775 // Until we improve the handling of addressing modes in the emitter, we'll create a
1776 // temporary GT_IND to generate code with.
1777 GenTreeIndir load = indirForm(node->TypeGet(), op1);
1778 emit->emitInsLoadInd(ins, emitTypeSize(TYP_SIMD16), node->gtRegNum, &load);
1782 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1787 case NI_SSE41_TestZ:
1789 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1790 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1791 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1792 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1796 case NI_SSE41_TestC:
1798 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1799 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1800 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1801 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1805 case NI_SSE41_TestNotZAndNotC:
1807 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1808 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1809 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1810 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1814 case NI_SSE41_Extract:
1815 case NI_SSE41_X64_Extract:
1817 regNumber tmpTargetReg = REG_NA;
1818 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1819 if (baseType == TYP_FLOAT)
1821 tmpTargetReg = node->ExtractTempReg();
1824 auto emitSwCase = [&](int8_t i) {
1825 if (baseType == TYP_FLOAT)
1827 // extract instructions return to GP-registers, so it needs int size as the emitsize
1828 inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
1829 emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1833 inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
1837 if (op2->IsCnsIntOrI())
1839 ssize_t ival = op2->AsIntCon()->IconValue();
1840 assert((ival >= 0) && (ival <= 255));
1841 emitSwCase((int8_t)ival);
1845 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1846 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1847 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1848 regNumber baseReg = node->ExtractTempReg();
1849 regNumber offsReg = node->GetSingleTempReg();
1850 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1860 genProduceReg(node);
1863 //------------------------------------------------------------------------
1864 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1867 // node - The hardware intrinsic node
1869 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1871 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1872 regNumber targetReg = node->gtRegNum;
1873 GenTree* op1 = node->gtGetOp1();
1874 GenTree* op2 = node->gtGetOp2();
1875 var_types baseType = node->gtSIMDBaseType;
1876 var_types targetType = node->TypeGet();
1877 emitter* emit = getEmitter();
1879 genConsumeHWIntrinsicOperands(node);
1880 regNumber op1Reg = op1->gtRegNum;
1882 assert(targetReg != REG_NA);
1883 assert(op1Reg != REG_NA);
1884 assert(op2 != nullptr);
1885 assert(!node->OperIsCommutative());
1887 switch (intrinsicId)
1889 case NI_SSE42_Crc32:
1890 case NI_SSE42_X64_Crc32:
1892 if (op1Reg != targetReg)
1894 assert(op2->gtRegNum != targetReg);
1895 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1898 // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1899 // overload that explicitly takes the operands.
1901 node->gtOp2 = nullptr;
1903 if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1905 assert(targetType == TYP_INT);
1906 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1910 assert(op1->TypeGet() == op2->TypeGet());
1911 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1912 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1925 genProduceReg(node);
1928 //------------------------------------------------------------------------
1929 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1932 // node - The hardware intrinsic node
1934 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1936 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1937 var_types baseType = node->gtSIMDBaseType;
1938 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1939 var_types targetType = node->TypeGet();
1940 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1941 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
1942 GenTree* op1 = node->gtGetOp1();
1943 GenTree* op2 = node->gtGetOp2();
1944 regNumber op1Reg = REG_NA;
1945 regNumber op2Reg = REG_NA;
1946 regNumber targetReg = node->gtRegNum;
1947 emitter* emit = getEmitter();
1949 genConsumeHWIntrinsicOperands(node);
1951 switch (intrinsicId)
1953 case NI_AVX2_ConvertToInt32:
1954 case NI_AVX2_ConvertToUInt32:
1956 op1Reg = op1->gtRegNum;
1957 assert(numArgs == 1);
1958 assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1959 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1960 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1964 case NI_AVX2_ConvertToVector256Int16:
1965 case NI_AVX2_ConvertToVector256Int32:
1966 case NI_AVX2_ConvertToVector256Int64:
1968 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1970 if (!varTypeIsSIMD(op1->gtType))
1972 // Until we improve the handling of addressing modes in the emitter, we'll create a
1973 // temporary GT_IND to generate code with.
1974 GenTreeIndir load = indirForm(node->TypeGet(), op1);
1975 emit->emitInsLoadInd(ins, emitTypeSize(TYP_SIMD32), node->gtRegNum, &load);
1979 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD32));
1984 case NI_AVX2_GatherVector128:
1985 case NI_AVX2_GatherVector256:
1986 case NI_AVX2_GatherMaskVector128:
1987 case NI_AVX2_GatherMaskVector256:
1989 GenTreeArgList* list = op1->AsArgList();
1990 op1 = list->Current();
1991 op1Reg = op1->gtRegNum;
1993 list = list->Rest();
1994 op2 = list->Current();
1995 op2Reg = op2->gtRegNum;
1997 list = list->Rest();
1998 GenTree* op3 = list->Current();
2000 list = list->Rest();
2001 GenTree* op4 = nullptr;
2002 GenTree* lastOp = nullptr;
2003 GenTree* indexOp = nullptr;
2005 regNumber op3Reg = REG_NA;
2006 regNumber op4Reg = REG_NA;
2007 regNumber addrBaseReg = REG_NA;
2008 regNumber addrIndexReg = REG_NA;
2009 regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT);
2013 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
2014 op4 = list->Current();
2015 list = list->Rest();
2016 lastOp = list->Current();
2017 op3Reg = op3->gtRegNum;
2018 op4Reg = op4->gtRegNum;
2019 addrBaseReg = op2Reg;
2020 addrIndexReg = op3Reg;
2023 // copy op4Reg into the tmp mask register,
2024 // the mask register will be cleared by gather instructions
2025 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
2027 if (targetReg != op1Reg)
2029 // copy source vector to the target register for masking merge
2030 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
2035 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
2036 addrBaseReg = op1Reg;
2037 addrIndexReg = op2Reg;
2041 // generate all-one mask vector
2042 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
2045 bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
2047 // hwintrinsiclistxarch.h uses Dword index instructions in default
2048 if (varTypeIsLong(node->gtIndexBaseType))
2052 case INS_vpgatherdd:
2053 ins = INS_vpgatherqd;
2054 if (isVector128GatherWithVector256Index)
2056 // YMM index in address mode
2057 attr = emitTypeSize(TYP_SIMD32);
2060 case INS_vpgatherdq:
2061 ins = INS_vpgatherqq;
2063 case INS_vgatherdps:
2064 ins = INS_vgatherqps;
2065 if (isVector128GatherWithVector256Index)
2067 // YMM index in address mode
2068 attr = emitTypeSize(TYP_SIMD32);
2071 case INS_vgatherdpd:
2072 ins = INS_vgatherqpd;
2079 assert(lastOp->IsCnsIntOrI());
2080 ssize_t ival = lastOp->AsIntCon()->IconValue();
2081 assert((ival >= 0) && (ival <= 255));
2083 assert(targetReg != maskReg);
2084 assert(targetReg != addrIndexReg);
2085 assert(maskReg != addrIndexReg);
2086 emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2093 genHWIntrinsic_R_RM(node, ins, attr);
2094 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2095 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2099 case NI_AVX_TestNotZAndNotC:
2101 genHWIntrinsic_R_RM(node, ins, attr);
2102 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2103 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2109 genHWIntrinsic_R_RM(node, ins, attr);
2110 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2111 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2120 genProduceReg(node);
2123 //------------------------------------------------------------------------
2124 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2127 // node - The hardware intrinsic node
2129 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2131 NYI("Implement AES intrinsic code generation");
2134 //------------------------------------------------------------------------
2135 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2138 // node - The hardware intrinsic node
2140 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2142 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2143 regNumber targetReg = node->gtRegNum;
2144 GenTree* op1 = node->gtGetOp1();
2145 GenTree* op2 = node->gtGetOp2();
2146 var_types targetType = node->TypeGet();
2147 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2148 emitter* emit = getEmitter();
2150 assert(targetReg != REG_NA);
2151 assert(op1 != nullptr);
2153 genConsumeHWIntrinsicOperands(node);
2155 switch (intrinsicId)
2157 case NI_BMI1_AndNot:
2158 case NI_BMI1_X64_AndNot:
2159 case NI_BMI1_BitFieldExtract:
2160 case NI_BMI1_X64_BitFieldExtract:
2161 case NI_BMI2_ParallelBitDeposit:
2162 case NI_BMI2_ParallelBitExtract:
2163 case NI_BMI2_X64_ParallelBitDeposit:
2164 case NI_BMI2_X64_ParallelBitExtract:
2165 case NI_BMI2_ZeroHighBits:
2166 case NI_BMI2_X64_ZeroHighBits:
2168 assert(op2 != nullptr);
2169 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2170 genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2174 case NI_BMI1_ExtractLowestSetBit:
2175 case NI_BMI1_GetMaskUpToLowestSetBit:
2176 case NI_BMI1_ResetLowestSetBit:
2177 case NI_BMI1_X64_ExtractLowestSetBit:
2178 case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2179 case NI_BMI1_X64_ResetLowestSetBit:
2181 assert(op2 == nullptr);
2182 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2183 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2187 case NI_BMI1_TrailingZeroCount:
2188 case NI_BMI1_X64_TrailingZeroCount:
2190 assert(op2 == nullptr);
2191 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2192 genXCNTIntrinsic(node, ins);
2196 case NI_BMI2_MultiplyNoFlags:
2197 case NI_BMI2_X64_MultiplyNoFlags:
2199 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2200 assert(numArgs == 2 || numArgs == 3);
2202 regNumber op1Reg = REG_NA;
2203 regNumber op2Reg = REG_NA;
2204 regNumber op3Reg = REG_NA;
2205 regNumber lowReg = REG_NA;
2209 op1Reg = op1->gtRegNum;
2210 op2Reg = op2->gtRegNum;
2215 GenTreeArgList* argList = op1->AsArgList();
2216 op1 = argList->Current();
2217 op1Reg = op1->gtRegNum;
2218 argList = argList->Rest();
2219 op2 = argList->Current();
2220 op2Reg = op2->gtRegNum;
2221 argList = argList->Rest();
2222 GenTree* op3 = argList->Current();
2223 op3Reg = op3->gtRegNum;
2224 assert(!op3->isContained());
2225 assert(op3Reg != op1Reg);
2226 assert(op3Reg != targetReg);
2227 assert(op3Reg != REG_EDX);
2228 lowReg = node->GetSingleTempReg();
2229 assert(op3Reg != lowReg);
2230 assert(lowReg != targetReg);
2233 // These do not support containment
2234 assert(!op2->isContained());
2235 emitAttr attr = emitTypeSize(targetType);
2236 // mov the first operand into implicit source operand EDX/RDX
2237 if (op1Reg != REG_EDX)
2239 assert(op2Reg != REG_EDX);
2240 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2243 // generate code for MULX
2244 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2246 // If requires the lower half result, store in the memory pointed to by op3
2249 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2262 genProduceReg(node);
2265 //------------------------------------------------------------------------
2266 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2269 // node - The hardware intrinsic node
2271 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2273 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2274 var_types baseType = node->gtSIMDBaseType;
2275 emitAttr attr = EA_ATTR(node->gtSIMDSize);
2276 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2277 GenTree* op1 = node->gtGetOp1();
2278 regNumber targetReg = node->gtRegNum;
2280 assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2282 genConsumeHWIntrinsicOperands(node);
2283 GenTreeArgList* argList = op1->AsArgList();
2284 op1 = argList->Current();
2286 argList = argList->Rest();
2287 GenTree* op2 = argList->Current();
2289 argList = argList->Rest();
2290 GenTree* op3 = argList->Current();
2295 bool isCommutative = false;
2296 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2298 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2299 assert(!copiesUpperBits || !op1->isContained());
2301 if (op3->isContained() || op3->isUsedFromSpillTemp())
2303 // 213 form: op1 = (op2 * op1) + [op3]
2305 op1Reg = op1->gtRegNum;
2306 op2Reg = op2->gtRegNum;
2308 isCommutative = !copiesUpperBits;
2310 else if (op2->isContained() || op2->isUsedFromSpillTemp())
2312 // 132 form: op1 = (op1 * op3) + [op2]
2314 ins = (instruction)(ins - 1);
2315 op1Reg = op1->gtRegNum;
2316 op2Reg = op3->gtRegNum;
2319 else if (op1->isContained() || op1->isUsedFromSpillTemp())
2321 // 231 form: op3 = (op2 * op3) + [op1]
2323 ins = (instruction)(ins + 1);
2324 op1Reg = op3->gtRegNum;
2325 op2Reg = op2->gtRegNum;
2330 // 213 form: op1 = (op2 * op1) + op3
2332 op1Reg = op1->gtRegNum;
2333 op2Reg = op2->gtRegNum;
2335 isCommutative = !copiesUpperBits;
2338 if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2340 assert(node->isRMWHWIntrinsic(compiler));
2342 // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2344 // For non-commutative intrinsics, we should have ensured that op2 was marked
2345 // delay free in order to prevent it from getting assigned the same register
2346 // as target. However, for commutative intrinsics, we can just swap the operands
2347 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2353 genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2354 genProduceReg(node);
2357 //------------------------------------------------------------------------
2358 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2361 // node - The hardware intrinsic node
2363 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2365 assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2366 node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2368 genConsumeOperands(node);
2369 genXCNTIntrinsic(node, INS_lzcnt);
2370 genProduceReg(node);
2373 //------------------------------------------------------------------------
2374 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2377 // node - The hardware intrinsic node
2379 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2381 NYI("Implement PCLMULQDQ intrinsic code generation");
2384 //------------------------------------------------------------------------
2385 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2388 // node - The hardware intrinsic node
2390 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2392 assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2394 genConsumeOperands(node);
2395 genXCNTIntrinsic(node, INS_popcnt);
2396 genProduceReg(node);
2399 //------------------------------------------------------------------------
2400 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2401 // the target register
2404 // node - The hardware intrinsic node
2405 // ins - The instruction being generated
2407 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2409 // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2410 // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2411 // renaming, but only if it's not an actual dependency.
2413 GenTree* op1 = node->gtGetOp1();
2414 regNumber sourceReg1 = REG_NA;
2415 regNumber sourceReg2 = REG_NA;
2417 if (!op1->isContained())
2419 sourceReg1 = op1->gtRegNum;
2421 else if (op1->isIndir())
2423 GenTreeIndir* indir = op1->AsIndir();
2424 GenTree* memBase = indir->Base();
2426 if (memBase != nullptr)
2428 sourceReg1 = memBase->gtRegNum;
2431 if (indir->HasIndex())
2433 sourceReg2 = indir->Index()->gtRegNum;
2437 regNumber targetReg = node->gtRegNum;
2438 if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2440 getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2442 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2445 #endif // FEATURE_HW_INTRINSICS