1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Intel hardware intrinsic Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifdef FEATURE_HW_INTRINSICS
22 #include "sideeffects.h"
25 #include "gcinfoencoder.h"
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
31 // lowering - The lowering phase from the compiler
32 // node - The HWIntrinsic node that has the contained node
33 // op - The op that is contained
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
38 // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39 // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
41 // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42 // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
44 // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
46 // in the first place).
48 bool supportsRegOptional = false;
49 bool isContainable = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50 assert(isContainable || supportsRegOptional);
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
58 // category - category of a HW intrinsic
61 // returns true if this category can be table-driven in CodeGen
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
65 // TODO - make more categories to the table-driven framework
66 // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67 const bool tableDrivenCategory =
68 (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69 const bool tableDrivenFlag =
70 !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71 return tableDrivenCategory && tableDrivenFlag;
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
78 // node - The hardware intrinsic node
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
82 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
83 InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId);
84 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
85 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
86 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
88 assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
90 if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
92 GenTree* op1 = node->gtGetOp1();
93 GenTree* op2 = node->gtGetOp2();
94 regNumber targetReg = node->gtRegNum;
95 var_types baseType = node->gtSIMDBaseType;
97 regNumber op1Reg = REG_NA;
98 regNumber op2Reg = REG_NA;
99 emitter* emit = getEmitter();
101 assert(numArgs >= 0);
102 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
103 assert(ins != INS_invalid);
104 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
105 assert(simdSize != 0);
111 if (node->OperIsMemoryLoad())
113 genConsumeAddress(op1);
114 // Until we improve the handling of addressing modes in the emitter, we'll create a
115 // temporary GT_IND to generate code with.
116 GenTreeIndir load = indirForm(node->TypeGet(), op1);
117 emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
122 op1Reg = op1->gtRegNum;
124 if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
126 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
128 else if ((ival != -1) && varTypeIsFloating(baseType))
130 assert((ival >= 0) && (ival <= 127));
131 genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
135 genHWIntrinsic_R_RM(node, ins, simdSize);
143 if (category == HW_Category_MemoryStore)
145 genConsumeAddress(op1);
147 // Until we improve the handling of addressing modes in the emitter, we'll create a
148 // temporary GT_STORE_IND to generate code with.
149 GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
150 emit->emitInsStoreInd(ins, simdSize, &store);
156 op1Reg = op1->gtRegNum;
157 op2Reg = op2->gtRegNum;
159 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
161 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
163 // For non-commutative intrinsics, we should have ensured that op2 was marked
164 // delay free in order to prevent it from getting assigned the same register
165 // as target. However, for commutative intrinsics, we can just swap the operands
166 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
168 noway_assert(node->OperIsCommutative());
173 if ((ival != -1) && varTypeIsFloating(baseType))
175 assert((ival >= 0) && (ival <= 127));
176 genHWIntrinsic_R_R_RM_I(node, ins, ival);
178 else if (category == HW_Category_MemoryLoad)
180 // Get the address and the 'other' register.
183 if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
193 // Until we improve the handling of addressing modes in the emitter, we'll create a
194 // temporary GT_IND to generate code with.
195 GenTreeIndir load = indirForm(node->TypeGet(), addr);
196 genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
198 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
202 if (intrinsicId == NI_SSE2_Extract)
204 // extract instructions return to GP-registers, so it needs int size as the emitsize
205 simdSize = emitTypeSize(TYP_INT);
208 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
210 if (op2->IsCnsIntOrI())
212 ssize_t ival = op2->AsIntCon()->IconValue();
213 assert((ival >= 0) && (ival <= 255));
214 emitSwCase((int8_t)ival);
218 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
219 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
220 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
221 regNumber baseReg = node->ExtractTempReg();
222 regNumber offsReg = node->GetSingleTempReg();
223 genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
228 genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
235 GenTreeArgList* argList = op1->AsArgList();
236 op1 = argList->Current();
238 op1Reg = op1->gtRegNum;
240 argList = argList->Rest();
241 op2 = argList->Current();
243 op2Reg = op2->gtRegNum;
245 argList = argList->Rest();
246 GenTree* op3 = argList->Current();
248 regNumber op3Reg = op3->gtRegNum;
250 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
254 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
256 if (op3->IsCnsIntOrI())
258 ssize_t ival = op3->AsIntCon()->IconValue();
259 assert((ival >= 0) && (ival <= 255));
260 emitSwCase((int8_t)ival);
264 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
265 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
266 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
267 regNumber baseReg = node->ExtractTempReg();
268 regNumber offsReg = node->GetSingleTempReg();
269 genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
272 else if (category == HW_Category_MemoryStore)
274 if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
276 emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
280 assert(intrinsicId == NI_SSE2_MaskMove);
281 assert(targetReg == REG_NA);
283 // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
284 if (op3Reg != REG_EDI)
286 emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
288 emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
295 case NI_SSE41_BlendVariable:
296 case NI_AVX_BlendVariable:
297 case NI_AVX2_BlendVariable:
299 genHWIntrinsic_R_R_RM_R(node, ins);
323 case InstructionSet_Base:
324 genBaseIntrinsic(node);
326 case InstructionSet_SSE:
327 case InstructionSet_SSE_X64:
328 genSSEIntrinsic(node);
330 case InstructionSet_SSE2:
331 case InstructionSet_SSE2_X64:
332 genSSE2Intrinsic(node);
334 case InstructionSet_SSE41:
335 case InstructionSet_SSE41_X64:
336 genSSE41Intrinsic(node);
338 case InstructionSet_SSE42:
339 case InstructionSet_SSE42_X64:
340 genSSE42Intrinsic(node);
342 case InstructionSet_AVX:
343 case InstructionSet_AVX2:
344 genAvxOrAvx2Intrinsic(node);
346 case InstructionSet_AES:
347 genAESIntrinsic(node);
349 case InstructionSet_BMI1:
350 case InstructionSet_BMI1_X64:
351 case InstructionSet_BMI2:
352 case InstructionSet_BMI2_X64:
353 genBMI1OrBMI2Intrinsic(node);
355 case InstructionSet_FMA:
356 genFMAIntrinsic(node);
358 case InstructionSet_LZCNT:
359 case InstructionSet_LZCNT_X64:
360 genLZCNTIntrinsic(node);
362 case InstructionSet_PCLMULQDQ:
363 genPCLMULQDQIntrinsic(node);
365 case InstructionSet_POPCNT:
366 case InstructionSet_POPCNT_X64:
367 genPOPCNTIntrinsic(node);
375 //------------------------------------------------------------------------
376 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
377 // register/memory operand and that returns a value in register
380 // node - The hardware intrinsic node
381 // ins - The instruction being generated
382 // attr - The emit attribute for the instruciton being generated
384 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
386 regNumber targetReg = node->gtRegNum;
387 GenTree* op1 = node->gtGetOp1();
388 GenTree* op2 = node->gtGetOp2();
389 emitter* emit = getEmitter();
393 // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this
394 // code path. They are all MultiIns, as the return value comes from the flags and
395 // we have two operands instead.
397 assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
398 assert(targetReg != REG_NA);
400 targetReg = op1->gtRegNum;
406 assert(!node->OperIsCommutative());
409 assert(targetReg != REG_NA);
410 assert(op2 == nullptr);
412 if (op1->isContained() || op1->isUsedFromSpillTemp())
414 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
415 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
417 TempDsc* tmpDsc = nullptr;
418 unsigned varNum = BAD_VAR_NUM;
419 unsigned offset = (unsigned)-1;
421 if (op1->isUsedFromSpillTemp())
423 assert(op1->IsRegOptional());
425 tmpDsc = getSpillTempDsc(op1);
426 varNum = tmpDsc->tdTempNum();
429 regSet.tmpRlsTemp(tmpDsc);
431 else if (op1->OperIsHWIntrinsic())
433 emit->emitIns_R_AR(ins, attr, targetReg, op1->gtGetOp1()->gtRegNum, 0);
436 else if (op1->isIndir())
438 GenTreeIndir* memIndir = op1->AsIndir();
439 GenTree* memBase = memIndir->gtOp1;
441 switch (memBase->OperGet())
443 case GT_LCL_VAR_ADDR:
445 varNum = memBase->AsLclVarCommon()->GetLclNum();
448 // Ensure that all the GenTreeIndir values are set to their defaults.
449 assert(!memIndir->HasIndex());
450 assert(memIndir->Scale() == 1);
451 assert(memIndir->Offset() == 0);
456 case GT_CLS_VAR_ADDR:
458 emit->emitIns_R_C(ins, attr, targetReg, memBase->gtClsVar.gtClsVarHnd, 0);
464 emit->emitIns_R_A(ins, attr, targetReg, memIndir);
471 switch (op1->OperGet())
475 GenTreeLclFld* lclField = op1->AsLclFld();
477 varNum = lclField->GetLclNum();
478 offset = lclField->gtLclFld.gtLclOffs;
484 assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
485 varNum = op1->AsLclVar()->GetLclNum();
498 // Ensure we got a good varNum and offset.
499 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
500 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
501 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
502 assert(offset != (unsigned)-1);
504 emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
508 regNumber op1Reg = op1->gtRegNum;
509 emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
513 //------------------------------------------------------------------------
514 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
515 // an immediate operand, and that returns a value in register
518 // node - The hardware intrinsic node
519 // ins - The instruction being generated
520 // ival - The immediate value
522 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
524 regNumber targetReg = node->gtRegNum;
525 GenTree* op1 = node->gtGetOp1();
526 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
527 emitter* emit = getEmitter();
529 // TODO-XArch-CQ: Commutative operations can have op1 be contained
530 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
532 assert(targetReg != REG_NA);
533 assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
535 if (op1->isContained() || op1->isUsedFromSpillTemp())
537 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
538 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
540 inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
543 //------------------------------------------------------------------------
544 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
545 // register/memory operand, and that returns a value in register
548 // node - The hardware intrinsic node
549 // ins - The instruction being generated
550 // attr - The emit attribute for the instruciton being generated
552 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
554 regNumber targetReg = node->gtRegNum;
555 GenTree* op1 = node->gtGetOp1();
556 GenTree* op2 = node->gtGetOp2();
557 regNumber op1Reg = op1->gtRegNum;
559 assert(targetReg != REG_NA);
560 assert(op1Reg != REG_NA);
562 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
565 //------------------------------------------------------------------------
566 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
567 // register/memory operand, and that returns a value in register
570 // node - The hardware intrinsic node
571 // ins - The instruction being generated
572 // attr - The emit attribute for the instruciton being generated
573 // targetReg - The register allocated to the result
574 // op1Reg - The register allocated to the first operand
575 // op2 - Another operand that maybe in register or memory
577 void CodeGen::genHWIntrinsic_R_R_RM(
578 GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
580 emitter* emit = getEmitter();
582 // TODO-XArch-CQ: Commutative operations can have op1 be contained
583 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
585 assert(targetReg != REG_NA);
586 assert(op1Reg != REG_NA);
588 if (op2->isContained() || op2->isUsedFromSpillTemp())
590 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
591 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
593 TempDsc* tmpDsc = nullptr;
594 unsigned varNum = BAD_VAR_NUM;
595 unsigned offset = (unsigned)-1;
597 if (op2->isUsedFromSpillTemp())
599 assert(op2->IsRegOptional());
601 tmpDsc = getSpillTempDsc(op2);
602 varNum = tmpDsc->tdTempNum();
605 regSet.tmpRlsTemp(tmpDsc);
607 else if (op2->OperIsHWIntrinsic())
609 GenTree* addr = op2->gtGetOp1();
610 // Until we improve the handling of addressing modes in the emitter, we'll create a
611 // temporary GT_IND to generate code with.
612 GenTreeIndir load = indirForm(node->TypeGet(), addr);
613 emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, &load);
616 else if (op2->isIndir())
618 GenTreeIndir* memIndir = op2->AsIndir();
619 GenTree* memBase = memIndir->gtOp1;
621 switch (memBase->OperGet())
623 case GT_LCL_VAR_ADDR:
625 varNum = memBase->AsLclVarCommon()->GetLclNum();
628 // Ensure that all the GenTreeIndir values are set to their defaults.
629 assert(!memIndir->HasIndex());
630 assert(memIndir->Scale() == 1);
631 assert(memIndir->Offset() == 0);
636 case GT_CLS_VAR_ADDR:
638 emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
644 emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
651 switch (op2->OperGet())
655 GenTreeLclFld* lclField = op2->AsLclFld();
657 varNum = lclField->GetLclNum();
658 offset = lclField->gtLclFld.gtLclOffs;
664 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
665 varNum = op2->AsLclVar()->GetLclNum();
676 // Ensure we got a good varNum and offset.
677 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
678 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
679 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
680 assert(offset != (unsigned)-1);
682 emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
686 regNumber op2Reg = op2->gtRegNum;
688 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
690 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
692 // For non-commutative intrinsics, we should have ensured that op2 was marked
693 // delay free in order to prevent it from getting assigned the same register
694 // as target. However, for commutative intrinsics, we can just swap the operands
695 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
697 noway_assert(node->OperIsCommutative());
702 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
706 //------------------------------------------------------------------------
707 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
708 // register/memory operand, an immediate operand, and that returns a value in register
711 // node - The hardware intrinsic node
712 // ins - The instruction being generated
713 // ival - The immediate value
715 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
717 regNumber targetReg = node->gtRegNum;
718 GenTree* op1 = node->gtGetOp1();
719 GenTree* op2 = node->gtGetOp2();
720 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
721 emitter* emit = getEmitter();
723 // TODO-XArch-CQ: Commutative operations can have op1 be contained
724 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
726 if (op1->OperIsList())
728 assert(op2 == nullptr);
730 GenTreeArgList* argList = op1->AsArgList();
732 op1 = argList->Current();
733 argList = argList->Rest();
735 op2 = argList->Current();
736 argList = argList->Rest();
738 assert(argList->Current() != nullptr);
739 assert(argList->Rest() == nullptr);
742 regNumber op1Reg = op1->gtRegNum;
744 assert(targetReg != REG_NA);
745 assert(op1Reg != REG_NA);
747 if (op2->isContained() || op2->isUsedFromSpillTemp())
749 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
750 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
752 TempDsc* tmpDsc = nullptr;
753 unsigned varNum = BAD_VAR_NUM;
754 unsigned offset = (unsigned)-1;
756 if (op2->isUsedFromSpillTemp())
758 assert(op2->IsRegOptional());
760 tmpDsc = getSpillTempDsc(op2);
761 varNum = tmpDsc->tdTempNum();
764 regSet.tmpRlsTemp(tmpDsc);
766 else if (op2->OperIsHWIntrinsic())
768 emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
771 else if (op2->isIndir())
773 GenTreeIndir* memIndir = op2->AsIndir();
774 GenTree* memBase = memIndir->gtOp1;
776 switch (memBase->OperGet())
778 case GT_LCL_VAR_ADDR:
780 varNum = memBase->AsLclVarCommon()->GetLclNum();
783 // Ensure that all the GenTreeIndir values are set to their defaults.
784 assert(!memIndir->HasIndex());
785 assert(memIndir->Scale() == 1);
786 assert(memIndir->Offset() == 0);
791 case GT_CLS_VAR_ADDR:
793 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
800 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
807 switch (op2->OperGet())
811 GenTreeLclFld* lclField = op2->AsLclFld();
813 varNum = lclField->GetLclNum();
814 offset = lclField->gtLclFld.gtLclOffs;
820 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
821 varNum = op2->AsLclVar()->GetLclNum();
832 // Ensure we got a good varNum and offset.
833 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
834 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
835 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
836 assert(offset != (unsigned)-1);
838 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
842 regNumber op2Reg = op2->gtRegNum;
844 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
846 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
848 // For non-commutative intrinsics, we should have ensured that op2 was marked
849 // delay free in order to prevent it from getting assigned the same register
850 // as target. However, for commutative intrinsics, we can just swap the operands
851 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
853 noway_assert(node->OperIsCommutative());
858 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
862 //------------------------------------------------------------------------
863 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
864 // register/memory operand, another register operand, and that returns a value in register
867 // node - The hardware intrinsic node
868 // ins - The instruction being generated
870 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
872 regNumber targetReg = node->gtRegNum;
873 GenTree* op1 = node->gtGetOp1();
874 GenTree* op2 = node->gtGetOp2();
875 GenTree* op3 = nullptr;
876 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
877 emitter* emit = getEmitter();
879 assert(op1->OperIsList());
880 assert(op2 == nullptr);
882 GenTreeArgList* argList = op1->AsArgList();
884 op1 = argList->Current();
885 argList = argList->Rest();
887 op2 = argList->Current();
888 argList = argList->Rest();
890 op3 = argList->Current();
891 assert(argList->Rest() == nullptr);
893 regNumber op1Reg = op1->gtRegNum;
894 regNumber op3Reg = op3->gtRegNum;
896 assert(targetReg != REG_NA);
897 assert(op1Reg != REG_NA);
898 assert(op3Reg != REG_NA);
900 if (op2->isContained() || op2->isUsedFromSpillTemp())
902 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
903 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
905 TempDsc* tmpDsc = nullptr;
906 unsigned varNum = BAD_VAR_NUM;
907 unsigned offset = (unsigned)-1;
909 if (op2->isUsedFromSpillTemp())
911 assert(op2->IsRegOptional());
913 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
914 // pattern. It could probably be extracted to its own method.
915 tmpDsc = getSpillTempDsc(op2);
916 varNum = tmpDsc->tdTempNum();
919 regSet.tmpRlsTemp(tmpDsc);
921 else if (op2->OperIsHWIntrinsic())
923 emit->emitIns_SIMD_R_R_AR_R(ins, simdSize, targetReg, op1Reg, op3Reg, op2->gtGetOp1()->gtRegNum);
926 else if (op2->isIndir())
928 GenTreeIndir* memIndir = op2->AsIndir();
929 GenTree* memBase = memIndir->gtOp1;
931 switch (memBase->OperGet())
933 case GT_LCL_VAR_ADDR:
935 varNum = memBase->AsLclVarCommon()->GetLclNum();
938 // Ensure that all the GenTreeIndir values are set to their defaults.
939 assert(!memIndir->HasIndex());
940 assert(memIndir->Scale() == 1);
941 assert(memIndir->Offset() == 0);
946 case GT_CLS_VAR_ADDR:
948 emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, memBase->gtClsVar.gtClsVarHnd,
955 emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
962 switch (op2->OperGet())
966 GenTreeLclFld* lclField = op2->AsLclFld();
968 varNum = lclField->GetLclNum();
969 offset = lclField->gtLclFld.gtLclOffs;
975 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
976 varNum = op2->AsLclVar()->GetLclNum();
987 // Ensure we got a good varNum and offset.
988 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
989 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
990 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
991 assert(offset != (unsigned)-1);
993 emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
997 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1001 //------------------------------------------------------------------------
1002 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1003 // a register/memory operand, and that returns a value in register
1006 // ins - The instruction being generated
1007 // attr - The emit attribute
1008 // targetReg - The target register
1009 // op1Reg - The register of the first operand
1010 // op2Reg - The register of the second operand
1011 // op3 - The third operand
1013 void CodeGen::genHWIntrinsic_R_R_R_RM(
1014 instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1016 assert(targetReg != REG_NA);
1017 assert(op1Reg != REG_NA);
1018 assert(op2Reg != REG_NA);
1020 emitter* emit = getEmitter();
1022 if (op3->isContained() || op3->isUsedFromSpillTemp())
1024 TempDsc* tmpDsc = nullptr;
1025 unsigned varNum = BAD_VAR_NUM;
1026 unsigned offset = (unsigned)-1;
1028 if (op3->isUsedFromSpillTemp())
1030 assert(op3->IsRegOptional());
1032 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1033 // pattern. It could probably be extracted to its own method.
1034 tmpDsc = getSpillTempDsc(op3);
1035 varNum = tmpDsc->tdTempNum();
1038 regSet.tmpRlsTemp(tmpDsc);
1040 else if (op3->OperIsHWIntrinsic())
1042 emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum);
1045 else if (op3->isIndir())
1047 GenTreeIndir* memIndir = op3->AsIndir();
1048 GenTree* memBase = memIndir->gtOp1;
1050 switch (memBase->OperGet())
1052 case GT_LCL_VAR_ADDR:
1054 varNum = memBase->AsLclVarCommon()->GetLclNum();
1057 // Ensure that all the GenTreeIndir values are set to their defaults.
1058 assert(!memIndir->HasIndex());
1059 assert(memIndir->Scale() == 1);
1060 assert(memIndir->Offset() == 0);
1065 case GT_CLS_VAR_ADDR:
1067 emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0);
1073 emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1080 switch (op3->OperGet())
1084 GenTreeLclFld* lclField = op3->AsLclFld();
1086 varNum = lclField->GetLclNum();
1087 offset = lclField->gtLclFld.gtLclOffs;
1093 assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1094 varNum = op3->AsLclVar()->GetLclNum();
1105 // Ensure we got a good varNum and offset.
1106 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1107 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1108 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1109 assert(offset != (unsigned)-1);
1111 emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1115 emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1119 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1120 // with non-constant argument
1123 // intrinsic - intrinsic ID
1124 // nonConstImmReg - the register contains non-constant imm8 argument
1125 // baseReg - a register for the start of the switch table
1126 // offsReg - a register for the offset into the switch table
1127 // emitSwCase - the lambda to generate a switch case
1130 // generate the jump-table fallback for imm-intrinsics with non-constant argument.
1132 // This function can be used for all imm-intrinsics (whether full-range or not),
1133 // The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1134 // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1136 template <typename HWIntrinsicSwitchCaseBody>
1137 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
1138 regNumber nonConstImmReg,
1141 HWIntrinsicSwitchCaseBody emitSwCase)
1143 assert(nonConstImmReg != REG_NA);
1144 // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1145 // that does work with the current compiler generated jump-table fallback
1146 assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1147 emitter* emit = getEmitter();
1149 const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1150 assert(maxByte <= 256);
1151 BasicBlock* jmpTable[256];
1153 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1155 // Emit the jump table
1156 for (unsigned i = 0; i < maxByte; i++)
1158 jmpTable[i] = genCreateTempLabel();
1159 emit->emitDataGenData(i, jmpTable[i]);
1162 emit->emitDataGenEnd();
1164 // Compute and jump to the appropriate offset in the switch table
1165 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1167 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1168 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1169 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1170 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1172 // Emit the switch table entries
1174 BasicBlock* switchTableBeg = genCreateTempLabel();
1175 BasicBlock* switchTableEnd = genCreateTempLabel();
1177 genDefineTempLabel(switchTableBeg);
1179 for (unsigned i = 0; i < maxByte; i++)
1181 genDefineTempLabel(jmpTable[i]);
1182 emitSwCase((int8_t)i);
1183 emit->emitIns_J(INS_jmp, switchTableEnd);
1186 genDefineTempLabel(switchTableEnd);
1189 //------------------------------------------------------------------------
1190 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1193 // node - The hardware intrinsic node
1196 // We currently assume that all base intrinsics have zero or one operand.
1198 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1200 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1201 regNumber targetReg = node->gtRegNum;
1202 var_types targetType = node->TypeGet();
1203 var_types baseType = node->gtSIMDBaseType;
1205 assert(compiler->compSupports(InstructionSet_SSE));
1206 assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1208 GenTree* op1 = node->gtGetOp1();
1210 genConsumeHWIntrinsicOperands(node);
1211 regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
1213 assert(node->gtGetOp2() == nullptr);
1215 emitter* emit = getEmitter();
1216 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1217 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1219 switch (intrinsicId)
1221 case NI_Base_Vector128_CreateScalarUnsafe:
1222 case NI_Base_Vector256_CreateScalarUnsafe:
1224 if (varTypeIsIntegral(baseType))
1226 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1230 assert(varTypeIsFloating(baseType));
1232 attr = emitTypeSize(baseType);
1234 if (op1->isContained() || op1->isUsedFromSpillTemp())
1236 genHWIntrinsic_R_RM(node, ins, attr);
1238 else if (targetReg != op1Reg)
1240 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1241 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1247 case NI_Base_Vector128_ToScalar:
1248 case NI_Base_Vector256_ToScalar:
1250 assert(varTypeIsFloating(baseType));
1252 attr = emitTypeSize(TYP_SIMD16);
1254 if (op1->isContained() || op1->isUsedFromSpillTemp())
1256 genHWIntrinsic_R_RM(node, ins, attr);
1258 else if (targetReg != op1Reg)
1260 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1261 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1266 case NI_Base_Vector128_ToVector256:
1268 // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1269 // We always emit a move to the target register, even when op1Reg == targetReg,
1270 // in order to ensure that Bits MAXVL-1:128 are zeroed.
1272 attr = emitTypeSize(TYP_SIMD16);
1274 if (op1->isContained() || op1->isUsedFromSpillTemp())
1276 genHWIntrinsic_R_RM(node, ins, attr);
1280 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1281 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1286 case NI_Base_Vector128_ToVector256Unsafe:
1287 case NI_Base_Vector256_GetLower:
1289 if (op1->isContained() || op1->isUsedFromSpillTemp())
1291 genHWIntrinsic_R_RM(node, ins, attr);
1293 else if (targetReg != op1Reg)
1295 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1296 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1301 case NI_Base_Vector128_Zero:
1302 case NI_Base_Vector256_Zero:
1304 assert(op1 == nullptr);
1305 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1316 genProduceReg(node);
1319 //------------------------------------------------------------------------
1320 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1323 // node - The hardware intrinsic node
1325 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1327 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1328 GenTree* op1 = node->gtGetOp1();
1329 GenTree* op2 = node->gtGetOp2();
1330 regNumber targetReg = node->gtRegNum;
1331 var_types baseType = node->gtSIMDBaseType;
1333 regNumber op1Reg = REG_NA;
1334 emitter* emit = getEmitter();
1336 genConsumeHWIntrinsicOperands(node);
1338 switch (intrinsicId)
1340 case NI_SSE_CompareEqualOrderedScalar:
1341 case NI_SSE_CompareEqualUnorderedScalar:
1343 assert(baseType == TYP_FLOAT);
1344 regNumber tmpReg = node->GetSingleTempReg();
1345 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1347 // Ensure we aren't overwriting targetReg
1348 assert(tmpReg != targetReg);
1350 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1351 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1352 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1353 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1354 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1358 case NI_SSE_CompareGreaterThanOrderedScalar:
1359 case NI_SSE_CompareGreaterThanUnorderedScalar:
1361 assert(baseType == TYP_FLOAT);
1362 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1364 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1365 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1366 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1370 case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
1371 case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
1373 assert(baseType == TYP_FLOAT);
1374 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1376 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1377 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1378 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1382 case NI_SSE_CompareLessThanOrderedScalar:
1383 case NI_SSE_CompareLessThanUnorderedScalar:
1385 assert(baseType == TYP_FLOAT);
1386 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1388 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1389 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1390 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1394 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
1395 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
1397 assert(baseType == TYP_FLOAT);
1398 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1400 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1401 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1402 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1406 case NI_SSE_CompareNotEqualOrderedScalar:
1407 case NI_SSE_CompareNotEqualUnorderedScalar:
1409 assert(baseType == TYP_FLOAT);
1410 regNumber tmpReg = node->GetSingleTempReg();
1411 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1413 // Ensure we aren't overwriting targetReg
1414 assert(tmpReg != targetReg);
1416 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1417 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1418 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1419 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1420 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1424 case NI_SSE_X64_ConvertToInt64:
1425 case NI_SSE_X64_ConvertToInt64WithTruncation:
1427 assert(targetType == TYP_LONG);
1428 assert(op1 != nullptr);
1429 assert(op2 == nullptr);
1430 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1431 genHWIntrinsic_R_RM(node, ins, EA_8BYTE);
1435 case NI_SSE_X64_ConvertScalarToVector128Single:
1437 assert(baseType == TYP_LONG);
1438 assert(op1 != nullptr);
1439 assert(op2 != nullptr);
1440 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1441 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1445 case NI_SSE_Prefetch0:
1446 case NI_SSE_Prefetch1:
1447 case NI_SSE_Prefetch2:
1448 case NI_SSE_PrefetchNonTemporal:
1450 assert(baseType == TYP_UBYTE);
1451 assert(op2 == nullptr);
1453 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1454 op1Reg = op1->gtRegNum;
1455 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1459 case NI_SSE_StoreFence:
1461 assert(baseType == TYP_VOID);
1462 assert(op1 == nullptr);
1463 assert(op2 == nullptr);
1464 emit->emitIns(INS_sfence);
1473 genProduceReg(node);
1476 //------------------------------------------------------------------------
1477 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1480 // node - The hardware intrinsic node
1482 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1484 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1485 GenTree* op1 = node->gtGetOp1();
1486 GenTree* op2 = node->gtGetOp2();
1487 regNumber targetReg = node->gtRegNum;
1488 var_types targetType = node->TypeGet();
1489 var_types baseType = node->gtSIMDBaseType;
1490 regNumber op1Reg = REG_NA;
1491 regNumber op2Reg = REG_NA;
1492 emitter* emit = getEmitter();
1494 genConsumeHWIntrinsicOperands(node);
1496 switch (intrinsicId)
1498 // All integer overloads are handled by table codegen
1499 case NI_SSE2_CompareLessThan:
1501 assert(op1 != nullptr);
1502 assert(op2 != nullptr);
1504 assert(baseType == TYP_DOUBLE);
1506 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1507 assert((ival >= 0) && (ival <= 127));
1509 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1510 op1Reg = op1->gtRegNum;
1511 op2Reg = op2->gtRegNum;
1512 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1517 case NI_SSE2_CompareEqualOrderedScalar:
1518 case NI_SSE2_CompareEqualUnorderedScalar:
1520 assert(baseType == TYP_DOUBLE);
1521 regNumber tmpReg = node->GetSingleTempReg();
1522 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1524 // Ensure we aren't overwriting targetReg
1525 assert(tmpReg != targetReg);
1527 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1528 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1529 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1530 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1531 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1535 case NI_SSE2_CompareGreaterThanOrderedScalar:
1536 case NI_SSE2_CompareGreaterThanUnorderedScalar:
1538 assert(baseType == TYP_DOUBLE);
1539 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1541 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1542 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1543 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1547 case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
1548 case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
1550 assert(baseType == TYP_DOUBLE);
1551 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1553 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1554 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1555 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1559 case NI_SSE2_CompareLessThanOrderedScalar:
1560 case NI_SSE2_CompareLessThanUnorderedScalar:
1562 assert(baseType == TYP_DOUBLE);
1563 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1565 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1566 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1567 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1571 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
1572 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
1574 assert(baseType == TYP_DOUBLE);
1575 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1577 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1578 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1579 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1583 case NI_SSE2_CompareNotEqualOrderedScalar:
1584 case NI_SSE2_CompareNotEqualUnorderedScalar:
1586 assert(baseType == TYP_DOUBLE);
1587 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1588 regNumber tmpReg = node->GetSingleTempReg();
1590 // Ensure we aren't overwriting targetReg
1591 assert(tmpReg != targetReg);
1593 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1594 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1595 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1596 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1597 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1601 case NI_SSE2_X64_ConvertScalarToVector128Double:
1603 assert(baseType == TYP_LONG);
1604 assert(op1 != nullptr);
1605 assert(op2 != nullptr);
1606 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1607 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1611 case NI_SSE2_X64_ConvertScalarToVector128Int64:
1612 case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1614 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1615 assert(op1 != nullptr);
1616 assert(op2 == nullptr);
1617 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1618 genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1622 case NI_SSE2_ConvertToInt32:
1623 case NI_SSE2_ConvertToInt32WithTruncation:
1624 case NI_SSE2_ConvertToUInt32:
1625 case NI_SSE2_X64_ConvertToInt64:
1626 case NI_SSE2_X64_ConvertToInt64WithTruncation:
1627 case NI_SSE2_X64_ConvertToUInt64:
1629 assert(op2 == nullptr);
1630 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1632 if (varTypeIsIntegral(baseType))
1634 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1635 op1Reg = op1->gtRegNum;
1636 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1640 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1641 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1646 case NI_SSE2_LoadFence:
1648 assert(baseType == TYP_VOID);
1649 assert(op1 == nullptr);
1650 assert(op2 == nullptr);
1651 emit->emitIns(INS_lfence);
1655 case NI_SSE2_MemoryFence:
1657 assert(baseType == TYP_VOID);
1658 assert(op1 == nullptr);
1659 assert(op2 == nullptr);
1660 emit->emitIns(INS_mfence);
1664 case NI_SSE2_StoreNonTemporal:
1665 case NI_SSE2_X64_StoreNonTemporal:
1667 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1668 assert(op1 != nullptr);
1669 assert(op2 != nullptr);
1671 op2Reg = op2->gtRegNum;
1672 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1673 op1Reg = op1->gtRegNum;
1674 emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1683 genProduceReg(node);
1686 //------------------------------------------------------------------------
1687 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1690 // node - The hardware intrinsic node
1692 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1694 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1695 GenTree* op1 = node->gtGetOp1();
1696 GenTree* op2 = node->gtGetOp2();
1697 regNumber targetReg = node->gtRegNum;
1698 var_types baseType = node->gtSIMDBaseType;
1700 regNumber op1Reg = REG_NA;
1701 emitter* emit = getEmitter();
1703 genConsumeHWIntrinsicOperands(node);
1705 switch (intrinsicId)
1707 case NI_SSE41_TestAllOnes:
1709 op1Reg = op1->gtRegNum;
1710 regNumber tmpReg = node->GetSingleTempReg();
1711 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1712 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1713 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1714 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1715 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1719 case NI_SSE41_TestAllZeros:
1720 case NI_SSE41_TestZ:
1722 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1723 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1724 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1725 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1729 case NI_SSE41_TestC:
1731 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1732 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1733 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1734 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1738 case NI_SSE41_TestMixOnesZeros:
1739 case NI_SSE41_TestNotZAndNotC:
1741 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1742 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1743 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1744 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1748 case NI_SSE41_Extract:
1749 case NI_SSE41_X64_Extract:
1751 regNumber tmpTargetReg = REG_NA;
1752 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1753 if (baseType == TYP_FLOAT)
1755 tmpTargetReg = node->ExtractTempReg();
1758 auto emitSwCase = [&](int8_t i) {
1759 if (baseType == TYP_FLOAT)
1761 // extract instructions return to GP-registers, so it needs int size as the emitsize
1762 inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
1763 emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1767 inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
1771 if (op2->IsCnsIntOrI())
1773 ssize_t ival = op2->AsIntCon()->IconValue();
1774 assert((ival >= 0) && (ival <= 255));
1775 emitSwCase((int8_t)ival);
1779 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1780 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1781 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1782 regNumber baseReg = node->ExtractTempReg();
1783 regNumber offsReg = node->GetSingleTempReg();
1784 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1794 genProduceReg(node);
1797 //------------------------------------------------------------------------
1798 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1801 // node - The hardware intrinsic node
1803 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1805 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1806 regNumber targetReg = node->gtRegNum;
1807 GenTree* op1 = node->gtGetOp1();
1808 GenTree* op2 = node->gtGetOp2();
1809 var_types baseType = node->gtSIMDBaseType;
1810 var_types targetType = node->TypeGet();
1811 emitter* emit = getEmitter();
1813 genConsumeHWIntrinsicOperands(node);
1814 regNumber op1Reg = op1->gtRegNum;
1816 assert(targetReg != REG_NA);
1817 assert(op1Reg != REG_NA);
1818 assert(op2 != nullptr);
1819 assert(!node->OperIsCommutative());
1821 switch (intrinsicId)
1823 case NI_SSE42_Crc32:
1824 case NI_SSE42_X64_Crc32:
1826 if (op1Reg != targetReg)
1828 assert(op2->gtRegNum != targetReg);
1829 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1832 // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1833 // overload that explicitly takes the operands.
1835 node->gtOp2 = nullptr;
1837 if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1839 assert(targetType == TYP_INT);
1840 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1844 assert(op1->TypeGet() == op2->TypeGet());
1845 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1846 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1859 genProduceReg(node);
1862 //------------------------------------------------------------------------
1863 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1866 // node - The hardware intrinsic node
1868 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1870 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1871 var_types baseType = node->gtSIMDBaseType;
1872 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1873 var_types targetType = node->TypeGet();
1874 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1875 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
1876 GenTree* op1 = node->gtGetOp1();
1877 GenTree* op2 = node->gtGetOp2();
1878 regNumber op1Reg = REG_NA;
1879 regNumber op2Reg = REG_NA;
1880 regNumber targetReg = node->gtRegNum;
1881 emitter* emit = getEmitter();
1883 genConsumeHWIntrinsicOperands(node);
1885 switch (intrinsicId)
1887 case NI_AVX2_ConvertToInt32:
1888 case NI_AVX2_ConvertToUInt32:
1890 op1Reg = op1->gtRegNum;
1891 assert(numArgs == 1);
1892 assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1893 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1894 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1898 case NI_AVX2_GatherVector128:
1899 case NI_AVX2_GatherVector256:
1900 case NI_AVX2_GatherMaskVector128:
1901 case NI_AVX2_GatherMaskVector256:
1903 GenTreeArgList* list = op1->AsArgList();
1904 op1 = list->Current();
1905 op1Reg = op1->gtRegNum;
1907 list = list->Rest();
1908 op2 = list->Current();
1909 op2Reg = op2->gtRegNum;
1911 list = list->Rest();
1912 GenTree* op3 = list->Current();
1914 list = list->Rest();
1915 GenTree* op4 = nullptr;
1916 GenTree* lastOp = nullptr;
1917 GenTree* indexOp = nullptr;
1919 regNumber op3Reg = REG_NA;
1920 regNumber op4Reg = REG_NA;
1921 regNumber addrBaseReg = REG_NA;
1922 regNumber addrIndexReg = REG_NA;
1923 regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT);
1927 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
1928 op4 = list->Current();
1929 list = list->Rest();
1930 lastOp = list->Current();
1931 op3Reg = op3->gtRegNum;
1932 op4Reg = op4->gtRegNum;
1933 addrBaseReg = op2Reg;
1934 addrIndexReg = op3Reg;
1937 // copy op4Reg into the tmp mask register,
1938 // the mask register will be cleared by gather instructions
1939 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
1941 if (targetReg != op1Reg)
1943 // copy source vector to the target register for masking merge
1944 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1949 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
1950 addrBaseReg = op1Reg;
1951 addrIndexReg = op2Reg;
1955 // generate all-one mask vector
1956 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
1959 bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
1961 // hwintrinsiclistxarch.h uses Dword index instructions in default
1962 if (varTypeIsLong(node->gtIndexBaseType))
1966 case INS_vpgatherdd:
1967 ins = INS_vpgatherqd;
1968 if (isVector128GatherWithVector256Index)
1970 // YMM index in address mode
1971 attr = emitTypeSize(TYP_SIMD32);
1974 case INS_vpgatherdq:
1975 ins = INS_vpgatherqq;
1977 case INS_vgatherdps:
1978 ins = INS_vgatherqps;
1979 if (isVector128GatherWithVector256Index)
1981 // YMM index in address mode
1982 attr = emitTypeSize(TYP_SIMD32);
1985 case INS_vgatherdpd:
1986 ins = INS_vgatherqpd;
1993 assert(lastOp->IsCnsIntOrI());
1994 ssize_t ival = lastOp->AsIntCon()->IconValue();
1995 assert((ival >= 0) && (ival <= 255));
1997 assert(targetReg != maskReg);
1998 assert(targetReg != addrIndexReg);
1999 assert(maskReg != addrIndexReg);
2000 emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2007 genHWIntrinsic_R_RM(node, ins, attr);
2008 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2009 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2013 case NI_AVX_TestNotZAndNotC:
2015 genHWIntrinsic_R_RM(node, ins, attr);
2016 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2017 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2023 genHWIntrinsic_R_RM(node, ins, attr);
2024 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2025 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2034 genProduceReg(node);
2037 //------------------------------------------------------------------------
2038 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2041 // node - The hardware intrinsic node
2043 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2045 NYI("Implement AES intrinsic code generation");
2048 //------------------------------------------------------------------------
2049 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2052 // node - The hardware intrinsic node
2054 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2056 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2057 regNumber targetReg = node->gtRegNum;
2058 GenTree* op1 = node->gtGetOp1();
2059 GenTree* op2 = node->gtGetOp2();
2060 var_types targetType = node->TypeGet();
2061 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2062 emitter* emit = getEmitter();
2064 assert(targetReg != REG_NA);
2065 assert(op1 != nullptr);
2067 genConsumeHWIntrinsicOperands(node);
2069 switch (intrinsicId)
2071 case NI_BMI1_AndNot:
2072 case NI_BMI1_X64_AndNot:
2073 case NI_BMI1_BitFieldExtract:
2074 case NI_BMI1_X64_BitFieldExtract:
2075 case NI_BMI2_ParallelBitDeposit:
2076 case NI_BMI2_ParallelBitExtract:
2077 case NI_BMI2_X64_ParallelBitDeposit:
2078 case NI_BMI2_X64_ParallelBitExtract:
2079 case NI_BMI2_ZeroHighBits:
2080 case NI_BMI2_X64_ZeroHighBits:
2082 assert(op2 != nullptr);
2083 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2084 genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2088 case NI_BMI1_ExtractLowestSetBit:
2089 case NI_BMI1_GetMaskUpToLowestSetBit:
2090 case NI_BMI1_ResetLowestSetBit:
2091 case NI_BMI1_X64_ExtractLowestSetBit:
2092 case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2093 case NI_BMI1_X64_ResetLowestSetBit:
2095 assert(op2 == nullptr);
2096 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2097 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2101 case NI_BMI1_TrailingZeroCount:
2102 case NI_BMI1_X64_TrailingZeroCount:
2104 assert(op2 == nullptr);
2105 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2106 genXCNTIntrinsic(node, ins);
2110 case NI_BMI2_MultiplyNoFlags:
2111 case NI_BMI2_X64_MultiplyNoFlags:
2113 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2114 assert(numArgs == 2 || numArgs == 3);
2116 regNumber op1Reg = REG_NA;
2117 regNumber op2Reg = REG_NA;
2118 regNumber op3Reg = REG_NA;
2119 regNumber lowReg = REG_NA;
2123 op1Reg = op1->gtRegNum;
2124 op2Reg = op2->gtRegNum;
2129 GenTreeArgList* argList = op1->AsArgList();
2130 op1 = argList->Current();
2131 op1Reg = op1->gtRegNum;
2132 argList = argList->Rest();
2133 op2 = argList->Current();
2134 op2Reg = op2->gtRegNum;
2135 argList = argList->Rest();
2136 GenTree* op3 = argList->Current();
2137 op3Reg = op3->gtRegNum;
2138 assert(op3Reg != op1Reg);
2139 assert(op3Reg != targetReg);
2140 assert(op3Reg != REG_EDX);
2141 lowReg = node->GetSingleTempReg();
2142 assert(op3Reg != lowReg);
2143 assert(lowReg != targetReg);
2146 emitAttr attr = emitTypeSize(targetType);
2147 // mov the first operand into implicit source operand EDX/RDX
2148 if (op1Reg != REG_EDX)
2150 assert(op2Reg != REG_EDX);
2151 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2154 // generate code for MULX
2155 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2157 // If requires the lower half result, store in the memory opinted by op3
2160 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2173 genProduceReg(node);
2176 //------------------------------------------------------------------------
2177 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2180 // node - The hardware intrinsic node
2182 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2184 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2185 var_types baseType = node->gtSIMDBaseType;
2186 emitAttr attr = EA_ATTR(node->gtSIMDSize);
2187 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2188 GenTree* op1 = node->gtGetOp1();
2189 regNumber targetReg = node->gtRegNum;
2191 assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2193 genConsumeHWIntrinsicOperands(node);
2194 GenTreeArgList* argList = op1->AsArgList();
2195 op1 = argList->Current();
2197 argList = argList->Rest();
2198 GenTree* op2 = argList->Current();
2200 argList = argList->Rest();
2201 GenTree* op3 = argList->Current();
2206 bool isCommutative = false;
2207 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2209 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2210 assert(!copiesUpperBits || !op1->isContained());
2212 if (op3->isContained() || op3->isUsedFromSpillTemp())
2214 // 213 form: op1 = (op2 * op1) + [op3]
2216 op1Reg = op1->gtRegNum;
2217 op2Reg = op2->gtRegNum;
2219 isCommutative = !copiesUpperBits;
2221 else if (op2->isContained() || op2->isUsedFromSpillTemp())
2223 // 132 form: op1 = (op1 * op3) + [op2]
2225 ins = (instruction)(ins - 1);
2226 op1Reg = op1->gtRegNum;
2227 op2Reg = op3->gtRegNum;
2230 else if (op1->isContained() || op1->isUsedFromSpillTemp())
2232 // 231 form: op3 = (op2 * op3) + [op1]
2234 ins = (instruction)(ins + 1);
2235 op1Reg = op3->gtRegNum;
2236 op2Reg = op2->gtRegNum;
2241 // 213 form: op1 = (op2 * op1) + op3
2243 op1Reg = op1->gtRegNum;
2244 op2Reg = op2->gtRegNum;
2246 isCommutative = !copiesUpperBits;
2249 if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2251 assert(node->isRMWHWIntrinsic(compiler));
2253 // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2255 // For non-commutative intrinsics, we should have ensured that op2 was marked
2256 // delay free in order to prevent it from getting assigned the same register
2257 // as target. However, for commutative intrinsics, we can just swap the operands
2258 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2264 genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2265 genProduceReg(node);
2268 //------------------------------------------------------------------------
2269 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2272 // node - The hardware intrinsic node
2274 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2276 assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2277 node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2279 genConsumeOperands(node);
2280 genXCNTIntrinsic(node, INS_lzcnt);
2281 genProduceReg(node);
2284 //------------------------------------------------------------------------
2285 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2288 // node - The hardware intrinsic node
2290 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2292 NYI("Implement PCLMULQDQ intrinsic code generation");
2295 //------------------------------------------------------------------------
2296 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2299 // node - The hardware intrinsic node
2301 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2303 assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2305 genConsumeOperands(node);
2306 genXCNTIntrinsic(node, INS_popcnt);
2307 genProduceReg(node);
2310 //------------------------------------------------------------------------
2311 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2312 // the target register
2315 // node - The hardware intrinsic node
2316 // ins - The instruction being generated
2318 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2320 // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2321 // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2322 // renaming, but only if it's not an actual dependency.
2324 GenTree* op1 = node->gtGetOp1();
2325 regNumber sourceReg1 = REG_NA;
2326 regNumber sourceReg2 = REG_NA;
2328 if (!op1->isContained())
2330 sourceReg1 = op1->gtRegNum;
2332 else if (op1->isIndir())
2334 GenTreeIndir* indir = op1->AsIndir();
2335 GenTree* memBase = indir->Base();
2337 if (memBase != nullptr)
2339 sourceReg1 = memBase->gtRegNum;
2342 if (indir->HasIndex())
2344 sourceReg2 = indir->Index()->gtRegNum;
2348 regNumber targetReg = node->gtRegNum;
2349 if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2351 getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2353 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2356 #endif // FEATURE_HW_INTRINSICS