1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Intel hardware intrinsic Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifdef FEATURE_HW_INTRINSICS
22 #include "sideeffects.h"
25 #include "gcinfoencoder.h"
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
31 // lowering - The lowering phase from the compiler
32 // node - The HWIntrinsic node that has the contained node
33 // op - The op that is contained
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
38 // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39 // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
41 // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42 // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
44 // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
46 // in the first place).
48 bool supportsRegOptional = false;
49 bool isContainable = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50 assert(isContainable || supportsRegOptional);
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
58 // category - category of a HW intrinsic
61 // returns true if this category can be table-driven in CodeGen
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
65 // TODO - make more categories to the table-driven framework
66 // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67 const bool tableDrivenCategory =
68 (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69 const bool tableDrivenFlag =
70 !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71 return tableDrivenCategory && tableDrivenFlag;
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
78 // node - The hardware intrinsic node
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
82 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
83 InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId);
84 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
85 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
86 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
88 assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
90 if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
92 GenTree* op1 = node->gtGetOp1();
93 GenTree* op2 = node->gtGetOp2();
94 regNumber targetReg = node->gtRegNum;
95 var_types targetType = node->TypeGet();
96 var_types baseType = node->gtSIMDBaseType;
98 regNumber op1Reg = REG_NA;
99 regNumber op2Reg = REG_NA;
100 emitter* emit = getEmitter();
102 assert(numArgs >= 0);
103 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104 assert(ins != INS_invalid);
105 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106 assert(simdSize != 0);
112 if (node->OperIsMemoryLoad())
114 genConsumeAddress(op1);
115 // Until we improve the handling of addressing modes in the emitter, we'll create a
116 // temporary GT_IND to generate code with.
117 GenTreeIndir load = indirForm(node->TypeGet(), op1);
118 emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
123 op1Reg = op1->gtRegNum;
125 if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
127 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
129 else if ((ival != -1) && varTypeIsFloating(baseType))
131 assert((ival >= 0) && (ival <= 127));
132 genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
136 genHWIntrinsic_R_RM(node, ins, simdSize);
144 if (category == HW_Category_MemoryStore)
146 genConsumeAddress(op1);
148 // Until we improve the handling of addressing modes in the emitter, we'll create a
149 // temporary GT_STORE_IND to generate code with.
150 GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
151 emit->emitInsStoreInd(ins, simdSize, &store);
157 op1Reg = op1->gtRegNum;
158 op2Reg = op2->gtRegNum;
160 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
162 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
164 // For non-commutative intrinsics, we should have ensured that op2 was marked
165 // delay free in order to prevent it from getting assigned the same register
166 // as target. However, for commutative intrinsics, we can just swap the operands
167 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
169 noway_assert(node->OperIsCommutative());
174 if ((ival != -1) && varTypeIsFloating(baseType))
176 assert((ival >= 0) && (ival <= 127));
177 genHWIntrinsic_R_R_RM_I(node, ins, ival);
179 else if (category == HW_Category_MemoryLoad)
181 // Get the address and the 'other' register.
184 if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
194 // Until we improve the handling of addressing modes in the emitter, we'll create a
195 // temporary GT_IND to generate code with.
196 GenTreeIndir load = indirForm(node->TypeGet(), addr);
197 genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
199 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
203 if (intrinsicId == NI_SSE2_Extract)
205 // extract instructions return to GP-registers, so it needs int size as the emitsize
206 simdSize = emitTypeSize(TYP_INT);
209 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
211 if (op2->IsCnsIntOrI())
213 ssize_t ival = op2->AsIntCon()->IconValue();
214 assert((ival >= 0) && (ival <= 255));
215 emitSwCase((int8_t)ival);
219 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
220 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
221 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
222 regNumber baseReg = node->ExtractTempReg();
223 regNumber offsReg = node->GetSingleTempReg();
224 genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
229 genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
236 GenTreeArgList* argList = op1->AsArgList();
237 op1 = argList->Current();
239 op1Reg = op1->gtRegNum;
241 argList = argList->Rest();
242 op2 = argList->Current();
244 op2Reg = op2->gtRegNum;
246 argList = argList->Rest();
247 GenTree* op3 = argList->Current();
249 regNumber op3Reg = op3->gtRegNum;
251 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
255 auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
257 if (op3->IsCnsIntOrI())
259 ssize_t ival = op3->AsIntCon()->IconValue();
260 assert((ival >= 0) && (ival <= 255));
261 emitSwCase((int8_t)ival);
265 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
266 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
267 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
268 regNumber baseReg = node->ExtractTempReg();
269 regNumber offsReg = node->GetSingleTempReg();
270 genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
273 else if (category == HW_Category_MemoryStore)
275 if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
277 emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
281 assert(intrinsicId == NI_SSE2_MaskMove);
282 assert(targetReg == REG_NA);
284 // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
285 if (op3Reg != REG_EDI)
287 emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
289 emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
296 case NI_SSE41_BlendVariable:
297 case NI_AVX_BlendVariable:
298 case NI_AVX2_BlendVariable:
300 genHWIntrinsic_R_R_RM_R(node, ins);
324 case InstructionSet_Base:
325 genBaseIntrinsic(node);
327 case InstructionSet_SSE:
328 case InstructionSet_SSE_X64:
329 genSSEIntrinsic(node);
331 case InstructionSet_SSE2:
332 case InstructionSet_SSE2_X64:
333 genSSE2Intrinsic(node);
335 case InstructionSet_SSE41:
336 case InstructionSet_SSE41_X64:
337 genSSE41Intrinsic(node);
339 case InstructionSet_SSE42:
340 case InstructionSet_SSE42_X64:
341 genSSE42Intrinsic(node);
343 case InstructionSet_AVX:
344 case InstructionSet_AVX2:
345 genAvxOrAvx2Intrinsic(node);
347 case InstructionSet_AES:
348 genAESIntrinsic(node);
350 case InstructionSet_BMI1:
351 case InstructionSet_BMI1_X64:
352 case InstructionSet_BMI2:
353 case InstructionSet_BMI2_X64:
354 genBMI1OrBMI2Intrinsic(node);
356 case InstructionSet_FMA:
357 genFMAIntrinsic(node);
359 case InstructionSet_LZCNT:
360 case InstructionSet_LZCNT_X64:
361 genLZCNTIntrinsic(node);
363 case InstructionSet_PCLMULQDQ:
364 genPCLMULQDQIntrinsic(node);
366 case InstructionSet_POPCNT:
367 case InstructionSet_POPCNT_X64:
368 genPOPCNTIntrinsic(node);
376 //------------------------------------------------------------------------
377 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
378 // register/memory operand and that returns a value in register
381 // node - The hardware intrinsic node
382 // ins - The instruction being generated
383 // attr - The emit attribute for the instruciton being generated
385 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
387 var_types targetType = node->TypeGet();
388 regNumber targetReg = node->gtRegNum;
389 GenTree* op1 = node->gtGetOp1();
390 GenTree* op2 = node->gtGetOp2();
391 emitter* emit = getEmitter();
395 // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this
396 // code path. They are all MultiIns, as the return value comes from the flags and
397 // we have two operands instead.
399 assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
400 assert(targetReg != REG_NA);
402 targetReg = op1->gtRegNum;
408 assert(!node->OperIsCommutative());
411 assert(targetReg != REG_NA);
412 assert(op2 == nullptr);
414 if (op1->isContained() || op1->isUsedFromSpillTemp())
416 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
417 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
419 TempDsc* tmpDsc = nullptr;
420 unsigned varNum = BAD_VAR_NUM;
421 unsigned offset = (unsigned)-1;
423 if (op1->isUsedFromSpillTemp())
425 assert(op1->IsRegOptional());
427 tmpDsc = getSpillTempDsc(op1);
428 varNum = tmpDsc->tdTempNum();
431 regSet.tmpRlsTemp(tmpDsc);
433 else if (op1->OperIsHWIntrinsic())
435 emit->emitIns_R_AR(ins, attr, targetReg, op1->gtGetOp1()->gtRegNum, 0);
438 else if (op1->isIndir())
440 GenTreeIndir* memIndir = op1->AsIndir();
441 GenTree* memBase = memIndir->gtOp1;
443 switch (memBase->OperGet())
445 case GT_LCL_VAR_ADDR:
447 varNum = memBase->AsLclVarCommon()->GetLclNum();
450 // Ensure that all the GenTreeIndir values are set to their defaults.
451 assert(!memIndir->HasIndex());
452 assert(memIndir->Scale() == 1);
453 assert(memIndir->Offset() == 0);
458 case GT_CLS_VAR_ADDR:
460 emit->emitIns_R_C(ins, attr, targetReg, memBase->gtClsVar.gtClsVarHnd, 0);
466 emit->emitIns_R_A(ins, attr, targetReg, memIndir);
473 switch (op1->OperGet())
477 GenTreeLclFld* lclField = op1->AsLclFld();
479 varNum = lclField->GetLclNum();
480 offset = lclField->gtLclFld.gtLclOffs;
486 assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
487 varNum = op1->AsLclVar()->GetLclNum();
500 // Ensure we got a good varNum and offset.
501 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
502 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
503 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
504 assert(offset != (unsigned)-1);
506 emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
510 regNumber op1Reg = op1->gtRegNum;
511 emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
515 //------------------------------------------------------------------------
516 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
517 // an immediate operand, and that returns a value in register
520 // node - The hardware intrinsic node
521 // ins - The instruction being generated
522 // ival - The immediate value
524 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
526 var_types targetType = node->TypeGet();
527 regNumber targetReg = node->gtRegNum;
528 GenTree* op1 = node->gtGetOp1();
529 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
530 emitter* emit = getEmitter();
532 // TODO-XArch-CQ: Commutative operations can have op1 be contained
533 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
535 assert(targetReg != REG_NA);
536 assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
538 if (op1->isContained() || op1->isUsedFromSpillTemp())
540 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
541 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
543 inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
546 //------------------------------------------------------------------------
547 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
548 // register/memory operand, and that returns a value in register
551 // node - The hardware intrinsic node
552 // ins - The instruction being generated
553 // attr - The emit attribute for the instruciton being generated
555 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
557 regNumber targetReg = node->gtRegNum;
558 GenTree* op1 = node->gtGetOp1();
559 GenTree* op2 = node->gtGetOp2();
560 regNumber op1Reg = op1->gtRegNum;
562 assert(targetReg != REG_NA);
563 assert(op1Reg != REG_NA);
565 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
568 //------------------------------------------------------------------------
569 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
570 // register/memory operand, and that returns a value in register
573 // node - The hardware intrinsic node
574 // ins - The instruction being generated
575 // attr - The emit attribute for the instruciton being generated
576 // targetReg - The register allocated to the result
577 // op1Reg - The register allocated to the first operand
578 // op2 - Another operand that maybe in register or memory
580 void CodeGen::genHWIntrinsic_R_R_RM(
581 GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
583 emitter* emit = getEmitter();
585 // TODO-XArch-CQ: Commutative operations can have op1 be contained
586 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
588 assert(targetReg != REG_NA);
589 assert(op1Reg != REG_NA);
591 if (op2->isContained() || op2->isUsedFromSpillTemp())
593 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
594 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
596 TempDsc* tmpDsc = nullptr;
597 unsigned varNum = BAD_VAR_NUM;
598 unsigned offset = (unsigned)-1;
600 if (op2->isUsedFromSpillTemp())
602 assert(op2->IsRegOptional());
604 tmpDsc = getSpillTempDsc(op2);
605 varNum = tmpDsc->tdTempNum();
608 regSet.tmpRlsTemp(tmpDsc);
610 else if (op2->OperIsHWIntrinsic())
612 GenTree* addr = op2->gtGetOp1();
613 // Until we improve the handling of addressing modes in the emitter, we'll create a
614 // temporary GT_IND to generate code with.
615 GenTreeIndir load = indirForm(node->TypeGet(), addr);
616 emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, &load);
619 else if (op2->isIndir())
621 GenTreeIndir* memIndir = op2->AsIndir();
622 GenTree* memBase = memIndir->gtOp1;
624 switch (memBase->OperGet())
626 case GT_LCL_VAR_ADDR:
628 varNum = memBase->AsLclVarCommon()->GetLclNum();
631 // Ensure that all the GenTreeIndir values are set to their defaults.
632 assert(!memIndir->HasIndex());
633 assert(memIndir->Scale() == 1);
634 assert(memIndir->Offset() == 0);
639 case GT_CLS_VAR_ADDR:
641 emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
647 emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
654 switch (op2->OperGet())
658 GenTreeLclFld* lclField = op2->AsLclFld();
660 varNum = lclField->GetLclNum();
661 offset = lclField->gtLclFld.gtLclOffs;
667 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
668 varNum = op2->AsLclVar()->GetLclNum();
679 // Ensure we got a good varNum and offset.
680 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
681 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
682 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
683 assert(offset != (unsigned)-1);
685 emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
689 regNumber op2Reg = op2->gtRegNum;
691 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
693 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
695 // For non-commutative intrinsics, we should have ensured that op2 was marked
696 // delay free in order to prevent it from getting assigned the same register
697 // as target. However, for commutative intrinsics, we can just swap the operands
698 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
700 noway_assert(node->OperIsCommutative());
705 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
709 //------------------------------------------------------------------------
710 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
711 // register/memory operand, an immediate operand, and that returns a value in register
714 // node - The hardware intrinsic node
715 // ins - The instruction being generated
716 // ival - The immediate value
718 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
720 var_types targetType = node->TypeGet();
721 regNumber targetReg = node->gtRegNum;
722 GenTree* op1 = node->gtGetOp1();
723 GenTree* op2 = node->gtGetOp2();
724 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
725 emitter* emit = getEmitter();
727 // TODO-XArch-CQ: Commutative operations can have op1 be contained
728 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
730 if (op1->OperIsList())
732 assert(op2 == nullptr);
734 GenTreeArgList* argList = op1->AsArgList();
736 op1 = argList->Current();
737 argList = argList->Rest();
739 op2 = argList->Current();
740 argList = argList->Rest();
742 assert(argList->Current() != nullptr);
743 assert(argList->Rest() == nullptr);
746 regNumber op1Reg = op1->gtRegNum;
748 assert(targetReg != REG_NA);
749 assert(op1Reg != REG_NA);
751 if (op2->isContained() || op2->isUsedFromSpillTemp())
753 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
754 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
756 TempDsc* tmpDsc = nullptr;
757 unsigned varNum = BAD_VAR_NUM;
758 unsigned offset = (unsigned)-1;
760 if (op2->isUsedFromSpillTemp())
762 assert(op2->IsRegOptional());
764 tmpDsc = getSpillTempDsc(op2);
765 varNum = tmpDsc->tdTempNum();
768 regSet.tmpRlsTemp(tmpDsc);
770 else if (op2->OperIsHWIntrinsic())
772 emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
775 else if (op2->isIndir())
777 GenTreeIndir* memIndir = op2->AsIndir();
778 GenTree* memBase = memIndir->gtOp1;
780 switch (memBase->OperGet())
782 case GT_LCL_VAR_ADDR:
784 varNum = memBase->AsLclVarCommon()->GetLclNum();
787 // Ensure that all the GenTreeIndir values are set to their defaults.
788 assert(!memIndir->HasIndex());
789 assert(memIndir->Scale() == 1);
790 assert(memIndir->Offset() == 0);
795 case GT_CLS_VAR_ADDR:
797 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
804 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
811 switch (op2->OperGet())
815 GenTreeLclFld* lclField = op2->AsLclFld();
817 varNum = lclField->GetLclNum();
818 offset = lclField->gtLclFld.gtLclOffs;
824 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
825 varNum = op2->AsLclVar()->GetLclNum();
836 // Ensure we got a good varNum and offset.
837 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
838 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
839 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
840 assert(offset != (unsigned)-1);
842 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
846 regNumber op2Reg = op2->gtRegNum;
848 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
850 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
852 // For non-commutative intrinsics, we should have ensured that op2 was marked
853 // delay free in order to prevent it from getting assigned the same register
854 // as target. However, for commutative intrinsics, we can just swap the operands
855 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
857 noway_assert(node->OperIsCommutative());
862 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
866 //------------------------------------------------------------------------
867 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
868 // register/memory operand, another register operand, and that returns a value in register
871 // node - The hardware intrinsic node
872 // ins - The instruction being generated
874 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
876 var_types targetType = node->TypeGet();
877 regNumber targetReg = node->gtRegNum;
878 GenTree* op1 = node->gtGetOp1();
879 GenTree* op2 = node->gtGetOp2();
880 GenTree* op3 = nullptr;
881 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
882 emitter* emit = getEmitter();
884 assert(op1->OperIsList());
885 assert(op2 == nullptr);
887 GenTreeArgList* argList = op1->AsArgList();
889 op1 = argList->Current();
890 argList = argList->Rest();
892 op2 = argList->Current();
893 argList = argList->Rest();
895 op3 = argList->Current();
896 assert(argList->Rest() == nullptr);
898 regNumber op1Reg = op1->gtRegNum;
899 regNumber op3Reg = op3->gtRegNum;
901 assert(targetReg != REG_NA);
902 assert(op1Reg != REG_NA);
903 assert(op3Reg != REG_NA);
905 if (op2->isContained() || op2->isUsedFromSpillTemp())
907 assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
908 assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
910 TempDsc* tmpDsc = nullptr;
911 unsigned varNum = BAD_VAR_NUM;
912 unsigned offset = (unsigned)-1;
914 if (op2->isUsedFromSpillTemp())
916 assert(op2->IsRegOptional());
918 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
919 // pattern. It could probably be extracted to its own method.
920 tmpDsc = getSpillTempDsc(op2);
921 varNum = tmpDsc->tdTempNum();
924 regSet.tmpRlsTemp(tmpDsc);
926 else if (op2->OperIsHWIntrinsic())
928 emit->emitIns_SIMD_R_R_AR_R(ins, simdSize, targetReg, op1Reg, op3Reg, op2->gtGetOp1()->gtRegNum);
931 else if (op2->isIndir())
933 GenTreeIndir* memIndir = op2->AsIndir();
934 GenTree* memBase = memIndir->gtOp1;
936 switch (memBase->OperGet())
938 case GT_LCL_VAR_ADDR:
940 varNum = memBase->AsLclVarCommon()->GetLclNum();
943 // Ensure that all the GenTreeIndir values are set to their defaults.
944 assert(!memIndir->HasIndex());
945 assert(memIndir->Scale() == 1);
946 assert(memIndir->Offset() == 0);
951 case GT_CLS_VAR_ADDR:
953 emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, memBase->gtClsVar.gtClsVarHnd,
960 emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
967 switch (op2->OperGet())
971 GenTreeLclFld* lclField = op2->AsLclFld();
973 varNum = lclField->GetLclNum();
974 offset = lclField->gtLclFld.gtLclOffs;
980 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
981 varNum = op2->AsLclVar()->GetLclNum();
992 // Ensure we got a good varNum and offset.
993 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
994 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
995 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
996 assert(offset != (unsigned)-1);
998 emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
1002 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1006 //------------------------------------------------------------------------
1007 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1008 // a register/memory operand, and that returns a value in register
1011 // ins - The instruction being generated
1012 // attr - The emit attribute
1013 // targetReg - The target register
1014 // op1Reg - The register of the first operand
1015 // op2Reg - The register of the second operand
1016 // op3 - The third operand
1018 void CodeGen::genHWIntrinsic_R_R_R_RM(
1019 instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1021 assert(targetReg != REG_NA);
1022 assert(op1Reg != REG_NA);
1023 assert(op2Reg != REG_NA);
1025 emitter* emit = getEmitter();
1027 if (op3->isContained() || op3->isUsedFromSpillTemp())
1029 TempDsc* tmpDsc = nullptr;
1030 unsigned varNum = BAD_VAR_NUM;
1031 unsigned offset = (unsigned)-1;
1033 if (op3->isUsedFromSpillTemp())
1035 assert(op3->IsRegOptional());
1037 // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1038 // pattern. It could probably be extracted to its own method.
1039 tmpDsc = getSpillTempDsc(op3);
1040 varNum = tmpDsc->tdTempNum();
1043 regSet.tmpRlsTemp(tmpDsc);
1045 else if (op3->OperIsHWIntrinsic())
1047 emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum);
1050 else if (op3->isIndir())
1052 GenTreeIndir* memIndir = op3->AsIndir();
1053 GenTree* memBase = memIndir->gtOp1;
1055 switch (memBase->OperGet())
1057 case GT_LCL_VAR_ADDR:
1059 varNum = memBase->AsLclVarCommon()->GetLclNum();
1062 // Ensure that all the GenTreeIndir values are set to their defaults.
1063 assert(!memIndir->HasIndex());
1064 assert(memIndir->Scale() == 1);
1065 assert(memIndir->Offset() == 0);
1070 case GT_CLS_VAR_ADDR:
1072 emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0);
1078 emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1085 switch (op3->OperGet())
1089 GenTreeLclFld* lclField = op3->AsLclFld();
1091 varNum = lclField->GetLclNum();
1092 offset = lclField->gtLclFld.gtLclOffs;
1098 assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1099 varNum = op3->AsLclVar()->GetLclNum();
1110 // Ensure we got a good varNum and offset.
1111 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1112 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1113 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1114 assert(offset != (unsigned)-1);
1116 emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1120 emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1124 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1125 // with non-constant argument
1128 // intrinsic - intrinsic ID
1129 // nonConstImmReg - the register contains non-constant imm8 argument
1130 // baseReg - a register for the start of the switch table
1131 // offsReg - a register for the offset into the switch table
1132 // emitSwCase - the lambda to generate a switch case
1135 // generate the jump-table fallback for imm-intrinsics with non-constant argument.
1137 // This function can be used for all imm-intrinsics (whether full-range or not),
1138 // The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1139 // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1141 template <typename HWIntrinsicSwitchCaseBody>
1142 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
1143 regNumber nonConstImmReg,
1146 HWIntrinsicSwitchCaseBody emitSwCase)
1148 assert(nonConstImmReg != REG_NA);
1149 // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1150 // that does work with the current compiler generated jump-table fallback
1151 assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1152 emitter* emit = getEmitter();
1154 const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1155 assert(maxByte <= 256);
1156 BasicBlock* jmpTable[256];
1158 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1159 unsigned jmpTableOffs = 0;
1161 // Emit the jump table
1162 for (unsigned i = 0; i < maxByte; i++)
1164 jmpTable[i] = genCreateTempLabel();
1165 emit->emitDataGenData(i, jmpTable[i]);
1168 emit->emitDataGenEnd();
1170 // Compute and jump to the appropriate offset in the switch table
1171 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1173 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1174 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1175 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1176 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1178 // Emit the switch table entries
1180 BasicBlock* switchTableBeg = genCreateTempLabel();
1181 BasicBlock* switchTableEnd = genCreateTempLabel();
1183 genDefineTempLabel(switchTableBeg);
1185 for (unsigned i = 0; i < maxByte; i++)
1187 genDefineTempLabel(jmpTable[i]);
1188 emitSwCase((int8_t)i);
1189 emit->emitIns_J(INS_jmp, switchTableEnd);
1192 genDefineTempLabel(switchTableEnd);
1195 //------------------------------------------------------------------------
1196 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1199 // node - The hardware intrinsic node
1202 // We currently assume that all base intrinsics have zero or one operand.
1204 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1206 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1207 regNumber targetReg = node->gtRegNum;
1208 var_types targetType = node->TypeGet();
1209 var_types baseType = node->gtSIMDBaseType;
1211 assert(compiler->compSupports(InstructionSet_SSE));
1212 assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1214 GenTree* op1 = node->gtGetOp1();
1216 genConsumeHWIntrinsicOperands(node);
1217 regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
1219 assert(node->gtGetOp2() == nullptr);
1221 emitter* emit = getEmitter();
1222 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1223 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1225 switch (intrinsicId)
1227 case NI_Base_Vector128_CreateScalarUnsafe:
1228 case NI_Base_Vector256_CreateScalarUnsafe:
1230 if (varTypeIsIntegral(baseType))
1232 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1236 assert(varTypeIsFloating(baseType));
1238 attr = emitTypeSize(baseType);
1240 if (op1->isContained() || op1->isUsedFromSpillTemp())
1242 genHWIntrinsic_R_RM(node, ins, attr);
1244 else if (targetReg != op1Reg)
1246 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1247 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1253 case NI_Base_Vector128_ToScalar:
1254 case NI_Base_Vector256_ToScalar:
1256 assert(varTypeIsFloating(baseType));
1258 attr = emitTypeSize(TYP_SIMD16);
1260 if (op1->isContained() || op1->isUsedFromSpillTemp())
1262 genHWIntrinsic_R_RM(node, ins, attr);
1264 else if (targetReg != op1Reg)
1266 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1267 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1272 case NI_Base_Vector128_ToVector256:
1274 // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1275 // We always emit a move to the target register, even when op1Reg == targetReg,
1276 // in order to ensure that Bits MAXVL-1:128 are zeroed.
1278 attr = emitTypeSize(TYP_SIMD16);
1280 if (op1->isContained() || op1->isUsedFromSpillTemp())
1282 genHWIntrinsic_R_RM(node, ins, attr);
1286 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1287 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1292 case NI_Base_Vector128_ToVector256Unsafe:
1293 case NI_Base_Vector256_GetLower:
1295 if (op1->isContained() || op1->isUsedFromSpillTemp())
1297 genHWIntrinsic_R_RM(node, ins, attr);
1299 else if (targetReg != op1Reg)
1301 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1302 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1307 case NI_Base_Vector128_Zero:
1308 case NI_Base_Vector256_Zero:
1310 assert(op1 == nullptr);
1311 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1322 genProduceReg(node);
1325 //------------------------------------------------------------------------
1326 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1329 // node - The hardware intrinsic node
1331 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1333 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1334 GenTree* op1 = node->gtGetOp1();
1335 GenTree* op2 = node->gtGetOp2();
1336 GenTree* op3 = nullptr;
1337 GenTree* op4 = nullptr;
1338 regNumber targetReg = node->gtRegNum;
1339 var_types targetType = node->TypeGet();
1340 var_types baseType = node->gtSIMDBaseType;
1342 regNumber op1Reg = REG_NA;
1343 regNumber op2Reg = REG_NA;
1344 regNumber op3Reg = REG_NA;
1345 regNumber op4Reg = REG_NA;
1346 emitter* emit = getEmitter();
1348 genConsumeHWIntrinsicOperands(node);
1350 switch (intrinsicId)
1352 case NI_SSE_CompareEqualOrderedScalar:
1353 case NI_SSE_CompareEqualUnorderedScalar:
1355 assert(baseType == TYP_FLOAT);
1356 regNumber tmpReg = node->GetSingleTempReg();
1357 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1359 // Ensure we aren't overwriting targetReg
1360 assert(tmpReg != targetReg);
1362 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1363 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1364 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1365 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1366 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1370 case NI_SSE_CompareGreaterThanOrderedScalar:
1371 case NI_SSE_CompareGreaterThanUnorderedScalar:
1373 assert(baseType == TYP_FLOAT);
1374 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1376 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1377 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1378 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1382 case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
1383 case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
1385 assert(baseType == TYP_FLOAT);
1386 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1388 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1389 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1390 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1394 case NI_SSE_CompareLessThanOrderedScalar:
1395 case NI_SSE_CompareLessThanUnorderedScalar:
1397 assert(baseType == TYP_FLOAT);
1398 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1400 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1401 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1402 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1406 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
1407 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
1409 assert(baseType == TYP_FLOAT);
1410 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1412 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1413 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1414 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1418 case NI_SSE_CompareNotEqualOrderedScalar:
1419 case NI_SSE_CompareNotEqualUnorderedScalar:
1421 assert(baseType == TYP_FLOAT);
1422 regNumber tmpReg = node->GetSingleTempReg();
1423 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1425 // Ensure we aren't overwriting targetReg
1426 assert(tmpReg != targetReg);
1428 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1429 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1430 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1431 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1432 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1436 case NI_SSE_X64_ConvertToInt64:
1437 case NI_SSE_X64_ConvertToInt64WithTruncation:
1439 assert(targetType == TYP_LONG);
1440 assert(op1 != nullptr);
1441 assert(op2 == nullptr);
1442 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1443 genHWIntrinsic_R_RM(node, ins, EA_8BYTE);
1447 case NI_SSE_X64_ConvertScalarToVector128Single:
1449 assert(baseType == TYP_LONG);
1450 assert(op1 != nullptr);
1451 assert(op2 != nullptr);
1452 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1453 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1457 case NI_SSE_Prefetch0:
1458 case NI_SSE_Prefetch1:
1459 case NI_SSE_Prefetch2:
1460 case NI_SSE_PrefetchNonTemporal:
1462 assert(baseType == TYP_UBYTE);
1463 assert(op2 == nullptr);
1465 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1466 op1Reg = op1->gtRegNum;
1467 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1471 case NI_SSE_StoreFence:
1473 assert(baseType == TYP_VOID);
1474 assert(op1 == nullptr);
1475 assert(op2 == nullptr);
1476 emit->emitIns(INS_sfence);
1485 genProduceReg(node);
1488 //------------------------------------------------------------------------
1489 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1492 // node - The hardware intrinsic node
1494 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1496 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1497 GenTree* op1 = node->gtGetOp1();
1498 GenTree* op2 = node->gtGetOp2();
1499 regNumber targetReg = node->gtRegNum;
1500 var_types targetType = node->TypeGet();
1501 var_types baseType = node->gtSIMDBaseType;
1502 regNumber op1Reg = REG_NA;
1503 regNumber op2Reg = REG_NA;
1504 emitter* emit = getEmitter();
1506 genConsumeHWIntrinsicOperands(node);
1508 switch (intrinsicId)
1510 // All integer overloads are handled by table codegen
1511 case NI_SSE2_CompareLessThan:
1513 assert(op1 != nullptr);
1514 assert(op2 != nullptr);
1516 assert(baseType == TYP_DOUBLE);
1518 int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1519 assert((ival >= 0) && (ival <= 127));
1521 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1522 op1Reg = op1->gtRegNum;
1523 op2Reg = op2->gtRegNum;
1524 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1529 case NI_SSE2_CompareEqualOrderedScalar:
1530 case NI_SSE2_CompareEqualUnorderedScalar:
1532 assert(baseType == TYP_DOUBLE);
1533 regNumber tmpReg = node->GetSingleTempReg();
1534 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1536 // Ensure we aren't overwriting targetReg
1537 assert(tmpReg != targetReg);
1539 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1540 emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1541 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1542 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1543 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1547 case NI_SSE2_CompareGreaterThanOrderedScalar:
1548 case NI_SSE2_CompareGreaterThanUnorderedScalar:
1550 assert(baseType == TYP_DOUBLE);
1551 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1553 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1554 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1555 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1559 case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
1560 case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
1562 assert(baseType == TYP_DOUBLE);
1563 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1565 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1566 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1567 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1571 case NI_SSE2_CompareLessThanOrderedScalar:
1572 case NI_SSE2_CompareLessThanUnorderedScalar:
1574 assert(baseType == TYP_DOUBLE);
1575 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1577 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1578 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1579 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1583 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
1584 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
1586 assert(baseType == TYP_DOUBLE);
1587 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1589 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1590 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1591 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1595 case NI_SSE2_CompareNotEqualOrderedScalar:
1596 case NI_SSE2_CompareNotEqualUnorderedScalar:
1598 assert(baseType == TYP_DOUBLE);
1599 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1600 regNumber tmpReg = node->GetSingleTempReg();
1602 // Ensure we aren't overwriting targetReg
1603 assert(tmpReg != targetReg);
1605 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1606 emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1607 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1608 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1609 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1613 case NI_SSE2_X64_ConvertScalarToVector128Double:
1615 assert(baseType == TYP_LONG);
1616 assert(op1 != nullptr);
1617 assert(op2 != nullptr);
1618 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1619 genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1623 case NI_SSE2_X64_ConvertScalarToVector128Int64:
1624 case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1626 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1627 assert(op1 != nullptr);
1628 assert(op2 == nullptr);
1629 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1630 genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1634 case NI_SSE2_ConvertToInt32:
1635 case NI_SSE2_ConvertToInt32WithTruncation:
1636 case NI_SSE2_ConvertToUInt32:
1637 case NI_SSE2_X64_ConvertToInt64:
1638 case NI_SSE2_X64_ConvertToInt64WithTruncation:
1639 case NI_SSE2_X64_ConvertToUInt64:
1641 assert(op2 == nullptr);
1642 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1644 if (varTypeIsIntegral(baseType))
1646 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1647 op1Reg = op1->gtRegNum;
1648 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1652 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1653 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1658 case NI_SSE2_LoadFence:
1660 assert(baseType == TYP_VOID);
1661 assert(op1 == nullptr);
1662 assert(op2 == nullptr);
1663 emit->emitIns(INS_lfence);
1667 case NI_SSE2_MemoryFence:
1669 assert(baseType == TYP_VOID);
1670 assert(op1 == nullptr);
1671 assert(op2 == nullptr);
1672 emit->emitIns(INS_mfence);
1676 case NI_SSE2_StoreNonTemporal:
1677 case NI_SSE2_X64_StoreNonTemporal:
1679 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1680 assert(op1 != nullptr);
1681 assert(op2 != nullptr);
1683 op2Reg = op2->gtRegNum;
1684 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1685 op1Reg = op1->gtRegNum;
1686 emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1695 genProduceReg(node);
1698 //------------------------------------------------------------------------
1699 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1702 // node - The hardware intrinsic node
1704 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1706 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1707 GenTree* op1 = node->gtGetOp1();
1708 GenTree* op2 = node->gtGetOp2();
1709 GenTree* op3 = nullptr;
1710 GenTree* op4 = nullptr;
1711 regNumber targetReg = node->gtRegNum;
1712 var_types targetType = node->TypeGet();
1713 var_types baseType = node->gtSIMDBaseType;
1715 regNumber op1Reg = REG_NA;
1716 regNumber op2Reg = REG_NA;
1717 regNumber op3Reg = REG_NA;
1718 regNumber op4Reg = REG_NA;
1719 emitter* emit = getEmitter();
1721 genConsumeHWIntrinsicOperands(node);
1723 switch (intrinsicId)
1725 case NI_SSE41_TestAllOnes:
1727 op1Reg = op1->gtRegNum;
1728 regNumber tmpReg = node->GetSingleTempReg();
1729 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1730 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1731 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1732 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1733 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1737 case NI_SSE41_TestAllZeros:
1738 case NI_SSE41_TestZ:
1740 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1741 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1742 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1743 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1747 case NI_SSE41_TestC:
1749 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1750 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1751 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1752 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1756 case NI_SSE41_TestMixOnesZeros:
1757 case NI_SSE41_TestNotZAndNotC:
1759 assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1760 genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1761 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1762 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1766 case NI_SSE41_Extract:
1767 case NI_SSE41_X64_Extract:
1769 regNumber tmpTargetReg = REG_NA;
1770 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1771 if (baseType == TYP_FLOAT)
1773 tmpTargetReg = node->ExtractTempReg();
1776 auto emitSwCase = [&](int8_t i) {
1777 if (baseType == TYP_FLOAT)
1779 // extract instructions return to GP-registers, so it needs int size as the emitsize
1780 inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
1781 emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1785 inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
1789 if (op2->IsCnsIntOrI())
1791 ssize_t ival = op2->AsIntCon()->IconValue();
1792 assert((ival >= 0) && (ival <= 255));
1793 emitSwCase((int8_t)ival);
1797 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1798 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1799 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1800 regNumber baseReg = node->ExtractTempReg();
1801 regNumber offsReg = node->GetSingleTempReg();
1802 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1812 genProduceReg(node);
1815 //------------------------------------------------------------------------
1816 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1819 // node - The hardware intrinsic node
1821 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1823 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1824 regNumber targetReg = node->gtRegNum;
1825 GenTree* op1 = node->gtGetOp1();
1826 GenTree* op2 = node->gtGetOp2();
1827 var_types baseType = node->gtSIMDBaseType;
1828 var_types targetType = node->TypeGet();
1829 emitter* emit = getEmitter();
1831 genConsumeHWIntrinsicOperands(node);
1832 regNumber op1Reg = op1->gtRegNum;
1834 assert(targetReg != REG_NA);
1835 assert(op1Reg != REG_NA);
1836 assert(op2 != nullptr);
1837 assert(!node->OperIsCommutative());
1839 switch (intrinsicId)
1841 case NI_SSE42_Crc32:
1842 case NI_SSE42_X64_Crc32:
1844 if (op1Reg != targetReg)
1846 assert(op2->gtRegNum != targetReg);
1847 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1850 // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1851 // overload that explicitly takes the operands.
1853 node->gtOp2 = nullptr;
1855 if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1857 assert(targetType == TYP_INT);
1858 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1862 assert(op1->TypeGet() == op2->TypeGet());
1863 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1864 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1877 genProduceReg(node);
1880 //------------------------------------------------------------------------
1881 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1884 // node - The hardware intrinsic node
1886 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1888 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1889 var_types baseType = node->gtSIMDBaseType;
1890 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1891 var_types targetType = node->TypeGet();
1892 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1893 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
1894 GenTree* op1 = node->gtGetOp1();
1895 GenTree* op2 = node->gtGetOp2();
1896 regNumber op1Reg = REG_NA;
1897 regNumber op2Reg = REG_NA;
1898 regNumber targetReg = node->gtRegNum;
1899 emitter* emit = getEmitter();
1901 genConsumeHWIntrinsicOperands(node);
1903 switch (intrinsicId)
1905 case NI_AVX2_ConvertToInt32:
1906 case NI_AVX2_ConvertToUInt32:
1908 op1Reg = op1->gtRegNum;
1909 assert(numArgs == 1);
1910 assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1911 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1912 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1916 case NI_AVX2_GatherVector128:
1917 case NI_AVX2_GatherVector256:
1918 case NI_AVX2_GatherMaskVector128:
1919 case NI_AVX2_GatherMaskVector256:
1921 GenTreeArgList* list = op1->AsArgList();
1922 op1 = list->Current();
1923 op1Reg = op1->gtRegNum;
1925 list = list->Rest();
1926 op2 = list->Current();
1927 op2Reg = op2->gtRegNum;
1929 list = list->Rest();
1930 GenTree* op3 = list->Current();
1932 list = list->Rest();
1933 GenTree* op4 = nullptr;
1934 GenTree* lastOp = nullptr;
1935 GenTree* indexOp = nullptr;
1937 regNumber op3Reg = REG_NA;
1938 regNumber op4Reg = REG_NA;
1939 regNumber addrBaseReg = REG_NA;
1940 regNumber addrIndexReg = REG_NA;
1941 regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT);
1945 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
1946 op4 = list->Current();
1947 list = list->Rest();
1948 lastOp = list->Current();
1949 op3Reg = op3->gtRegNum;
1950 op4Reg = op4->gtRegNum;
1951 addrBaseReg = op2Reg;
1952 addrIndexReg = op3Reg;
1955 // copy op4Reg into the tmp mask register,
1956 // the mask register will be cleared by gather instructions
1957 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
1959 if (targetReg != op1Reg)
1961 // copy source vector to the target register for masking merge
1962 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1967 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
1968 addrBaseReg = op1Reg;
1969 addrIndexReg = op2Reg;
1973 // generate all-one mask vector
1974 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
1977 bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
1979 // hwintrinsiclistxarch.h uses Dword index instructions in default
1980 if (varTypeIsLong(node->gtIndexBaseType))
1984 case INS_vpgatherdd:
1985 ins = INS_vpgatherqd;
1986 if (isVector128GatherWithVector256Index)
1988 // YMM index in address mode
1989 attr = emitTypeSize(TYP_SIMD32);
1992 case INS_vpgatherdq:
1993 ins = INS_vpgatherqq;
1995 case INS_vgatherdps:
1996 ins = INS_vgatherqps;
1997 if (isVector128GatherWithVector256Index)
1999 // YMM index in address mode
2000 attr = emitTypeSize(TYP_SIMD32);
2003 case INS_vgatherdpd:
2004 ins = INS_vgatherqpd;
2011 assert(lastOp->IsCnsIntOrI());
2012 ssize_t ival = lastOp->AsIntCon()->IconValue();
2013 assert((ival >= 0) && (ival <= 255));
2015 assert(targetReg != maskReg);
2016 assert(targetReg != addrIndexReg);
2017 assert(maskReg != addrIndexReg);
2018 emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2025 genHWIntrinsic_R_RM(node, ins, attr);
2026 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2027 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2031 case NI_AVX_TestNotZAndNotC:
2033 genHWIntrinsic_R_RM(node, ins, attr);
2034 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2035 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2041 genHWIntrinsic_R_RM(node, ins, attr);
2042 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2043 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2052 genProduceReg(node);
2055 //------------------------------------------------------------------------
2056 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2059 // node - The hardware intrinsic node
2061 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2063 NYI("Implement AES intrinsic code generation");
2066 //------------------------------------------------------------------------
2067 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2070 // node - The hardware intrinsic node
2072 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2074 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2075 regNumber targetReg = node->gtRegNum;
2076 GenTree* op1 = node->gtGetOp1();
2077 GenTree* op2 = node->gtGetOp2();
2078 var_types targetType = node->TypeGet();
2079 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2080 emitter* emit = getEmitter();
2082 assert(targetReg != REG_NA);
2083 assert(op1 != nullptr);
2085 genConsumeHWIntrinsicOperands(node);
2087 switch (intrinsicId)
2089 case NI_BMI1_AndNot:
2090 case NI_BMI1_X64_AndNot:
2091 case NI_BMI1_BitFieldExtract:
2092 case NI_BMI1_X64_BitFieldExtract:
2093 case NI_BMI2_ParallelBitDeposit:
2094 case NI_BMI2_ParallelBitExtract:
2095 case NI_BMI2_X64_ParallelBitDeposit:
2096 case NI_BMI2_X64_ParallelBitExtract:
2097 case NI_BMI2_ZeroHighBits:
2098 case NI_BMI2_X64_ZeroHighBits:
2100 assert(op2 != nullptr);
2101 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2102 genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2106 case NI_BMI1_ExtractLowestSetBit:
2107 case NI_BMI1_GetMaskUpToLowestSetBit:
2108 case NI_BMI1_ResetLowestSetBit:
2109 case NI_BMI1_X64_ExtractLowestSetBit:
2110 case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2111 case NI_BMI1_X64_ResetLowestSetBit:
2113 assert(op2 == nullptr);
2114 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2115 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2119 case NI_BMI1_TrailingZeroCount:
2120 case NI_BMI1_X64_TrailingZeroCount:
2122 assert(op2 == nullptr);
2123 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2124 genXCNTIntrinsic(node, ins);
2128 case NI_BMI2_MultiplyNoFlags:
2129 case NI_BMI2_X64_MultiplyNoFlags:
2131 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2132 assert(numArgs == 2 || numArgs == 3);
2134 regNumber op1Reg = REG_NA;
2135 regNumber op2Reg = REG_NA;
2136 regNumber op3Reg = REG_NA;
2137 regNumber lowReg = REG_NA;
2141 op1Reg = op1->gtRegNum;
2142 op2Reg = op2->gtRegNum;
2147 GenTreeArgList* argList = op1->AsArgList();
2148 op1 = argList->Current();
2149 op1Reg = op1->gtRegNum;
2150 argList = argList->Rest();
2151 op2 = argList->Current();
2152 op2Reg = op2->gtRegNum;
2153 argList = argList->Rest();
2154 GenTree* op3 = argList->Current();
2155 op3Reg = op3->gtRegNum;
2156 assert(op3Reg != op1Reg);
2157 assert(op3Reg != targetReg);
2158 assert(op3Reg != REG_EDX);
2159 lowReg = node->GetSingleTempReg();
2160 assert(op3Reg != lowReg);
2161 assert(lowReg != targetReg);
2164 emitAttr attr = emitTypeSize(targetType);
2165 // mov the first operand into implicit source operand EDX/RDX
2166 if (op1Reg != REG_EDX)
2168 assert(op2Reg != REG_EDX);
2169 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2172 // generate code for MULX
2173 genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2175 // If requires the lower half result, store in the memory opinted by op3
2178 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2191 genProduceReg(node);
2194 //------------------------------------------------------------------------
2195 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2198 // node - The hardware intrinsic node
2200 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2202 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2203 var_types baseType = node->gtSIMDBaseType;
2204 emitAttr attr = EA_ATTR(node->gtSIMDSize);
2205 instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2206 GenTree* op1 = node->gtGetOp1();
2207 regNumber targetReg = node->gtRegNum;
2209 assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2211 genConsumeHWIntrinsicOperands(node);
2212 GenTreeArgList* argList = op1->AsArgList();
2213 op1 = argList->Current();
2215 argList = argList->Rest();
2216 GenTree* op2 = argList->Current();
2218 argList = argList->Rest();
2219 GenTree* op3 = argList->Current();
2224 bool isCommutative = false;
2225 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2227 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2228 assert(!copiesUpperBits || !op1->isContained());
2230 if (op3->isContained() || op3->isUsedFromSpillTemp())
2232 // 213 form: op1 = (op2 * op1) + [op3]
2234 op1Reg = op1->gtRegNum;
2235 op2Reg = op2->gtRegNum;
2237 isCommutative = !copiesUpperBits;
2239 else if (op2->isContained() || op2->isUsedFromSpillTemp())
2241 // 132 form: op1 = (op1 * op3) + [op2]
2243 ins = (instruction)(ins - 1);
2244 op1Reg = op1->gtRegNum;
2245 op2Reg = op3->gtRegNum;
2248 else if (op1->isContained() || op1->isUsedFromSpillTemp())
2250 // 231 form: op3 = (op2 * op3) + [op1]
2252 ins = (instruction)(ins + 1);
2253 op1Reg = op3->gtRegNum;
2254 op2Reg = op2->gtRegNum;
2259 // 213 form: op1 = (op2 * op1) + op3
2261 op1Reg = op1->gtRegNum;
2262 op2Reg = op2->gtRegNum;
2264 isCommutative = !copiesUpperBits;
2267 if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2269 assert(node->isRMWHWIntrinsic(compiler));
2271 // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2273 // For non-commutative intrinsics, we should have ensured that op2 was marked
2274 // delay free in order to prevent it from getting assigned the same register
2275 // as target. However, for commutative intrinsics, we can just swap the operands
2276 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2282 genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2283 genProduceReg(node);
2286 //------------------------------------------------------------------------
2287 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2290 // node - The hardware intrinsic node
2292 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2294 assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2295 node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2297 genConsumeOperands(node);
2298 genXCNTIntrinsic(node, INS_lzcnt);
2299 genProduceReg(node);
2302 //------------------------------------------------------------------------
2303 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2306 // node - The hardware intrinsic node
2308 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2310 NYI("Implement PCLMULQDQ intrinsic code generation");
2313 //------------------------------------------------------------------------
2314 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2317 // node - The hardware intrinsic node
2319 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2321 assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2323 genConsumeOperands(node);
2324 genXCNTIntrinsic(node, INS_popcnt);
2325 genProduceReg(node);
2328 //------------------------------------------------------------------------
2329 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2330 // the target register
2333 // node - The hardware intrinsic node
2334 // ins - The instruction being generated
2336 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2338 // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2339 // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2340 // renaming, but only if it's not an actual dependency.
2342 GenTree* op1 = node->gtGetOp1();
2343 regNumber sourceReg1 = REG_NA;
2344 regNumber sourceReg2 = REG_NA;
2346 if (!op1->isContained())
2348 sourceReg1 = op1->gtRegNum;
2350 else if (op1->isIndir())
2352 GenTreeIndir* indir = op1->AsIndir();
2353 GenTree* memBase = indir->Base();
2355 if (memBase != nullptr)
2357 sourceReg1 = memBase->gtRegNum;
2360 if (indir->HasIndex())
2362 sourceReg2 = indir->Index()->gtRegNum;
2366 regNumber targetReg = node->gtRegNum;
2367 if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2369 getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2371 genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2374 #endif // FEATURE_HW_INTRINSICS