9660ae346ef124ec09432666d6e78f09dc16efdb
[platform/upstream/coreclr.git] / src / jit / hwintrinsiccodegenxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX               Intel hardware intrinsic Code Generator                     XX
9 XX                                                                           XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12 */
13 #include "jitpch.h"
14 #ifdef _MSC_VER
15 #pragma hdrstop
16 #endif
17
18 #ifdef FEATURE_HW_INTRINSICS
19
20 #include "emit.h"
21 #include "codegen.h"
22 #include "sideeffects.h"
23 #include "lower.h"
24 #include "gcinfo.h"
25 #include "gcinfoencoder.h"
26
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
29 //
30 // Arguments:
31 //    lowering - The lowering phase from the compiler
32 //    node     - The HWIntrinsic node that has the contained node
33 //    op       - The op that is contained
34 //
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
36 {
37 #if DEBUG
38     // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39     // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
40     //
41     // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42     // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
43     // spillage
44     // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
45     // register
46     // in the first place).
47
48     bool supportsRegOptional = false;
49     bool isContainable       = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50     assert(isContainable || supportsRegOptional);
51 #endif // DEBUG
52 }
53
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
56 //
57 // Arguments:
58 //    category - category of a HW intrinsic
59 //
60 // Return Value:
61 //    returns true if this category can be table-driven in CodeGen
62 //
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
64 {
65     // TODO - make more categories to the table-driven framework
66     // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67     const bool tableDrivenCategory =
68         (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69     const bool tableDrivenFlag =
70         !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71     return tableDrivenCategory && tableDrivenFlag;
72 }
73
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
76 //
77 // Arguments:
78 //    node - The hardware intrinsic node
79 //
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
81 {
82     NamedIntrinsic      intrinsicId = node->gtHWIntrinsicId;
83     InstructionSet      isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
84     HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
85     int                 ival        = HWIntrinsicInfo::lookupIval(intrinsicId);
86     int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
87
88     assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
89
90     if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
91     {
92         GenTree*  op1        = node->gtGetOp1();
93         GenTree*  op2        = node->gtGetOp2();
94         regNumber targetReg  = node->gtRegNum;
95         var_types targetType = node->TypeGet();
96         var_types baseType   = node->gtSIMDBaseType;
97
98         regNumber op1Reg = REG_NA;
99         regNumber op2Reg = REG_NA;
100         emitter*  emit   = getEmitter();
101
102         assert(numArgs >= 0);
103         instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104         assert(ins != INS_invalid);
105         emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106         assert(simdSize != 0);
107
108         switch (numArgs)
109         {
110             case 1:
111             {
112                 genConsumeOperands(node);
113                 op1Reg = op1->gtRegNum;
114
115                 if (node->OperIsMemoryLoad())
116                 {
117                     emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
118                 }
119                 else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
120                 {
121                     emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
122                 }
123                 else if ((ival != -1) && varTypeIsFloating(baseType))
124                 {
125                     assert((ival >= 0) && (ival <= 127));
126                     genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
127                 }
128                 else
129                 {
130                     genHWIntrinsic_R_RM(node, ins, simdSize);
131                 }
132                 break;
133             }
134
135             case 2:
136             {
137                 genConsumeOperands(node);
138
139                 op1Reg = op1->gtRegNum;
140                 op2Reg = op2->gtRegNum;
141
142                 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
143                 {
144                     // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
145                     //
146                     // For non-commutative intrinsics, we should have ensured that op2 was marked
147                     // delay free in order to prevent it from getting assigned the same register
148                     // as target. However, for commutative intrinsics, we can just swap the operands
149                     // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
150
151                     noway_assert(node->OperIsCommutative());
152                     op2Reg = op1Reg;
153                     op1Reg = targetReg;
154                 }
155
156                 if (category == HW_Category_MemoryStore)
157                 {
158                     emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
159                 }
160                 else if ((ival != -1) && varTypeIsFloating(baseType))
161                 {
162                     assert((ival >= 0) && (ival <= 127));
163                     genHWIntrinsic_R_R_RM_I(node, ins, ival);
164                 }
165                 else if (category == HW_Category_MemoryLoad)
166                 {
167                     if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
168                     {
169                         emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg);
170                     }
171                     else
172                     {
173                         emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
174                     }
175                 }
176                 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
177                 {
178                     assert(ival == -1);
179
180                     if (intrinsicId == NI_SSE2_Extract)
181                     {
182                         // extract instructions return to GP-registers, so it needs int size as the emitsize
183                         simdSize = emitTypeSize(TYP_INT);
184                     }
185
186                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
187
188                     if (op2->IsCnsIntOrI())
189                     {
190                         ssize_t ival = op2->AsIntCon()->IconValue();
191                         assert((ival >= 0) && (ival <= 255));
192                         emitSwCase((int8_t)ival);
193                     }
194                     else
195                     {
196                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
197                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
198                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
199                         regNumber baseReg = node->ExtractTempReg();
200                         regNumber offsReg = node->GetSingleTempReg();
201                         genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
202                     }
203                 }
204                 else
205                 {
206                     genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
207                 }
208                 break;
209             }
210
211             case 3:
212             {
213                 assert(op1->OperIsList());
214                 assert(op1->gtGetOp2()->OperIsList());
215                 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
216
217                 GenTreeArgList* argList = op1->AsArgList();
218                 op1                     = argList->Current();
219                 genConsumeRegs(op1);
220                 op1Reg = op1->gtRegNum;
221
222                 argList = argList->Rest();
223                 op2     = argList->Current();
224                 genConsumeRegs(op2);
225                 op2Reg = op2->gtRegNum;
226
227                 argList      = argList->Rest();
228                 GenTree* op3 = argList->Current();
229                 genConsumeRegs(op3);
230                 regNumber op3Reg = op3->gtRegNum;
231
232                 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
233                 {
234                     assert(ival == -1);
235
236                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
237
238                     if (op3->IsCnsIntOrI())
239                     {
240                         ssize_t ival = op3->AsIntCon()->IconValue();
241                         assert((ival >= 0) && (ival <= 255));
242                         emitSwCase((int8_t)ival);
243                     }
244                     else
245                     {
246                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
247                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
248                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
249                         regNumber baseReg = node->ExtractTempReg();
250                         regNumber offsReg = node->GetSingleTempReg();
251                         genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
252                     }
253                 }
254                 else if (category == HW_Category_MemoryStore)
255                 {
256                     if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
257                     {
258                         emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
259                     }
260                     else
261                     {
262                         assert(intrinsicId == NI_SSE2_MaskMove);
263                         assert(targetReg == REG_NA);
264
265                         // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
266                         if (op3Reg != REG_EDI)
267                         {
268                             emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
269                         }
270                         emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
271                     }
272                 }
273                 else
274                 {
275                     switch (intrinsicId)
276                     {
277                         case NI_SSE41_BlendVariable:
278                         case NI_AVX_BlendVariable:
279                         case NI_AVX2_BlendVariable:
280                         {
281                             genHWIntrinsic_R_R_RM_R(node, ins);
282                             break;
283                         }
284
285                         default:
286                         {
287                             unreached();
288                             break;
289                         };
290                     }
291                 }
292                 break;
293             }
294
295             default:
296                 unreached();
297                 break;
298         }
299         genProduceReg(node);
300         return;
301     }
302
303     switch (isa)
304     {
305         case InstructionSet_Base:
306             genBaseIntrinsic(node);
307             break;
308         case InstructionSet_SSE:
309         case InstructionSet_SSE_X64:
310             genSSEIntrinsic(node);
311             break;
312         case InstructionSet_SSE2:
313         case InstructionSet_SSE2_X64:
314             genSSE2Intrinsic(node);
315             break;
316         case InstructionSet_SSE41:
317         case InstructionSet_SSE41_X64:
318             genSSE41Intrinsic(node);
319             break;
320         case InstructionSet_SSE42:
321         case InstructionSet_SSE42_X64:
322             genSSE42Intrinsic(node);
323             break;
324         case InstructionSet_AVX:
325         case InstructionSet_AVX2:
326             genAvxOrAvx2Intrinsic(node);
327             break;
328         case InstructionSet_AES:
329             genAESIntrinsic(node);
330             break;
331         case InstructionSet_BMI1:
332         case InstructionSet_BMI1_X64:
333         case InstructionSet_BMI2:
334         case InstructionSet_BMI2_X64:
335             genBMI1OrBMI2Intrinsic(node);
336             break;
337         case InstructionSet_FMA:
338             genFMAIntrinsic(node);
339             break;
340         case InstructionSet_LZCNT:
341         case InstructionSet_LZCNT_X64:
342             genLZCNTIntrinsic(node);
343             break;
344         case InstructionSet_PCLMULQDQ:
345             genPCLMULQDQIntrinsic(node);
346             break;
347         case InstructionSet_POPCNT:
348         case InstructionSet_POPCNT_X64:
349             genPOPCNTIntrinsic(node);
350             break;
351         default:
352             unreached();
353             break;
354     }
355 }
356
357 //------------------------------------------------------------------------
358 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
359 //                      register/memory operand and that returns a value in register
360 //
361 // Arguments:
362 //    node - The hardware intrinsic node
363 //    ins  - The instruction being generated
364 //    attr - The emit attribute for the instruciton being generated
365 //
366 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
367 {
368     var_types targetType = node->TypeGet();
369     regNumber targetReg  = node->gtRegNum;
370     GenTree*  op1        = node->gtGetOp1();
371     GenTree*  op2        = node->gtGetOp2();
372     emitter*  emit       = getEmitter();
373
374     if (op2 != nullptr)
375     {
376         // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this
377         // code path. They are all MultiIns, as the return value comes from the flags and
378         // we have two operands instead.
379
380         assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
381         assert(targetReg != REG_NA);
382
383         targetReg = op1->gtRegNum;
384         op1       = op2;
385         op2       = nullptr;
386     }
387     else
388     {
389         assert(!node->OperIsCommutative());
390     }
391
392     assert(targetReg != REG_NA);
393     assert(op2 == nullptr);
394
395     if (op1->isContained() || op1->isUsedFromSpillTemp())
396     {
397         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
398         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
399
400         TempDsc* tmpDsc = nullptr;
401         unsigned varNum = BAD_VAR_NUM;
402         unsigned offset = (unsigned)-1;
403
404         if (op1->isUsedFromSpillTemp())
405         {
406             assert(op1->IsRegOptional());
407
408             tmpDsc = getSpillTempDsc(op1);
409             varNum = tmpDsc->tdTempNum();
410             offset = 0;
411
412             regSet.tmpRlsTemp(tmpDsc);
413         }
414         else if (op1->OperIsHWIntrinsic())
415         {
416             emit->emitIns_R_AR(ins, attr, targetReg, op1->gtGetOp1()->gtRegNum, 0);
417             return;
418         }
419         else if (op1->isIndir())
420         {
421             GenTreeIndir* memIndir = op1->AsIndir();
422             GenTree*      memBase  = memIndir->gtOp1;
423
424             switch (memBase->OperGet())
425             {
426                 case GT_LCL_VAR_ADDR:
427                 {
428                     varNum = memBase->AsLclVarCommon()->GetLclNum();
429                     offset = 0;
430
431                     // Ensure that all the GenTreeIndir values are set to their defaults.
432                     assert(!memIndir->HasIndex());
433                     assert(memIndir->Scale() == 1);
434                     assert(memIndir->Offset() == 0);
435
436                     break;
437                 }
438
439                 case GT_CLS_VAR_ADDR:
440                 {
441                     emit->emitIns_R_C(ins, attr, targetReg, memBase->gtClsVar.gtClsVarHnd, 0);
442                     return;
443                 }
444
445                 default:
446                 {
447                     emit->emitIns_R_A(ins, attr, targetReg, memIndir);
448                     return;
449                 }
450             }
451         }
452         else
453         {
454             switch (op1->OperGet())
455             {
456                 case GT_LCL_FLD:
457                 {
458                     GenTreeLclFld* lclField = op1->AsLclFld();
459
460                     varNum = lclField->GetLclNum();
461                     offset = lclField->gtLclFld.gtLclOffs;
462                     break;
463                 }
464
465                 case GT_LCL_VAR:
466                 {
467                     assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
468                     varNum = op1->AsLclVar()->GetLclNum();
469                     offset = 0;
470                     break;
471                 }
472
473                 default:
474                 {
475                     unreached();
476                     break;
477                 }
478             }
479         }
480
481         // Ensure we got a good varNum and offset.
482         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
483         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
484         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
485         assert(offset != (unsigned)-1);
486
487         emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
488     }
489     else
490     {
491         regNumber op1Reg = op1->gtRegNum;
492         emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
493     }
494 }
495
496 //------------------------------------------------------------------------
497 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
498 //                        an immediate operand, and that returns a value in register
499 //
500 // Arguments:
501 //    node - The hardware intrinsic node
502 //    ins  - The instruction being generated
503 //    ival - The immediate value
504 //
505 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
506 {
507     var_types targetType = node->TypeGet();
508     regNumber targetReg  = node->gtRegNum;
509     GenTree*  op1        = node->gtGetOp1();
510     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
511     emitter*  emit       = getEmitter();
512
513     // TODO-XArch-CQ: Commutative operations can have op1 be contained
514     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
515
516     assert(targetReg != REG_NA);
517     assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
518
519     if (op1->isContained() || op1->isUsedFromSpillTemp())
520     {
521         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
522         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
523
524         TempDsc* tmpDsc = nullptr;
525         unsigned varNum = BAD_VAR_NUM;
526         unsigned offset = (unsigned)-1;
527
528         if (op1->isUsedFromSpillTemp())
529         {
530             assert(op1->IsRegOptional());
531
532             tmpDsc = getSpillTempDsc(op1);
533             varNum = tmpDsc->tdTempNum();
534             offset = 0;
535
536             regSet.tmpRlsTemp(tmpDsc);
537         }
538         else if (op1->OperIsHWIntrinsic())
539         {
540             emit->emitIns_R_AR_I(ins, simdSize, targetReg, op1->gtGetOp1()->gtRegNum, 0, ival);
541             return;
542         }
543         else if (op1->isIndir())
544         {
545             GenTreeIndir* memIndir = op1->AsIndir();
546             GenTree*      memBase  = memIndir->gtOp1;
547
548             switch (memBase->OperGet())
549             {
550                 case GT_LCL_VAR_ADDR:
551                 {
552                     varNum = memBase->AsLclVarCommon()->GetLclNum();
553                     offset = 0;
554
555                     // Ensure that all the GenTreeIndir values are set to their defaults.
556                     assert(!memIndir->HasIndex());
557                     assert(memIndir->Scale() == 1);
558                     assert(memIndir->Offset() == 0);
559
560                     break;
561                 }
562
563                 case GT_CLS_VAR_ADDR:
564                 {
565                     emit->emitIns_R_C_I(ins, simdSize, targetReg, memBase->gtClsVar.gtClsVarHnd, 0, ival);
566                     return;
567                 }
568
569                 default:
570                 {
571                     emit->emitIns_R_A_I(ins, simdSize, targetReg, memIndir, ival);
572                     return;
573                 }
574             }
575         }
576         else
577         {
578             switch (op1->OperGet())
579             {
580                 case GT_LCL_FLD:
581                 {
582                     GenTreeLclFld* lclField = op1->AsLclFld();
583
584                     varNum = lclField->GetLclNum();
585                     offset = lclField->gtLclFld.gtLclOffs;
586                     break;
587                 }
588
589                 case GT_LCL_VAR:
590                 {
591                     assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
592                     varNum = op1->AsLclVar()->GetLclNum();
593                     offset = 0;
594                     break;
595                 }
596
597                 default:
598                     unreached();
599                     break;
600             }
601         }
602
603         // Ensure we got a good varNum and offset.
604         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
605         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
606         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
607         assert(offset != (unsigned)-1);
608
609         emit->emitIns_R_S_I(ins, simdSize, targetReg, varNum, offset, ival);
610     }
611     else
612     {
613         regNumber op1Reg = op1->gtRegNum;
614         emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
615     }
616 }
617
618 //------------------------------------------------------------------------
619 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
620 //                        register/memory operand, and that returns a value in register
621 //
622 // Arguments:
623 //    node - The hardware intrinsic node
624 //    ins  - The instruction being generated
625 //    attr - The emit attribute for the instruciton being generated
626 //
627 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
628 {
629     regNumber targetReg = node->gtRegNum;
630     GenTree*  op1       = node->gtGetOp1();
631     GenTree*  op2       = node->gtGetOp2();
632     regNumber op1Reg    = op1->gtRegNum;
633
634     assert(targetReg != REG_NA);
635     assert(op1Reg != REG_NA);
636
637     genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
638 }
639
640 //------------------------------------------------------------------------
641 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
642 //                        register/memory operand, and that returns a value in register
643 //
644 // Arguments:
645 //    node - The hardware intrinsic node
646 //    ins  - The instruction being generated
647 //    attr - The emit attribute for the instruciton being generated
648 //    targetReg - The register allocated to the result
649 //    op1Reg    - The register allocated to the first operand
650 //    op2       - Another operand that maybe in register or memory
651 //
652 void CodeGen::genHWIntrinsic_R_R_RM(
653     GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
654 {
655     emitter* emit = getEmitter();
656
657     // TODO-XArch-CQ: Commutative operations can have op1 be contained
658     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
659
660     assert(targetReg != REG_NA);
661     assert(op1Reg != REG_NA);
662
663     if (op2->isContained() || op2->isUsedFromSpillTemp())
664     {
665         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
666         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
667
668         TempDsc* tmpDsc = nullptr;
669         unsigned varNum = BAD_VAR_NUM;
670         unsigned offset = (unsigned)-1;
671
672         if (op2->isUsedFromSpillTemp())
673         {
674             assert(op2->IsRegOptional());
675
676             tmpDsc = getSpillTempDsc(op2);
677             varNum = tmpDsc->tdTempNum();
678             offset = 0;
679
680             regSet.tmpRlsTemp(tmpDsc);
681         }
682         else if (op2->OperIsHWIntrinsic())
683         {
684             emit->emitIns_SIMD_R_R_AR(ins, attr, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
685             return;
686         }
687         else if (op2->isIndir())
688         {
689             GenTreeIndir* memIndir = op2->AsIndir();
690             GenTree*      memBase  = memIndir->gtOp1;
691
692             switch (memBase->OperGet())
693             {
694                 case GT_LCL_VAR_ADDR:
695                 {
696                     varNum = memBase->AsLclVarCommon()->GetLclNum();
697                     offset = 0;
698
699                     // Ensure that all the GenTreeIndir values are set to their defaults.
700                     assert(!memIndir->HasIndex());
701                     assert(memIndir->Scale() == 1);
702                     assert(memIndir->Offset() == 0);
703
704                     break;
705                 }
706
707                 case GT_CLS_VAR_ADDR:
708                 {
709                     emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
710                     return;
711                 }
712
713                 default:
714                 {
715                     emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
716                     return;
717                 }
718             }
719         }
720         else
721         {
722             switch (op2->OperGet())
723             {
724                 case GT_LCL_FLD:
725                 {
726                     GenTreeLclFld* lclField = op2->AsLclFld();
727
728                     varNum = lclField->GetLclNum();
729                     offset = lclField->gtLclFld.gtLclOffs;
730                     break;
731                 }
732
733                 case GT_LCL_VAR:
734                 {
735                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
736                     varNum = op2->AsLclVar()->GetLclNum();
737                     offset = 0;
738                     break;
739                 }
740
741                 default:
742                     unreached();
743                     break;
744             }
745         }
746
747         // Ensure we got a good varNum and offset.
748         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
749         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
750         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
751         assert(offset != (unsigned)-1);
752
753         emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
754     }
755     else
756     {
757         regNumber op2Reg = op2->gtRegNum;
758
759         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
760         {
761             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
762             //
763             // For non-commutative intrinsics, we should have ensured that op2 was marked
764             // delay free in order to prevent it from getting assigned the same register
765             // as target. However, for commutative intrinsics, we can just swap the operands
766             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
767
768             noway_assert(node->OperIsCommutative());
769             op2Reg = op1Reg;
770             op1Reg = targetReg;
771         }
772
773         emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
774     }
775 }
776
777 //------------------------------------------------------------------------
778 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
779 //                        register/memory operand, an immediate operand, and that returns a value in register
780 //
781 // Arguments:
782 //    node - The hardware intrinsic node
783 //    ins  - The instruction being generated
784 //    ival - The immediate value
785 //
786 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
787 {
788     var_types targetType = node->TypeGet();
789     regNumber targetReg  = node->gtRegNum;
790     GenTree*  op1        = node->gtGetOp1();
791     GenTree*  op2        = node->gtGetOp2();
792     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
793     emitter*  emit       = getEmitter();
794
795     // TODO-XArch-CQ: Commutative operations can have op1 be contained
796     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
797
798     if (op1->OperIsList())
799     {
800         assert(op2 == nullptr);
801
802         GenTreeArgList* argList = op1->AsArgList();
803
804         op1     = argList->Current();
805         argList = argList->Rest();
806
807         op2     = argList->Current();
808         argList = argList->Rest();
809
810         assert(argList->Current() != nullptr);
811         assert(argList->Rest() == nullptr);
812     }
813
814     regNumber op1Reg = op1->gtRegNum;
815
816     assert(targetReg != REG_NA);
817     assert(op1Reg != REG_NA);
818
819     if (op2->isContained() || op2->isUsedFromSpillTemp())
820     {
821         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
822         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
823
824         TempDsc* tmpDsc = nullptr;
825         unsigned varNum = BAD_VAR_NUM;
826         unsigned offset = (unsigned)-1;
827
828         if (op2->isUsedFromSpillTemp())
829         {
830             assert(op2->IsRegOptional());
831
832             tmpDsc = getSpillTempDsc(op2);
833             varNum = tmpDsc->tdTempNum();
834             offset = 0;
835
836             regSet.tmpRlsTemp(tmpDsc);
837         }
838         else if (op2->OperIsHWIntrinsic())
839         {
840             emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
841             return;
842         }
843         else if (op2->isIndir())
844         {
845             GenTreeIndir* memIndir = op2->AsIndir();
846             GenTree*      memBase  = memIndir->gtOp1;
847
848             switch (memBase->OperGet())
849             {
850                 case GT_LCL_VAR_ADDR:
851                 {
852                     varNum = memBase->AsLclVarCommon()->GetLclNum();
853                     offset = 0;
854
855                     // Ensure that all the GenTreeIndir values are set to their defaults.
856                     assert(!memIndir->HasIndex());
857                     assert(memIndir->Scale() == 1);
858                     assert(memIndir->Offset() == 0);
859
860                     break;
861                 }
862
863                 case GT_CLS_VAR_ADDR:
864                 {
865                     emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
866                                                ival);
867                     return;
868                 }
869
870                 default:
871                 {
872                     emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
873                     return;
874                 }
875             }
876         }
877         else
878         {
879             switch (op2->OperGet())
880             {
881                 case GT_LCL_FLD:
882                 {
883                     GenTreeLclFld* lclField = op2->AsLclFld();
884
885                     varNum = lclField->GetLclNum();
886                     offset = lclField->gtLclFld.gtLclOffs;
887                     break;
888                 }
889
890                 case GT_LCL_VAR:
891                 {
892                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
893                     varNum = op2->AsLclVar()->GetLclNum();
894                     offset = 0;
895                     break;
896                 }
897
898                 default:
899                     unreached();
900                     break;
901             }
902         }
903
904         // Ensure we got a good varNum and offset.
905         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
906         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
907         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
908         assert(offset != (unsigned)-1);
909
910         emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
911     }
912     else
913     {
914         regNumber op2Reg = op2->gtRegNum;
915
916         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
917         {
918             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
919             //
920             // For non-commutative intrinsics, we should have ensured that op2 was marked
921             // delay free in order to prevent it from getting assigned the same register
922             // as target. However, for commutative intrinsics, we can just swap the operands
923             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
924
925             noway_assert(node->OperIsCommutative());
926             op2Reg = op1Reg;
927             op1Reg = targetReg;
928         }
929
930         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
931     }
932 }
933
934 //------------------------------------------------------------------------
935 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
936 //                          register/memory operand, another register operand, and that returns a value in register
937 //
938 // Arguments:
939 //    node - The hardware intrinsic node
940 //    ins  - The instruction being generated
941 //
942 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
943 {
944     var_types targetType = node->TypeGet();
945     regNumber targetReg  = node->gtRegNum;
946     GenTree*  op1        = node->gtGetOp1();
947     GenTree*  op2        = node->gtGetOp2();
948     GenTree*  op3        = nullptr;
949     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
950     emitter*  emit       = getEmitter();
951
952     assert(op1->OperIsList());
953     assert(op2 == nullptr);
954
955     GenTreeArgList* argList = op1->AsArgList();
956
957     op1     = argList->Current();
958     argList = argList->Rest();
959
960     op2     = argList->Current();
961     argList = argList->Rest();
962
963     op3 = argList->Current();
964     assert(argList->Rest() == nullptr);
965
966     regNumber op1Reg = op1->gtRegNum;
967     regNumber op3Reg = op3->gtRegNum;
968
969     assert(targetReg != REG_NA);
970     assert(op1Reg != REG_NA);
971     assert(op3Reg != REG_NA);
972
973     if (op2->isContained() || op2->isUsedFromSpillTemp())
974     {
975         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
976         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
977
978         TempDsc* tmpDsc = nullptr;
979         unsigned varNum = BAD_VAR_NUM;
980         unsigned offset = (unsigned)-1;
981
982         if (op2->isUsedFromSpillTemp())
983         {
984             assert(op2->IsRegOptional());
985
986             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
987             //                     pattern. It could probably be extracted to its own method.
988             tmpDsc = getSpillTempDsc(op2);
989             varNum = tmpDsc->tdTempNum();
990             offset = 0;
991
992             regSet.tmpRlsTemp(tmpDsc);
993         }
994         else if (op2->OperIsHWIntrinsic())
995         {
996             emit->emitIns_SIMD_R_R_AR_R(ins, simdSize, targetReg, op1Reg, op3Reg, op2->gtGetOp1()->gtRegNum);
997             return;
998         }
999         else if (op2->isIndir())
1000         {
1001             GenTreeIndir* memIndir = op2->AsIndir();
1002             GenTree*      memBase  = memIndir->gtOp1;
1003
1004             switch (memBase->OperGet())
1005             {
1006                 case GT_LCL_VAR_ADDR:
1007                 {
1008                     varNum = memBase->AsLclVarCommon()->GetLclNum();
1009                     offset = 0;
1010
1011                     // Ensure that all the GenTreeIndir values are set to their defaults.
1012                     assert(!memIndir->HasIndex());
1013                     assert(memIndir->Scale() == 1);
1014                     assert(memIndir->Offset() == 0);
1015
1016                     break;
1017                 }
1018
1019                 case GT_CLS_VAR_ADDR:
1020                 {
1021                     emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, memBase->gtClsVar.gtClsVarHnd,
1022                                                0);
1023                     return;
1024                 }
1025
1026                 default:
1027                 {
1028                     emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
1029                     return;
1030                 }
1031             }
1032         }
1033         else
1034         {
1035             switch (op2->OperGet())
1036             {
1037                 case GT_LCL_FLD:
1038                 {
1039                     GenTreeLclFld* lclField = op2->AsLclFld();
1040
1041                     varNum = lclField->GetLclNum();
1042                     offset = lclField->gtLclFld.gtLclOffs;
1043                     break;
1044                 }
1045
1046                 case GT_LCL_VAR:
1047                 {
1048                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
1049                     varNum = op2->AsLclVar()->GetLclNum();
1050                     offset = 0;
1051                     break;
1052                 }
1053
1054                 default:
1055                     unreached();
1056                     break;
1057             }
1058         }
1059
1060         // Ensure we got a good varNum and offset.
1061         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1062         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1063         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1064         assert(offset != (unsigned)-1);
1065
1066         emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
1067     }
1068     else
1069     {
1070         emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1071     }
1072 }
1073
1074 //------------------------------------------------------------------------
1075 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1076 //                          a register/memory operand, and that returns a value in register
1077 //
1078 // Arguments:
1079 //    ins       - The instruction being generated
1080 //    attr      - The emit attribute
1081 //    targetReg - The target register
1082 //    op1Reg    - The register of the first operand
1083 //    op2Reg    - The register of the second operand
1084 //    op3       - The third operand
1085 //
1086 void CodeGen::genHWIntrinsic_R_R_R_RM(
1087     instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1088 {
1089     assert(targetReg != REG_NA);
1090     assert(op1Reg != REG_NA);
1091     assert(op2Reg != REG_NA);
1092
1093     emitter* emit = getEmitter();
1094
1095     if (op3->isContained() || op3->isUsedFromSpillTemp())
1096     {
1097         TempDsc* tmpDsc = nullptr;
1098         unsigned varNum = BAD_VAR_NUM;
1099         unsigned offset = (unsigned)-1;
1100
1101         if (op3->isUsedFromSpillTemp())
1102         {
1103             assert(op3->IsRegOptional());
1104
1105             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1106             //                     pattern. It could probably be extracted to its own method.
1107             tmpDsc = getSpillTempDsc(op3);
1108             varNum = tmpDsc->tdTempNum();
1109             offset = 0;
1110
1111             regSet.tmpRlsTemp(tmpDsc);
1112         }
1113         else if (op3->OperIsHWIntrinsic())
1114         {
1115             emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum);
1116             return;
1117         }
1118         else if (op3->isIndir())
1119         {
1120             GenTreeIndir* memIndir = op3->AsIndir();
1121             GenTree*      memBase  = memIndir->gtOp1;
1122
1123             switch (memBase->OperGet())
1124             {
1125                 case GT_LCL_VAR_ADDR:
1126                 {
1127                     varNum = memBase->AsLclVarCommon()->GetLclNum();
1128                     offset = 0;
1129
1130                     // Ensure that all the GenTreeIndir values are set to their defaults.
1131                     assert(!memIndir->HasIndex());
1132                     assert(memIndir->Scale() == 1);
1133                     assert(memIndir->Offset() == 0);
1134
1135                     break;
1136                 }
1137
1138                 case GT_CLS_VAR_ADDR:
1139                 {
1140                     emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0);
1141                     return;
1142                 }
1143
1144                 default:
1145                 {
1146                     emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1147                     return;
1148                 }
1149             }
1150         }
1151         else
1152         {
1153             switch (op3->OperGet())
1154             {
1155                 case GT_LCL_FLD:
1156                 {
1157                     GenTreeLclFld* lclField = op3->AsLclFld();
1158
1159                     varNum = lclField->GetLclNum();
1160                     offset = lclField->gtLclFld.gtLclOffs;
1161                     break;
1162                 }
1163
1164                 case GT_LCL_VAR:
1165                 {
1166                     assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1167                     varNum = op3->AsLclVar()->GetLclNum();
1168                     offset = 0;
1169                     break;
1170                 }
1171
1172                 default:
1173                     unreached();
1174                     break;
1175             }
1176         }
1177
1178         // Ensure we got a good varNum and offset.
1179         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1180         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1181         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1182         assert(offset != (unsigned)-1);
1183
1184         emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1185     }
1186     else
1187     {
1188         emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1189     }
1190 }
1191
1192 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1193 //                       with non-constant argument
1194 //
1195 // Arguments:
1196 //    intrinsic      - intrinsic ID
1197 //    nonConstImmReg - the register contains non-constant imm8 argument
1198 //    baseReg        - a register for the start of the switch table
1199 //    offsReg        - a register for the offset into the switch table
1200 //    emitSwCase     - the lambda to generate a switch case
1201 //
1202 // Return Value:
1203 //    generate the jump-table fallback for imm-intrinsics with non-constant argument.
1204 // Note:
1205 //    This function can be used for all imm-intrinsics (whether full-range or not),
1206 //    The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1207 //    (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1208 //
1209 template <typename HWIntrinsicSwitchCaseBody>
1210 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsic,
1211                                               regNumber                 nonConstImmReg,
1212                                               regNumber                 baseReg,
1213                                               regNumber                 offsReg,
1214                                               HWIntrinsicSwitchCaseBody emitSwCase)
1215 {
1216     assert(nonConstImmReg != REG_NA);
1217     // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1218     // that does work with the current compiler generated jump-table fallback
1219     assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1220     emitter* emit = getEmitter();
1221
1222     const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1223     assert(maxByte <= 256);
1224     BasicBlock* jmpTable[256];
1225
1226     unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1227     unsigned jmpTableOffs = 0;
1228
1229     // Emit the jump table
1230     for (unsigned i = 0; i < maxByte; i++)
1231     {
1232         jmpTable[i] = genCreateTempLabel();
1233         emit->emitDataGenData(i, jmpTable[i]);
1234     }
1235
1236     emit->emitDataGenEnd();
1237
1238     // Compute and jump to the appropriate offset in the switch table
1239     emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1240
1241     emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1242     emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1243     emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1244     emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1245
1246     // Emit the switch table entries
1247
1248     BasicBlock* switchTableBeg = genCreateTempLabel();
1249     BasicBlock* switchTableEnd = genCreateTempLabel();
1250
1251     genDefineTempLabel(switchTableBeg);
1252
1253     for (unsigned i = 0; i < maxByte; i++)
1254     {
1255         genDefineTempLabel(jmpTable[i]);
1256         emitSwCase((int8_t)i);
1257         emit->emitIns_J(INS_jmp, switchTableEnd);
1258     }
1259
1260     genDefineTempLabel(switchTableEnd);
1261 }
1262
1263 //------------------------------------------------------------------------
1264 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1265 //
1266 // Arguments:
1267 //    node - The hardware intrinsic node
1268 //
1269 // Note:
1270 //    We currently assume that all base intrinsics only have a single operand.
1271 //
1272 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1273 {
1274     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1275     regNumber      targetReg   = node->gtRegNum;
1276     var_types      targetType  = node->TypeGet();
1277     var_types      baseType    = node->gtSIMDBaseType;
1278
1279     assert(compiler->compSupports(InstructionSet_SSE));
1280     assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1281
1282     GenTree*  op1    = node->gtGetOp1();
1283     regNumber op1Reg = REG_NA;
1284
1285     if (op1 != nullptr)
1286     {
1287         assert(!op1->OperIsList());
1288         op1Reg = op1->gtRegNum;
1289         genConsumeOperands(node);
1290     }
1291
1292     assert(node->gtGetOp2() == nullptr);
1293
1294     emitter*    emit = getEmitter();
1295     emitAttr    attr = EA_ATTR(node->gtSIMDSize);
1296     instruction ins  = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1297
1298     switch (intrinsicId)
1299     {
1300         case NI_Base_Vector128_CreateScalarUnsafe:
1301         case NI_Base_Vector256_CreateScalarUnsafe:
1302         {
1303             if (varTypeIsIntegral(baseType))
1304             {
1305                 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1306             }
1307             else
1308             {
1309                 assert(varTypeIsFloating(baseType));
1310
1311                 attr = emitTypeSize(baseType);
1312
1313                 if (op1->isContained() || op1->isUsedFromSpillTemp())
1314                 {
1315                     genHWIntrinsic_R_RM(node, ins, attr);
1316                 }
1317                 else if (targetReg != op1Reg)
1318                 {
1319                     // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1320                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1321                 }
1322             }
1323             break;
1324         }
1325
1326         case NI_Base_Vector128_ToScalar:
1327         case NI_Base_Vector256_ToScalar:
1328         {
1329             assert(varTypeIsFloating(baseType));
1330
1331             attr = emitTypeSize(TYP_SIMD16);
1332
1333             if (op1->isContained() || op1->isUsedFromSpillTemp())
1334             {
1335                 genHWIntrinsic_R_RM(node, ins, attr);
1336             }
1337             else if (targetReg != op1Reg)
1338             {
1339                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1340                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1341             }
1342             break;
1343         }
1344
1345         case NI_Base_Vector128_ToVector256:
1346         {
1347             // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1348             // We always emit a move to the target register, even when op1Reg == targetReg,
1349             // in order to ensure that Bits MAXVL-1:128 are zeroed.
1350
1351             attr = emitTypeSize(TYP_SIMD16);
1352
1353             if (op1->isContained() || op1->isUsedFromSpillTemp())
1354             {
1355                 genHWIntrinsic_R_RM(node, ins, attr);
1356             }
1357             else
1358             {
1359                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1360                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1361             }
1362             break;
1363         }
1364
1365         case NI_Base_Vector128_ToVector256Unsafe:
1366         case NI_Base_Vector256_GetLower:
1367         {
1368             if (op1->isContained() || op1->isUsedFromSpillTemp())
1369             {
1370                 genHWIntrinsic_R_RM(node, ins, attr);
1371             }
1372             else if (targetReg != op1Reg)
1373             {
1374                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1375                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1376             }
1377             break;
1378         }
1379
1380         case NI_Base_Vector128_Zero:
1381         case NI_Base_Vector256_Zero:
1382         {
1383             assert(op1 == nullptr);
1384             emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1385             break;
1386         }
1387
1388         default:
1389         {
1390             unreached();
1391             break;
1392         }
1393     }
1394
1395     genProduceReg(node);
1396 }
1397
1398 //------------------------------------------------------------------------
1399 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1400 //
1401 // Arguments:
1402 //    node - The hardware intrinsic node
1403 //
1404 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1405 {
1406     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1407     GenTree*       op1         = node->gtGetOp1();
1408     GenTree*       op2         = node->gtGetOp2();
1409     GenTree*       op3         = nullptr;
1410     GenTree*       op4         = nullptr;
1411     regNumber      targetReg   = node->gtRegNum;
1412     var_types      targetType  = node->TypeGet();
1413     var_types      baseType    = node->gtSIMDBaseType;
1414
1415     regNumber op1Reg = REG_NA;
1416     regNumber op2Reg = REG_NA;
1417     regNumber op3Reg = REG_NA;
1418     regNumber op4Reg = REG_NA;
1419     emitter*  emit   = getEmitter();
1420
1421     if ((op1 != nullptr) && !op1->OperIsList())
1422     {
1423         op1Reg = op1->gtRegNum;
1424         genConsumeOperands(node);
1425     }
1426
1427     switch (intrinsicId)
1428     {
1429         case NI_SSE_CompareEqualOrderedScalar:
1430         case NI_SSE_CompareEqualUnorderedScalar:
1431         {
1432             assert(baseType == TYP_FLOAT);
1433             regNumber   tmpReg = node->GetSingleTempReg();
1434             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1435
1436             // Ensure we aren't overwriting targetReg
1437             assert(tmpReg != targetReg);
1438
1439             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1440             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1441             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1442             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1443             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1444             break;
1445         }
1446
1447         case NI_SSE_CompareGreaterThanOrderedScalar:
1448         case NI_SSE_CompareGreaterThanUnorderedScalar:
1449         {
1450             assert(baseType == TYP_FLOAT);
1451             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1452
1453             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1454             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1455             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1456             break;
1457         }
1458
1459         case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
1460         case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
1461         {
1462             assert(baseType == TYP_FLOAT);
1463             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1464
1465             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1466             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1467             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1468             break;
1469         }
1470
1471         case NI_SSE_CompareLessThanOrderedScalar:
1472         case NI_SSE_CompareLessThanUnorderedScalar:
1473         {
1474             assert(baseType == TYP_FLOAT);
1475             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1476
1477             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1478             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1479             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1480             break;
1481         }
1482
1483         case NI_SSE_CompareLessThanOrEqualOrderedScalar:
1484         case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
1485         {
1486             assert(baseType == TYP_FLOAT);
1487             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1488
1489             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1490             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1491             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1492             break;
1493         }
1494
1495         case NI_SSE_CompareNotEqualOrderedScalar:
1496         case NI_SSE_CompareNotEqualUnorderedScalar:
1497         {
1498             assert(baseType == TYP_FLOAT);
1499             regNumber   tmpReg = node->GetSingleTempReg();
1500             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1501
1502             // Ensure we aren't overwriting targetReg
1503             assert(tmpReg != targetReg);
1504
1505             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1506             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1507             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1508             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1509             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1510             break;
1511         }
1512
1513         case NI_SSE_X64_ConvertScalarToVector128Single:
1514         {
1515             assert(baseType == TYP_LONG);
1516             assert(op1 != nullptr);
1517             assert(op2 != nullptr);
1518             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1519             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1520             break;
1521         }
1522
1523         case NI_SSE_Prefetch0:
1524         case NI_SSE_Prefetch1:
1525         case NI_SSE_Prefetch2:
1526         case NI_SSE_PrefetchNonTemporal:
1527         {
1528             assert(baseType == TYP_UBYTE);
1529             assert(op2 == nullptr);
1530
1531             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1532             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1533             break;
1534         }
1535
1536         case NI_SSE_StoreFence:
1537         {
1538             assert(baseType == TYP_VOID);
1539             assert(op1 == nullptr);
1540             assert(op2 == nullptr);
1541             emit->emitIns(INS_sfence);
1542             break;
1543         }
1544
1545         default:
1546             unreached();
1547             break;
1548     }
1549
1550     genProduceReg(node);
1551 }
1552
1553 //------------------------------------------------------------------------
1554 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1555 //
1556 // Arguments:
1557 //    node - The hardware intrinsic node
1558 //
1559 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1560 {
1561     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1562     GenTree*       op1         = node->gtGetOp1();
1563     GenTree*       op2         = node->gtGetOp2();
1564     regNumber      targetReg   = node->gtRegNum;
1565     var_types      targetType  = node->TypeGet();
1566     var_types      baseType    = node->gtSIMDBaseType;
1567     regNumber      op1Reg      = REG_NA;
1568     regNumber      op2Reg      = REG_NA;
1569     emitter*       emit        = getEmitter();
1570
1571     if ((op1 != nullptr) && !op1->OperIsList())
1572     {
1573         op1Reg = op1->gtRegNum;
1574         genConsumeOperands(node);
1575     }
1576
1577     switch (intrinsicId)
1578     {
1579         // All integer overloads are handled by table codegen
1580         case NI_SSE2_CompareLessThan:
1581         {
1582             assert(op1 != nullptr);
1583             assert(op2 != nullptr);
1584
1585             assert(baseType == TYP_DOUBLE);
1586
1587             int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1588             assert((ival >= 0) && (ival <= 127));
1589
1590             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1591             op2Reg          = op2->gtRegNum;
1592             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1593
1594             break;
1595         }
1596
1597         case NI_SSE2_CompareEqualOrderedScalar:
1598         case NI_SSE2_CompareEqualUnorderedScalar:
1599         {
1600             assert(baseType == TYP_DOUBLE);
1601             regNumber   tmpReg = node->GetSingleTempReg();
1602             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1603
1604             // Ensure we aren't overwriting targetReg
1605             assert(tmpReg != targetReg);
1606
1607             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1608             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1609             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1610             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1611             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1612             break;
1613         }
1614
1615         case NI_SSE2_CompareGreaterThanOrderedScalar:
1616         case NI_SSE2_CompareGreaterThanUnorderedScalar:
1617         {
1618             assert(baseType == TYP_DOUBLE);
1619             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1620
1621             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1622             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1623             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1624             break;
1625         }
1626
1627         case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
1628         case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
1629         {
1630             assert(baseType == TYP_DOUBLE);
1631             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1632
1633             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1634             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1635             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1636             break;
1637         }
1638
1639         case NI_SSE2_CompareLessThanOrderedScalar:
1640         case NI_SSE2_CompareLessThanUnorderedScalar:
1641         {
1642             assert(baseType == TYP_DOUBLE);
1643             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1644
1645             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1646             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1647             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1648             break;
1649         }
1650
1651         case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
1652         case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
1653         {
1654             assert(baseType == TYP_DOUBLE);
1655             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1656
1657             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1658             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1659             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1660             break;
1661         }
1662
1663         case NI_SSE2_CompareNotEqualOrderedScalar:
1664         case NI_SSE2_CompareNotEqualUnorderedScalar:
1665         {
1666             assert(baseType == TYP_DOUBLE);
1667             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1668             regNumber   tmpReg = node->GetSingleTempReg();
1669
1670             // Ensure we aren't overwriting targetReg
1671             assert(tmpReg != targetReg);
1672
1673             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1674             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1675             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1676             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1677             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1678             break;
1679         }
1680
1681         case NI_SSE2_X64_ConvertScalarToVector128Double:
1682         {
1683             assert(baseType == TYP_LONG);
1684             assert(op1 != nullptr);
1685             assert(op2 != nullptr);
1686             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1687             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1688             break;
1689         }
1690
1691         case NI_SSE2_X64_ConvertScalarToVector128Int64:
1692         case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1693         {
1694             assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1695             assert(op1 != nullptr);
1696             assert(op2 == nullptr);
1697             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1698             genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1699             break;
1700         }
1701
1702         case NI_SSE2_ConvertToInt32:
1703         case NI_SSE2_ConvertToInt32WithTruncation:
1704         case NI_SSE2_ConvertToUInt32:
1705         case NI_SSE2_X64_ConvertToUInt64:
1706         case NI_SSE2_X64_ConvertToInt64:
1707         {
1708             assert(op2 == nullptr);
1709             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1710
1711             if (varTypeIsIntegral(baseType))
1712             {
1713                 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1714                 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1715             }
1716             else
1717             {
1718                 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1719                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1720             }
1721             break;
1722         }
1723
1724         case NI_SSE2_LoadFence:
1725         {
1726             assert(baseType == TYP_VOID);
1727             assert(op1 == nullptr);
1728             assert(op2 == nullptr);
1729             emit->emitIns(INS_lfence);
1730             break;
1731         }
1732
1733         case NI_SSE2_MemoryFence:
1734         {
1735             assert(baseType == TYP_VOID);
1736             assert(op1 == nullptr);
1737             assert(op2 == nullptr);
1738             emit->emitIns(INS_mfence);
1739             break;
1740         }
1741
1742         case NI_SSE2_StoreNonTemporal:
1743         case NI_SSE2_X64_StoreNonTemporal:
1744         {
1745             assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1746             assert(op1 != nullptr);
1747             assert(op2 != nullptr);
1748
1749             op2Reg          = op2->gtRegNum;
1750             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1751             emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1752             break;
1753         }
1754
1755         default:
1756             unreached();
1757             break;
1758     }
1759
1760     genProduceReg(node);
1761 }
1762
1763 //------------------------------------------------------------------------
1764 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1765 //
1766 // Arguments:
1767 //    node - The hardware intrinsic node
1768 //
1769 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1770 {
1771     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1772     GenTree*       op1         = node->gtGetOp1();
1773     GenTree*       op2         = node->gtGetOp2();
1774     GenTree*       op3         = nullptr;
1775     GenTree*       op4         = nullptr;
1776     regNumber      targetReg   = node->gtRegNum;
1777     var_types      targetType  = node->TypeGet();
1778     var_types      baseType    = node->gtSIMDBaseType;
1779
1780     regNumber op1Reg = REG_NA;
1781     regNumber op2Reg = REG_NA;
1782     regNumber op3Reg = REG_NA;
1783     regNumber op4Reg = REG_NA;
1784     emitter*  emit   = getEmitter();
1785
1786     if ((op1 != nullptr) && !op1->OperIsList())
1787     {
1788         op1Reg = op1->gtRegNum;
1789         genConsumeOperands(node);
1790     }
1791
1792     switch (intrinsicId)
1793     {
1794         case NI_SSE41_TestAllOnes:
1795         {
1796             regNumber tmpReg = node->GetSingleTempReg();
1797             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1798             emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1799             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1800             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1801             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1802             break;
1803         }
1804
1805         case NI_SSE41_TestAllZeros:
1806         case NI_SSE41_TestZ:
1807         {
1808             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1809             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1810             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1811             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1812             break;
1813         }
1814
1815         case NI_SSE41_TestC:
1816         {
1817             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1818             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1819             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1820             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1821             break;
1822         }
1823
1824         case NI_SSE41_TestMixOnesZeros:
1825         case NI_SSE41_TestNotZAndNotC:
1826         {
1827             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1828             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1829             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1830             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1831             break;
1832         }
1833
1834         case NI_SSE41_Extract:
1835         case NI_SSE41_X64_Extract:
1836         {
1837             regNumber   tmpTargetReg = REG_NA;
1838             instruction ins          = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1839             if (baseType == TYP_FLOAT)
1840             {
1841                 tmpTargetReg = node->ExtractTempReg();
1842             }
1843
1844             auto emitSwCase = [&](int8_t i) {
1845                 if (baseType == TYP_FLOAT)
1846                 {
1847                     // extract instructions return to GP-registers, so it needs int size as the emitsize
1848                     emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, i);
1849                     emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1850                 }
1851                 else
1852                 {
1853                     emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, i);
1854                 }
1855             };
1856
1857             if (op2->IsCnsIntOrI())
1858             {
1859                 ssize_t ival = op2->AsIntCon()->IconValue();
1860                 assert((ival >= 0) && (ival <= 255));
1861                 emitSwCase((int8_t)ival);
1862             }
1863             else
1864             {
1865                 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1866                 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1867                 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1868                 regNumber baseReg = node->ExtractTempReg();
1869                 regNumber offsReg = node->GetSingleTempReg();
1870                 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1871             }
1872             break;
1873         }
1874
1875         default:
1876             unreached();
1877             break;
1878     }
1879
1880     genProduceReg(node);
1881 }
1882
1883 //------------------------------------------------------------------------
1884 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1885 //
1886 // Arguments:
1887 //    node - The hardware intrinsic node
1888 //
1889 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1890 {
1891     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1892     regNumber      targetReg   = node->gtRegNum;
1893     GenTree*       op1         = node->gtGetOp1();
1894     GenTree*       op2         = node->gtGetOp2();
1895     var_types      baseType    = node->gtSIMDBaseType;
1896     var_types      targetType  = node->TypeGet();
1897     emitter*       emit        = getEmitter();
1898
1899     regNumber op1Reg = op1->gtRegNum;
1900     genConsumeOperands(node);
1901
1902     assert(targetReg != REG_NA);
1903     assert(op1Reg != REG_NA);
1904     assert(op2 != nullptr);
1905     assert(!node->OperIsCommutative());
1906
1907     switch (intrinsicId)
1908     {
1909         case NI_SSE42_Crc32:
1910         case NI_SSE42_X64_Crc32:
1911         {
1912             if (op1Reg != targetReg)
1913             {
1914                 assert(op2->gtRegNum != targetReg);
1915                 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1916             }
1917
1918             // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1919             // overload that explicitly takes the operands.
1920             node->gtOp1 = op2;
1921             node->gtOp2 = nullptr;
1922
1923             if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1924             {
1925                 assert(targetType == TYP_INT);
1926                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1927             }
1928             else
1929             {
1930                 assert(op1->TypeGet() == op2->TypeGet());
1931                 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1932                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1933             }
1934
1935             break;
1936         }
1937
1938         default:
1939         {
1940             unreached();
1941             break;
1942         }
1943     }
1944
1945     genProduceReg(node);
1946 }
1947
1948 //------------------------------------------------------------------------
1949 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1950 //
1951 // Arguments:
1952 //    node - The hardware intrinsic node
1953 //
1954 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1955 {
1956     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1957     var_types      baseType    = node->gtSIMDBaseType;
1958     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
1959     var_types      targetType  = node->TypeGet();
1960     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1961     int            numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
1962     GenTree*       op1         = node->gtGetOp1();
1963     GenTree*       op2         = node->gtGetOp2();
1964     regNumber      op1Reg      = REG_NA;
1965     regNumber      op2Reg      = REG_NA;
1966     regNumber      targetReg   = node->gtRegNum;
1967     emitter*       emit        = getEmitter();
1968
1969     if ((op1 != nullptr) && !op1->OperIsList())
1970     {
1971         op1Reg = op1->gtRegNum;
1972         genConsumeOperands(node);
1973     }
1974
1975     switch (intrinsicId)
1976     {
1977         case NI_AVX2_ConvertToInt32:
1978         case NI_AVX2_ConvertToUInt32:
1979         {
1980             assert(op2 == nullptr);
1981             assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1982             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1983             emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1984             break;
1985         }
1986
1987         case NI_AVX2_GatherVector128:
1988         case NI_AVX2_GatherVector256:
1989         case NI_AVX2_GatherMaskVector128:
1990         case NI_AVX2_GatherMaskVector256:
1991         {
1992             GenTreeArgList* list = op1->AsArgList();
1993             op1                  = list->Current();
1994             op1Reg               = op1->gtRegNum;
1995             genConsumeRegs(op1);
1996
1997             list   = list->Rest();
1998             op2    = list->Current();
1999             op2Reg = op2->gtRegNum;
2000             genConsumeRegs(op2);
2001
2002             list         = list->Rest();
2003             GenTree* op3 = list->Current();
2004             genConsumeRegs(op3);
2005
2006             list             = list->Rest();
2007             GenTree* op4     = nullptr;
2008             GenTree* lastOp  = nullptr;
2009             GenTree* indexOp = nullptr;
2010
2011             regNumber op3Reg       = REG_NA;
2012             regNumber op4Reg       = REG_NA;
2013             regNumber addrBaseReg  = REG_NA;
2014             regNumber addrIndexReg = REG_NA;
2015             regNumber maskReg      = node->ExtractTempReg(RBM_ALLFLOAT);
2016
2017             if (numArgs == 5)
2018             {
2019                 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
2020                 op4    = list->Current();
2021                 list   = list->Rest();
2022                 lastOp = list->Current();
2023                 op3Reg = op3->gtRegNum;
2024                 op4Reg = op4->gtRegNum;
2025                 genConsumeRegs(op4);
2026                 addrBaseReg  = op2Reg;
2027                 addrIndexReg = op3Reg;
2028                 indexOp      = op3;
2029
2030                 // copy op4Reg into the tmp mask register,
2031                 // the mask register will be cleared by gather instructions
2032                 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
2033
2034                 if (targetReg != op1Reg)
2035                 {
2036                     // copy source vector to the target register for masking merge
2037                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
2038                 }
2039             }
2040             else
2041             {
2042                 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
2043                 addrBaseReg  = op1Reg;
2044                 addrIndexReg = op2Reg;
2045                 indexOp      = op2;
2046                 lastOp       = op3;
2047
2048                 // generate all-one mask vector
2049                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
2050             }
2051
2052             bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
2053
2054             // hwintrinsiclistxarch.h uses Dword index instructions in default
2055             if (varTypeIsLong(node->gtIndexBaseType))
2056             {
2057                 switch (ins)
2058                 {
2059                     case INS_vpgatherdd:
2060                         ins = INS_vpgatherqd;
2061                         if (isVector128GatherWithVector256Index)
2062                         {
2063                             // YMM index in address mode
2064                             attr = emitTypeSize(TYP_SIMD32);
2065                         }
2066                         break;
2067                     case INS_vpgatherdq:
2068                         ins = INS_vpgatherqq;
2069                         break;
2070                     case INS_vgatherdps:
2071                         ins = INS_vgatherqps;
2072                         if (isVector128GatherWithVector256Index)
2073                         {
2074                             // YMM index in address mode
2075                             attr = emitTypeSize(TYP_SIMD32);
2076                         }
2077                         break;
2078                     case INS_vgatherdpd:
2079                         ins = INS_vgatherqpd;
2080                         break;
2081                     default:
2082                         unreached();
2083                 }
2084             }
2085
2086             assert(lastOp->IsCnsIntOrI());
2087             ssize_t ival = lastOp->AsIntCon()->IconValue();
2088             assert((ival >= 0) && (ival <= 255));
2089
2090             assert(targetReg != maskReg);
2091             assert(targetReg != addrIndexReg);
2092             assert(maskReg != addrIndexReg);
2093             emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2094
2095             break;
2096         }
2097
2098         case NI_AVX_TestC:
2099         {
2100             genHWIntrinsic_R_RM(node, ins, attr);
2101             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2102             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2103             break;
2104         }
2105
2106         case NI_AVX_TestNotZAndNotC:
2107         {
2108             genHWIntrinsic_R_RM(node, ins, attr);
2109             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2110             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2111             break;
2112         }
2113
2114         case NI_AVX_TestZ:
2115         {
2116             genHWIntrinsic_R_RM(node, ins, attr);
2117             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2118             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2119             break;
2120         }
2121
2122         default:
2123             unreached();
2124             break;
2125     }
2126
2127     genProduceReg(node);
2128 }
2129
2130 //------------------------------------------------------------------------
2131 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2132 //
2133 // Arguments:
2134 //    node - The hardware intrinsic node
2135 //
2136 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2137 {
2138     NYI("Implement AES intrinsic code generation");
2139 }
2140
2141 //------------------------------------------------------------------------
2142 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2143 //
2144 // Arguments:
2145 //    node - The hardware intrinsic node
2146 //
2147 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2148 {
2149     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2150     regNumber      targetReg   = node->gtRegNum;
2151     GenTree*       op1         = node->gtGetOp1();
2152     GenTree*       op2         = node->gtGetOp2();
2153     var_types      targetType  = node->TypeGet();
2154     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2155     emitter*       emit        = getEmitter();
2156
2157     assert(targetReg != REG_NA);
2158     assert(op1 != nullptr);
2159
2160     if (!op1->OperIsList())
2161     {
2162         genConsumeOperands(node);
2163     }
2164
2165     switch (intrinsicId)
2166     {
2167         case NI_BMI1_AndNot:
2168         case NI_BMI1_X64_AndNot:
2169         case NI_BMI1_BitFieldExtract:
2170         case NI_BMI1_X64_BitFieldExtract:
2171         case NI_BMI2_ParallelBitDeposit:
2172         case NI_BMI2_ParallelBitExtract:
2173         case NI_BMI2_X64_ParallelBitDeposit:
2174         case NI_BMI2_X64_ParallelBitExtract:
2175         case NI_BMI2_ZeroHighBits:
2176         case NI_BMI2_X64_ZeroHighBits:
2177         {
2178             assert(op2 != nullptr);
2179             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2180             genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2181             break;
2182         }
2183
2184         case NI_BMI1_ExtractLowestSetBit:
2185         case NI_BMI1_GetMaskUpToLowestSetBit:
2186         case NI_BMI1_ResetLowestSetBit:
2187         case NI_BMI1_X64_ExtractLowestSetBit:
2188         case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2189         case NI_BMI1_X64_ResetLowestSetBit:
2190         {
2191             assert(op2 == nullptr);
2192             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2193             genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2194             break;
2195         }
2196
2197         case NI_BMI1_TrailingZeroCount:
2198         case NI_BMI1_X64_TrailingZeroCount:
2199         {
2200             assert(op2 == nullptr);
2201             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2202             genXCNTIntrinsic(node, ins);
2203             break;
2204         }
2205
2206         case NI_BMI2_MultiplyNoFlags:
2207         case NI_BMI2_X64_MultiplyNoFlags:
2208         {
2209             int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2210             assert(numArgs == 2 || numArgs == 3);
2211
2212             regNumber op1Reg = REG_NA;
2213             regNumber op2Reg = REG_NA;
2214             regNumber op3Reg = REG_NA;
2215             regNumber lowReg = REG_NA;
2216
2217             if (numArgs == 2)
2218             {
2219                 op1Reg = op1->gtRegNum;
2220                 op2Reg = op2->gtRegNum;
2221                 lowReg = targetReg;
2222             }
2223             else
2224             {
2225                 GenTreeArgList* argList = op1->AsArgList();
2226                 op1                     = argList->Current();
2227                 genConsumeRegs(op1);
2228                 op1Reg  = op1->gtRegNum;
2229                 argList = argList->Rest();
2230                 op2     = argList->Current();
2231                 genConsumeRegs(op2);
2232                 op2Reg       = op2->gtRegNum;
2233                 argList      = argList->Rest();
2234                 GenTree* op3 = argList->Current();
2235                 genConsumeRegs(op3);
2236                 op3Reg = op3->gtRegNum;
2237                 assert(op3Reg != op1Reg);
2238                 assert(op3Reg != targetReg);
2239                 assert(op3Reg != REG_EDX);
2240                 lowReg = node->GetSingleTempReg();
2241                 assert(op3Reg != lowReg);
2242                 assert(lowReg != targetReg);
2243             }
2244
2245             emitAttr attr = emitTypeSize(targetType);
2246             // mov the first operand into implicit source operand EDX/RDX
2247             if (op1Reg != REG_EDX)
2248             {
2249                 assert(op2Reg != REG_EDX);
2250                 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2251             }
2252
2253             // generate code for MULX
2254             genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2255
2256             // If requires the lower half result, store in the memory opinted by op3
2257             if (numArgs == 3)
2258             {
2259                 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2260             }
2261
2262             break;
2263         }
2264
2265         default:
2266         {
2267             unreached();
2268             break;
2269         }
2270     }
2271
2272     genProduceReg(node);
2273 }
2274
2275 //------------------------------------------------------------------------
2276 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2277 //
2278 // Arguments:
2279 //    node - The hardware intrinsic node
2280 //
2281 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2282 {
2283     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2284     var_types      baseType    = node->gtSIMDBaseType;
2285     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
2286     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2287     GenTree*       op1         = node->gtGetOp1();
2288     regNumber      targetReg   = node->gtRegNum;
2289
2290     assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2291     assert(op1 != nullptr);
2292     assert(op1->OperIsList());
2293     assert(op1->gtGetOp2()->OperIsList());
2294     assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
2295
2296     GenTreeArgList* argList = op1->AsArgList();
2297     op1                     = argList->Current();
2298     genConsumeRegs(op1);
2299
2300     argList      = argList->Rest();
2301     GenTree* op2 = argList->Current();
2302     genConsumeRegs(op2);
2303
2304     argList      = argList->Rest();
2305     GenTree* op3 = argList->Current();
2306     genConsumeRegs(op3);
2307
2308     regNumber op1Reg;
2309     regNumber op2Reg;
2310
2311     bool       isCommutative   = false;
2312     const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2313
2314     // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2315     assert(!copiesUpperBits || !op1->isContained());
2316
2317     if (op3->isContained() || op3->isUsedFromSpillTemp())
2318     {
2319         // 213 form: op1 = (op2 * op1) + [op3]
2320
2321         op1Reg = op1->gtRegNum;
2322         op2Reg = op2->gtRegNum;
2323
2324         isCommutative = !copiesUpperBits;
2325     }
2326     else if (op2->isContained() || op2->isUsedFromSpillTemp())
2327     {
2328         // 132 form: op1 = (op1 * op3) + [op2]
2329
2330         ins    = (instruction)(ins - 1);
2331         op1Reg = op1->gtRegNum;
2332         op2Reg = op3->gtRegNum;
2333         op3    = op2;
2334     }
2335     else if (op1->isContained() || op1->isUsedFromSpillTemp())
2336     {
2337         // 231 form: op3 = (op2 * op3) + [op1]
2338
2339         ins    = (instruction)(ins + 1);
2340         op1Reg = op3->gtRegNum;
2341         op2Reg = op2->gtRegNum;
2342         op3    = op1;
2343     }
2344     else
2345     {
2346         // 213 form: op1 = (op2 * op1) + op3
2347
2348         op1Reg = op1->gtRegNum;
2349         op2Reg = op2->gtRegNum;
2350
2351         isCommutative = !copiesUpperBits;
2352     }
2353
2354     if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2355     {
2356         assert(node->isRMWHWIntrinsic(compiler));
2357
2358         // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2359         //
2360         // For non-commutative intrinsics, we should have ensured that op2 was marked
2361         // delay free in order to prevent it from getting assigned the same register
2362         // as target. However, for commutative intrinsics, we can just swap the operands
2363         // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2364
2365         op2Reg = op1Reg;
2366         op1Reg = targetReg;
2367     }
2368
2369     genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2370     genProduceReg(node);
2371 }
2372
2373 //------------------------------------------------------------------------
2374 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2375 //
2376 // Arguments:
2377 //    node - The hardware intrinsic node
2378 //
2379 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2380 {
2381     assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2382            node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2383
2384     genConsumeOperands(node);
2385     genXCNTIntrinsic(node, INS_lzcnt);
2386     genProduceReg(node);
2387 }
2388
2389 //------------------------------------------------------------------------
2390 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2391 //
2392 // Arguments:
2393 //    node - The hardware intrinsic node
2394 //
2395 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2396 {
2397     NYI("Implement PCLMULQDQ intrinsic code generation");
2398 }
2399
2400 //------------------------------------------------------------------------
2401 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2402 //
2403 // Arguments:
2404 //    node - The hardware intrinsic node
2405 //
2406 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2407 {
2408     assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2409
2410     genConsumeOperands(node);
2411     genXCNTIntrinsic(node, INS_popcnt);
2412     genProduceReg(node);
2413 }
2414
2415 //------------------------------------------------------------------------
2416 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2417 // the target register
2418 //
2419 // Arguments:
2420 //    node - The hardware intrinsic node
2421 //    ins  - The instruction being generated
2422 //
2423 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2424 {
2425     // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2426     // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2427     // renaming, but only if it's not an actual dependency.
2428
2429     GenTree*  op1        = node->gtGetOp1();
2430     regNumber sourceReg1 = REG_NA;
2431     regNumber sourceReg2 = REG_NA;
2432
2433     if (!op1->isContained())
2434     {
2435         sourceReg1 = op1->gtRegNum;
2436     }
2437     else if (op1->isIndir())
2438     {
2439         GenTreeIndir* indir   = op1->AsIndir();
2440         GenTree*      memBase = indir->Base();
2441
2442         if (memBase != nullptr)
2443         {
2444             sourceReg1 = memBase->gtRegNum;
2445         }
2446
2447         if (indir->HasIndex())
2448         {
2449             sourceReg2 = indir->Index()->gtRegNum;
2450         }
2451     }
2452
2453     regNumber targetReg = node->gtRegNum;
2454     if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2455     {
2456         getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2457     }
2458     genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2459 }
2460
2461 #endif // FEATURE_HW_INTRINSICS