Correctly handle variable argument SIMDScalar (#26421) (#26778)
[platform/upstream/coreclr.git] / src / jit / hwintrinsiccodegenxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX               Intel hardware intrinsic Code Generator                     XX
9 XX                                                                           XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12 */
13 #include "jitpch.h"
14 #ifdef _MSC_VER
15 #pragma hdrstop
16 #endif
17
18 #ifdef FEATURE_HW_INTRINSICS
19
20 #include "emit.h"
21 #include "codegen.h"
22 #include "sideeffects.h"
23 #include "lower.h"
24 #include "gcinfo.h"
25 #include "gcinfoencoder.h"
26
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
29 //
30 // Arguments:
31 //    lowering - The lowering phase from the compiler
32 //    node     - The HWIntrinsic node that has the contained node
33 //    op       - The op that is contained
34 //
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
36 {
37 #if DEBUG
38     // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39     // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
40     //
41     // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42     // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
43     // spillage
44     // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
45     // register
46     // in the first place).
47
48     bool supportsRegOptional = false;
49     bool isContainable       = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50     assert(isContainable || supportsRegOptional);
51 #endif // DEBUG
52 }
53
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
56 //
57 // Arguments:
58 //    category - category of a HW intrinsic
59 //
60 // Return Value:
61 //    returns true if this category can be table-driven in CodeGen
62 //
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
64 {
65     // TODO - make more categories to the table-driven framework
66     // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67     const bool tableDrivenCategory =
68         (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69     const bool tableDrivenFlag =
70         !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71     return tableDrivenCategory && tableDrivenFlag;
72 }
73
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
76 //
77 // Arguments:
78 //    node - The hardware intrinsic node
79 //
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
81 {
82     NamedIntrinsic      intrinsicId = node->gtHWIntrinsicId;
83     InstructionSet      isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
84     HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
85     int                 ival        = HWIntrinsicInfo::lookupIval(intrinsicId);
86     int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
87
88     assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
89
90     if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
91     {
92         GenTree*  op1        = node->gtGetOp1();
93         GenTree*  op2        = node->gtGetOp2();
94         regNumber targetReg  = node->gtRegNum;
95         var_types targetType = node->TypeGet();
96         var_types baseType   = node->gtSIMDBaseType;
97
98         regNumber op1Reg = REG_NA;
99         regNumber op2Reg = REG_NA;
100         emitter*  emit   = getEmitter();
101
102         assert(numArgs >= 0);
103         instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104         assert(ins != INS_invalid);
105         emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106         assert(simdSize != 0);
107
108         switch (numArgs)
109         {
110             case 1:
111             {
112                 if (node->OperIsMemoryLoad())
113                 {
114                     genConsumeAddress(op1);
115                     // Until we improve the handling of addressing modes in the emitter, we'll create a
116                     // temporary GT_IND to generate code with.
117                     GenTreeIndir load = indirForm(node->TypeGet(), op1);
118                     emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
119                 }
120                 else
121                 {
122                     genConsumeRegs(op1);
123                     op1Reg = op1->gtRegNum;
124
125                     if ((ival != -1) && varTypeIsFloating(baseType))
126                     {
127                         assert((ival >= 0) && (ival <= 127));
128                         if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
129                         {
130                             assert(!op1->isContained());
131                             emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op1Reg,
132                                                        static_cast<int8_t>(ival));
133                         }
134                         else
135                         {
136                             genHWIntrinsic_R_RM_I(node, ins, static_cast<int8_t>(ival));
137                         }
138                     }
139                     else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
140                     {
141                         emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
142                     }
143                     else
144                     {
145                         genHWIntrinsic_R_RM(node, ins, simdSize);
146                     }
147                 }
148                 break;
149             }
150
151             case 2:
152             {
153                 if (category == HW_Category_MemoryStore)
154                 {
155                     genConsumeAddress(op1);
156                     genConsumeReg(op2);
157                     // Until we improve the handling of addressing modes in the emitter, we'll create a
158                     // temporary GT_STORE_IND to generate code with.
159                     GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
160                     emit->emitInsStoreInd(ins, simdSize, &store);
161                     break;
162                 }
163                 genConsumeRegs(op1);
164                 genConsumeRegs(op2);
165
166                 op1Reg = op1->gtRegNum;
167                 op2Reg = op2->gtRegNum;
168
169                 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
170                 {
171                     // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
172                     //
173                     // For non-commutative intrinsics, we should have ensured that op2 was marked
174                     // delay free in order to prevent it from getting assigned the same register
175                     // as target. However, for commutative intrinsics, we can just swap the operands
176                     // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
177
178                     noway_assert(node->OperIsCommutative());
179                     op2Reg = op1Reg;
180                     op1Reg = targetReg;
181                 }
182
183                 if ((ival != -1) && varTypeIsFloating(baseType))
184                 {
185                     assert((ival >= 0) && (ival <= 127));
186                     genHWIntrinsic_R_R_RM_I(node, ins, static_cast<int8_t>(ival));
187                 }
188                 else if (category == HW_Category_MemoryLoad)
189                 {
190                     // Get the address and the 'other' register.
191                     GenTree*  addr;
192                     regNumber otherReg;
193                     if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
194                     {
195                         addr     = op1;
196                         otherReg = op2Reg;
197                     }
198                     else
199                     {
200                         addr     = op2;
201                         otherReg = op1Reg;
202                     }
203                     // Until we improve the handling of addressing modes in the emitter, we'll create a
204                     // temporary GT_IND to generate code with.
205                     GenTreeIndir load = indirForm(node->TypeGet(), addr);
206                     genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
207                 }
208                 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
209                 {
210                     assert(ival == -1);
211
212                     if (intrinsicId == NI_SSE2_Extract)
213                     {
214                         // extract instructions return to GP-registers, so it needs int size as the emitsize
215                         simdSize = emitTypeSize(TYP_INT);
216                     }
217
218                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
219
220                     if (op2->IsCnsIntOrI())
221                     {
222                         ssize_t ival = op2->AsIntCon()->IconValue();
223                         assert((ival >= 0) && (ival <= 255));
224                         emitSwCase((int8_t)ival);
225                     }
226                     else
227                     {
228                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
229                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
230                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
231                         regNumber baseReg = node->ExtractTempReg();
232                         regNumber offsReg = node->GetSingleTempReg();
233                         genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
234                     }
235                 }
236                 else
237                 {
238                     genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
239                 }
240                 break;
241             }
242
243             case 3:
244             {
245                 GenTreeArgList* argList = op1->AsArgList();
246                 op1                     = argList->Current();
247                 genConsumeRegs(op1);
248                 op1Reg = op1->gtRegNum;
249
250                 argList = argList->Rest();
251                 op2     = argList->Current();
252                 genConsumeRegs(op2);
253                 op2Reg = op2->gtRegNum;
254
255                 argList      = argList->Rest();
256                 GenTree* op3 = argList->Current();
257                 genConsumeRegs(op3);
258                 regNumber op3Reg = op3->gtRegNum;
259
260                 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
261                 {
262                     assert(ival == -1);
263
264                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
265
266                     if (op3->IsCnsIntOrI())
267                     {
268                         ssize_t ival = op3->AsIntCon()->IconValue();
269                         assert((ival >= 0) && (ival <= 255));
270                         emitSwCase((int8_t)ival);
271                     }
272                     else
273                     {
274                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
275                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
276                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
277                         regNumber baseReg = node->ExtractTempReg();
278                         regNumber offsReg = node->GetSingleTempReg();
279                         genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
280                     }
281                 }
282                 else if (category == HW_Category_MemoryStore)
283                 {
284                     // The Mask instructions do not currently support containment of the address.
285                     assert(!op2->isContained());
286                     if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
287                     {
288                         emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
289                     }
290                     else
291                     {
292                         assert(intrinsicId == NI_SSE2_MaskMove);
293                         assert(targetReg == REG_NA);
294
295                         // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
296                         if (op3Reg != REG_EDI)
297                         {
298                             emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
299                         }
300                         emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
301                     }
302                 }
303                 else
304                 {
305                     switch (intrinsicId)
306                     {
307                         case NI_SSE41_BlendVariable:
308                         case NI_AVX_BlendVariable:
309                         case NI_AVX2_BlendVariable:
310                         {
311                             genHWIntrinsic_R_R_RM_R(node, ins);
312                             break;
313                         }
314
315                         default:
316                         {
317                             unreached();
318                             break;
319                         };
320                     }
321                 }
322                 break;
323             }
324
325             default:
326                 unreached();
327                 break;
328         }
329         genProduceReg(node);
330         return;
331     }
332
333     switch (isa)
334     {
335         case InstructionSet_Vector128:
336         case InstructionSet_Vector256:
337             genBaseIntrinsic(node);
338             break;
339         case InstructionSet_SSE:
340         case InstructionSet_SSE_X64:
341             genSSEIntrinsic(node);
342             break;
343         case InstructionSet_SSE2:
344         case InstructionSet_SSE2_X64:
345             genSSE2Intrinsic(node);
346             break;
347         case InstructionSet_SSE41:
348         case InstructionSet_SSE41_X64:
349             genSSE41Intrinsic(node);
350             break;
351         case InstructionSet_SSE42:
352         case InstructionSet_SSE42_X64:
353             genSSE42Intrinsic(node);
354             break;
355         case InstructionSet_AVX:
356         case InstructionSet_AVX2:
357             genAvxOrAvx2Intrinsic(node);
358             break;
359         case InstructionSet_AES:
360             genAESIntrinsic(node);
361             break;
362         case InstructionSet_BMI1:
363         case InstructionSet_BMI1_X64:
364         case InstructionSet_BMI2:
365         case InstructionSet_BMI2_X64:
366             genBMI1OrBMI2Intrinsic(node);
367             break;
368         case InstructionSet_FMA:
369             genFMAIntrinsic(node);
370             break;
371         case InstructionSet_LZCNT:
372         case InstructionSet_LZCNT_X64:
373             genLZCNTIntrinsic(node);
374             break;
375         case InstructionSet_PCLMULQDQ:
376             genPCLMULQDQIntrinsic(node);
377             break;
378         case InstructionSet_POPCNT:
379         case InstructionSet_POPCNT_X64:
380             genPOPCNTIntrinsic(node);
381             break;
382         default:
383             unreached();
384             break;
385     }
386 }
387
388 //------------------------------------------------------------------------
389 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
390 //                      register/memory operand and that returns a value in register
391 //
392 // Arguments:
393 //    node - The hardware intrinsic node
394 //    ins  - The instruction being generated
395 //    attr - The emit attribute for the instruciton being generated
396 //
397 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
398 {
399     var_types targetType = node->TypeGet();
400     regNumber targetReg  = node->gtRegNum;
401     GenTree*  op1        = node->gtGetOp1();
402     GenTree*  op2        = node->gtGetOp2();
403     emitter*  emit       = getEmitter();
404
405     if (op2 != nullptr)
406     {
407         // The CompareScalarOrdered* and CompareScalarUnordered* intrinsics come down this
408         // code path. They are all MultiIns, as the return value comes from the flags and
409         // we have two operands instead.
410
411         assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
412         assert(targetReg != REG_NA);
413
414         targetReg = op1->gtRegNum;
415         op1       = op2;
416         op2       = nullptr;
417     }
418     else
419     {
420         assert(!node->OperIsCommutative());
421     }
422
423     assert(targetReg != REG_NA);
424     assert(op2 == nullptr);
425
426     if (op1->isContained() || op1->isUsedFromSpillTemp())
427     {
428         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
429         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
430
431         TempDsc* tmpDsc = nullptr;
432         unsigned varNum = BAD_VAR_NUM;
433         unsigned offset = (unsigned)-1;
434
435         if (op1->isUsedFromSpillTemp())
436         {
437             assert(op1->IsRegOptional());
438
439             tmpDsc = getSpillTempDsc(op1);
440             varNum = tmpDsc->tdTempNum();
441             offset = 0;
442
443             regSet.tmpRlsTemp(tmpDsc);
444         }
445         else if (op1->isIndir() || op1->OperIsHWIntrinsic())
446         {
447             GenTree*      addr;
448             GenTreeIndir* memIndir = nullptr;
449
450             if (op1->isIndir())
451             {
452                 memIndir = op1->AsIndir();
453                 addr     = memIndir->Addr();
454             }
455             else
456             {
457                 assert(op1->AsHWIntrinsic()->OperIsMemoryLoad());
458                 assert(HWIntrinsicInfo::lookupNumArgs(op1->AsHWIntrinsic()) == 1);
459                 addr = op1->gtGetOp1();
460             }
461
462             switch (addr->OperGet())
463             {
464                 case GT_LCL_VAR_ADDR:
465                 {
466                     varNum = addr->AsLclVarCommon()->GetLclNum();
467                     offset = 0;
468                     break;
469                 }
470
471                 case GT_CLS_VAR_ADDR:
472                 {
473                     emit->emitIns_R_C(ins, attr, targetReg, addr->gtClsVar.gtClsVarHnd, 0);
474                     return;
475                 }
476
477                 default:
478                 {
479                     if (memIndir == nullptr)
480                     {
481                         // This is the HW intrinsic load case.
482                         // Until we improve the handling of addressing modes in the emitter, we'll create a
483                         // temporary GT_IND to generate code with.
484                         GenTreeIndir load = indirForm(op1->TypeGet(), addr);
485                         memIndir          = &load;
486                     }
487                     emit->emitIns_R_A(ins, attr, targetReg, memIndir);
488                     return;
489                 }
490             }
491         }
492         else
493         {
494             switch (op1->OperGet())
495             {
496                 case GT_LCL_FLD:
497                 {
498                     GenTreeLclFld* lclField = op1->AsLclFld();
499
500                     varNum = lclField->GetLclNum();
501                     offset = lclField->gtLclFld.gtLclOffs;
502                     break;
503                 }
504
505                 case GT_LCL_VAR:
506                 {
507                     assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
508                     varNum = op1->AsLclVar()->GetLclNum();
509                     offset = 0;
510                     break;
511                 }
512
513                 default:
514                 {
515                     unreached();
516                     break;
517                 }
518             }
519         }
520
521         // Ensure we got a good varNum and offset.
522         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
523         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
524         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
525         assert(offset != (unsigned)-1);
526
527         emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
528     }
529     else
530     {
531         regNumber op1Reg = op1->gtRegNum;
532         emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
533     }
534 }
535
536 //------------------------------------------------------------------------
537 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
538 //                        an immediate operand, and that returns a value in register
539 //
540 // Arguments:
541 //    node - The hardware intrinsic node
542 //    ins  - The instruction being generated
543 //    ival - The immediate value
544 //
545 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
546 {
547     var_types targetType = node->TypeGet();
548     regNumber targetReg  = node->gtRegNum;
549     GenTree*  op1        = node->gtGetOp1();
550     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
551     emitter*  emit       = getEmitter();
552
553     // TODO-XArch-CQ: Commutative operations can have op1 be contained
554     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
555
556     assert(targetReg != REG_NA);
557     assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
558
559     if (op1->isContained() || op1->isUsedFromSpillTemp())
560     {
561         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
562         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
563     }
564     inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
565 }
566
567 //------------------------------------------------------------------------
568 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
569 //                        register/memory operand, and that returns a value in register
570 //
571 // Arguments:
572 //    node - The hardware intrinsic node
573 //    ins  - The instruction being generated
574 //    attr - The emit attribute for the instruciton being generated
575 //
576 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
577 {
578     regNumber targetReg = node->gtRegNum;
579     GenTree*  op1       = node->gtGetOp1();
580     GenTree*  op2       = node->gtGetOp2();
581     regNumber op1Reg    = op1->gtRegNum;
582
583     assert(targetReg != REG_NA);
584     assert(op1Reg != REG_NA);
585
586     genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
587 }
588
589 //------------------------------------------------------------------------
590 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
591 //                        register/memory operand, and that returns a value in register
592 //
593 // Arguments:
594 //    node - The hardware intrinsic node
595 //    ins  - The instruction being generated
596 //    attr - The emit attribute for the instruciton being generated
597 //    targetReg - The register allocated to the result
598 //    op1Reg    - The register allocated to the first operand
599 //    op2       - Another operand that maybe in register or memory
600 //
601 void CodeGen::genHWIntrinsic_R_R_RM(
602     GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
603 {
604     emitter* emit = getEmitter();
605
606     // TODO-XArch-CQ: Commutative operations can have op1 be contained
607     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
608
609     assert(targetReg != REG_NA);
610     assert(op1Reg != REG_NA);
611
612     if (op2->isContained() || op2->isUsedFromSpillTemp())
613     {
614         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
615         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
616
617         TempDsc* tmpDsc = nullptr;
618         unsigned varNum = BAD_VAR_NUM;
619         unsigned offset = (unsigned)-1;
620
621         if (op2->isUsedFromSpillTemp())
622         {
623             assert(op2->IsRegOptional());
624
625             tmpDsc = getSpillTempDsc(op2);
626             varNum = tmpDsc->tdTempNum();
627             offset = 0;
628
629             regSet.tmpRlsTemp(tmpDsc);
630         }
631         else if (op2->isIndir() || op2->OperIsHWIntrinsic())
632         {
633             GenTree*      addr;
634             GenTreeIndir* memIndir = nullptr;
635
636             if (op2->isIndir())
637             {
638                 memIndir = op2->AsIndir();
639                 addr     = memIndir->Addr();
640             }
641             else
642             {
643                 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
644                 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
645                 addr = op2->gtGetOp1();
646             }
647
648             switch (addr->OperGet())
649             {
650                 case GT_LCL_VAR_ADDR:
651                 {
652                     varNum = addr->AsLclVarCommon()->GetLclNum();
653                     offset = 0;
654                     break;
655                 }
656
657                 case GT_CLS_VAR_ADDR:
658                 {
659                     emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, addr->gtClsVar.gtClsVarHnd, 0);
660                     return;
661                 }
662
663                 default:
664                 {
665                     if (memIndir == nullptr)
666                     {
667                         // This is the HW intrinsic load case.
668                         // Until we improve the handling of addressing modes in the emitter, we'll create a
669                         // temporary GT_IND to generate code with.
670                         GenTreeIndir load = indirForm(op2->TypeGet(), addr);
671                         memIndir          = &load;
672                     }
673                     emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
674                     return;
675                 }
676             }
677         }
678         else
679         {
680             switch (op2->OperGet())
681             {
682                 case GT_LCL_FLD:
683                 {
684                     GenTreeLclFld* lclField = op2->AsLclFld();
685
686                     varNum = lclField->GetLclNum();
687                     offset = lclField->gtLclFld.gtLclOffs;
688                     break;
689                 }
690
691                 case GT_LCL_VAR:
692                 {
693                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
694                     varNum = op2->AsLclVar()->GetLclNum();
695                     offset = 0;
696                     break;
697                 }
698
699                 default:
700                     unreached();
701                     break;
702             }
703         }
704
705         // Ensure we got a good varNum and offset.
706         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
707         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
708         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
709         assert(offset != (unsigned)-1);
710
711         emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
712     }
713     else
714     {
715         regNumber op2Reg = op2->gtRegNum;
716
717         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
718         {
719             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
720             //
721             // For non-commutative intrinsics, we should have ensured that op2 was marked
722             // delay free in order to prevent it from getting assigned the same register
723             // as target. However, for commutative intrinsics, we can just swap the operands
724             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
725
726             noway_assert(node->OperIsCommutative());
727             op2Reg = op1Reg;
728             op1Reg = targetReg;
729         }
730
731         emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
732     }
733 }
734
735 //------------------------------------------------------------------------
736 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
737 //                        register/memory operand, an immediate operand, and that returns a value in register
738 //
739 // Arguments:
740 //    node - The hardware intrinsic node
741 //    ins  - The instruction being generated
742 //    ival - The immediate value
743 //
744 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
745 {
746     var_types targetType = node->TypeGet();
747     regNumber targetReg  = node->gtRegNum;
748     GenTree*  op1        = node->gtGetOp1();
749     GenTree*  op2        = node->gtGetOp2();
750     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
751     emitter*  emit       = getEmitter();
752
753     // TODO-XArch-CQ: Commutative operations can have op1 be contained
754     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
755
756     if (op1->OperIsList())
757     {
758         assert(op2 == nullptr);
759
760         GenTreeArgList* argList = op1->AsArgList();
761
762         op1     = argList->Current();
763         argList = argList->Rest();
764
765         op2     = argList->Current();
766         argList = argList->Rest();
767
768         assert(argList->Current() != nullptr);
769         assert(argList->Rest() == nullptr);
770     }
771
772     regNumber op1Reg = op1->gtRegNum;
773
774     assert(targetReg != REG_NA);
775     assert(op1Reg != REG_NA);
776
777     if (op2->isContained() || op2->isUsedFromSpillTemp())
778     {
779         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
780         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
781
782         TempDsc* tmpDsc = nullptr;
783         unsigned varNum = BAD_VAR_NUM;
784         unsigned offset = (unsigned)-1;
785
786         if (op2->isUsedFromSpillTemp())
787         {
788             assert(op2->IsRegOptional());
789
790             tmpDsc = getSpillTempDsc(op2);
791             varNum = tmpDsc->tdTempNum();
792             offset = 0;
793
794             regSet.tmpRlsTemp(tmpDsc);
795         }
796         else if (op2->isIndir() || op2->OperIsHWIntrinsic())
797         {
798             GenTree*      addr;
799             GenTreeIndir* memIndir = nullptr;
800
801             if (op2->isIndir())
802             {
803                 memIndir = op2->AsIndir();
804                 addr     = memIndir->Addr();
805             }
806             else
807             {
808                 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
809                 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
810                 addr = op2->gtGetOp1();
811             }
812
813             switch (addr->OperGet())
814             {
815                 case GT_LCL_VAR_ADDR:
816                 {
817                     varNum = addr->AsLclVarCommon()->GetLclNum();
818                     offset = 0;
819                     break;
820                 }
821
822                 case GT_CLS_VAR_ADDR:
823                 {
824                     emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, addr->gtClsVar.gtClsVarHnd, 0, ival);
825                     return;
826                 }
827
828                 default:
829                 {
830                     if (memIndir == nullptr)
831                     {
832                         // This is the HW intrinsic load case.
833                         // Until we improve the handling of addressing modes in the emitter, we'll create a
834                         // temporary GT_IND to generate code with.
835                         GenTreeIndir load = indirForm(op2->TypeGet(), addr);
836                         memIndir          = &load;
837                     }
838                     emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
839                     return;
840                 }
841             }
842         }
843         else
844         {
845             switch (op2->OperGet())
846             {
847                 case GT_LCL_FLD:
848                 {
849                     GenTreeLclFld* lclField = op2->AsLclFld();
850
851                     varNum = lclField->GetLclNum();
852                     offset = lclField->gtLclFld.gtLclOffs;
853                     break;
854                 }
855
856                 case GT_LCL_VAR:
857                 {
858                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
859                     varNum = op2->AsLclVar()->GetLclNum();
860                     offset = 0;
861                     break;
862                 }
863
864                 default:
865                     unreached();
866                     break;
867             }
868         }
869
870         // Ensure we got a good varNum and offset.
871         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
872         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
873         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
874         assert(offset != (unsigned)-1);
875
876         emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
877     }
878     else
879     {
880         regNumber op2Reg = op2->gtRegNum;
881
882         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
883         {
884             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
885             //
886             // For non-commutative intrinsics, we should have ensured that op2 was marked
887             // delay free in order to prevent it from getting assigned the same register
888             // as target. However, for commutative intrinsics, we can just swap the operands
889             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
890
891             noway_assert(node->OperIsCommutative());
892             op2Reg = op1Reg;
893             op1Reg = targetReg;
894         }
895
896         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
897     }
898 }
899
900 //------------------------------------------------------------------------
901 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
902 //                          register/memory operand, another register operand, and that returns a value in register
903 //
904 // Arguments:
905 //    node - The hardware intrinsic node
906 //    ins  - The instruction being generated
907 //
908 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
909 {
910     var_types targetType = node->TypeGet();
911     regNumber targetReg  = node->gtRegNum;
912     GenTree*  op1        = node->gtGetOp1();
913     GenTree*  op2        = node->gtGetOp2();
914     GenTree*  op3        = nullptr;
915     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
916     emitter*  emit       = getEmitter();
917
918     assert(op1->OperIsList());
919     assert(op2 == nullptr);
920
921     GenTreeArgList* argList = op1->AsArgList();
922
923     op1     = argList->Current();
924     argList = argList->Rest();
925
926     op2     = argList->Current();
927     argList = argList->Rest();
928
929     op3 = argList->Current();
930     assert(argList->Rest() == nullptr);
931
932     regNumber op1Reg = op1->gtRegNum;
933     regNumber op3Reg = op3->gtRegNum;
934
935     assert(targetReg != REG_NA);
936     assert(op1Reg != REG_NA);
937     assert(op3Reg != REG_NA);
938
939     if (op2->isContained() || op2->isUsedFromSpillTemp())
940     {
941         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
942         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
943
944         TempDsc* tmpDsc = nullptr;
945         unsigned varNum = BAD_VAR_NUM;
946         unsigned offset = (unsigned)-1;
947
948         if (op2->isUsedFromSpillTemp())
949         {
950             assert(op2->IsRegOptional());
951
952             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
953             //                     pattern. It could probably be extracted to its own method.
954             tmpDsc = getSpillTempDsc(op2);
955             varNum = tmpDsc->tdTempNum();
956             offset = 0;
957
958             regSet.tmpRlsTemp(tmpDsc);
959         }
960         else if (op2->isIndir() || op2->OperIsHWIntrinsic())
961         {
962             GenTree*      addr;
963             GenTreeIndir* memIndir = nullptr;
964
965             if (op2->isIndir())
966             {
967                 memIndir = op2->AsIndir();
968                 addr     = memIndir->Addr();
969             }
970             else
971             {
972                 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
973                 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
974                 addr = op2->gtGetOp1();
975             }
976
977             switch (addr->OperGet())
978             {
979                 case GT_LCL_VAR_ADDR:
980                 {
981                     varNum = addr->AsLclVarCommon()->GetLclNum();
982                     offset = 0;
983                     break;
984                 }
985
986                 case GT_CLS_VAR_ADDR:
987                 {
988                     emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, addr->gtClsVar.gtClsVarHnd, 0);
989                     return;
990                 }
991
992                 default:
993                 {
994                     if (memIndir == nullptr)
995                     {
996                         // This is the HW intrinsic load case.
997                         // Until we improve the handling of addressing modes in the emitter, we'll create a
998                         // temporary GT_IND to generate code with.
999                         GenTreeIndir load = indirForm(op2->TypeGet(), addr);
1000                         memIndir          = &load;
1001                     }
1002                     emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
1003                     return;
1004                 }
1005             }
1006         }
1007         else
1008         {
1009             switch (op2->OperGet())
1010             {
1011                 case GT_LCL_FLD:
1012                 {
1013                     GenTreeLclFld* lclField = op2->AsLclFld();
1014
1015                     varNum = lclField->GetLclNum();
1016                     offset = lclField->gtLclFld.gtLclOffs;
1017                     break;
1018                 }
1019
1020                 case GT_LCL_VAR:
1021                 {
1022                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
1023                     varNum = op2->AsLclVar()->GetLclNum();
1024                     offset = 0;
1025                     break;
1026                 }
1027
1028                 default:
1029                     unreached();
1030                     break;
1031             }
1032         }
1033
1034         // Ensure we got a good varNum and offset.
1035         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1036         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1037         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1038         assert(offset != (unsigned)-1);
1039
1040         emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
1041     }
1042     else
1043     {
1044         emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1045     }
1046 }
1047
1048 //------------------------------------------------------------------------
1049 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1050 //                          a register/memory operand, and that returns a value in register
1051 //
1052 // Arguments:
1053 //    ins       - The instruction being generated
1054 //    attr      - The emit attribute
1055 //    targetReg - The target register
1056 //    op1Reg    - The register of the first operand
1057 //    op2Reg    - The register of the second operand
1058 //    op3       - The third operand
1059 //
1060 void CodeGen::genHWIntrinsic_R_R_R_RM(
1061     instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1062 {
1063     assert(targetReg != REG_NA);
1064     assert(op1Reg != REG_NA);
1065     assert(op2Reg != REG_NA);
1066
1067     emitter* emit = getEmitter();
1068
1069     if (op3->isContained() || op3->isUsedFromSpillTemp())
1070     {
1071         TempDsc* tmpDsc = nullptr;
1072         unsigned varNum = BAD_VAR_NUM;
1073         unsigned offset = (unsigned)-1;
1074
1075         if (op3->isUsedFromSpillTemp())
1076         {
1077             assert(op3->IsRegOptional());
1078
1079             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1080             //                     pattern. It could probably be extracted to its own method.
1081             tmpDsc = getSpillTempDsc(op3);
1082             varNum = tmpDsc->tdTempNum();
1083             offset = 0;
1084
1085             regSet.tmpRlsTemp(tmpDsc);
1086         }
1087         else if (op3->isIndir() || op3->OperIsHWIntrinsic())
1088         {
1089             GenTree*      addr;
1090             GenTreeIndir* memIndir = nullptr;
1091             if (op3->isIndir())
1092             {
1093                 memIndir = op3->AsIndir();
1094                 addr     = memIndir->Addr();
1095             }
1096             else
1097             {
1098                 assert(op3->AsHWIntrinsic()->OperIsMemoryLoad());
1099                 assert(HWIntrinsicInfo::lookupNumArgs(op3->AsHWIntrinsic()) == 1);
1100                 addr = op3->gtGetOp1();
1101             }
1102
1103             switch (addr->OperGet())
1104             {
1105                 case GT_LCL_VAR_ADDR:
1106                 {
1107                     varNum = addr->AsLclVarCommon()->GetLclNum();
1108                     offset = 0;
1109                     break;
1110                 }
1111
1112                 case GT_CLS_VAR_ADDR:
1113                 {
1114                     emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, addr->gtClsVar.gtClsVarHnd, 0);
1115                     return;
1116                 }
1117
1118                 default:
1119                 {
1120                     if (memIndir == nullptr)
1121                     {
1122                         // This is the HW intrinsic load case.
1123                         // Until we improve the handling of addressing modes in the emitter, we'll create a
1124                         // temporary GT_IND to generate code with.
1125                         GenTreeIndir load = indirForm(op3->TypeGet(), addr);
1126                         memIndir          = &load;
1127                     }
1128                     emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1129                     return;
1130                 }
1131             }
1132         }
1133         else
1134         {
1135             switch (op3->OperGet())
1136             {
1137                 case GT_LCL_FLD:
1138                 {
1139                     GenTreeLclFld* lclField = op3->AsLclFld();
1140
1141                     varNum = lclField->GetLclNum();
1142                     offset = lclField->gtLclFld.gtLclOffs;
1143                     break;
1144                 }
1145
1146                 case GT_LCL_VAR:
1147                 {
1148                     assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1149                     varNum = op3->AsLclVar()->GetLclNum();
1150                     offset = 0;
1151                     break;
1152                 }
1153
1154                 default:
1155                     unreached();
1156                     break;
1157             }
1158         }
1159
1160         // Ensure we got a good varNum and offset.
1161         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1162         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1163         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1164         assert(offset != (unsigned)-1);
1165
1166         emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1167     }
1168     else
1169     {
1170         emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1171     }
1172 }
1173
1174 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1175 //                       with non-constant argument
1176 //
1177 // Arguments:
1178 //    intrinsic      - intrinsic ID
1179 //    nonConstImmReg - the register contains non-constant imm8 argument
1180 //    baseReg        - a register for the start of the switch table
1181 //    offsReg        - a register for the offset into the switch table
1182 //    emitSwCase     - the lambda to generate a switch case
1183 //
1184 // Return Value:
1185 //    generate the jump-table fallback for imm-intrinsics with non-constant argument.
1186 // Note:
1187 //    This function can be used for all imm-intrinsics (whether full-range or not),
1188 //    The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1189 //    (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1190 //
1191 template <typename HWIntrinsicSwitchCaseBody>
1192 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsic,
1193                                               regNumber                 nonConstImmReg,
1194                                               regNumber                 baseReg,
1195                                               regNumber                 offsReg,
1196                                               HWIntrinsicSwitchCaseBody emitSwCase)
1197 {
1198     assert(nonConstImmReg != REG_NA);
1199     // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1200     // that does work with the current compiler generated jump-table fallback
1201     assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1202     emitter* emit = getEmitter();
1203
1204     const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1205     assert(maxByte <= 256);
1206     BasicBlock* jmpTable[256];
1207
1208     unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1209     unsigned jmpTableOffs = 0;
1210
1211     // Emit the jump table
1212     for (unsigned i = 0; i < maxByte; i++)
1213     {
1214         jmpTable[i] = genCreateTempLabel();
1215         emit->emitDataGenData(i, jmpTable[i]);
1216     }
1217
1218     emit->emitDataGenEnd();
1219
1220     // Compute and jump to the appropriate offset in the switch table
1221     emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1222
1223     emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1224     emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1225     emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1226     emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1227
1228     // Emit the switch table entries
1229
1230     BasicBlock* switchTableBeg = genCreateTempLabel();
1231     BasicBlock* switchTableEnd = genCreateTempLabel();
1232
1233     genDefineTempLabel(switchTableBeg);
1234
1235     for (unsigned i = 0; i < maxByte; i++)
1236     {
1237         genDefineTempLabel(jmpTable[i]);
1238         emitSwCase((int8_t)i);
1239         emit->emitIns_J(INS_jmp, switchTableEnd);
1240     }
1241
1242     genDefineTempLabel(switchTableEnd);
1243 }
1244
1245 //------------------------------------------------------------------------
1246 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1247 //
1248 // Arguments:
1249 //    node - The hardware intrinsic node
1250 //
1251 // Note:
1252 //    We currently assume that all base intrinsics have zero or one operand.
1253 //
1254 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1255 {
1256     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1257     regNumber      targetReg   = node->gtRegNum;
1258     var_types      targetType  = node->TypeGet();
1259     var_types      baseType    = node->gtSIMDBaseType;
1260
1261     assert(compiler->compSupports(InstructionSet_SSE));
1262     assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1263
1264     GenTree* op1 = node->gtGetOp1();
1265
1266     genConsumeHWIntrinsicOperands(node);
1267     regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
1268
1269     assert(node->gtGetOp2() == nullptr);
1270
1271     emitter*    emit = getEmitter();
1272     emitAttr    attr = EA_ATTR(node->gtSIMDSize);
1273     instruction ins  = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1274
1275     switch (intrinsicId)
1276     {
1277         case NI_Vector128_CreateScalarUnsafe:
1278         case NI_Vector256_CreateScalarUnsafe:
1279         {
1280             if (varTypeIsIntegral(baseType))
1281             {
1282                 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1283             }
1284             else
1285             {
1286                 assert(varTypeIsFloating(baseType));
1287
1288                 attr = emitTypeSize(baseType);
1289
1290                 if (op1->isContained() || op1->isUsedFromSpillTemp())
1291                 {
1292                     genHWIntrinsic_R_RM(node, ins, attr);
1293                 }
1294                 else if (targetReg != op1Reg)
1295                 {
1296                     // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1297                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1298                 }
1299             }
1300             break;
1301         }
1302
1303         case NI_Vector128_ToScalar:
1304         case NI_Vector256_ToScalar:
1305         {
1306             assert(varTypeIsFloating(baseType));
1307
1308             attr = emitTypeSize(TYP_SIMD16);
1309
1310             if (op1->isContained() || op1->isUsedFromSpillTemp())
1311             {
1312                 genHWIntrinsic_R_RM(node, ins, attr);
1313             }
1314             else if (targetReg != op1Reg)
1315             {
1316                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1317                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1318             }
1319             break;
1320         }
1321
1322         case NI_Vector128_ToVector256:
1323         {
1324             // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1325             // We always emit a move to the target register, even when op1Reg == targetReg,
1326             // in order to ensure that Bits MAXVL-1:128 are zeroed.
1327
1328             attr = emitTypeSize(TYP_SIMD16);
1329
1330             if (op1->isContained() || op1->isUsedFromSpillTemp())
1331             {
1332                 genHWIntrinsic_R_RM(node, ins, attr);
1333             }
1334             else
1335             {
1336                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1337                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1338             }
1339             break;
1340         }
1341
1342         case NI_Vector128_ToVector256Unsafe:
1343         case NI_Vector256_GetLower:
1344         {
1345             if (op1->isContained() || op1->isUsedFromSpillTemp())
1346             {
1347                 genHWIntrinsic_R_RM(node, ins, attr);
1348             }
1349             else if (targetReg != op1Reg)
1350             {
1351                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1352                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1353             }
1354             break;
1355         }
1356
1357         case NI_Vector128_Zero:
1358         case NI_Vector256_Zero:
1359         {
1360             assert(op1 == nullptr);
1361             emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1362             break;
1363         }
1364
1365         default:
1366         {
1367             unreached();
1368             break;
1369         }
1370     }
1371
1372     genProduceReg(node);
1373 }
1374
1375 //------------------------------------------------------------------------
1376 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1377 //
1378 // Arguments:
1379 //    node - The hardware intrinsic node
1380 //
1381 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1382 {
1383     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1384     GenTree*       op1         = node->gtGetOp1();
1385     GenTree*       op2         = node->gtGetOp2();
1386     GenTree*       op3         = nullptr;
1387     GenTree*       op4         = nullptr;
1388     regNumber      targetReg   = node->gtRegNum;
1389     var_types      targetType  = node->TypeGet();
1390     var_types      baseType    = node->gtSIMDBaseType;
1391
1392     regNumber op1Reg = REG_NA;
1393     regNumber op2Reg = REG_NA;
1394     regNumber op3Reg = REG_NA;
1395     regNumber op4Reg = REG_NA;
1396     emitter*  emit   = getEmitter();
1397
1398     genConsumeHWIntrinsicOperands(node);
1399
1400     switch (intrinsicId)
1401     {
1402         case NI_SSE_CompareScalarOrderedEqual:
1403         case NI_SSE_CompareScalarUnorderedEqual:
1404         {
1405             assert(baseType == TYP_FLOAT);
1406             regNumber   tmpReg = node->GetSingleTempReg();
1407             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1408
1409             // Ensure we aren't overwriting targetReg
1410             assert(tmpReg != targetReg);
1411
1412             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1413             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1414             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1415             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1416             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1417             break;
1418         }
1419
1420         case NI_SSE_CompareScalarOrderedGreaterThan:
1421         case NI_SSE_CompareScalarUnorderedGreaterThan:
1422         {
1423             assert(baseType == TYP_FLOAT);
1424             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1425
1426             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1427             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1428             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1429             break;
1430         }
1431
1432         case NI_SSE_CompareScalarOrderedGreaterThanOrEqual:
1433         case NI_SSE_CompareScalarUnorderedGreaterThanOrEqual:
1434         {
1435             assert(baseType == TYP_FLOAT);
1436             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1437
1438             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1439             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1440             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1441             break;
1442         }
1443
1444         case NI_SSE_CompareScalarOrderedLessThan:
1445         case NI_SSE_CompareScalarUnorderedLessThan:
1446         {
1447             assert(baseType == TYP_FLOAT);
1448             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1449
1450             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1451             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1452             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1453             break;
1454         }
1455
1456         case NI_SSE_CompareScalarOrderedLessThanOrEqual:
1457         case NI_SSE_CompareScalarUnorderedLessThanOrEqual:
1458         {
1459             assert(baseType == TYP_FLOAT);
1460             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1461
1462             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1463             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1464             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1465             break;
1466         }
1467
1468         case NI_SSE_CompareScalarOrderedNotEqual:
1469         case NI_SSE_CompareScalarUnorderedNotEqual:
1470         {
1471             assert(baseType == TYP_FLOAT);
1472             regNumber   tmpReg = node->GetSingleTempReg();
1473             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1474
1475             // Ensure we aren't overwriting targetReg
1476             assert(tmpReg != targetReg);
1477
1478             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1479             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1480             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1481             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1482             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1483             break;
1484         }
1485
1486         case NI_SSE_X64_ConvertToInt64:
1487         case NI_SSE_X64_ConvertToInt64WithTruncation:
1488         {
1489             assert(targetType == TYP_LONG);
1490             assert(op1 != nullptr);
1491             assert(op2 == nullptr);
1492             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1493             genHWIntrinsic_R_RM(node, ins, EA_8BYTE);
1494             break;
1495         }
1496
1497         case NI_SSE_X64_ConvertScalarToVector128Single:
1498         {
1499             assert(baseType == TYP_LONG);
1500             assert(op1 != nullptr);
1501             assert(op2 != nullptr);
1502             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1503             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1504             break;
1505         }
1506
1507         case NI_SSE_Prefetch0:
1508         case NI_SSE_Prefetch1:
1509         case NI_SSE_Prefetch2:
1510         case NI_SSE_PrefetchNonTemporal:
1511         {
1512             assert(baseType == TYP_UBYTE);
1513             assert(op2 == nullptr);
1514
1515             // These do not support containment.
1516             assert(!op1->isContained());
1517             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1518             op1Reg          = op1->gtRegNum;
1519             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1520             break;
1521         }
1522
1523         case NI_SSE_StoreFence:
1524         {
1525             assert(baseType == TYP_VOID);
1526             assert(op1 == nullptr);
1527             assert(op2 == nullptr);
1528             emit->emitIns(INS_sfence);
1529             break;
1530         }
1531
1532         default:
1533             unreached();
1534             break;
1535     }
1536
1537     genProduceReg(node);
1538 }
1539
1540 //------------------------------------------------------------------------
1541 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1542 //
1543 // Arguments:
1544 //    node - The hardware intrinsic node
1545 //
1546 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1547 {
1548     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1549     GenTree*       op1         = node->gtGetOp1();
1550     GenTree*       op2         = node->gtGetOp2();
1551     regNumber      targetReg   = node->gtRegNum;
1552     var_types      targetType  = node->TypeGet();
1553     var_types      baseType    = node->gtSIMDBaseType;
1554     regNumber      op1Reg      = REG_NA;
1555     regNumber      op2Reg      = REG_NA;
1556     emitter*       emit        = getEmitter();
1557
1558     genConsumeHWIntrinsicOperands(node);
1559
1560     switch (intrinsicId)
1561     {
1562         // All integer overloads are handled by table codegen
1563         case NI_SSE2_CompareLessThan:
1564         {
1565             assert(op1 != nullptr);
1566             assert(op2 != nullptr);
1567
1568             assert(baseType == TYP_DOUBLE);
1569
1570             int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1571             assert((ival >= 0) && (ival <= 127));
1572
1573             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1574             op1Reg          = op1->gtRegNum;
1575             op2Reg          = op2->gtRegNum;
1576             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1577
1578             break;
1579         }
1580
1581         case NI_SSE2_CompareScalarOrderedEqual:
1582         case NI_SSE2_CompareScalarUnorderedEqual:
1583         {
1584             assert(baseType == TYP_DOUBLE);
1585             regNumber   tmpReg = node->GetSingleTempReg();
1586             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1587
1588             // Ensure we aren't overwriting targetReg
1589             assert(tmpReg != targetReg);
1590
1591             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1592             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1593             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1594             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1595             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1596             break;
1597         }
1598
1599         case NI_SSE2_CompareScalarOrderedGreaterThan:
1600         case NI_SSE2_CompareScalarUnorderedGreaterThan:
1601         {
1602             assert(baseType == TYP_DOUBLE);
1603             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1604
1605             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1606             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1607             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1608             break;
1609         }
1610
1611         case NI_SSE2_CompareScalarOrderedGreaterThanOrEqual:
1612         case NI_SSE2_CompareScalarUnorderedGreaterThanOrEqual:
1613         {
1614             assert(baseType == TYP_DOUBLE);
1615             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1616
1617             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1618             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1619             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1620             break;
1621         }
1622
1623         case NI_SSE2_CompareScalarOrderedLessThan:
1624         case NI_SSE2_CompareScalarUnorderedLessThan:
1625         {
1626             assert(baseType == TYP_DOUBLE);
1627             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1628
1629             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1630             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1631             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1632             break;
1633         }
1634
1635         case NI_SSE2_CompareScalarOrderedLessThanOrEqual:
1636         case NI_SSE2_CompareScalarUnorderedLessThanOrEqual:
1637         {
1638             assert(baseType == TYP_DOUBLE);
1639             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1640
1641             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1642             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1643             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1644             break;
1645         }
1646
1647         case NI_SSE2_CompareScalarOrderedNotEqual:
1648         case NI_SSE2_CompareScalarUnorderedNotEqual:
1649         {
1650             assert(baseType == TYP_DOUBLE);
1651             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1652             regNumber   tmpReg = node->GetSingleTempReg();
1653
1654             // Ensure we aren't overwriting targetReg
1655             assert(tmpReg != targetReg);
1656
1657             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1658             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1659             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1660             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1661             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1662             break;
1663         }
1664
1665         case NI_SSE2_X64_ConvertScalarToVector128Double:
1666         {
1667             assert(baseType == TYP_LONG);
1668             assert(op1 != nullptr);
1669             assert(op2 != nullptr);
1670             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1671             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1672             break;
1673         }
1674
1675         case NI_SSE2_X64_ConvertScalarToVector128Int64:
1676         case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1677         {
1678             assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1679             assert(op1 != nullptr);
1680             assert(op2 == nullptr);
1681             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1682             genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1683             break;
1684         }
1685
1686         case NI_SSE2_ConvertToInt32:
1687         case NI_SSE2_ConvertToInt32WithTruncation:
1688         case NI_SSE2_ConvertToUInt32:
1689         case NI_SSE2_X64_ConvertToInt64:
1690         case NI_SSE2_X64_ConvertToInt64WithTruncation:
1691         case NI_SSE2_X64_ConvertToUInt64:
1692         {
1693             assert(op2 == nullptr);
1694             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1695
1696             if (varTypeIsIntegral(baseType))
1697             {
1698                 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1699                 op1Reg = op1->gtRegNum;
1700                 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1701             }
1702             else
1703             {
1704                 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1705                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1706             }
1707             break;
1708         }
1709
1710         case NI_SSE2_LoadFence:
1711         {
1712             assert(baseType == TYP_VOID);
1713             assert(op1 == nullptr);
1714             assert(op2 == nullptr);
1715             emit->emitIns(INS_lfence);
1716             break;
1717         }
1718
1719         case NI_SSE2_MemoryFence:
1720         {
1721             assert(baseType == TYP_VOID);
1722             assert(op1 == nullptr);
1723             assert(op2 == nullptr);
1724             emit->emitIns(INS_mfence);
1725             break;
1726         }
1727
1728         case NI_SSE2_StoreNonTemporal:
1729         case NI_SSE2_X64_StoreNonTemporal:
1730         {
1731             assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1732             assert(op1 != nullptr);
1733             assert(op2 != nullptr);
1734
1735             instruction     ins   = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1736             GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
1737             emit->emitInsStoreInd(ins, emitTypeSize(baseType), &store);
1738             break;
1739         }
1740
1741         default:
1742             unreached();
1743             break;
1744     }
1745
1746     genProduceReg(node);
1747 }
1748
1749 //------------------------------------------------------------------------
1750 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1751 //
1752 // Arguments:
1753 //    node - The hardware intrinsic node
1754 //
1755 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1756 {
1757     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1758     GenTree*       op1         = node->gtGetOp1();
1759     GenTree*       op2         = node->gtGetOp2();
1760     GenTree*       op3         = nullptr;
1761     GenTree*       op4         = nullptr;
1762     regNumber      targetReg   = node->gtRegNum;
1763     var_types      targetType  = node->TypeGet();
1764     var_types      baseType    = node->gtSIMDBaseType;
1765
1766     regNumber op1Reg = REG_NA;
1767     regNumber op2Reg = REG_NA;
1768     regNumber op3Reg = REG_NA;
1769     regNumber op4Reg = REG_NA;
1770     emitter*  emit   = getEmitter();
1771
1772     genConsumeHWIntrinsicOperands(node);
1773
1774     switch (intrinsicId)
1775     {
1776         case NI_SSE41_ConvertToVector128Int16:
1777         case NI_SSE41_ConvertToVector128Int32:
1778         case NI_SSE41_ConvertToVector128Int64:
1779         {
1780             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1781
1782             if (!varTypeIsSIMD(op1->gtType))
1783             {
1784                 // Until we improve the handling of addressing modes in the emitter, we'll create a
1785                 // temporary GT_IND to generate code with.
1786                 GenTreeIndir load = indirForm(node->TypeGet(), op1);
1787                 emit->emitInsLoadInd(ins, emitTypeSize(TYP_SIMD16), node->gtRegNum, &load);
1788             }
1789             else
1790             {
1791                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1792             }
1793             break;
1794         }
1795
1796         case NI_SSE41_TestZ:
1797         {
1798             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1799             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1800             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1801             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1802             break;
1803         }
1804
1805         case NI_SSE41_TestC:
1806         {
1807             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1808             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1809             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1810             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1811             break;
1812         }
1813
1814         case NI_SSE41_TestNotZAndNotC:
1815         {
1816             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1817             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1818             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1819             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1820             break;
1821         }
1822
1823         case NI_SSE41_Extract:
1824         case NI_SSE41_X64_Extract:
1825         {
1826             regNumber   tmpTargetReg = REG_NA;
1827             instruction ins          = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1828             if (baseType == TYP_FLOAT)
1829             {
1830                 tmpTargetReg = node->ExtractTempReg();
1831             }
1832
1833             auto emitSwCase = [&](int8_t i) {
1834                 if (baseType == TYP_FLOAT)
1835                 {
1836                     // extract instructions return to GP-registers, so it needs int size as the emitsize
1837                     inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
1838                     emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1839                 }
1840                 else
1841                 {
1842                     inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
1843                 }
1844             };
1845
1846             if (op2->IsCnsIntOrI())
1847             {
1848                 ssize_t ival = op2->AsIntCon()->IconValue();
1849                 assert((ival >= 0) && (ival <= 255));
1850                 emitSwCase((int8_t)ival);
1851             }
1852             else
1853             {
1854                 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1855                 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1856                 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1857                 regNumber baseReg = node->ExtractTempReg();
1858                 regNumber offsReg = node->GetSingleTempReg();
1859                 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1860             }
1861             break;
1862         }
1863
1864         default:
1865             unreached();
1866             break;
1867     }
1868
1869     genProduceReg(node);
1870 }
1871
1872 //------------------------------------------------------------------------
1873 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1874 //
1875 // Arguments:
1876 //    node - The hardware intrinsic node
1877 //
1878 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1879 {
1880     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1881     regNumber      targetReg   = node->gtRegNum;
1882     GenTree*       op1         = node->gtGetOp1();
1883     GenTree*       op2         = node->gtGetOp2();
1884     var_types      baseType    = node->gtSIMDBaseType;
1885     var_types      targetType  = node->TypeGet();
1886     emitter*       emit        = getEmitter();
1887
1888     genConsumeHWIntrinsicOperands(node);
1889     regNumber op1Reg = op1->gtRegNum;
1890
1891     assert(targetReg != REG_NA);
1892     assert(op1Reg != REG_NA);
1893     assert(op2 != nullptr);
1894     assert(!node->OperIsCommutative());
1895
1896     switch (intrinsicId)
1897     {
1898         case NI_SSE42_Crc32:
1899         case NI_SSE42_X64_Crc32:
1900         {
1901             if (op1Reg != targetReg)
1902             {
1903                 assert(op2->gtRegNum != targetReg);
1904                 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1905             }
1906
1907             // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1908             // overload that explicitly takes the operands.
1909             node->gtOp1 = op2;
1910             node->gtOp2 = nullptr;
1911
1912             if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1913             {
1914                 assert(targetType == TYP_INT);
1915                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1916             }
1917             else
1918             {
1919                 assert(op1->TypeGet() == op2->TypeGet());
1920                 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1921                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1922             }
1923
1924             break;
1925         }
1926
1927         default:
1928         {
1929             unreached();
1930             break;
1931         }
1932     }
1933
1934     genProduceReg(node);
1935 }
1936
1937 //------------------------------------------------------------------------
1938 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1939 //
1940 // Arguments:
1941 //    node - The hardware intrinsic node
1942 //
1943 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1944 {
1945     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1946     var_types      baseType    = node->gtSIMDBaseType;
1947     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
1948     var_types      targetType  = node->TypeGet();
1949     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1950     int            numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
1951     GenTree*       op1         = node->gtGetOp1();
1952     GenTree*       op2         = node->gtGetOp2();
1953     regNumber      op1Reg      = REG_NA;
1954     regNumber      op2Reg      = REG_NA;
1955     regNumber      targetReg   = node->gtRegNum;
1956     emitter*       emit        = getEmitter();
1957
1958     genConsumeHWIntrinsicOperands(node);
1959
1960     switch (intrinsicId)
1961     {
1962         case NI_AVX2_ConvertToInt32:
1963         case NI_AVX2_ConvertToUInt32:
1964         {
1965             op1Reg = op1->gtRegNum;
1966             assert(numArgs == 1);
1967             assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1968             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1969             emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1970             break;
1971         }
1972
1973         case NI_AVX2_ConvertToVector256Int16:
1974         case NI_AVX2_ConvertToVector256Int32:
1975         case NI_AVX2_ConvertToVector256Int64:
1976         {
1977             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1978
1979             if (!varTypeIsSIMD(op1->gtType))
1980             {
1981                 // Until we improve the handling of addressing modes in the emitter, we'll create a
1982                 // temporary GT_IND to generate code with.
1983                 GenTreeIndir load = indirForm(node->TypeGet(), op1);
1984                 emit->emitInsLoadInd(ins, emitTypeSize(TYP_SIMD32), node->gtRegNum, &load);
1985             }
1986             else
1987             {
1988                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD32));
1989             }
1990             break;
1991         }
1992
1993         case NI_AVX2_GatherVector128:
1994         case NI_AVX2_GatherVector256:
1995         case NI_AVX2_GatherMaskVector128:
1996         case NI_AVX2_GatherMaskVector256:
1997         {
1998             GenTreeArgList* list = op1->AsArgList();
1999             op1                  = list->Current();
2000             op1Reg               = op1->gtRegNum;
2001
2002             list   = list->Rest();
2003             op2    = list->Current();
2004             op2Reg = op2->gtRegNum;
2005
2006             list         = list->Rest();
2007             GenTree* op3 = list->Current();
2008
2009             list             = list->Rest();
2010             GenTree* op4     = nullptr;
2011             GenTree* lastOp  = nullptr;
2012             GenTree* indexOp = nullptr;
2013
2014             regNumber op3Reg       = REG_NA;
2015             regNumber op4Reg       = REG_NA;
2016             regNumber addrBaseReg  = REG_NA;
2017             regNumber addrIndexReg = REG_NA;
2018             regNumber maskReg      = node->ExtractTempReg(RBM_ALLFLOAT);
2019
2020             if (numArgs == 5)
2021             {
2022                 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
2023                 op4          = list->Current();
2024                 list         = list->Rest();
2025                 lastOp       = list->Current();
2026                 op3Reg       = op3->gtRegNum;
2027                 op4Reg       = op4->gtRegNum;
2028                 addrBaseReg  = op2Reg;
2029                 addrIndexReg = op3Reg;
2030                 indexOp      = op3;
2031
2032                 // copy op4Reg into the tmp mask register,
2033                 // the mask register will be cleared by gather instructions
2034                 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
2035
2036                 if (targetReg != op1Reg)
2037                 {
2038                     // copy source vector to the target register for masking merge
2039                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
2040                 }
2041             }
2042             else
2043             {
2044                 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
2045                 addrBaseReg  = op1Reg;
2046                 addrIndexReg = op2Reg;
2047                 indexOp      = op2;
2048                 lastOp       = op3;
2049
2050                 // generate all-one mask vector
2051                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
2052             }
2053
2054             bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
2055
2056             // hwintrinsiclistxarch.h uses Dword index instructions in default
2057             if (varTypeIsLong(node->gtIndexBaseType))
2058             {
2059                 switch (ins)
2060                 {
2061                     case INS_vpgatherdd:
2062                         ins = INS_vpgatherqd;
2063                         if (isVector128GatherWithVector256Index)
2064                         {
2065                             // YMM index in address mode
2066                             attr = emitTypeSize(TYP_SIMD32);
2067                         }
2068                         break;
2069                     case INS_vpgatherdq:
2070                         ins = INS_vpgatherqq;
2071                         break;
2072                     case INS_vgatherdps:
2073                         ins = INS_vgatherqps;
2074                         if (isVector128GatherWithVector256Index)
2075                         {
2076                             // YMM index in address mode
2077                             attr = emitTypeSize(TYP_SIMD32);
2078                         }
2079                         break;
2080                     case INS_vgatherdpd:
2081                         ins = INS_vgatherqpd;
2082                         break;
2083                     default:
2084                         unreached();
2085                 }
2086             }
2087
2088             assert(lastOp->IsCnsIntOrI());
2089             ssize_t ival = lastOp->AsIntCon()->IconValue();
2090             assert((ival >= 0) && (ival <= 255));
2091
2092             assert(targetReg != maskReg);
2093             assert(targetReg != addrIndexReg);
2094             assert(maskReg != addrIndexReg);
2095             emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2096
2097             break;
2098         }
2099
2100         case NI_AVX_TestC:
2101         {
2102             genHWIntrinsic_R_RM(node, ins, attr);
2103             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2104             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2105             break;
2106         }
2107
2108         case NI_AVX_TestNotZAndNotC:
2109         {
2110             genHWIntrinsic_R_RM(node, ins, attr);
2111             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2112             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2113             break;
2114         }
2115
2116         case NI_AVX_TestZ:
2117         {
2118             genHWIntrinsic_R_RM(node, ins, attr);
2119             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2120             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2121             break;
2122         }
2123
2124         default:
2125             unreached();
2126             break;
2127     }
2128
2129     genProduceReg(node);
2130 }
2131
2132 //------------------------------------------------------------------------
2133 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2134 //
2135 // Arguments:
2136 //    node - The hardware intrinsic node
2137 //
2138 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2139 {
2140     NYI("Implement AES intrinsic code generation");
2141 }
2142
2143 //------------------------------------------------------------------------
2144 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2145 //
2146 // Arguments:
2147 //    node - The hardware intrinsic node
2148 //
2149 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2150 {
2151     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2152     regNumber      targetReg   = node->gtRegNum;
2153     GenTree*       op1         = node->gtGetOp1();
2154     GenTree*       op2         = node->gtGetOp2();
2155     var_types      targetType  = node->TypeGet();
2156     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2157     emitter*       emit        = getEmitter();
2158
2159     assert(targetReg != REG_NA);
2160     assert(op1 != nullptr);
2161
2162     genConsumeHWIntrinsicOperands(node);
2163
2164     switch (intrinsicId)
2165     {
2166         case NI_BMI1_AndNot:
2167         case NI_BMI1_X64_AndNot:
2168         case NI_BMI1_BitFieldExtract:
2169         case NI_BMI1_X64_BitFieldExtract:
2170         case NI_BMI2_ParallelBitDeposit:
2171         case NI_BMI2_ParallelBitExtract:
2172         case NI_BMI2_X64_ParallelBitDeposit:
2173         case NI_BMI2_X64_ParallelBitExtract:
2174         case NI_BMI2_ZeroHighBits:
2175         case NI_BMI2_X64_ZeroHighBits:
2176         {
2177             assert(op2 != nullptr);
2178             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2179             genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2180             break;
2181         }
2182
2183         case NI_BMI1_ExtractLowestSetBit:
2184         case NI_BMI1_GetMaskUpToLowestSetBit:
2185         case NI_BMI1_ResetLowestSetBit:
2186         case NI_BMI1_X64_ExtractLowestSetBit:
2187         case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2188         case NI_BMI1_X64_ResetLowestSetBit:
2189         {
2190             assert(op2 == nullptr);
2191             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2192             genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2193             break;
2194         }
2195
2196         case NI_BMI1_TrailingZeroCount:
2197         case NI_BMI1_X64_TrailingZeroCount:
2198         {
2199             assert(op2 == nullptr);
2200             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2201             genXCNTIntrinsic(node, ins);
2202             break;
2203         }
2204
2205         case NI_BMI2_MultiplyNoFlags:
2206         case NI_BMI2_X64_MultiplyNoFlags:
2207         {
2208             int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2209             assert(numArgs == 2 || numArgs == 3);
2210
2211             regNumber op1Reg = REG_NA;
2212             regNumber op2Reg = REG_NA;
2213             regNumber op3Reg = REG_NA;
2214             regNumber lowReg = REG_NA;
2215
2216             if (numArgs == 2)
2217             {
2218                 op1Reg = op1->gtRegNum;
2219                 op2Reg = op2->gtRegNum;
2220                 lowReg = targetReg;
2221             }
2222             else
2223             {
2224                 GenTreeArgList* argList = op1->AsArgList();
2225                 op1                     = argList->Current();
2226                 op1Reg                  = op1->gtRegNum;
2227                 argList                 = argList->Rest();
2228                 op2                     = argList->Current();
2229                 op2Reg                  = op2->gtRegNum;
2230                 argList                 = argList->Rest();
2231                 GenTree* op3            = argList->Current();
2232                 op3Reg                  = op3->gtRegNum;
2233                 assert(!op3->isContained());
2234                 assert(op3Reg != op1Reg);
2235                 assert(op3Reg != targetReg);
2236                 assert(op3Reg != REG_EDX);
2237                 lowReg = node->GetSingleTempReg();
2238                 assert(op3Reg != lowReg);
2239                 assert(lowReg != targetReg);
2240             }
2241
2242             // These do not support containment
2243             assert(!op2->isContained());
2244             emitAttr attr = emitTypeSize(targetType);
2245             // mov the first operand into implicit source operand EDX/RDX
2246             if (op1Reg != REG_EDX)
2247             {
2248                 assert(op2Reg != REG_EDX);
2249                 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2250             }
2251
2252             // generate code for MULX
2253             genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2254
2255             // If requires the lower half result, store in the memory pointed to by op3
2256             if (numArgs == 3)
2257             {
2258                 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2259             }
2260
2261             break;
2262         }
2263
2264         default:
2265         {
2266             unreached();
2267             break;
2268         }
2269     }
2270
2271     genProduceReg(node);
2272 }
2273
2274 //------------------------------------------------------------------------
2275 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2276 //
2277 // Arguments:
2278 //    node - The hardware intrinsic node
2279 //
2280 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2281 {
2282     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2283     var_types      baseType    = node->gtSIMDBaseType;
2284     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
2285     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2286     GenTree*       op1         = node->gtGetOp1();
2287     regNumber      targetReg   = node->gtRegNum;
2288
2289     assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2290
2291     genConsumeHWIntrinsicOperands(node);
2292     GenTreeArgList* argList = op1->AsArgList();
2293     op1                     = argList->Current();
2294
2295     argList      = argList->Rest();
2296     GenTree* op2 = argList->Current();
2297
2298     argList      = argList->Rest();
2299     GenTree* op3 = argList->Current();
2300
2301     regNumber op1Reg;
2302     regNumber op2Reg;
2303
2304     bool       isCommutative   = false;
2305     const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2306
2307     // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2308     assert(!copiesUpperBits || !op1->isContained());
2309
2310     if (op3->isContained() || op3->isUsedFromSpillTemp())
2311     {
2312         // 213 form: op1 = (op2 * op1) + [op3]
2313
2314         op1Reg = op1->gtRegNum;
2315         op2Reg = op2->gtRegNum;
2316
2317         isCommutative = !copiesUpperBits;
2318     }
2319     else if (op2->isContained() || op2->isUsedFromSpillTemp())
2320     {
2321         // 132 form: op1 = (op1 * op3) + [op2]
2322
2323         ins    = (instruction)(ins - 1);
2324         op1Reg = op1->gtRegNum;
2325         op2Reg = op3->gtRegNum;
2326         op3    = op2;
2327     }
2328     else if (op1->isContained() || op1->isUsedFromSpillTemp())
2329     {
2330         // 231 form: op3 = (op2 * op3) + [op1]
2331
2332         ins    = (instruction)(ins + 1);
2333         op1Reg = op3->gtRegNum;
2334         op2Reg = op2->gtRegNum;
2335         op3    = op1;
2336     }
2337     else
2338     {
2339         // 213 form: op1 = (op2 * op1) + op3
2340
2341         op1Reg = op1->gtRegNum;
2342         op2Reg = op2->gtRegNum;
2343
2344         isCommutative = !copiesUpperBits;
2345     }
2346
2347     if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2348     {
2349         assert(node->isRMWHWIntrinsic(compiler));
2350
2351         // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2352         //
2353         // For non-commutative intrinsics, we should have ensured that op2 was marked
2354         // delay free in order to prevent it from getting assigned the same register
2355         // as target. However, for commutative intrinsics, we can just swap the operands
2356         // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2357
2358         op2Reg = op1Reg;
2359         op1Reg = targetReg;
2360     }
2361
2362     genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2363     genProduceReg(node);
2364 }
2365
2366 //------------------------------------------------------------------------
2367 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2368 //
2369 // Arguments:
2370 //    node - The hardware intrinsic node
2371 //
2372 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2373 {
2374     assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2375            node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2376
2377     genConsumeOperands(node);
2378     genXCNTIntrinsic(node, INS_lzcnt);
2379     genProduceReg(node);
2380 }
2381
2382 //------------------------------------------------------------------------
2383 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2384 //
2385 // Arguments:
2386 //    node - The hardware intrinsic node
2387 //
2388 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2389 {
2390     NYI("Implement PCLMULQDQ intrinsic code generation");
2391 }
2392
2393 //------------------------------------------------------------------------
2394 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2395 //
2396 // Arguments:
2397 //    node - The hardware intrinsic node
2398 //
2399 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2400 {
2401     assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2402
2403     genConsumeOperands(node);
2404     genXCNTIntrinsic(node, INS_popcnt);
2405     genProduceReg(node);
2406 }
2407
2408 //------------------------------------------------------------------------
2409 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2410 // the target register
2411 //
2412 // Arguments:
2413 //    node - The hardware intrinsic node
2414 //    ins  - The instruction being generated
2415 //
2416 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2417 {
2418     // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2419     // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2420     // renaming, but only if it's not an actual dependency.
2421
2422     GenTree*  op1        = node->gtGetOp1();
2423     regNumber sourceReg1 = REG_NA;
2424     regNumber sourceReg2 = REG_NA;
2425
2426     if (!op1->isContained())
2427     {
2428         sourceReg1 = op1->gtRegNum;
2429     }
2430     else if (op1->isIndir())
2431     {
2432         GenTreeIndir* indir   = op1->AsIndir();
2433         GenTree*      memBase = indir->Base();
2434
2435         if (memBase != nullptr)
2436         {
2437             sourceReg1 = memBase->gtRegNum;
2438         }
2439
2440         if (indir->HasIndex())
2441         {
2442             sourceReg2 = indir->Index()->gtRegNum;
2443         }
2444     }
2445
2446     regNumber targetReg = node->gtRegNum;
2447     if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2448     {
2449         getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2450     }
2451     genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2452 }
2453
2454 #endif // FEATURE_HW_INTRINSICS