aa30f76a1d8e29fccfa557692e600c9b21579746
[platform/upstream/coreclr.git] / src / jit / hwintrinsiccodegenxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX               Intel hardware intrinsic Code Generator                     XX
9 XX                                                                           XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12 */
13 #include "jitpch.h"
14 #ifdef _MSC_VER
15 #pragma hdrstop
16 #endif
17
18 #ifdef FEATURE_HW_INTRINSICS
19
20 #include "emit.h"
21 #include "codegen.h"
22 #include "sideeffects.h"
23 #include "lower.h"
24 #include "gcinfo.h"
25 #include "gcinfoencoder.h"
26
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
29 //
30 // Arguments:
31 //    lowering - The lowering phase from the compiler
32 //    node     - The HWIntrinsic node that has the contained node
33 //    op       - The op that is contained
34 //
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
36 {
37 #if DEBUG
38     // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39     // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
40     //
41     // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42     // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
43     // spillage
44     // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
45     // register
46     // in the first place).
47
48     bool supportsRegOptional = false;
49     bool isContainable       = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50     assert(isContainable || supportsRegOptional);
51 #endif // DEBUG
52 }
53
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
56 //
57 // Arguments:
58 //    category - category of a HW intrinsic
59 //
60 // Return Value:
61 //    returns true if this category can be table-driven in CodeGen
62 //
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
64 {
65     // TODO - make more categories to the table-driven framework
66     // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67     const bool tableDrivenCategory =
68         (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69     const bool tableDrivenFlag =
70         !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71     return tableDrivenCategory && tableDrivenFlag;
72 }
73
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
76 //
77 // Arguments:
78 //    node - The hardware intrinsic node
79 //
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
81 {
82     NamedIntrinsic      intrinsicId = node->gtHWIntrinsicId;
83     InstructionSet      isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
84     HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
85     int                 ival        = HWIntrinsicInfo::lookupIval(intrinsicId);
86     int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
87
88     assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
89
90     if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
91     {
92         GenTree*  op1        = node->gtGetOp1();
93         GenTree*  op2        = node->gtGetOp2();
94         regNumber targetReg  = node->gtRegNum;
95         var_types targetType = node->TypeGet();
96         var_types baseType   = node->gtSIMDBaseType;
97
98         regNumber op1Reg = REG_NA;
99         regNumber op2Reg = REG_NA;
100         emitter*  emit   = getEmitter();
101
102         assert(numArgs >= 0);
103         instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104         assert(ins != INS_invalid);
105         emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106         assert(simdSize != 0);
107
108         switch (numArgs)
109         {
110             case 1:
111             {
112                 if (node->OperIsMemoryLoad())
113                 {
114                     genConsumeAddress(op1);
115                     // Until we improve the handling of addressing modes in the emitter, we'll create a
116                     // temporary GT_IND to generate code with.
117                     GenTreeIndir load = indirForm(node->TypeGet(), op1);
118                     emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
119                 }
120                 else
121                 {
122                     genConsumeRegs(op1);
123                     op1Reg = op1->gtRegNum;
124
125                     if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
126                     {
127                         emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
128                     }
129                     else if ((ival != -1) && varTypeIsFloating(baseType))
130                     {
131                         assert((ival >= 0) && (ival <= 127));
132                         genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
133                     }
134                     else
135                     {
136                         genHWIntrinsic_R_RM(node, ins, simdSize);
137                     }
138                 }
139                 break;
140             }
141
142             case 2:
143             {
144                 if (category == HW_Category_MemoryStore)
145                 {
146                     genConsumeAddress(op1);
147                     genConsumeReg(op2);
148                     // Until we improve the handling of addressing modes in the emitter, we'll create a
149                     // temporary GT_STORE_IND to generate code with.
150                     GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
151                     emit->emitInsStoreInd(ins, simdSize, &store);
152                     break;
153                 }
154                 genConsumeRegs(op1);
155                 genConsumeRegs(op2);
156
157                 op1Reg = op1->gtRegNum;
158                 op2Reg = op2->gtRegNum;
159
160                 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
161                 {
162                     // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
163                     //
164                     // For non-commutative intrinsics, we should have ensured that op2 was marked
165                     // delay free in order to prevent it from getting assigned the same register
166                     // as target. However, for commutative intrinsics, we can just swap the operands
167                     // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
168
169                     noway_assert(node->OperIsCommutative());
170                     op2Reg = op1Reg;
171                     op1Reg = targetReg;
172                 }
173
174                 if ((ival != -1) && varTypeIsFloating(baseType))
175                 {
176                     assert((ival >= 0) && (ival <= 127));
177                     genHWIntrinsic_R_R_RM_I(node, ins, ival);
178                 }
179                 else if (category == HW_Category_MemoryLoad)
180                 {
181                     // Get the address and the 'other' register.
182                     GenTree*  addr;
183                     regNumber otherReg;
184                     if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
185                     {
186                         addr     = op1;
187                         otherReg = op2Reg;
188                     }
189                     else
190                     {
191                         addr     = op2;
192                         otherReg = op1Reg;
193                     }
194                     // Until we improve the handling of addressing modes in the emitter, we'll create a
195                     // temporary GT_IND to generate code with.
196                     GenTreeIndir load = indirForm(node->TypeGet(), addr);
197                     genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
198                 }
199                 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
200                 {
201                     assert(ival == -1);
202
203                     if (intrinsicId == NI_SSE2_Extract)
204                     {
205                         // extract instructions return to GP-registers, so it needs int size as the emitsize
206                         simdSize = emitTypeSize(TYP_INT);
207                     }
208
209                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
210
211                     if (op2->IsCnsIntOrI())
212                     {
213                         ssize_t ival = op2->AsIntCon()->IconValue();
214                         assert((ival >= 0) && (ival <= 255));
215                         emitSwCase((int8_t)ival);
216                     }
217                     else
218                     {
219                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
220                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
221                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
222                         regNumber baseReg = node->ExtractTempReg();
223                         regNumber offsReg = node->GetSingleTempReg();
224                         genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
225                     }
226                 }
227                 else
228                 {
229                     genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
230                 }
231                 break;
232             }
233
234             case 3:
235             {
236                 GenTreeArgList* argList = op1->AsArgList();
237                 op1                     = argList->Current();
238                 genConsumeRegs(op1);
239                 op1Reg = op1->gtRegNum;
240
241                 argList = argList->Rest();
242                 op2     = argList->Current();
243                 genConsumeRegs(op2);
244                 op2Reg = op2->gtRegNum;
245
246                 argList      = argList->Rest();
247                 GenTree* op3 = argList->Current();
248                 genConsumeRegs(op3);
249                 regNumber op3Reg = op3->gtRegNum;
250
251                 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
252                 {
253                     assert(ival == -1);
254
255                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
256
257                     if (op3->IsCnsIntOrI())
258                     {
259                         ssize_t ival = op3->AsIntCon()->IconValue();
260                         assert((ival >= 0) && (ival <= 255));
261                         emitSwCase((int8_t)ival);
262                     }
263                     else
264                     {
265                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
266                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
267                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
268                         regNumber baseReg = node->ExtractTempReg();
269                         regNumber offsReg = node->GetSingleTempReg();
270                         genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
271                     }
272                 }
273                 else if (category == HW_Category_MemoryStore)
274                 {
275                     if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
276                     {
277                         emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
278                     }
279                     else
280                     {
281                         assert(intrinsicId == NI_SSE2_MaskMove);
282                         assert(targetReg == REG_NA);
283
284                         // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
285                         if (op3Reg != REG_EDI)
286                         {
287                             emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
288                         }
289                         emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
290                     }
291                 }
292                 else
293                 {
294                     switch (intrinsicId)
295                     {
296                         case NI_SSE41_BlendVariable:
297                         case NI_AVX_BlendVariable:
298                         case NI_AVX2_BlendVariable:
299                         {
300                             genHWIntrinsic_R_R_RM_R(node, ins);
301                             break;
302                         }
303
304                         default:
305                         {
306                             unreached();
307                             break;
308                         };
309                     }
310                 }
311                 break;
312             }
313
314             default:
315                 unreached();
316                 break;
317         }
318         genProduceReg(node);
319         return;
320     }
321
322     switch (isa)
323     {
324         case InstructionSet_Base:
325             genBaseIntrinsic(node);
326             break;
327         case InstructionSet_SSE:
328         case InstructionSet_SSE_X64:
329             genSSEIntrinsic(node);
330             break;
331         case InstructionSet_SSE2:
332         case InstructionSet_SSE2_X64:
333             genSSE2Intrinsic(node);
334             break;
335         case InstructionSet_SSE41:
336         case InstructionSet_SSE41_X64:
337             genSSE41Intrinsic(node);
338             break;
339         case InstructionSet_SSE42:
340         case InstructionSet_SSE42_X64:
341             genSSE42Intrinsic(node);
342             break;
343         case InstructionSet_AVX:
344         case InstructionSet_AVX2:
345             genAvxOrAvx2Intrinsic(node);
346             break;
347         case InstructionSet_AES:
348             genAESIntrinsic(node);
349             break;
350         case InstructionSet_BMI1:
351         case InstructionSet_BMI1_X64:
352         case InstructionSet_BMI2:
353         case InstructionSet_BMI2_X64:
354             genBMI1OrBMI2Intrinsic(node);
355             break;
356         case InstructionSet_FMA:
357             genFMAIntrinsic(node);
358             break;
359         case InstructionSet_LZCNT:
360         case InstructionSet_LZCNT_X64:
361             genLZCNTIntrinsic(node);
362             break;
363         case InstructionSet_PCLMULQDQ:
364             genPCLMULQDQIntrinsic(node);
365             break;
366         case InstructionSet_POPCNT:
367         case InstructionSet_POPCNT_X64:
368             genPOPCNTIntrinsic(node);
369             break;
370         default:
371             unreached();
372             break;
373     }
374 }
375
376 //------------------------------------------------------------------------
377 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
378 //                      register/memory operand and that returns a value in register
379 //
380 // Arguments:
381 //    node - The hardware intrinsic node
382 //    ins  - The instruction being generated
383 //    attr - The emit attribute for the instruciton being generated
384 //
385 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
386 {
387     var_types targetType = node->TypeGet();
388     regNumber targetReg  = node->gtRegNum;
389     GenTree*  op1        = node->gtGetOp1();
390     GenTree*  op2        = node->gtGetOp2();
391     emitter*  emit       = getEmitter();
392
393     if (op2 != nullptr)
394     {
395         // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this
396         // code path. They are all MultiIns, as the return value comes from the flags and
397         // we have two operands instead.
398
399         assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
400         assert(targetReg != REG_NA);
401
402         targetReg = op1->gtRegNum;
403         op1       = op2;
404         op2       = nullptr;
405     }
406     else
407     {
408         assert(!node->OperIsCommutative());
409     }
410
411     assert(targetReg != REG_NA);
412     assert(op2 == nullptr);
413
414     if (op1->isContained() || op1->isUsedFromSpillTemp())
415     {
416         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
417         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
418
419         TempDsc* tmpDsc = nullptr;
420         unsigned varNum = BAD_VAR_NUM;
421         unsigned offset = (unsigned)-1;
422
423         if (op1->isUsedFromSpillTemp())
424         {
425             assert(op1->IsRegOptional());
426
427             tmpDsc = getSpillTempDsc(op1);
428             varNum = tmpDsc->tdTempNum();
429             offset = 0;
430
431             regSet.tmpRlsTemp(tmpDsc);
432         }
433         else if (op1->OperIsHWIntrinsic())
434         {
435             emit->emitIns_R_AR(ins, attr, targetReg, op1->gtGetOp1()->gtRegNum, 0);
436             return;
437         }
438         else if (op1->isIndir())
439         {
440             GenTreeIndir* memIndir = op1->AsIndir();
441             GenTree*      memBase  = memIndir->gtOp1;
442
443             switch (memBase->OperGet())
444             {
445                 case GT_LCL_VAR_ADDR:
446                 {
447                     varNum = memBase->AsLclVarCommon()->GetLclNum();
448                     offset = 0;
449
450                     // Ensure that all the GenTreeIndir values are set to their defaults.
451                     assert(!memIndir->HasIndex());
452                     assert(memIndir->Scale() == 1);
453                     assert(memIndir->Offset() == 0);
454
455                     break;
456                 }
457
458                 case GT_CLS_VAR_ADDR:
459                 {
460                     emit->emitIns_R_C(ins, attr, targetReg, memBase->gtClsVar.gtClsVarHnd, 0);
461                     return;
462                 }
463
464                 default:
465                 {
466                     emit->emitIns_R_A(ins, attr, targetReg, memIndir);
467                     return;
468                 }
469             }
470         }
471         else
472         {
473             switch (op1->OperGet())
474             {
475                 case GT_LCL_FLD:
476                 {
477                     GenTreeLclFld* lclField = op1->AsLclFld();
478
479                     varNum = lclField->GetLclNum();
480                     offset = lclField->gtLclFld.gtLclOffs;
481                     break;
482                 }
483
484                 case GT_LCL_VAR:
485                 {
486                     assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
487                     varNum = op1->AsLclVar()->GetLclNum();
488                     offset = 0;
489                     break;
490                 }
491
492                 default:
493                 {
494                     unreached();
495                     break;
496                 }
497             }
498         }
499
500         // Ensure we got a good varNum and offset.
501         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
502         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
503         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
504         assert(offset != (unsigned)-1);
505
506         emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
507     }
508     else
509     {
510         regNumber op1Reg = op1->gtRegNum;
511         emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
512     }
513 }
514
515 //------------------------------------------------------------------------
516 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
517 //                        an immediate operand, and that returns a value in register
518 //
519 // Arguments:
520 //    node - The hardware intrinsic node
521 //    ins  - The instruction being generated
522 //    ival - The immediate value
523 //
524 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
525 {
526     var_types targetType = node->TypeGet();
527     regNumber targetReg  = node->gtRegNum;
528     GenTree*  op1        = node->gtGetOp1();
529     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
530     emitter*  emit       = getEmitter();
531
532     // TODO-XArch-CQ: Commutative operations can have op1 be contained
533     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
534
535     assert(targetReg != REG_NA);
536     assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
537
538     if (op1->isContained() || op1->isUsedFromSpillTemp())
539     {
540         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
541         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
542     }
543     inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
544 }
545
546 //------------------------------------------------------------------------
547 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
548 //                        register/memory operand, and that returns a value in register
549 //
550 // Arguments:
551 //    node - The hardware intrinsic node
552 //    ins  - The instruction being generated
553 //    attr - The emit attribute for the instruciton being generated
554 //
555 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
556 {
557     regNumber targetReg = node->gtRegNum;
558     GenTree*  op1       = node->gtGetOp1();
559     GenTree*  op2       = node->gtGetOp2();
560     regNumber op1Reg    = op1->gtRegNum;
561
562     assert(targetReg != REG_NA);
563     assert(op1Reg != REG_NA);
564
565     genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
566 }
567
568 //------------------------------------------------------------------------
569 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
570 //                        register/memory operand, and that returns a value in register
571 //
572 // Arguments:
573 //    node - The hardware intrinsic node
574 //    ins  - The instruction being generated
575 //    attr - The emit attribute for the instruciton being generated
576 //    targetReg - The register allocated to the result
577 //    op1Reg    - The register allocated to the first operand
578 //    op2       - Another operand that maybe in register or memory
579 //
580 void CodeGen::genHWIntrinsic_R_R_RM(
581     GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
582 {
583     emitter* emit = getEmitter();
584
585     // TODO-XArch-CQ: Commutative operations can have op1 be contained
586     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
587
588     assert(targetReg != REG_NA);
589     assert(op1Reg != REG_NA);
590
591     if (op2->isContained() || op2->isUsedFromSpillTemp())
592     {
593         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
594         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
595
596         TempDsc* tmpDsc = nullptr;
597         unsigned varNum = BAD_VAR_NUM;
598         unsigned offset = (unsigned)-1;
599
600         if (op2->isUsedFromSpillTemp())
601         {
602             assert(op2->IsRegOptional());
603
604             tmpDsc = getSpillTempDsc(op2);
605             varNum = tmpDsc->tdTempNum();
606             offset = 0;
607
608             regSet.tmpRlsTemp(tmpDsc);
609         }
610         else if (op2->OperIsHWIntrinsic())
611         {
612             GenTree* addr = op2->gtGetOp1();
613             // Until we improve the handling of addressing modes in the emitter, we'll create a
614             // temporary GT_IND to generate code with.
615             GenTreeIndir load = indirForm(node->TypeGet(), addr);
616             emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, &load);
617             return;
618         }
619         else if (op2->isIndir())
620         {
621             GenTreeIndir* memIndir = op2->AsIndir();
622             GenTree*      memBase  = memIndir->gtOp1;
623
624             switch (memBase->OperGet())
625             {
626                 case GT_LCL_VAR_ADDR:
627                 {
628                     varNum = memBase->AsLclVarCommon()->GetLclNum();
629                     offset = 0;
630
631                     // Ensure that all the GenTreeIndir values are set to their defaults.
632                     assert(!memIndir->HasIndex());
633                     assert(memIndir->Scale() == 1);
634                     assert(memIndir->Offset() == 0);
635
636                     break;
637                 }
638
639                 case GT_CLS_VAR_ADDR:
640                 {
641                     emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
642                     return;
643                 }
644
645                 default:
646                 {
647                     emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
648                     return;
649                 }
650             }
651         }
652         else
653         {
654             switch (op2->OperGet())
655             {
656                 case GT_LCL_FLD:
657                 {
658                     GenTreeLclFld* lclField = op2->AsLclFld();
659
660                     varNum = lclField->GetLclNum();
661                     offset = lclField->gtLclFld.gtLclOffs;
662                     break;
663                 }
664
665                 case GT_LCL_VAR:
666                 {
667                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
668                     varNum = op2->AsLclVar()->GetLclNum();
669                     offset = 0;
670                     break;
671                 }
672
673                 default:
674                     unreached();
675                     break;
676             }
677         }
678
679         // Ensure we got a good varNum and offset.
680         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
681         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
682         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
683         assert(offset != (unsigned)-1);
684
685         emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
686     }
687     else
688     {
689         regNumber op2Reg = op2->gtRegNum;
690
691         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
692         {
693             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
694             //
695             // For non-commutative intrinsics, we should have ensured that op2 was marked
696             // delay free in order to prevent it from getting assigned the same register
697             // as target. However, for commutative intrinsics, we can just swap the operands
698             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
699
700             noway_assert(node->OperIsCommutative());
701             op2Reg = op1Reg;
702             op1Reg = targetReg;
703         }
704
705         emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
706     }
707 }
708
709 //------------------------------------------------------------------------
710 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
711 //                        register/memory operand, an immediate operand, and that returns a value in register
712 //
713 // Arguments:
714 //    node - The hardware intrinsic node
715 //    ins  - The instruction being generated
716 //    ival - The immediate value
717 //
718 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
719 {
720     var_types targetType = node->TypeGet();
721     regNumber targetReg  = node->gtRegNum;
722     GenTree*  op1        = node->gtGetOp1();
723     GenTree*  op2        = node->gtGetOp2();
724     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
725     emitter*  emit       = getEmitter();
726
727     // TODO-XArch-CQ: Commutative operations can have op1 be contained
728     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
729
730     if (op1->OperIsList())
731     {
732         assert(op2 == nullptr);
733
734         GenTreeArgList* argList = op1->AsArgList();
735
736         op1     = argList->Current();
737         argList = argList->Rest();
738
739         op2     = argList->Current();
740         argList = argList->Rest();
741
742         assert(argList->Current() != nullptr);
743         assert(argList->Rest() == nullptr);
744     }
745
746     regNumber op1Reg = op1->gtRegNum;
747
748     assert(targetReg != REG_NA);
749     assert(op1Reg != REG_NA);
750
751     if (op2->isContained() || op2->isUsedFromSpillTemp())
752     {
753         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
754         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
755
756         TempDsc* tmpDsc = nullptr;
757         unsigned varNum = BAD_VAR_NUM;
758         unsigned offset = (unsigned)-1;
759
760         if (op2->isUsedFromSpillTemp())
761         {
762             assert(op2->IsRegOptional());
763
764             tmpDsc = getSpillTempDsc(op2);
765             varNum = tmpDsc->tdTempNum();
766             offset = 0;
767
768             regSet.tmpRlsTemp(tmpDsc);
769         }
770         else if (op2->OperIsHWIntrinsic())
771         {
772             emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
773             return;
774         }
775         else if (op2->isIndir())
776         {
777             GenTreeIndir* memIndir = op2->AsIndir();
778             GenTree*      memBase  = memIndir->gtOp1;
779
780             switch (memBase->OperGet())
781             {
782                 case GT_LCL_VAR_ADDR:
783                 {
784                     varNum = memBase->AsLclVarCommon()->GetLclNum();
785                     offset = 0;
786
787                     // Ensure that all the GenTreeIndir values are set to their defaults.
788                     assert(!memIndir->HasIndex());
789                     assert(memIndir->Scale() == 1);
790                     assert(memIndir->Offset() == 0);
791
792                     break;
793                 }
794
795                 case GT_CLS_VAR_ADDR:
796                 {
797                     emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
798                                                ival);
799                     return;
800                 }
801
802                 default:
803                 {
804                     emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
805                     return;
806                 }
807             }
808         }
809         else
810         {
811             switch (op2->OperGet())
812             {
813                 case GT_LCL_FLD:
814                 {
815                     GenTreeLclFld* lclField = op2->AsLclFld();
816
817                     varNum = lclField->GetLclNum();
818                     offset = lclField->gtLclFld.gtLclOffs;
819                     break;
820                 }
821
822                 case GT_LCL_VAR:
823                 {
824                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
825                     varNum = op2->AsLclVar()->GetLclNum();
826                     offset = 0;
827                     break;
828                 }
829
830                 default:
831                     unreached();
832                     break;
833             }
834         }
835
836         // Ensure we got a good varNum and offset.
837         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
838         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
839         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
840         assert(offset != (unsigned)-1);
841
842         emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
843     }
844     else
845     {
846         regNumber op2Reg = op2->gtRegNum;
847
848         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
849         {
850             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
851             //
852             // For non-commutative intrinsics, we should have ensured that op2 was marked
853             // delay free in order to prevent it from getting assigned the same register
854             // as target. However, for commutative intrinsics, we can just swap the operands
855             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
856
857             noway_assert(node->OperIsCommutative());
858             op2Reg = op1Reg;
859             op1Reg = targetReg;
860         }
861
862         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
863     }
864 }
865
866 //------------------------------------------------------------------------
867 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
868 //                          register/memory operand, another register operand, and that returns a value in register
869 //
870 // Arguments:
871 //    node - The hardware intrinsic node
872 //    ins  - The instruction being generated
873 //
874 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
875 {
876     var_types targetType = node->TypeGet();
877     regNumber targetReg  = node->gtRegNum;
878     GenTree*  op1        = node->gtGetOp1();
879     GenTree*  op2        = node->gtGetOp2();
880     GenTree*  op3        = nullptr;
881     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
882     emitter*  emit       = getEmitter();
883
884     assert(op1->OperIsList());
885     assert(op2 == nullptr);
886
887     GenTreeArgList* argList = op1->AsArgList();
888
889     op1     = argList->Current();
890     argList = argList->Rest();
891
892     op2     = argList->Current();
893     argList = argList->Rest();
894
895     op3 = argList->Current();
896     assert(argList->Rest() == nullptr);
897
898     regNumber op1Reg = op1->gtRegNum;
899     regNumber op3Reg = op3->gtRegNum;
900
901     assert(targetReg != REG_NA);
902     assert(op1Reg != REG_NA);
903     assert(op3Reg != REG_NA);
904
905     if (op2->isContained() || op2->isUsedFromSpillTemp())
906     {
907         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
908         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
909
910         TempDsc* tmpDsc = nullptr;
911         unsigned varNum = BAD_VAR_NUM;
912         unsigned offset = (unsigned)-1;
913
914         if (op2->isUsedFromSpillTemp())
915         {
916             assert(op2->IsRegOptional());
917
918             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
919             //                     pattern. It could probably be extracted to its own method.
920             tmpDsc = getSpillTempDsc(op2);
921             varNum = tmpDsc->tdTempNum();
922             offset = 0;
923
924             regSet.tmpRlsTemp(tmpDsc);
925         }
926         else if (op2->OperIsHWIntrinsic())
927         {
928             emit->emitIns_SIMD_R_R_AR_R(ins, simdSize, targetReg, op1Reg, op3Reg, op2->gtGetOp1()->gtRegNum);
929             return;
930         }
931         else if (op2->isIndir())
932         {
933             GenTreeIndir* memIndir = op2->AsIndir();
934             GenTree*      memBase  = memIndir->gtOp1;
935
936             switch (memBase->OperGet())
937             {
938                 case GT_LCL_VAR_ADDR:
939                 {
940                     varNum = memBase->AsLclVarCommon()->GetLclNum();
941                     offset = 0;
942
943                     // Ensure that all the GenTreeIndir values are set to their defaults.
944                     assert(!memIndir->HasIndex());
945                     assert(memIndir->Scale() == 1);
946                     assert(memIndir->Offset() == 0);
947
948                     break;
949                 }
950
951                 case GT_CLS_VAR_ADDR:
952                 {
953                     emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, memBase->gtClsVar.gtClsVarHnd,
954                                                0);
955                     return;
956                 }
957
958                 default:
959                 {
960                     emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
961                     return;
962                 }
963             }
964         }
965         else
966         {
967             switch (op2->OperGet())
968             {
969                 case GT_LCL_FLD:
970                 {
971                     GenTreeLclFld* lclField = op2->AsLclFld();
972
973                     varNum = lclField->GetLclNum();
974                     offset = lclField->gtLclFld.gtLclOffs;
975                     break;
976                 }
977
978                 case GT_LCL_VAR:
979                 {
980                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
981                     varNum = op2->AsLclVar()->GetLclNum();
982                     offset = 0;
983                     break;
984                 }
985
986                 default:
987                     unreached();
988                     break;
989             }
990         }
991
992         // Ensure we got a good varNum and offset.
993         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
994         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
995         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
996         assert(offset != (unsigned)-1);
997
998         emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
999     }
1000     else
1001     {
1002         emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1003     }
1004 }
1005
1006 //------------------------------------------------------------------------
1007 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1008 //                          a register/memory operand, and that returns a value in register
1009 //
1010 // Arguments:
1011 //    ins       - The instruction being generated
1012 //    attr      - The emit attribute
1013 //    targetReg - The target register
1014 //    op1Reg    - The register of the first operand
1015 //    op2Reg    - The register of the second operand
1016 //    op3       - The third operand
1017 //
1018 void CodeGen::genHWIntrinsic_R_R_R_RM(
1019     instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1020 {
1021     assert(targetReg != REG_NA);
1022     assert(op1Reg != REG_NA);
1023     assert(op2Reg != REG_NA);
1024
1025     emitter* emit = getEmitter();
1026
1027     if (op3->isContained() || op3->isUsedFromSpillTemp())
1028     {
1029         TempDsc* tmpDsc = nullptr;
1030         unsigned varNum = BAD_VAR_NUM;
1031         unsigned offset = (unsigned)-1;
1032
1033         if (op3->isUsedFromSpillTemp())
1034         {
1035             assert(op3->IsRegOptional());
1036
1037             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1038             //                     pattern. It could probably be extracted to its own method.
1039             tmpDsc = getSpillTempDsc(op3);
1040             varNum = tmpDsc->tdTempNum();
1041             offset = 0;
1042
1043             regSet.tmpRlsTemp(tmpDsc);
1044         }
1045         else if (op3->OperIsHWIntrinsic())
1046         {
1047             emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum);
1048             return;
1049         }
1050         else if (op3->isIndir())
1051         {
1052             GenTreeIndir* memIndir = op3->AsIndir();
1053             GenTree*      memBase  = memIndir->gtOp1;
1054
1055             switch (memBase->OperGet())
1056             {
1057                 case GT_LCL_VAR_ADDR:
1058                 {
1059                     varNum = memBase->AsLclVarCommon()->GetLclNum();
1060                     offset = 0;
1061
1062                     // Ensure that all the GenTreeIndir values are set to their defaults.
1063                     assert(!memIndir->HasIndex());
1064                     assert(memIndir->Scale() == 1);
1065                     assert(memIndir->Offset() == 0);
1066
1067                     break;
1068                 }
1069
1070                 case GT_CLS_VAR_ADDR:
1071                 {
1072                     emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0);
1073                     return;
1074                 }
1075
1076                 default:
1077                 {
1078                     emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1079                     return;
1080                 }
1081             }
1082         }
1083         else
1084         {
1085             switch (op3->OperGet())
1086             {
1087                 case GT_LCL_FLD:
1088                 {
1089                     GenTreeLclFld* lclField = op3->AsLclFld();
1090
1091                     varNum = lclField->GetLclNum();
1092                     offset = lclField->gtLclFld.gtLclOffs;
1093                     break;
1094                 }
1095
1096                 case GT_LCL_VAR:
1097                 {
1098                     assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1099                     varNum = op3->AsLclVar()->GetLclNum();
1100                     offset = 0;
1101                     break;
1102                 }
1103
1104                 default:
1105                     unreached();
1106                     break;
1107             }
1108         }
1109
1110         // Ensure we got a good varNum and offset.
1111         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1112         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1113         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1114         assert(offset != (unsigned)-1);
1115
1116         emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1117     }
1118     else
1119     {
1120         emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1121     }
1122 }
1123
1124 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1125 //                       with non-constant argument
1126 //
1127 // Arguments:
1128 //    intrinsic      - intrinsic ID
1129 //    nonConstImmReg - the register contains non-constant imm8 argument
1130 //    baseReg        - a register for the start of the switch table
1131 //    offsReg        - a register for the offset into the switch table
1132 //    emitSwCase     - the lambda to generate a switch case
1133 //
1134 // Return Value:
1135 //    generate the jump-table fallback for imm-intrinsics with non-constant argument.
1136 // Note:
1137 //    This function can be used for all imm-intrinsics (whether full-range or not),
1138 //    The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1139 //    (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1140 //
1141 template <typename HWIntrinsicSwitchCaseBody>
1142 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsic,
1143                                               regNumber                 nonConstImmReg,
1144                                               regNumber                 baseReg,
1145                                               regNumber                 offsReg,
1146                                               HWIntrinsicSwitchCaseBody emitSwCase)
1147 {
1148     assert(nonConstImmReg != REG_NA);
1149     // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1150     // that does work with the current compiler generated jump-table fallback
1151     assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1152     emitter* emit = getEmitter();
1153
1154     const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1155     assert(maxByte <= 256);
1156     BasicBlock* jmpTable[256];
1157
1158     unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1159     unsigned jmpTableOffs = 0;
1160
1161     // Emit the jump table
1162     for (unsigned i = 0; i < maxByte; i++)
1163     {
1164         jmpTable[i] = genCreateTempLabel();
1165         emit->emitDataGenData(i, jmpTable[i]);
1166     }
1167
1168     emit->emitDataGenEnd();
1169
1170     // Compute and jump to the appropriate offset in the switch table
1171     emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1172
1173     emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1174     emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1175     emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1176     emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1177
1178     // Emit the switch table entries
1179
1180     BasicBlock* switchTableBeg = genCreateTempLabel();
1181     BasicBlock* switchTableEnd = genCreateTempLabel();
1182
1183     genDefineTempLabel(switchTableBeg);
1184
1185     for (unsigned i = 0; i < maxByte; i++)
1186     {
1187         genDefineTempLabel(jmpTable[i]);
1188         emitSwCase((int8_t)i);
1189         emit->emitIns_J(INS_jmp, switchTableEnd);
1190     }
1191
1192     genDefineTempLabel(switchTableEnd);
1193 }
1194
1195 //------------------------------------------------------------------------
1196 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1197 //
1198 // Arguments:
1199 //    node - The hardware intrinsic node
1200 //
1201 // Note:
1202 //    We currently assume that all base intrinsics have zero or one operand.
1203 //
1204 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1205 {
1206     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1207     regNumber      targetReg   = node->gtRegNum;
1208     var_types      targetType  = node->TypeGet();
1209     var_types      baseType    = node->gtSIMDBaseType;
1210
1211     assert(compiler->compSupports(InstructionSet_SSE));
1212     assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1213
1214     GenTree* op1 = node->gtGetOp1();
1215
1216     genConsumeHWIntrinsicOperands(node);
1217     regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
1218
1219     assert(node->gtGetOp2() == nullptr);
1220
1221     emitter*    emit = getEmitter();
1222     emitAttr    attr = EA_ATTR(node->gtSIMDSize);
1223     instruction ins  = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1224
1225     switch (intrinsicId)
1226     {
1227         case NI_Base_Vector128_CreateScalarUnsafe:
1228         case NI_Base_Vector256_CreateScalarUnsafe:
1229         {
1230             if (varTypeIsIntegral(baseType))
1231             {
1232                 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1233             }
1234             else
1235             {
1236                 assert(varTypeIsFloating(baseType));
1237
1238                 attr = emitTypeSize(baseType);
1239
1240                 if (op1->isContained() || op1->isUsedFromSpillTemp())
1241                 {
1242                     genHWIntrinsic_R_RM(node, ins, attr);
1243                 }
1244                 else if (targetReg != op1Reg)
1245                 {
1246                     // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1247                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1248                 }
1249             }
1250             break;
1251         }
1252
1253         case NI_Base_Vector128_ToScalar:
1254         case NI_Base_Vector256_ToScalar:
1255         {
1256             assert(varTypeIsFloating(baseType));
1257
1258             attr = emitTypeSize(TYP_SIMD16);
1259
1260             if (op1->isContained() || op1->isUsedFromSpillTemp())
1261             {
1262                 genHWIntrinsic_R_RM(node, ins, attr);
1263             }
1264             else if (targetReg != op1Reg)
1265             {
1266                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1267                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1268             }
1269             break;
1270         }
1271
1272         case NI_Base_Vector128_ToVector256:
1273         {
1274             // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1275             // We always emit a move to the target register, even when op1Reg == targetReg,
1276             // in order to ensure that Bits MAXVL-1:128 are zeroed.
1277
1278             attr = emitTypeSize(TYP_SIMD16);
1279
1280             if (op1->isContained() || op1->isUsedFromSpillTemp())
1281             {
1282                 genHWIntrinsic_R_RM(node, ins, attr);
1283             }
1284             else
1285             {
1286                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1287                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1288             }
1289             break;
1290         }
1291
1292         case NI_Base_Vector128_ToVector256Unsafe:
1293         case NI_Base_Vector256_GetLower:
1294         {
1295             if (op1->isContained() || op1->isUsedFromSpillTemp())
1296             {
1297                 genHWIntrinsic_R_RM(node, ins, attr);
1298             }
1299             else if (targetReg != op1Reg)
1300             {
1301                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1302                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1303             }
1304             break;
1305         }
1306
1307         case NI_Base_Vector128_Zero:
1308         case NI_Base_Vector256_Zero:
1309         {
1310             assert(op1 == nullptr);
1311             emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1312             break;
1313         }
1314
1315         default:
1316         {
1317             unreached();
1318             break;
1319         }
1320     }
1321
1322     genProduceReg(node);
1323 }
1324
1325 //------------------------------------------------------------------------
1326 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1327 //
1328 // Arguments:
1329 //    node - The hardware intrinsic node
1330 //
1331 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1332 {
1333     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1334     GenTree*       op1         = node->gtGetOp1();
1335     GenTree*       op2         = node->gtGetOp2();
1336     GenTree*       op3         = nullptr;
1337     GenTree*       op4         = nullptr;
1338     regNumber      targetReg   = node->gtRegNum;
1339     var_types      targetType  = node->TypeGet();
1340     var_types      baseType    = node->gtSIMDBaseType;
1341
1342     regNumber op1Reg = REG_NA;
1343     regNumber op2Reg = REG_NA;
1344     regNumber op3Reg = REG_NA;
1345     regNumber op4Reg = REG_NA;
1346     emitter*  emit   = getEmitter();
1347
1348     genConsumeHWIntrinsicOperands(node);
1349
1350     switch (intrinsicId)
1351     {
1352         case NI_SSE_CompareEqualOrderedScalar:
1353         case NI_SSE_CompareEqualUnorderedScalar:
1354         {
1355             assert(baseType == TYP_FLOAT);
1356             regNumber   tmpReg = node->GetSingleTempReg();
1357             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1358
1359             // Ensure we aren't overwriting targetReg
1360             assert(tmpReg != targetReg);
1361
1362             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1363             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1364             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1365             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1366             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1367             break;
1368         }
1369
1370         case NI_SSE_CompareGreaterThanOrderedScalar:
1371         case NI_SSE_CompareGreaterThanUnorderedScalar:
1372         {
1373             assert(baseType == TYP_FLOAT);
1374             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1375
1376             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1377             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1378             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1379             break;
1380         }
1381
1382         case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
1383         case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
1384         {
1385             assert(baseType == TYP_FLOAT);
1386             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1387
1388             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1389             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1390             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1391             break;
1392         }
1393
1394         case NI_SSE_CompareLessThanOrderedScalar:
1395         case NI_SSE_CompareLessThanUnorderedScalar:
1396         {
1397             assert(baseType == TYP_FLOAT);
1398             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1399
1400             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1401             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1402             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1403             break;
1404         }
1405
1406         case NI_SSE_CompareLessThanOrEqualOrderedScalar:
1407         case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
1408         {
1409             assert(baseType == TYP_FLOAT);
1410             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1411
1412             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1413             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1414             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1415             break;
1416         }
1417
1418         case NI_SSE_CompareNotEqualOrderedScalar:
1419         case NI_SSE_CompareNotEqualUnorderedScalar:
1420         {
1421             assert(baseType == TYP_FLOAT);
1422             regNumber   tmpReg = node->GetSingleTempReg();
1423             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1424
1425             // Ensure we aren't overwriting targetReg
1426             assert(tmpReg != targetReg);
1427
1428             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1429             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1430             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1431             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1432             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1433             break;
1434         }
1435
1436         case NI_SSE_X64_ConvertToInt64:
1437         case NI_SSE_X64_ConvertToInt64WithTruncation:
1438         {
1439             assert(targetType == TYP_LONG);
1440             assert(op1 != nullptr);
1441             assert(op2 == nullptr);
1442             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1443             genHWIntrinsic_R_RM(node, ins, EA_8BYTE);
1444             break;
1445         }
1446
1447         case NI_SSE_X64_ConvertScalarToVector128Single:
1448         {
1449             assert(baseType == TYP_LONG);
1450             assert(op1 != nullptr);
1451             assert(op2 != nullptr);
1452             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1453             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1454             break;
1455         }
1456
1457         case NI_SSE_Prefetch0:
1458         case NI_SSE_Prefetch1:
1459         case NI_SSE_Prefetch2:
1460         case NI_SSE_PrefetchNonTemporal:
1461         {
1462             assert(baseType == TYP_UBYTE);
1463             assert(op2 == nullptr);
1464
1465             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1466             op1Reg          = op1->gtRegNum;
1467             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1468             break;
1469         }
1470
1471         case NI_SSE_StoreFence:
1472         {
1473             assert(baseType == TYP_VOID);
1474             assert(op1 == nullptr);
1475             assert(op2 == nullptr);
1476             emit->emitIns(INS_sfence);
1477             break;
1478         }
1479
1480         default:
1481             unreached();
1482             break;
1483     }
1484
1485     genProduceReg(node);
1486 }
1487
1488 //------------------------------------------------------------------------
1489 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1490 //
1491 // Arguments:
1492 //    node - The hardware intrinsic node
1493 //
1494 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1495 {
1496     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1497     GenTree*       op1         = node->gtGetOp1();
1498     GenTree*       op2         = node->gtGetOp2();
1499     regNumber      targetReg   = node->gtRegNum;
1500     var_types      targetType  = node->TypeGet();
1501     var_types      baseType    = node->gtSIMDBaseType;
1502     regNumber      op1Reg      = REG_NA;
1503     regNumber      op2Reg      = REG_NA;
1504     emitter*       emit        = getEmitter();
1505
1506     genConsumeHWIntrinsicOperands(node);
1507
1508     switch (intrinsicId)
1509     {
1510         // All integer overloads are handled by table codegen
1511         case NI_SSE2_CompareLessThan:
1512         {
1513             assert(op1 != nullptr);
1514             assert(op2 != nullptr);
1515
1516             assert(baseType == TYP_DOUBLE);
1517
1518             int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1519             assert((ival >= 0) && (ival <= 127));
1520
1521             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1522             op1Reg          = op1->gtRegNum;
1523             op2Reg          = op2->gtRegNum;
1524             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1525
1526             break;
1527         }
1528
1529         case NI_SSE2_CompareEqualOrderedScalar:
1530         case NI_SSE2_CompareEqualUnorderedScalar:
1531         {
1532             assert(baseType == TYP_DOUBLE);
1533             regNumber   tmpReg = node->GetSingleTempReg();
1534             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1535
1536             // Ensure we aren't overwriting targetReg
1537             assert(tmpReg != targetReg);
1538
1539             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1540             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1541             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1542             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1543             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1544             break;
1545         }
1546
1547         case NI_SSE2_CompareGreaterThanOrderedScalar:
1548         case NI_SSE2_CompareGreaterThanUnorderedScalar:
1549         {
1550             assert(baseType == TYP_DOUBLE);
1551             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1552
1553             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1554             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1555             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1556             break;
1557         }
1558
1559         case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
1560         case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
1561         {
1562             assert(baseType == TYP_DOUBLE);
1563             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1564
1565             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1566             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1567             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1568             break;
1569         }
1570
1571         case NI_SSE2_CompareLessThanOrderedScalar:
1572         case NI_SSE2_CompareLessThanUnorderedScalar:
1573         {
1574             assert(baseType == TYP_DOUBLE);
1575             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1576
1577             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1578             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1579             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1580             break;
1581         }
1582
1583         case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
1584         case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
1585         {
1586             assert(baseType == TYP_DOUBLE);
1587             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1588
1589             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1590             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1591             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1592             break;
1593         }
1594
1595         case NI_SSE2_CompareNotEqualOrderedScalar:
1596         case NI_SSE2_CompareNotEqualUnorderedScalar:
1597         {
1598             assert(baseType == TYP_DOUBLE);
1599             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1600             regNumber   tmpReg = node->GetSingleTempReg();
1601
1602             // Ensure we aren't overwriting targetReg
1603             assert(tmpReg != targetReg);
1604
1605             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1606             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1607             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1608             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1609             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1610             break;
1611         }
1612
1613         case NI_SSE2_X64_ConvertScalarToVector128Double:
1614         {
1615             assert(baseType == TYP_LONG);
1616             assert(op1 != nullptr);
1617             assert(op2 != nullptr);
1618             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1619             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1620             break;
1621         }
1622
1623         case NI_SSE2_X64_ConvertScalarToVector128Int64:
1624         case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1625         {
1626             assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1627             assert(op1 != nullptr);
1628             assert(op2 == nullptr);
1629             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1630             genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1631             break;
1632         }
1633
1634         case NI_SSE2_ConvertToInt32:
1635         case NI_SSE2_ConvertToInt32WithTruncation:
1636         case NI_SSE2_ConvertToUInt32:
1637         case NI_SSE2_X64_ConvertToInt64:
1638         case NI_SSE2_X64_ConvertToInt64WithTruncation:
1639         case NI_SSE2_X64_ConvertToUInt64:
1640         {
1641             assert(op2 == nullptr);
1642             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1643
1644             if (varTypeIsIntegral(baseType))
1645             {
1646                 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1647                 op1Reg = op1->gtRegNum;
1648                 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1649             }
1650             else
1651             {
1652                 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1653                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1654             }
1655             break;
1656         }
1657
1658         case NI_SSE2_LoadFence:
1659         {
1660             assert(baseType == TYP_VOID);
1661             assert(op1 == nullptr);
1662             assert(op2 == nullptr);
1663             emit->emitIns(INS_lfence);
1664             break;
1665         }
1666
1667         case NI_SSE2_MemoryFence:
1668         {
1669             assert(baseType == TYP_VOID);
1670             assert(op1 == nullptr);
1671             assert(op2 == nullptr);
1672             emit->emitIns(INS_mfence);
1673             break;
1674         }
1675
1676         case NI_SSE2_StoreNonTemporal:
1677         case NI_SSE2_X64_StoreNonTemporal:
1678         {
1679             assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1680             assert(op1 != nullptr);
1681             assert(op2 != nullptr);
1682
1683             op2Reg          = op2->gtRegNum;
1684             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1685             op1Reg          = op1->gtRegNum;
1686             emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1687             break;
1688         }
1689
1690         default:
1691             unreached();
1692             break;
1693     }
1694
1695     genProduceReg(node);
1696 }
1697
1698 //------------------------------------------------------------------------
1699 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1700 //
1701 // Arguments:
1702 //    node - The hardware intrinsic node
1703 //
1704 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1705 {
1706     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1707     GenTree*       op1         = node->gtGetOp1();
1708     GenTree*       op2         = node->gtGetOp2();
1709     GenTree*       op3         = nullptr;
1710     GenTree*       op4         = nullptr;
1711     regNumber      targetReg   = node->gtRegNum;
1712     var_types      targetType  = node->TypeGet();
1713     var_types      baseType    = node->gtSIMDBaseType;
1714
1715     regNumber op1Reg = REG_NA;
1716     regNumber op2Reg = REG_NA;
1717     regNumber op3Reg = REG_NA;
1718     regNumber op4Reg = REG_NA;
1719     emitter*  emit   = getEmitter();
1720
1721     genConsumeHWIntrinsicOperands(node);
1722
1723     switch (intrinsicId)
1724     {
1725         case NI_SSE41_TestAllOnes:
1726         {
1727             op1Reg           = op1->gtRegNum;
1728             regNumber tmpReg = node->GetSingleTempReg();
1729             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1730             emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1731             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1732             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1733             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1734             break;
1735         }
1736
1737         case NI_SSE41_TestAllZeros:
1738         case NI_SSE41_TestZ:
1739         {
1740             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1741             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1742             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1743             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1744             break;
1745         }
1746
1747         case NI_SSE41_TestC:
1748         {
1749             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1750             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1751             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1752             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1753             break;
1754         }
1755
1756         case NI_SSE41_TestMixOnesZeros:
1757         case NI_SSE41_TestNotZAndNotC:
1758         {
1759             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1760             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1761             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1762             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1763             break;
1764         }
1765
1766         case NI_SSE41_Extract:
1767         case NI_SSE41_X64_Extract:
1768         {
1769             regNumber   tmpTargetReg = REG_NA;
1770             instruction ins          = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1771             if (baseType == TYP_FLOAT)
1772             {
1773                 tmpTargetReg = node->ExtractTempReg();
1774             }
1775
1776             auto emitSwCase = [&](int8_t i) {
1777                 if (baseType == TYP_FLOAT)
1778                 {
1779                     // extract instructions return to GP-registers, so it needs int size as the emitsize
1780                     inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
1781                     emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1782                 }
1783                 else
1784                 {
1785                     inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
1786                 }
1787             };
1788
1789             if (op2->IsCnsIntOrI())
1790             {
1791                 ssize_t ival = op2->AsIntCon()->IconValue();
1792                 assert((ival >= 0) && (ival <= 255));
1793                 emitSwCase((int8_t)ival);
1794             }
1795             else
1796             {
1797                 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1798                 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1799                 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1800                 regNumber baseReg = node->ExtractTempReg();
1801                 regNumber offsReg = node->GetSingleTempReg();
1802                 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1803             }
1804             break;
1805         }
1806
1807         default:
1808             unreached();
1809             break;
1810     }
1811
1812     genProduceReg(node);
1813 }
1814
1815 //------------------------------------------------------------------------
1816 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1817 //
1818 // Arguments:
1819 //    node - The hardware intrinsic node
1820 //
1821 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1822 {
1823     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1824     regNumber      targetReg   = node->gtRegNum;
1825     GenTree*       op1         = node->gtGetOp1();
1826     GenTree*       op2         = node->gtGetOp2();
1827     var_types      baseType    = node->gtSIMDBaseType;
1828     var_types      targetType  = node->TypeGet();
1829     emitter*       emit        = getEmitter();
1830
1831     genConsumeHWIntrinsicOperands(node);
1832     regNumber op1Reg = op1->gtRegNum;
1833
1834     assert(targetReg != REG_NA);
1835     assert(op1Reg != REG_NA);
1836     assert(op2 != nullptr);
1837     assert(!node->OperIsCommutative());
1838
1839     switch (intrinsicId)
1840     {
1841         case NI_SSE42_Crc32:
1842         case NI_SSE42_X64_Crc32:
1843         {
1844             if (op1Reg != targetReg)
1845             {
1846                 assert(op2->gtRegNum != targetReg);
1847                 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1848             }
1849
1850             // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1851             // overload that explicitly takes the operands.
1852             node->gtOp1 = op2;
1853             node->gtOp2 = nullptr;
1854
1855             if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1856             {
1857                 assert(targetType == TYP_INT);
1858                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1859             }
1860             else
1861             {
1862                 assert(op1->TypeGet() == op2->TypeGet());
1863                 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1864                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1865             }
1866
1867             break;
1868         }
1869
1870         default:
1871         {
1872             unreached();
1873             break;
1874         }
1875     }
1876
1877     genProduceReg(node);
1878 }
1879
1880 //------------------------------------------------------------------------
1881 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1882 //
1883 // Arguments:
1884 //    node - The hardware intrinsic node
1885 //
1886 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1887 {
1888     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1889     var_types      baseType    = node->gtSIMDBaseType;
1890     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
1891     var_types      targetType  = node->TypeGet();
1892     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1893     int            numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
1894     GenTree*       op1         = node->gtGetOp1();
1895     GenTree*       op2         = node->gtGetOp2();
1896     regNumber      op1Reg      = REG_NA;
1897     regNumber      op2Reg      = REG_NA;
1898     regNumber      targetReg   = node->gtRegNum;
1899     emitter*       emit        = getEmitter();
1900
1901     genConsumeHWIntrinsicOperands(node);
1902
1903     switch (intrinsicId)
1904     {
1905         case NI_AVX2_ConvertToInt32:
1906         case NI_AVX2_ConvertToUInt32:
1907         {
1908             op1Reg = op1->gtRegNum;
1909             assert(numArgs == 1);
1910             assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1911             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1912             emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1913             break;
1914         }
1915
1916         case NI_AVX2_GatherVector128:
1917         case NI_AVX2_GatherVector256:
1918         case NI_AVX2_GatherMaskVector128:
1919         case NI_AVX2_GatherMaskVector256:
1920         {
1921             GenTreeArgList* list = op1->AsArgList();
1922             op1                  = list->Current();
1923             op1Reg               = op1->gtRegNum;
1924
1925             list   = list->Rest();
1926             op2    = list->Current();
1927             op2Reg = op2->gtRegNum;
1928
1929             list         = list->Rest();
1930             GenTree* op3 = list->Current();
1931
1932             list             = list->Rest();
1933             GenTree* op4     = nullptr;
1934             GenTree* lastOp  = nullptr;
1935             GenTree* indexOp = nullptr;
1936
1937             regNumber op3Reg       = REG_NA;
1938             regNumber op4Reg       = REG_NA;
1939             regNumber addrBaseReg  = REG_NA;
1940             regNumber addrIndexReg = REG_NA;
1941             regNumber maskReg      = node->ExtractTempReg(RBM_ALLFLOAT);
1942
1943             if (numArgs == 5)
1944             {
1945                 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
1946                 op4          = list->Current();
1947                 list         = list->Rest();
1948                 lastOp       = list->Current();
1949                 op3Reg       = op3->gtRegNum;
1950                 op4Reg       = op4->gtRegNum;
1951                 addrBaseReg  = op2Reg;
1952                 addrIndexReg = op3Reg;
1953                 indexOp      = op3;
1954
1955                 // copy op4Reg into the tmp mask register,
1956                 // the mask register will be cleared by gather instructions
1957                 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
1958
1959                 if (targetReg != op1Reg)
1960                 {
1961                     // copy source vector to the target register for masking merge
1962                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1963                 }
1964             }
1965             else
1966             {
1967                 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
1968                 addrBaseReg  = op1Reg;
1969                 addrIndexReg = op2Reg;
1970                 indexOp      = op2;
1971                 lastOp       = op3;
1972
1973                 // generate all-one mask vector
1974                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
1975             }
1976
1977             bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
1978
1979             // hwintrinsiclistxarch.h uses Dword index instructions in default
1980             if (varTypeIsLong(node->gtIndexBaseType))
1981             {
1982                 switch (ins)
1983                 {
1984                     case INS_vpgatherdd:
1985                         ins = INS_vpgatherqd;
1986                         if (isVector128GatherWithVector256Index)
1987                         {
1988                             // YMM index in address mode
1989                             attr = emitTypeSize(TYP_SIMD32);
1990                         }
1991                         break;
1992                     case INS_vpgatherdq:
1993                         ins = INS_vpgatherqq;
1994                         break;
1995                     case INS_vgatherdps:
1996                         ins = INS_vgatherqps;
1997                         if (isVector128GatherWithVector256Index)
1998                         {
1999                             // YMM index in address mode
2000                             attr = emitTypeSize(TYP_SIMD32);
2001                         }
2002                         break;
2003                     case INS_vgatherdpd:
2004                         ins = INS_vgatherqpd;
2005                         break;
2006                     default:
2007                         unreached();
2008                 }
2009             }
2010
2011             assert(lastOp->IsCnsIntOrI());
2012             ssize_t ival = lastOp->AsIntCon()->IconValue();
2013             assert((ival >= 0) && (ival <= 255));
2014
2015             assert(targetReg != maskReg);
2016             assert(targetReg != addrIndexReg);
2017             assert(maskReg != addrIndexReg);
2018             emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2019
2020             break;
2021         }
2022
2023         case NI_AVX_TestC:
2024         {
2025             genHWIntrinsic_R_RM(node, ins, attr);
2026             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2027             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2028             break;
2029         }
2030
2031         case NI_AVX_TestNotZAndNotC:
2032         {
2033             genHWIntrinsic_R_RM(node, ins, attr);
2034             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2035             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2036             break;
2037         }
2038
2039         case NI_AVX_TestZ:
2040         {
2041             genHWIntrinsic_R_RM(node, ins, attr);
2042             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2043             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2044             break;
2045         }
2046
2047         default:
2048             unreached();
2049             break;
2050     }
2051
2052     genProduceReg(node);
2053 }
2054
2055 //------------------------------------------------------------------------
2056 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2057 //
2058 // Arguments:
2059 //    node - The hardware intrinsic node
2060 //
2061 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2062 {
2063     NYI("Implement AES intrinsic code generation");
2064 }
2065
2066 //------------------------------------------------------------------------
2067 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2068 //
2069 // Arguments:
2070 //    node - The hardware intrinsic node
2071 //
2072 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2073 {
2074     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2075     regNumber      targetReg   = node->gtRegNum;
2076     GenTree*       op1         = node->gtGetOp1();
2077     GenTree*       op2         = node->gtGetOp2();
2078     var_types      targetType  = node->TypeGet();
2079     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2080     emitter*       emit        = getEmitter();
2081
2082     assert(targetReg != REG_NA);
2083     assert(op1 != nullptr);
2084
2085     genConsumeHWIntrinsicOperands(node);
2086
2087     switch (intrinsicId)
2088     {
2089         case NI_BMI1_AndNot:
2090         case NI_BMI1_X64_AndNot:
2091         case NI_BMI1_BitFieldExtract:
2092         case NI_BMI1_X64_BitFieldExtract:
2093         case NI_BMI2_ParallelBitDeposit:
2094         case NI_BMI2_ParallelBitExtract:
2095         case NI_BMI2_X64_ParallelBitDeposit:
2096         case NI_BMI2_X64_ParallelBitExtract:
2097         case NI_BMI2_ZeroHighBits:
2098         case NI_BMI2_X64_ZeroHighBits:
2099         {
2100             assert(op2 != nullptr);
2101             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2102             genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2103             break;
2104         }
2105
2106         case NI_BMI1_ExtractLowestSetBit:
2107         case NI_BMI1_GetMaskUpToLowestSetBit:
2108         case NI_BMI1_ResetLowestSetBit:
2109         case NI_BMI1_X64_ExtractLowestSetBit:
2110         case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2111         case NI_BMI1_X64_ResetLowestSetBit:
2112         {
2113             assert(op2 == nullptr);
2114             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2115             genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2116             break;
2117         }
2118
2119         case NI_BMI1_TrailingZeroCount:
2120         case NI_BMI1_X64_TrailingZeroCount:
2121         {
2122             assert(op2 == nullptr);
2123             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2124             genXCNTIntrinsic(node, ins);
2125             break;
2126         }
2127
2128         case NI_BMI2_MultiplyNoFlags:
2129         case NI_BMI2_X64_MultiplyNoFlags:
2130         {
2131             int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2132             assert(numArgs == 2 || numArgs == 3);
2133
2134             regNumber op1Reg = REG_NA;
2135             regNumber op2Reg = REG_NA;
2136             regNumber op3Reg = REG_NA;
2137             regNumber lowReg = REG_NA;
2138
2139             if (numArgs == 2)
2140             {
2141                 op1Reg = op1->gtRegNum;
2142                 op2Reg = op2->gtRegNum;
2143                 lowReg = targetReg;
2144             }
2145             else
2146             {
2147                 GenTreeArgList* argList = op1->AsArgList();
2148                 op1                     = argList->Current();
2149                 op1Reg                  = op1->gtRegNum;
2150                 argList                 = argList->Rest();
2151                 op2                     = argList->Current();
2152                 op2Reg                  = op2->gtRegNum;
2153                 argList                 = argList->Rest();
2154                 GenTree* op3            = argList->Current();
2155                 op3Reg                  = op3->gtRegNum;
2156                 assert(op3Reg != op1Reg);
2157                 assert(op3Reg != targetReg);
2158                 assert(op3Reg != REG_EDX);
2159                 lowReg = node->GetSingleTempReg();
2160                 assert(op3Reg != lowReg);
2161                 assert(lowReg != targetReg);
2162             }
2163
2164             emitAttr attr = emitTypeSize(targetType);
2165             // mov the first operand into implicit source operand EDX/RDX
2166             if (op1Reg != REG_EDX)
2167             {
2168                 assert(op2Reg != REG_EDX);
2169                 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2170             }
2171
2172             // generate code for MULX
2173             genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2174
2175             // If requires the lower half result, store in the memory opinted by op3
2176             if (numArgs == 3)
2177             {
2178                 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2179             }
2180
2181             break;
2182         }
2183
2184         default:
2185         {
2186             unreached();
2187             break;
2188         }
2189     }
2190
2191     genProduceReg(node);
2192 }
2193
2194 //------------------------------------------------------------------------
2195 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2196 //
2197 // Arguments:
2198 //    node - The hardware intrinsic node
2199 //
2200 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2201 {
2202     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2203     var_types      baseType    = node->gtSIMDBaseType;
2204     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
2205     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2206     GenTree*       op1         = node->gtGetOp1();
2207     regNumber      targetReg   = node->gtRegNum;
2208
2209     assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2210
2211     genConsumeHWIntrinsicOperands(node);
2212     GenTreeArgList* argList = op1->AsArgList();
2213     op1                     = argList->Current();
2214
2215     argList      = argList->Rest();
2216     GenTree* op2 = argList->Current();
2217
2218     argList      = argList->Rest();
2219     GenTree* op3 = argList->Current();
2220
2221     regNumber op1Reg;
2222     regNumber op2Reg;
2223
2224     bool       isCommutative   = false;
2225     const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2226
2227     // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2228     assert(!copiesUpperBits || !op1->isContained());
2229
2230     if (op3->isContained() || op3->isUsedFromSpillTemp())
2231     {
2232         // 213 form: op1 = (op2 * op1) + [op3]
2233
2234         op1Reg = op1->gtRegNum;
2235         op2Reg = op2->gtRegNum;
2236
2237         isCommutative = !copiesUpperBits;
2238     }
2239     else if (op2->isContained() || op2->isUsedFromSpillTemp())
2240     {
2241         // 132 form: op1 = (op1 * op3) + [op2]
2242
2243         ins    = (instruction)(ins - 1);
2244         op1Reg = op1->gtRegNum;
2245         op2Reg = op3->gtRegNum;
2246         op3    = op2;
2247     }
2248     else if (op1->isContained() || op1->isUsedFromSpillTemp())
2249     {
2250         // 231 form: op3 = (op2 * op3) + [op1]
2251
2252         ins    = (instruction)(ins + 1);
2253         op1Reg = op3->gtRegNum;
2254         op2Reg = op2->gtRegNum;
2255         op3    = op1;
2256     }
2257     else
2258     {
2259         // 213 form: op1 = (op2 * op1) + op3
2260
2261         op1Reg = op1->gtRegNum;
2262         op2Reg = op2->gtRegNum;
2263
2264         isCommutative = !copiesUpperBits;
2265     }
2266
2267     if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2268     {
2269         assert(node->isRMWHWIntrinsic(compiler));
2270
2271         // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2272         //
2273         // For non-commutative intrinsics, we should have ensured that op2 was marked
2274         // delay free in order to prevent it from getting assigned the same register
2275         // as target. However, for commutative intrinsics, we can just swap the operands
2276         // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2277
2278         op2Reg = op1Reg;
2279         op1Reg = targetReg;
2280     }
2281
2282     genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2283     genProduceReg(node);
2284 }
2285
2286 //------------------------------------------------------------------------
2287 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2288 //
2289 // Arguments:
2290 //    node - The hardware intrinsic node
2291 //
2292 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2293 {
2294     assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2295            node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2296
2297     genConsumeOperands(node);
2298     genXCNTIntrinsic(node, INS_lzcnt);
2299     genProduceReg(node);
2300 }
2301
2302 //------------------------------------------------------------------------
2303 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2304 //
2305 // Arguments:
2306 //    node - The hardware intrinsic node
2307 //
2308 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2309 {
2310     NYI("Implement PCLMULQDQ intrinsic code generation");
2311 }
2312
2313 //------------------------------------------------------------------------
2314 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2315 //
2316 // Arguments:
2317 //    node - The hardware intrinsic node
2318 //
2319 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2320 {
2321     assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2322
2323     genConsumeOperands(node);
2324     genXCNTIntrinsic(node, INS_popcnt);
2325     genProduceReg(node);
2326 }
2327
2328 //------------------------------------------------------------------------
2329 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2330 // the target register
2331 //
2332 // Arguments:
2333 //    node - The hardware intrinsic node
2334 //    ins  - The instruction being generated
2335 //
2336 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2337 {
2338     // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2339     // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2340     // renaming, but only if it's not an actual dependency.
2341
2342     GenTree*  op1        = node->gtGetOp1();
2343     regNumber sourceReg1 = REG_NA;
2344     regNumber sourceReg2 = REG_NA;
2345
2346     if (!op1->isContained())
2347     {
2348         sourceReg1 = op1->gtRegNum;
2349     }
2350     else if (op1->isIndir())
2351     {
2352         GenTreeIndir* indir   = op1->AsIndir();
2353         GenTree*      memBase = indir->Base();
2354
2355         if (memBase != nullptr)
2356         {
2357             sourceReg1 = memBase->gtRegNum;
2358         }
2359
2360         if (indir->HasIndex())
2361         {
2362             sourceReg2 = indir->Index()->gtRegNum;
2363         }
2364     }
2365
2366     regNumber targetReg = node->gtRegNum;
2367     if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2368     {
2369         getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2370     }
2371     genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2372 }
2373
2374 #endif // FEATURE_HW_INTRINSICS