Delete unused variables in jit. Part 2. (#23481)
[platform/upstream/coreclr.git] / src / jit / hwintrinsiccodegenxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX               Intel hardware intrinsic Code Generator                     XX
9 XX                                                                           XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12 */
13 #include "jitpch.h"
14 #ifdef _MSC_VER
15 #pragma hdrstop
16 #endif
17
18 #ifdef FEATURE_HW_INTRINSICS
19
20 #include "emit.h"
21 #include "codegen.h"
22 #include "sideeffects.h"
23 #include "lower.h"
24 #include "gcinfo.h"
25 #include "gcinfoencoder.h"
26
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
29 //
30 // Arguments:
31 //    lowering - The lowering phase from the compiler
32 //    node     - The HWIntrinsic node that has the contained node
33 //    op       - The op that is contained
34 //
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
36 {
37 #if DEBUG
38     // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39     // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
40     //
41     // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42     // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
43     // spillage
44     // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
45     // register
46     // in the first place).
47
48     bool supportsRegOptional = false;
49     bool isContainable       = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50     assert(isContainable || supportsRegOptional);
51 #endif // DEBUG
52 }
53
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
56 //
57 // Arguments:
58 //    category - category of a HW intrinsic
59 //
60 // Return Value:
61 //    returns true if this category can be table-driven in CodeGen
62 //
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
64 {
65     // TODO - make more categories to the table-driven framework
66     // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67     const bool tableDrivenCategory =
68         (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69     const bool tableDrivenFlag =
70         !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71     return tableDrivenCategory && tableDrivenFlag;
72 }
73
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
76 //
77 // Arguments:
78 //    node - The hardware intrinsic node
79 //
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
81 {
82     NamedIntrinsic      intrinsicId = node->gtHWIntrinsicId;
83     InstructionSet      isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
84     HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
85     int                 ival        = HWIntrinsicInfo::lookupIval(intrinsicId);
86     int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
87
88     assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
89
90     if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
91     {
92         GenTree*  op1       = node->gtGetOp1();
93         GenTree*  op2       = node->gtGetOp2();
94         regNumber targetReg = node->gtRegNum;
95         var_types baseType  = node->gtSIMDBaseType;
96
97         regNumber op1Reg = REG_NA;
98         regNumber op2Reg = REG_NA;
99         emitter*  emit   = getEmitter();
100
101         assert(numArgs >= 0);
102         instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
103         assert(ins != INS_invalid);
104         emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
105         assert(simdSize != 0);
106
107         switch (numArgs)
108         {
109             case 1:
110             {
111                 if (node->OperIsMemoryLoad())
112                 {
113                     genConsumeAddress(op1);
114                     // Until we improve the handling of addressing modes in the emitter, we'll create a
115                     // temporary GT_IND to generate code with.
116                     GenTreeIndir load = indirForm(node->TypeGet(), op1);
117                     emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
118                 }
119                 else
120                 {
121                     genConsumeRegs(op1);
122                     op1Reg = op1->gtRegNum;
123
124                     if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
125                     {
126                         emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
127                     }
128                     else if ((ival != -1) && varTypeIsFloating(baseType))
129                     {
130                         assert((ival >= 0) && (ival <= 127));
131                         genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
132                     }
133                     else
134                     {
135                         genHWIntrinsic_R_RM(node, ins, simdSize);
136                     }
137                 }
138                 break;
139             }
140
141             case 2:
142             {
143                 if (category == HW_Category_MemoryStore)
144                 {
145                     genConsumeAddress(op1);
146                     genConsumeReg(op2);
147                     // Until we improve the handling of addressing modes in the emitter, we'll create a
148                     // temporary GT_STORE_IND to generate code with.
149                     GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
150                     emit->emitInsStoreInd(ins, simdSize, &store);
151                     break;
152                 }
153                 genConsumeRegs(op1);
154                 genConsumeRegs(op2);
155
156                 op1Reg = op1->gtRegNum;
157                 op2Reg = op2->gtRegNum;
158
159                 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
160                 {
161                     // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
162                     //
163                     // For non-commutative intrinsics, we should have ensured that op2 was marked
164                     // delay free in order to prevent it from getting assigned the same register
165                     // as target. However, for commutative intrinsics, we can just swap the operands
166                     // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
167
168                     noway_assert(node->OperIsCommutative());
169                     op2Reg = op1Reg;
170                     op1Reg = targetReg;
171                 }
172
173                 if ((ival != -1) && varTypeIsFloating(baseType))
174                 {
175                     assert((ival >= 0) && (ival <= 127));
176                     genHWIntrinsic_R_R_RM_I(node, ins, ival);
177                 }
178                 else if (category == HW_Category_MemoryLoad)
179                 {
180                     // Get the address and the 'other' register.
181                     GenTree*  addr;
182                     regNumber otherReg;
183                     if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
184                     {
185                         addr     = op1;
186                         otherReg = op2Reg;
187                     }
188                     else
189                     {
190                         addr     = op2;
191                         otherReg = op1Reg;
192                     }
193                     // Until we improve the handling of addressing modes in the emitter, we'll create a
194                     // temporary GT_IND to generate code with.
195                     GenTreeIndir load = indirForm(node->TypeGet(), addr);
196                     genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
197                 }
198                 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
199                 {
200                     assert(ival == -1);
201
202                     if (intrinsicId == NI_SSE2_Extract)
203                     {
204                         // extract instructions return to GP-registers, so it needs int size as the emitsize
205                         simdSize = emitTypeSize(TYP_INT);
206                     }
207
208                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
209
210                     if (op2->IsCnsIntOrI())
211                     {
212                         ssize_t ival = op2->AsIntCon()->IconValue();
213                         assert((ival >= 0) && (ival <= 255));
214                         emitSwCase((int8_t)ival);
215                     }
216                     else
217                     {
218                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
219                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
220                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
221                         regNumber baseReg = node->ExtractTempReg();
222                         regNumber offsReg = node->GetSingleTempReg();
223                         genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
224                     }
225                 }
226                 else
227                 {
228                     genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
229                 }
230                 break;
231             }
232
233             case 3:
234             {
235                 GenTreeArgList* argList = op1->AsArgList();
236                 op1                     = argList->Current();
237                 genConsumeRegs(op1);
238                 op1Reg = op1->gtRegNum;
239
240                 argList = argList->Rest();
241                 op2     = argList->Current();
242                 genConsumeRegs(op2);
243                 op2Reg = op2->gtRegNum;
244
245                 argList      = argList->Rest();
246                 GenTree* op3 = argList->Current();
247                 genConsumeRegs(op3);
248                 regNumber op3Reg = op3->gtRegNum;
249
250                 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
251                 {
252                     assert(ival == -1);
253
254                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
255
256                     if (op3->IsCnsIntOrI())
257                     {
258                         ssize_t ival = op3->AsIntCon()->IconValue();
259                         assert((ival >= 0) && (ival <= 255));
260                         emitSwCase((int8_t)ival);
261                     }
262                     else
263                     {
264                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
265                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
266                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
267                         regNumber baseReg = node->ExtractTempReg();
268                         regNumber offsReg = node->GetSingleTempReg();
269                         genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
270                     }
271                 }
272                 else if (category == HW_Category_MemoryStore)
273                 {
274                     if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
275                     {
276                         emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
277                     }
278                     else
279                     {
280                         assert(intrinsicId == NI_SSE2_MaskMove);
281                         assert(targetReg == REG_NA);
282
283                         // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
284                         if (op3Reg != REG_EDI)
285                         {
286                             emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
287                         }
288                         emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
289                     }
290                 }
291                 else
292                 {
293                     switch (intrinsicId)
294                     {
295                         case NI_SSE41_BlendVariable:
296                         case NI_AVX_BlendVariable:
297                         case NI_AVX2_BlendVariable:
298                         {
299                             genHWIntrinsic_R_R_RM_R(node, ins);
300                             break;
301                         }
302
303                         default:
304                         {
305                             unreached();
306                             break;
307                         };
308                     }
309                 }
310                 break;
311             }
312
313             default:
314                 unreached();
315                 break;
316         }
317         genProduceReg(node);
318         return;
319     }
320
321     switch (isa)
322     {
323         case InstructionSet_Base:
324             genBaseIntrinsic(node);
325             break;
326         case InstructionSet_SSE:
327         case InstructionSet_SSE_X64:
328             genSSEIntrinsic(node);
329             break;
330         case InstructionSet_SSE2:
331         case InstructionSet_SSE2_X64:
332             genSSE2Intrinsic(node);
333             break;
334         case InstructionSet_SSE41:
335         case InstructionSet_SSE41_X64:
336             genSSE41Intrinsic(node);
337             break;
338         case InstructionSet_SSE42:
339         case InstructionSet_SSE42_X64:
340             genSSE42Intrinsic(node);
341             break;
342         case InstructionSet_AVX:
343         case InstructionSet_AVX2:
344             genAvxOrAvx2Intrinsic(node);
345             break;
346         case InstructionSet_AES:
347             genAESIntrinsic(node);
348             break;
349         case InstructionSet_BMI1:
350         case InstructionSet_BMI1_X64:
351         case InstructionSet_BMI2:
352         case InstructionSet_BMI2_X64:
353             genBMI1OrBMI2Intrinsic(node);
354             break;
355         case InstructionSet_FMA:
356             genFMAIntrinsic(node);
357             break;
358         case InstructionSet_LZCNT:
359         case InstructionSet_LZCNT_X64:
360             genLZCNTIntrinsic(node);
361             break;
362         case InstructionSet_PCLMULQDQ:
363             genPCLMULQDQIntrinsic(node);
364             break;
365         case InstructionSet_POPCNT:
366         case InstructionSet_POPCNT_X64:
367             genPOPCNTIntrinsic(node);
368             break;
369         default:
370             unreached();
371             break;
372     }
373 }
374
375 //------------------------------------------------------------------------
376 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
377 //                      register/memory operand and that returns a value in register
378 //
379 // Arguments:
380 //    node - The hardware intrinsic node
381 //    ins  - The instruction being generated
382 //    attr - The emit attribute for the instruciton being generated
383 //
384 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
385 {
386     regNumber targetReg = node->gtRegNum;
387     GenTree*  op1       = node->gtGetOp1();
388     GenTree*  op2       = node->gtGetOp2();
389     emitter*  emit      = getEmitter();
390
391     if (op2 != nullptr)
392     {
393         // The Compare*OrderedScalar and Compare*UnorderedScalar intrinsics come down this
394         // code path. They are all MultiIns, as the return value comes from the flags and
395         // we have two operands instead.
396
397         assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
398         assert(targetReg != REG_NA);
399
400         targetReg = op1->gtRegNum;
401         op1       = op2;
402         op2       = nullptr;
403     }
404     else
405     {
406         assert(!node->OperIsCommutative());
407     }
408
409     assert(targetReg != REG_NA);
410     assert(op2 == nullptr);
411
412     if (op1->isContained() || op1->isUsedFromSpillTemp())
413     {
414         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
415         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
416
417         TempDsc* tmpDsc = nullptr;
418         unsigned varNum = BAD_VAR_NUM;
419         unsigned offset = (unsigned)-1;
420
421         if (op1->isUsedFromSpillTemp())
422         {
423             assert(op1->IsRegOptional());
424
425             tmpDsc = getSpillTempDsc(op1);
426             varNum = tmpDsc->tdTempNum();
427             offset = 0;
428
429             regSet.tmpRlsTemp(tmpDsc);
430         }
431         else if (op1->OperIsHWIntrinsic())
432         {
433             emit->emitIns_R_AR(ins, attr, targetReg, op1->gtGetOp1()->gtRegNum, 0);
434             return;
435         }
436         else if (op1->isIndir())
437         {
438             GenTreeIndir* memIndir = op1->AsIndir();
439             GenTree*      memBase  = memIndir->gtOp1;
440
441             switch (memBase->OperGet())
442             {
443                 case GT_LCL_VAR_ADDR:
444                 {
445                     varNum = memBase->AsLclVarCommon()->GetLclNum();
446                     offset = 0;
447
448                     // Ensure that all the GenTreeIndir values are set to their defaults.
449                     assert(!memIndir->HasIndex());
450                     assert(memIndir->Scale() == 1);
451                     assert(memIndir->Offset() == 0);
452
453                     break;
454                 }
455
456                 case GT_CLS_VAR_ADDR:
457                 {
458                     emit->emitIns_R_C(ins, attr, targetReg, memBase->gtClsVar.gtClsVarHnd, 0);
459                     return;
460                 }
461
462                 default:
463                 {
464                     emit->emitIns_R_A(ins, attr, targetReg, memIndir);
465                     return;
466                 }
467             }
468         }
469         else
470         {
471             switch (op1->OperGet())
472             {
473                 case GT_LCL_FLD:
474                 {
475                     GenTreeLclFld* lclField = op1->AsLclFld();
476
477                     varNum = lclField->GetLclNum();
478                     offset = lclField->gtLclFld.gtLclOffs;
479                     break;
480                 }
481
482                 case GT_LCL_VAR:
483                 {
484                     assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
485                     varNum = op1->AsLclVar()->GetLclNum();
486                     offset = 0;
487                     break;
488                 }
489
490                 default:
491                 {
492                     unreached();
493                     break;
494                 }
495             }
496         }
497
498         // Ensure we got a good varNum and offset.
499         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
500         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
501         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
502         assert(offset != (unsigned)-1);
503
504         emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
505     }
506     else
507     {
508         regNumber op1Reg = op1->gtRegNum;
509         emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
510     }
511 }
512
513 //------------------------------------------------------------------------
514 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
515 //                        an immediate operand, and that returns a value in register
516 //
517 // Arguments:
518 //    node - The hardware intrinsic node
519 //    ins  - The instruction being generated
520 //    ival - The immediate value
521 //
522 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
523 {
524     regNumber targetReg = node->gtRegNum;
525     GenTree*  op1       = node->gtGetOp1();
526     emitAttr  simdSize  = EA_ATTR(node->gtSIMDSize);
527     emitter*  emit      = getEmitter();
528
529     // TODO-XArch-CQ: Commutative operations can have op1 be contained
530     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
531
532     assert(targetReg != REG_NA);
533     assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
534
535     if (op1->isContained() || op1->isUsedFromSpillTemp())
536     {
537         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
538         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
539     }
540     inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
541 }
542
543 //------------------------------------------------------------------------
544 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
545 //                        register/memory operand, and that returns a value in register
546 //
547 // Arguments:
548 //    node - The hardware intrinsic node
549 //    ins  - The instruction being generated
550 //    attr - The emit attribute for the instruciton being generated
551 //
552 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
553 {
554     regNumber targetReg = node->gtRegNum;
555     GenTree*  op1       = node->gtGetOp1();
556     GenTree*  op2       = node->gtGetOp2();
557     regNumber op1Reg    = op1->gtRegNum;
558
559     assert(targetReg != REG_NA);
560     assert(op1Reg != REG_NA);
561
562     genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
563 }
564
565 //------------------------------------------------------------------------
566 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
567 //                        register/memory operand, and that returns a value in register
568 //
569 // Arguments:
570 //    node - The hardware intrinsic node
571 //    ins  - The instruction being generated
572 //    attr - The emit attribute for the instruciton being generated
573 //    targetReg - The register allocated to the result
574 //    op1Reg    - The register allocated to the first operand
575 //    op2       - Another operand that maybe in register or memory
576 //
577 void CodeGen::genHWIntrinsic_R_R_RM(
578     GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
579 {
580     emitter* emit = getEmitter();
581
582     // TODO-XArch-CQ: Commutative operations can have op1 be contained
583     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
584
585     assert(targetReg != REG_NA);
586     assert(op1Reg != REG_NA);
587
588     if (op2->isContained() || op2->isUsedFromSpillTemp())
589     {
590         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
591         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
592
593         TempDsc* tmpDsc = nullptr;
594         unsigned varNum = BAD_VAR_NUM;
595         unsigned offset = (unsigned)-1;
596
597         if (op2->isUsedFromSpillTemp())
598         {
599             assert(op2->IsRegOptional());
600
601             tmpDsc = getSpillTempDsc(op2);
602             varNum = tmpDsc->tdTempNum();
603             offset = 0;
604
605             regSet.tmpRlsTemp(tmpDsc);
606         }
607         else if (op2->OperIsHWIntrinsic())
608         {
609             GenTree* addr = op2->gtGetOp1();
610             // Until we improve the handling of addressing modes in the emitter, we'll create a
611             // temporary GT_IND to generate code with.
612             GenTreeIndir load = indirForm(node->TypeGet(), addr);
613             emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, &load);
614             return;
615         }
616         else if (op2->isIndir())
617         {
618             GenTreeIndir* memIndir = op2->AsIndir();
619             GenTree*      memBase  = memIndir->gtOp1;
620
621             switch (memBase->OperGet())
622             {
623                 case GT_LCL_VAR_ADDR:
624                 {
625                     varNum = memBase->AsLclVarCommon()->GetLclNum();
626                     offset = 0;
627
628                     // Ensure that all the GenTreeIndir values are set to their defaults.
629                     assert(!memIndir->HasIndex());
630                     assert(memIndir->Scale() == 1);
631                     assert(memIndir->Offset() == 0);
632
633                     break;
634                 }
635
636                 case GT_CLS_VAR_ADDR:
637                 {
638                     emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
639                     return;
640                 }
641
642                 default:
643                 {
644                     emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
645                     return;
646                 }
647             }
648         }
649         else
650         {
651             switch (op2->OperGet())
652             {
653                 case GT_LCL_FLD:
654                 {
655                     GenTreeLclFld* lclField = op2->AsLclFld();
656
657                     varNum = lclField->GetLclNum();
658                     offset = lclField->gtLclFld.gtLclOffs;
659                     break;
660                 }
661
662                 case GT_LCL_VAR:
663                 {
664                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
665                     varNum = op2->AsLclVar()->GetLclNum();
666                     offset = 0;
667                     break;
668                 }
669
670                 default:
671                     unreached();
672                     break;
673             }
674         }
675
676         // Ensure we got a good varNum and offset.
677         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
678         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
679         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
680         assert(offset != (unsigned)-1);
681
682         emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
683     }
684     else
685     {
686         regNumber op2Reg = op2->gtRegNum;
687
688         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
689         {
690             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
691             //
692             // For non-commutative intrinsics, we should have ensured that op2 was marked
693             // delay free in order to prevent it from getting assigned the same register
694             // as target. However, for commutative intrinsics, we can just swap the operands
695             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
696
697             noway_assert(node->OperIsCommutative());
698             op2Reg = op1Reg;
699             op1Reg = targetReg;
700         }
701
702         emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
703     }
704 }
705
706 //------------------------------------------------------------------------
707 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
708 //                        register/memory operand, an immediate operand, and that returns a value in register
709 //
710 // Arguments:
711 //    node - The hardware intrinsic node
712 //    ins  - The instruction being generated
713 //    ival - The immediate value
714 //
715 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
716 {
717     regNumber targetReg = node->gtRegNum;
718     GenTree*  op1       = node->gtGetOp1();
719     GenTree*  op2       = node->gtGetOp2();
720     emitAttr  simdSize  = EA_ATTR(node->gtSIMDSize);
721     emitter*  emit      = getEmitter();
722
723     // TODO-XArch-CQ: Commutative operations can have op1 be contained
724     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
725
726     if (op1->OperIsList())
727     {
728         assert(op2 == nullptr);
729
730         GenTreeArgList* argList = op1->AsArgList();
731
732         op1     = argList->Current();
733         argList = argList->Rest();
734
735         op2     = argList->Current();
736         argList = argList->Rest();
737
738         assert(argList->Current() != nullptr);
739         assert(argList->Rest() == nullptr);
740     }
741
742     regNumber op1Reg = op1->gtRegNum;
743
744     assert(targetReg != REG_NA);
745     assert(op1Reg != REG_NA);
746
747     if (op2->isContained() || op2->isUsedFromSpillTemp())
748     {
749         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
750         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
751
752         TempDsc* tmpDsc = nullptr;
753         unsigned varNum = BAD_VAR_NUM;
754         unsigned offset = (unsigned)-1;
755
756         if (op2->isUsedFromSpillTemp())
757         {
758             assert(op2->IsRegOptional());
759
760             tmpDsc = getSpillTempDsc(op2);
761             varNum = tmpDsc->tdTempNum();
762             offset = 0;
763
764             regSet.tmpRlsTemp(tmpDsc);
765         }
766         else if (op2->OperIsHWIntrinsic())
767         {
768             emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
769             return;
770         }
771         else if (op2->isIndir())
772         {
773             GenTreeIndir* memIndir = op2->AsIndir();
774             GenTree*      memBase  = memIndir->gtOp1;
775
776             switch (memBase->OperGet())
777             {
778                 case GT_LCL_VAR_ADDR:
779                 {
780                     varNum = memBase->AsLclVarCommon()->GetLclNum();
781                     offset = 0;
782
783                     // Ensure that all the GenTreeIndir values are set to their defaults.
784                     assert(!memIndir->HasIndex());
785                     assert(memIndir->Scale() == 1);
786                     assert(memIndir->Offset() == 0);
787
788                     break;
789                 }
790
791                 case GT_CLS_VAR_ADDR:
792                 {
793                     emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
794                                                ival);
795                     return;
796                 }
797
798                 default:
799                 {
800                     emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
801                     return;
802                 }
803             }
804         }
805         else
806         {
807             switch (op2->OperGet())
808             {
809                 case GT_LCL_FLD:
810                 {
811                     GenTreeLclFld* lclField = op2->AsLclFld();
812
813                     varNum = lclField->GetLclNum();
814                     offset = lclField->gtLclFld.gtLclOffs;
815                     break;
816                 }
817
818                 case GT_LCL_VAR:
819                 {
820                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
821                     varNum = op2->AsLclVar()->GetLclNum();
822                     offset = 0;
823                     break;
824                 }
825
826                 default:
827                     unreached();
828                     break;
829             }
830         }
831
832         // Ensure we got a good varNum and offset.
833         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
834         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
835         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
836         assert(offset != (unsigned)-1);
837
838         emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
839     }
840     else
841     {
842         regNumber op2Reg = op2->gtRegNum;
843
844         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
845         {
846             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
847             //
848             // For non-commutative intrinsics, we should have ensured that op2 was marked
849             // delay free in order to prevent it from getting assigned the same register
850             // as target. However, for commutative intrinsics, we can just swap the operands
851             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
852
853             noway_assert(node->OperIsCommutative());
854             op2Reg = op1Reg;
855             op1Reg = targetReg;
856         }
857
858         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
859     }
860 }
861
862 //------------------------------------------------------------------------
863 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
864 //                          register/memory operand, another register operand, and that returns a value in register
865 //
866 // Arguments:
867 //    node - The hardware intrinsic node
868 //    ins  - The instruction being generated
869 //
870 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
871 {
872     regNumber targetReg = node->gtRegNum;
873     GenTree*  op1       = node->gtGetOp1();
874     GenTree*  op2       = node->gtGetOp2();
875     GenTree*  op3       = nullptr;
876     emitAttr  simdSize  = EA_ATTR(node->gtSIMDSize);
877     emitter*  emit      = getEmitter();
878
879     assert(op1->OperIsList());
880     assert(op2 == nullptr);
881
882     GenTreeArgList* argList = op1->AsArgList();
883
884     op1     = argList->Current();
885     argList = argList->Rest();
886
887     op2     = argList->Current();
888     argList = argList->Rest();
889
890     op3 = argList->Current();
891     assert(argList->Rest() == nullptr);
892
893     regNumber op1Reg = op1->gtRegNum;
894     regNumber op3Reg = op3->gtRegNum;
895
896     assert(targetReg != REG_NA);
897     assert(op1Reg != REG_NA);
898     assert(op3Reg != REG_NA);
899
900     if (op2->isContained() || op2->isUsedFromSpillTemp())
901     {
902         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
903         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
904
905         TempDsc* tmpDsc = nullptr;
906         unsigned varNum = BAD_VAR_NUM;
907         unsigned offset = (unsigned)-1;
908
909         if (op2->isUsedFromSpillTemp())
910         {
911             assert(op2->IsRegOptional());
912
913             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
914             //                     pattern. It could probably be extracted to its own method.
915             tmpDsc = getSpillTempDsc(op2);
916             varNum = tmpDsc->tdTempNum();
917             offset = 0;
918
919             regSet.tmpRlsTemp(tmpDsc);
920         }
921         else if (op2->OperIsHWIntrinsic())
922         {
923             emit->emitIns_SIMD_R_R_AR_R(ins, simdSize, targetReg, op1Reg, op3Reg, op2->gtGetOp1()->gtRegNum);
924             return;
925         }
926         else if (op2->isIndir())
927         {
928             GenTreeIndir* memIndir = op2->AsIndir();
929             GenTree*      memBase  = memIndir->gtOp1;
930
931             switch (memBase->OperGet())
932             {
933                 case GT_LCL_VAR_ADDR:
934                 {
935                     varNum = memBase->AsLclVarCommon()->GetLclNum();
936                     offset = 0;
937
938                     // Ensure that all the GenTreeIndir values are set to their defaults.
939                     assert(!memIndir->HasIndex());
940                     assert(memIndir->Scale() == 1);
941                     assert(memIndir->Offset() == 0);
942
943                     break;
944                 }
945
946                 case GT_CLS_VAR_ADDR:
947                 {
948                     emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, memBase->gtClsVar.gtClsVarHnd,
949                                                0);
950                     return;
951                 }
952
953                 default:
954                 {
955                     emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
956                     return;
957                 }
958             }
959         }
960         else
961         {
962             switch (op2->OperGet())
963             {
964                 case GT_LCL_FLD:
965                 {
966                     GenTreeLclFld* lclField = op2->AsLclFld();
967
968                     varNum = lclField->GetLclNum();
969                     offset = lclField->gtLclFld.gtLclOffs;
970                     break;
971                 }
972
973                 case GT_LCL_VAR:
974                 {
975                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
976                     varNum = op2->AsLclVar()->GetLclNum();
977                     offset = 0;
978                     break;
979                 }
980
981                 default:
982                     unreached();
983                     break;
984             }
985         }
986
987         // Ensure we got a good varNum and offset.
988         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
989         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
990         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
991         assert(offset != (unsigned)-1);
992
993         emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
994     }
995     else
996     {
997         emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
998     }
999 }
1000
1001 //------------------------------------------------------------------------
1002 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1003 //                          a register/memory operand, and that returns a value in register
1004 //
1005 // Arguments:
1006 //    ins       - The instruction being generated
1007 //    attr      - The emit attribute
1008 //    targetReg - The target register
1009 //    op1Reg    - The register of the first operand
1010 //    op2Reg    - The register of the second operand
1011 //    op3       - The third operand
1012 //
1013 void CodeGen::genHWIntrinsic_R_R_R_RM(
1014     instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1015 {
1016     assert(targetReg != REG_NA);
1017     assert(op1Reg != REG_NA);
1018     assert(op2Reg != REG_NA);
1019
1020     emitter* emit = getEmitter();
1021
1022     if (op3->isContained() || op3->isUsedFromSpillTemp())
1023     {
1024         TempDsc* tmpDsc = nullptr;
1025         unsigned varNum = BAD_VAR_NUM;
1026         unsigned offset = (unsigned)-1;
1027
1028         if (op3->isUsedFromSpillTemp())
1029         {
1030             assert(op3->IsRegOptional());
1031
1032             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1033             //                     pattern. It could probably be extracted to its own method.
1034             tmpDsc = getSpillTempDsc(op3);
1035             varNum = tmpDsc->tdTempNum();
1036             offset = 0;
1037
1038             regSet.tmpRlsTemp(tmpDsc);
1039         }
1040         else if (op3->OperIsHWIntrinsic())
1041         {
1042             emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum);
1043             return;
1044         }
1045         else if (op3->isIndir())
1046         {
1047             GenTreeIndir* memIndir = op3->AsIndir();
1048             GenTree*      memBase  = memIndir->gtOp1;
1049
1050             switch (memBase->OperGet())
1051             {
1052                 case GT_LCL_VAR_ADDR:
1053                 {
1054                     varNum = memBase->AsLclVarCommon()->GetLclNum();
1055                     offset = 0;
1056
1057                     // Ensure that all the GenTreeIndir values are set to their defaults.
1058                     assert(!memIndir->HasIndex());
1059                     assert(memIndir->Scale() == 1);
1060                     assert(memIndir->Offset() == 0);
1061
1062                     break;
1063                 }
1064
1065                 case GT_CLS_VAR_ADDR:
1066                 {
1067                     emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0);
1068                     return;
1069                 }
1070
1071                 default:
1072                 {
1073                     emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1074                     return;
1075                 }
1076             }
1077         }
1078         else
1079         {
1080             switch (op3->OperGet())
1081             {
1082                 case GT_LCL_FLD:
1083                 {
1084                     GenTreeLclFld* lclField = op3->AsLclFld();
1085
1086                     varNum = lclField->GetLclNum();
1087                     offset = lclField->gtLclFld.gtLclOffs;
1088                     break;
1089                 }
1090
1091                 case GT_LCL_VAR:
1092                 {
1093                     assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1094                     varNum = op3->AsLclVar()->GetLclNum();
1095                     offset = 0;
1096                     break;
1097                 }
1098
1099                 default:
1100                     unreached();
1101                     break;
1102             }
1103         }
1104
1105         // Ensure we got a good varNum and offset.
1106         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1107         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1108         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1109         assert(offset != (unsigned)-1);
1110
1111         emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1112     }
1113     else
1114     {
1115         emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1116     }
1117 }
1118
1119 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1120 //                       with non-constant argument
1121 //
1122 // Arguments:
1123 //    intrinsic      - intrinsic ID
1124 //    nonConstImmReg - the register contains non-constant imm8 argument
1125 //    baseReg        - a register for the start of the switch table
1126 //    offsReg        - a register for the offset into the switch table
1127 //    emitSwCase     - the lambda to generate a switch case
1128 //
1129 // Return Value:
1130 //    generate the jump-table fallback for imm-intrinsics with non-constant argument.
1131 // Note:
1132 //    This function can be used for all imm-intrinsics (whether full-range or not),
1133 //    The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1134 //    (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1135 //
1136 template <typename HWIntrinsicSwitchCaseBody>
1137 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsic,
1138                                               regNumber                 nonConstImmReg,
1139                                               regNumber                 baseReg,
1140                                               regNumber                 offsReg,
1141                                               HWIntrinsicSwitchCaseBody emitSwCase)
1142 {
1143     assert(nonConstImmReg != REG_NA);
1144     // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1145     // that does work with the current compiler generated jump-table fallback
1146     assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1147     emitter* emit = getEmitter();
1148
1149     const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1150     assert(maxByte <= 256);
1151     BasicBlock* jmpTable[256];
1152
1153     unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1154
1155     // Emit the jump table
1156     for (unsigned i = 0; i < maxByte; i++)
1157     {
1158         jmpTable[i] = genCreateTempLabel();
1159         emit->emitDataGenData(i, jmpTable[i]);
1160     }
1161
1162     emit->emitDataGenEnd();
1163
1164     // Compute and jump to the appropriate offset in the switch table
1165     emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1166
1167     emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1168     emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1169     emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1170     emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1171
1172     // Emit the switch table entries
1173
1174     BasicBlock* switchTableBeg = genCreateTempLabel();
1175     BasicBlock* switchTableEnd = genCreateTempLabel();
1176
1177     genDefineTempLabel(switchTableBeg);
1178
1179     for (unsigned i = 0; i < maxByte; i++)
1180     {
1181         genDefineTempLabel(jmpTable[i]);
1182         emitSwCase((int8_t)i);
1183         emit->emitIns_J(INS_jmp, switchTableEnd);
1184     }
1185
1186     genDefineTempLabel(switchTableEnd);
1187 }
1188
1189 //------------------------------------------------------------------------
1190 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1191 //
1192 // Arguments:
1193 //    node - The hardware intrinsic node
1194 //
1195 // Note:
1196 //    We currently assume that all base intrinsics have zero or one operand.
1197 //
1198 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1199 {
1200     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1201     regNumber      targetReg   = node->gtRegNum;
1202     var_types      targetType  = node->TypeGet();
1203     var_types      baseType    = node->gtSIMDBaseType;
1204
1205     assert(compiler->compSupports(InstructionSet_SSE));
1206     assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1207
1208     GenTree* op1 = node->gtGetOp1();
1209
1210     genConsumeHWIntrinsicOperands(node);
1211     regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
1212
1213     assert(node->gtGetOp2() == nullptr);
1214
1215     emitter*    emit = getEmitter();
1216     emitAttr    attr = EA_ATTR(node->gtSIMDSize);
1217     instruction ins  = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1218
1219     switch (intrinsicId)
1220     {
1221         case NI_Base_Vector128_CreateScalarUnsafe:
1222         case NI_Base_Vector256_CreateScalarUnsafe:
1223         {
1224             if (varTypeIsIntegral(baseType))
1225             {
1226                 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1227             }
1228             else
1229             {
1230                 assert(varTypeIsFloating(baseType));
1231
1232                 attr = emitTypeSize(baseType);
1233
1234                 if (op1->isContained() || op1->isUsedFromSpillTemp())
1235                 {
1236                     genHWIntrinsic_R_RM(node, ins, attr);
1237                 }
1238                 else if (targetReg != op1Reg)
1239                 {
1240                     // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1241                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1242                 }
1243             }
1244             break;
1245         }
1246
1247         case NI_Base_Vector128_ToScalar:
1248         case NI_Base_Vector256_ToScalar:
1249         {
1250             assert(varTypeIsFloating(baseType));
1251
1252             attr = emitTypeSize(TYP_SIMD16);
1253
1254             if (op1->isContained() || op1->isUsedFromSpillTemp())
1255             {
1256                 genHWIntrinsic_R_RM(node, ins, attr);
1257             }
1258             else if (targetReg != op1Reg)
1259             {
1260                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1261                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1262             }
1263             break;
1264         }
1265
1266         case NI_Base_Vector128_ToVector256:
1267         {
1268             // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1269             // We always emit a move to the target register, even when op1Reg == targetReg,
1270             // in order to ensure that Bits MAXVL-1:128 are zeroed.
1271
1272             attr = emitTypeSize(TYP_SIMD16);
1273
1274             if (op1->isContained() || op1->isUsedFromSpillTemp())
1275             {
1276                 genHWIntrinsic_R_RM(node, ins, attr);
1277             }
1278             else
1279             {
1280                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1281                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1282             }
1283             break;
1284         }
1285
1286         case NI_Base_Vector128_ToVector256Unsafe:
1287         case NI_Base_Vector256_GetLower:
1288         {
1289             if (op1->isContained() || op1->isUsedFromSpillTemp())
1290             {
1291                 genHWIntrinsic_R_RM(node, ins, attr);
1292             }
1293             else if (targetReg != op1Reg)
1294             {
1295                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1296                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1297             }
1298             break;
1299         }
1300
1301         case NI_Base_Vector128_Zero:
1302         case NI_Base_Vector256_Zero:
1303         {
1304             assert(op1 == nullptr);
1305             emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1306             break;
1307         }
1308
1309         default:
1310         {
1311             unreached();
1312             break;
1313         }
1314     }
1315
1316     genProduceReg(node);
1317 }
1318
1319 //------------------------------------------------------------------------
1320 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1321 //
1322 // Arguments:
1323 //    node - The hardware intrinsic node
1324 //
1325 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1326 {
1327     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1328     GenTree*       op1         = node->gtGetOp1();
1329     GenTree*       op2         = node->gtGetOp2();
1330     regNumber      targetReg   = node->gtRegNum;
1331     var_types      baseType    = node->gtSIMDBaseType;
1332
1333     regNumber op1Reg = REG_NA;
1334     emitter*  emit   = getEmitter();
1335
1336     genConsumeHWIntrinsicOperands(node);
1337
1338     switch (intrinsicId)
1339     {
1340         case NI_SSE_CompareEqualOrderedScalar:
1341         case NI_SSE_CompareEqualUnorderedScalar:
1342         {
1343             assert(baseType == TYP_FLOAT);
1344             regNumber   tmpReg = node->GetSingleTempReg();
1345             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1346
1347             // Ensure we aren't overwriting targetReg
1348             assert(tmpReg != targetReg);
1349
1350             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1351             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1352             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1353             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1354             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1355             break;
1356         }
1357
1358         case NI_SSE_CompareGreaterThanOrderedScalar:
1359         case NI_SSE_CompareGreaterThanUnorderedScalar:
1360         {
1361             assert(baseType == TYP_FLOAT);
1362             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1363
1364             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1365             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1366             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1367             break;
1368         }
1369
1370         case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
1371         case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
1372         {
1373             assert(baseType == TYP_FLOAT);
1374             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1375
1376             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1377             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1378             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1379             break;
1380         }
1381
1382         case NI_SSE_CompareLessThanOrderedScalar:
1383         case NI_SSE_CompareLessThanUnorderedScalar:
1384         {
1385             assert(baseType == TYP_FLOAT);
1386             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1387
1388             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1389             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1390             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1391             break;
1392         }
1393
1394         case NI_SSE_CompareLessThanOrEqualOrderedScalar:
1395         case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
1396         {
1397             assert(baseType == TYP_FLOAT);
1398             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1399
1400             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1401             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1402             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1403             break;
1404         }
1405
1406         case NI_SSE_CompareNotEqualOrderedScalar:
1407         case NI_SSE_CompareNotEqualUnorderedScalar:
1408         {
1409             assert(baseType == TYP_FLOAT);
1410             regNumber   tmpReg = node->GetSingleTempReg();
1411             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1412
1413             // Ensure we aren't overwriting targetReg
1414             assert(tmpReg != targetReg);
1415
1416             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1417             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1418             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1419             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1420             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1421             break;
1422         }
1423
1424         case NI_SSE_X64_ConvertToInt64:
1425         case NI_SSE_X64_ConvertToInt64WithTruncation:
1426         {
1427             assert(targetType == TYP_LONG);
1428             assert(op1 != nullptr);
1429             assert(op2 == nullptr);
1430             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1431             genHWIntrinsic_R_RM(node, ins, EA_8BYTE);
1432             break;
1433         }
1434
1435         case NI_SSE_X64_ConvertScalarToVector128Single:
1436         {
1437             assert(baseType == TYP_LONG);
1438             assert(op1 != nullptr);
1439             assert(op2 != nullptr);
1440             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1441             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1442             break;
1443         }
1444
1445         case NI_SSE_Prefetch0:
1446         case NI_SSE_Prefetch1:
1447         case NI_SSE_Prefetch2:
1448         case NI_SSE_PrefetchNonTemporal:
1449         {
1450             assert(baseType == TYP_UBYTE);
1451             assert(op2 == nullptr);
1452
1453             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1454             op1Reg          = op1->gtRegNum;
1455             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1456             break;
1457         }
1458
1459         case NI_SSE_StoreFence:
1460         {
1461             assert(baseType == TYP_VOID);
1462             assert(op1 == nullptr);
1463             assert(op2 == nullptr);
1464             emit->emitIns(INS_sfence);
1465             break;
1466         }
1467
1468         default:
1469             unreached();
1470             break;
1471     }
1472
1473     genProduceReg(node);
1474 }
1475
1476 //------------------------------------------------------------------------
1477 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1478 //
1479 // Arguments:
1480 //    node - The hardware intrinsic node
1481 //
1482 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1483 {
1484     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1485     GenTree*       op1         = node->gtGetOp1();
1486     GenTree*       op2         = node->gtGetOp2();
1487     regNumber      targetReg   = node->gtRegNum;
1488     var_types      targetType  = node->TypeGet();
1489     var_types      baseType    = node->gtSIMDBaseType;
1490     regNumber      op1Reg      = REG_NA;
1491     regNumber      op2Reg      = REG_NA;
1492     emitter*       emit        = getEmitter();
1493
1494     genConsumeHWIntrinsicOperands(node);
1495
1496     switch (intrinsicId)
1497     {
1498         // All integer overloads are handled by table codegen
1499         case NI_SSE2_CompareLessThan:
1500         {
1501             assert(op1 != nullptr);
1502             assert(op2 != nullptr);
1503
1504             assert(baseType == TYP_DOUBLE);
1505
1506             int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1507             assert((ival >= 0) && (ival <= 127));
1508
1509             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1510             op1Reg          = op1->gtRegNum;
1511             op2Reg          = op2->gtRegNum;
1512             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1513
1514             break;
1515         }
1516
1517         case NI_SSE2_CompareEqualOrderedScalar:
1518         case NI_SSE2_CompareEqualUnorderedScalar:
1519         {
1520             assert(baseType == TYP_DOUBLE);
1521             regNumber   tmpReg = node->GetSingleTempReg();
1522             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1523
1524             // Ensure we aren't overwriting targetReg
1525             assert(tmpReg != targetReg);
1526
1527             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1528             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1529             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1530             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1531             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1532             break;
1533         }
1534
1535         case NI_SSE2_CompareGreaterThanOrderedScalar:
1536         case NI_SSE2_CompareGreaterThanUnorderedScalar:
1537         {
1538             assert(baseType == TYP_DOUBLE);
1539             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1540
1541             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1542             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1543             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1544             break;
1545         }
1546
1547         case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
1548         case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
1549         {
1550             assert(baseType == TYP_DOUBLE);
1551             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1552
1553             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1554             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1555             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1556             break;
1557         }
1558
1559         case NI_SSE2_CompareLessThanOrderedScalar:
1560         case NI_SSE2_CompareLessThanUnorderedScalar:
1561         {
1562             assert(baseType == TYP_DOUBLE);
1563             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1564
1565             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1566             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1567             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1568             break;
1569         }
1570
1571         case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
1572         case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
1573         {
1574             assert(baseType == TYP_DOUBLE);
1575             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1576
1577             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1578             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1579             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1580             break;
1581         }
1582
1583         case NI_SSE2_CompareNotEqualOrderedScalar:
1584         case NI_SSE2_CompareNotEqualUnorderedScalar:
1585         {
1586             assert(baseType == TYP_DOUBLE);
1587             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1588             regNumber   tmpReg = node->GetSingleTempReg();
1589
1590             // Ensure we aren't overwriting targetReg
1591             assert(tmpReg != targetReg);
1592
1593             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1594             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1595             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1596             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1597             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1598             break;
1599         }
1600
1601         case NI_SSE2_X64_ConvertScalarToVector128Double:
1602         {
1603             assert(baseType == TYP_LONG);
1604             assert(op1 != nullptr);
1605             assert(op2 != nullptr);
1606             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1607             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1608             break;
1609         }
1610
1611         case NI_SSE2_X64_ConvertScalarToVector128Int64:
1612         case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1613         {
1614             assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1615             assert(op1 != nullptr);
1616             assert(op2 == nullptr);
1617             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1618             genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1619             break;
1620         }
1621
1622         case NI_SSE2_ConvertToInt32:
1623         case NI_SSE2_ConvertToInt32WithTruncation:
1624         case NI_SSE2_ConvertToUInt32:
1625         case NI_SSE2_X64_ConvertToInt64:
1626         case NI_SSE2_X64_ConvertToInt64WithTruncation:
1627         case NI_SSE2_X64_ConvertToUInt64:
1628         {
1629             assert(op2 == nullptr);
1630             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1631
1632             if (varTypeIsIntegral(baseType))
1633             {
1634                 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1635                 op1Reg = op1->gtRegNum;
1636                 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1637             }
1638             else
1639             {
1640                 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1641                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1642             }
1643             break;
1644         }
1645
1646         case NI_SSE2_LoadFence:
1647         {
1648             assert(baseType == TYP_VOID);
1649             assert(op1 == nullptr);
1650             assert(op2 == nullptr);
1651             emit->emitIns(INS_lfence);
1652             break;
1653         }
1654
1655         case NI_SSE2_MemoryFence:
1656         {
1657             assert(baseType == TYP_VOID);
1658             assert(op1 == nullptr);
1659             assert(op2 == nullptr);
1660             emit->emitIns(INS_mfence);
1661             break;
1662         }
1663
1664         case NI_SSE2_StoreNonTemporal:
1665         case NI_SSE2_X64_StoreNonTemporal:
1666         {
1667             assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1668             assert(op1 != nullptr);
1669             assert(op2 != nullptr);
1670
1671             op2Reg          = op2->gtRegNum;
1672             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1673             op1Reg          = op1->gtRegNum;
1674             emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1675             break;
1676         }
1677
1678         default:
1679             unreached();
1680             break;
1681     }
1682
1683     genProduceReg(node);
1684 }
1685
1686 //------------------------------------------------------------------------
1687 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1688 //
1689 // Arguments:
1690 //    node - The hardware intrinsic node
1691 //
1692 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1693 {
1694     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1695     GenTree*       op1         = node->gtGetOp1();
1696     GenTree*       op2         = node->gtGetOp2();
1697     regNumber      targetReg   = node->gtRegNum;
1698     var_types      baseType    = node->gtSIMDBaseType;
1699
1700     regNumber op1Reg = REG_NA;
1701     emitter*  emit   = getEmitter();
1702
1703     genConsumeHWIntrinsicOperands(node);
1704
1705     switch (intrinsicId)
1706     {
1707         case NI_SSE41_TestAllOnes:
1708         {
1709             op1Reg           = op1->gtRegNum;
1710             regNumber tmpReg = node->GetSingleTempReg();
1711             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1712             emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1713             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1714             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1715             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1716             break;
1717         }
1718
1719         case NI_SSE41_TestAllZeros:
1720         case NI_SSE41_TestZ:
1721         {
1722             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1723             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1724             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1725             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1726             break;
1727         }
1728
1729         case NI_SSE41_TestC:
1730         {
1731             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1732             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1733             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1734             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1735             break;
1736         }
1737
1738         case NI_SSE41_TestMixOnesZeros:
1739         case NI_SSE41_TestNotZAndNotC:
1740         {
1741             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1742             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1743             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1744             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1745             break;
1746         }
1747
1748         case NI_SSE41_Extract:
1749         case NI_SSE41_X64_Extract:
1750         {
1751             regNumber   tmpTargetReg = REG_NA;
1752             instruction ins          = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1753             if (baseType == TYP_FLOAT)
1754             {
1755                 tmpTargetReg = node->ExtractTempReg();
1756             }
1757
1758             auto emitSwCase = [&](int8_t i) {
1759                 if (baseType == TYP_FLOAT)
1760                 {
1761                     // extract instructions return to GP-registers, so it needs int size as the emitsize
1762                     inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
1763                     emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1764                 }
1765                 else
1766                 {
1767                     inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
1768                 }
1769             };
1770
1771             if (op2->IsCnsIntOrI())
1772             {
1773                 ssize_t ival = op2->AsIntCon()->IconValue();
1774                 assert((ival >= 0) && (ival <= 255));
1775                 emitSwCase((int8_t)ival);
1776             }
1777             else
1778             {
1779                 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1780                 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1781                 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1782                 regNumber baseReg = node->ExtractTempReg();
1783                 regNumber offsReg = node->GetSingleTempReg();
1784                 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1785             }
1786             break;
1787         }
1788
1789         default:
1790             unreached();
1791             break;
1792     }
1793
1794     genProduceReg(node);
1795 }
1796
1797 //------------------------------------------------------------------------
1798 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1799 //
1800 // Arguments:
1801 //    node - The hardware intrinsic node
1802 //
1803 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1804 {
1805     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1806     regNumber      targetReg   = node->gtRegNum;
1807     GenTree*       op1         = node->gtGetOp1();
1808     GenTree*       op2         = node->gtGetOp2();
1809     var_types      baseType    = node->gtSIMDBaseType;
1810     var_types      targetType  = node->TypeGet();
1811     emitter*       emit        = getEmitter();
1812
1813     genConsumeHWIntrinsicOperands(node);
1814     regNumber op1Reg = op1->gtRegNum;
1815
1816     assert(targetReg != REG_NA);
1817     assert(op1Reg != REG_NA);
1818     assert(op2 != nullptr);
1819     assert(!node->OperIsCommutative());
1820
1821     switch (intrinsicId)
1822     {
1823         case NI_SSE42_Crc32:
1824         case NI_SSE42_X64_Crc32:
1825         {
1826             if (op1Reg != targetReg)
1827             {
1828                 assert(op2->gtRegNum != targetReg);
1829                 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1830             }
1831
1832             // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1833             // overload that explicitly takes the operands.
1834             node->gtOp1 = op2;
1835             node->gtOp2 = nullptr;
1836
1837             if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1838             {
1839                 assert(targetType == TYP_INT);
1840                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1841             }
1842             else
1843             {
1844                 assert(op1->TypeGet() == op2->TypeGet());
1845                 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1846                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1847             }
1848
1849             break;
1850         }
1851
1852         default:
1853         {
1854             unreached();
1855             break;
1856         }
1857     }
1858
1859     genProduceReg(node);
1860 }
1861
1862 //------------------------------------------------------------------------
1863 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1864 //
1865 // Arguments:
1866 //    node - The hardware intrinsic node
1867 //
1868 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1869 {
1870     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1871     var_types      baseType    = node->gtSIMDBaseType;
1872     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
1873     var_types      targetType  = node->TypeGet();
1874     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1875     int            numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
1876     GenTree*       op1         = node->gtGetOp1();
1877     GenTree*       op2         = node->gtGetOp2();
1878     regNumber      op1Reg      = REG_NA;
1879     regNumber      op2Reg      = REG_NA;
1880     regNumber      targetReg   = node->gtRegNum;
1881     emitter*       emit        = getEmitter();
1882
1883     genConsumeHWIntrinsicOperands(node);
1884
1885     switch (intrinsicId)
1886     {
1887         case NI_AVX2_ConvertToInt32:
1888         case NI_AVX2_ConvertToUInt32:
1889         {
1890             op1Reg = op1->gtRegNum;
1891             assert(numArgs == 1);
1892             assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1893             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1894             emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1895             break;
1896         }
1897
1898         case NI_AVX2_GatherVector128:
1899         case NI_AVX2_GatherVector256:
1900         case NI_AVX2_GatherMaskVector128:
1901         case NI_AVX2_GatherMaskVector256:
1902         {
1903             GenTreeArgList* list = op1->AsArgList();
1904             op1                  = list->Current();
1905             op1Reg               = op1->gtRegNum;
1906
1907             list   = list->Rest();
1908             op2    = list->Current();
1909             op2Reg = op2->gtRegNum;
1910
1911             list         = list->Rest();
1912             GenTree* op3 = list->Current();
1913
1914             list             = list->Rest();
1915             GenTree* op4     = nullptr;
1916             GenTree* lastOp  = nullptr;
1917             GenTree* indexOp = nullptr;
1918
1919             regNumber op3Reg       = REG_NA;
1920             regNumber op4Reg       = REG_NA;
1921             regNumber addrBaseReg  = REG_NA;
1922             regNumber addrIndexReg = REG_NA;
1923             regNumber maskReg      = node->ExtractTempReg(RBM_ALLFLOAT);
1924
1925             if (numArgs == 5)
1926             {
1927                 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
1928                 op4          = list->Current();
1929                 list         = list->Rest();
1930                 lastOp       = list->Current();
1931                 op3Reg       = op3->gtRegNum;
1932                 op4Reg       = op4->gtRegNum;
1933                 addrBaseReg  = op2Reg;
1934                 addrIndexReg = op3Reg;
1935                 indexOp      = op3;
1936
1937                 // copy op4Reg into the tmp mask register,
1938                 // the mask register will be cleared by gather instructions
1939                 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
1940
1941                 if (targetReg != op1Reg)
1942                 {
1943                     // copy source vector to the target register for masking merge
1944                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1945                 }
1946             }
1947             else
1948             {
1949                 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
1950                 addrBaseReg  = op1Reg;
1951                 addrIndexReg = op2Reg;
1952                 indexOp      = op2;
1953                 lastOp       = op3;
1954
1955                 // generate all-one mask vector
1956                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
1957             }
1958
1959             bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
1960
1961             // hwintrinsiclistxarch.h uses Dword index instructions in default
1962             if (varTypeIsLong(node->gtIndexBaseType))
1963             {
1964                 switch (ins)
1965                 {
1966                     case INS_vpgatherdd:
1967                         ins = INS_vpgatherqd;
1968                         if (isVector128GatherWithVector256Index)
1969                         {
1970                             // YMM index in address mode
1971                             attr = emitTypeSize(TYP_SIMD32);
1972                         }
1973                         break;
1974                     case INS_vpgatherdq:
1975                         ins = INS_vpgatherqq;
1976                         break;
1977                     case INS_vgatherdps:
1978                         ins = INS_vgatherqps;
1979                         if (isVector128GatherWithVector256Index)
1980                         {
1981                             // YMM index in address mode
1982                             attr = emitTypeSize(TYP_SIMD32);
1983                         }
1984                         break;
1985                     case INS_vgatherdpd:
1986                         ins = INS_vgatherqpd;
1987                         break;
1988                     default:
1989                         unreached();
1990                 }
1991             }
1992
1993             assert(lastOp->IsCnsIntOrI());
1994             ssize_t ival = lastOp->AsIntCon()->IconValue();
1995             assert((ival >= 0) && (ival <= 255));
1996
1997             assert(targetReg != maskReg);
1998             assert(targetReg != addrIndexReg);
1999             assert(maskReg != addrIndexReg);
2000             emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2001
2002             break;
2003         }
2004
2005         case NI_AVX_TestC:
2006         {
2007             genHWIntrinsic_R_RM(node, ins, attr);
2008             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2009             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2010             break;
2011         }
2012
2013         case NI_AVX_TestNotZAndNotC:
2014         {
2015             genHWIntrinsic_R_RM(node, ins, attr);
2016             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2017             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2018             break;
2019         }
2020
2021         case NI_AVX_TestZ:
2022         {
2023             genHWIntrinsic_R_RM(node, ins, attr);
2024             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2025             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2026             break;
2027         }
2028
2029         default:
2030             unreached();
2031             break;
2032     }
2033
2034     genProduceReg(node);
2035 }
2036
2037 //------------------------------------------------------------------------
2038 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2039 //
2040 // Arguments:
2041 //    node - The hardware intrinsic node
2042 //
2043 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2044 {
2045     NYI("Implement AES intrinsic code generation");
2046 }
2047
2048 //------------------------------------------------------------------------
2049 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2050 //
2051 // Arguments:
2052 //    node - The hardware intrinsic node
2053 //
2054 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2055 {
2056     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2057     regNumber      targetReg   = node->gtRegNum;
2058     GenTree*       op1         = node->gtGetOp1();
2059     GenTree*       op2         = node->gtGetOp2();
2060     var_types      targetType  = node->TypeGet();
2061     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2062     emitter*       emit        = getEmitter();
2063
2064     assert(targetReg != REG_NA);
2065     assert(op1 != nullptr);
2066
2067     genConsumeHWIntrinsicOperands(node);
2068
2069     switch (intrinsicId)
2070     {
2071         case NI_BMI1_AndNot:
2072         case NI_BMI1_X64_AndNot:
2073         case NI_BMI1_BitFieldExtract:
2074         case NI_BMI1_X64_BitFieldExtract:
2075         case NI_BMI2_ParallelBitDeposit:
2076         case NI_BMI2_ParallelBitExtract:
2077         case NI_BMI2_X64_ParallelBitDeposit:
2078         case NI_BMI2_X64_ParallelBitExtract:
2079         case NI_BMI2_ZeroHighBits:
2080         case NI_BMI2_X64_ZeroHighBits:
2081         {
2082             assert(op2 != nullptr);
2083             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2084             genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2085             break;
2086         }
2087
2088         case NI_BMI1_ExtractLowestSetBit:
2089         case NI_BMI1_GetMaskUpToLowestSetBit:
2090         case NI_BMI1_ResetLowestSetBit:
2091         case NI_BMI1_X64_ExtractLowestSetBit:
2092         case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2093         case NI_BMI1_X64_ResetLowestSetBit:
2094         {
2095             assert(op2 == nullptr);
2096             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2097             genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2098             break;
2099         }
2100
2101         case NI_BMI1_TrailingZeroCount:
2102         case NI_BMI1_X64_TrailingZeroCount:
2103         {
2104             assert(op2 == nullptr);
2105             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2106             genXCNTIntrinsic(node, ins);
2107             break;
2108         }
2109
2110         case NI_BMI2_MultiplyNoFlags:
2111         case NI_BMI2_X64_MultiplyNoFlags:
2112         {
2113             int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2114             assert(numArgs == 2 || numArgs == 3);
2115
2116             regNumber op1Reg = REG_NA;
2117             regNumber op2Reg = REG_NA;
2118             regNumber op3Reg = REG_NA;
2119             regNumber lowReg = REG_NA;
2120
2121             if (numArgs == 2)
2122             {
2123                 op1Reg = op1->gtRegNum;
2124                 op2Reg = op2->gtRegNum;
2125                 lowReg = targetReg;
2126             }
2127             else
2128             {
2129                 GenTreeArgList* argList = op1->AsArgList();
2130                 op1                     = argList->Current();
2131                 op1Reg                  = op1->gtRegNum;
2132                 argList                 = argList->Rest();
2133                 op2                     = argList->Current();
2134                 op2Reg                  = op2->gtRegNum;
2135                 argList                 = argList->Rest();
2136                 GenTree* op3            = argList->Current();
2137                 op3Reg                  = op3->gtRegNum;
2138                 assert(op3Reg != op1Reg);
2139                 assert(op3Reg != targetReg);
2140                 assert(op3Reg != REG_EDX);
2141                 lowReg = node->GetSingleTempReg();
2142                 assert(op3Reg != lowReg);
2143                 assert(lowReg != targetReg);
2144             }
2145
2146             emitAttr attr = emitTypeSize(targetType);
2147             // mov the first operand into implicit source operand EDX/RDX
2148             if (op1Reg != REG_EDX)
2149             {
2150                 assert(op2Reg != REG_EDX);
2151                 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2152             }
2153
2154             // generate code for MULX
2155             genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2156
2157             // If requires the lower half result, store in the memory opinted by op3
2158             if (numArgs == 3)
2159             {
2160                 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2161             }
2162
2163             break;
2164         }
2165
2166         default:
2167         {
2168             unreached();
2169             break;
2170         }
2171     }
2172
2173     genProduceReg(node);
2174 }
2175
2176 //------------------------------------------------------------------------
2177 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2178 //
2179 // Arguments:
2180 //    node - The hardware intrinsic node
2181 //
2182 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2183 {
2184     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2185     var_types      baseType    = node->gtSIMDBaseType;
2186     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
2187     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2188     GenTree*       op1         = node->gtGetOp1();
2189     regNumber      targetReg   = node->gtRegNum;
2190
2191     assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2192
2193     genConsumeHWIntrinsicOperands(node);
2194     GenTreeArgList* argList = op1->AsArgList();
2195     op1                     = argList->Current();
2196
2197     argList      = argList->Rest();
2198     GenTree* op2 = argList->Current();
2199
2200     argList      = argList->Rest();
2201     GenTree* op3 = argList->Current();
2202
2203     regNumber op1Reg;
2204     regNumber op2Reg;
2205
2206     bool       isCommutative   = false;
2207     const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2208
2209     // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2210     assert(!copiesUpperBits || !op1->isContained());
2211
2212     if (op3->isContained() || op3->isUsedFromSpillTemp())
2213     {
2214         // 213 form: op1 = (op2 * op1) + [op3]
2215
2216         op1Reg = op1->gtRegNum;
2217         op2Reg = op2->gtRegNum;
2218
2219         isCommutative = !copiesUpperBits;
2220     }
2221     else if (op2->isContained() || op2->isUsedFromSpillTemp())
2222     {
2223         // 132 form: op1 = (op1 * op3) + [op2]
2224
2225         ins    = (instruction)(ins - 1);
2226         op1Reg = op1->gtRegNum;
2227         op2Reg = op3->gtRegNum;
2228         op3    = op2;
2229     }
2230     else if (op1->isContained() || op1->isUsedFromSpillTemp())
2231     {
2232         // 231 form: op3 = (op2 * op3) + [op1]
2233
2234         ins    = (instruction)(ins + 1);
2235         op1Reg = op3->gtRegNum;
2236         op2Reg = op2->gtRegNum;
2237         op3    = op1;
2238     }
2239     else
2240     {
2241         // 213 form: op1 = (op2 * op1) + op3
2242
2243         op1Reg = op1->gtRegNum;
2244         op2Reg = op2->gtRegNum;
2245
2246         isCommutative = !copiesUpperBits;
2247     }
2248
2249     if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2250     {
2251         assert(node->isRMWHWIntrinsic(compiler));
2252
2253         // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2254         //
2255         // For non-commutative intrinsics, we should have ensured that op2 was marked
2256         // delay free in order to prevent it from getting assigned the same register
2257         // as target. However, for commutative intrinsics, we can just swap the operands
2258         // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2259
2260         op2Reg = op1Reg;
2261         op1Reg = targetReg;
2262     }
2263
2264     genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2265     genProduceReg(node);
2266 }
2267
2268 //------------------------------------------------------------------------
2269 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2270 //
2271 // Arguments:
2272 //    node - The hardware intrinsic node
2273 //
2274 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2275 {
2276     assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2277            node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2278
2279     genConsumeOperands(node);
2280     genXCNTIntrinsic(node, INS_lzcnt);
2281     genProduceReg(node);
2282 }
2283
2284 //------------------------------------------------------------------------
2285 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2286 //
2287 // Arguments:
2288 //    node - The hardware intrinsic node
2289 //
2290 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2291 {
2292     NYI("Implement PCLMULQDQ intrinsic code generation");
2293 }
2294
2295 //------------------------------------------------------------------------
2296 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2297 //
2298 // Arguments:
2299 //    node - The hardware intrinsic node
2300 //
2301 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2302 {
2303     assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2304
2305     genConsumeOperands(node);
2306     genXCNTIntrinsic(node, INS_popcnt);
2307     genProduceReg(node);
2308 }
2309
2310 //------------------------------------------------------------------------
2311 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2312 // the target register
2313 //
2314 // Arguments:
2315 //    node - The hardware intrinsic node
2316 //    ins  - The instruction being generated
2317 //
2318 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2319 {
2320     // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2321     // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2322     // renaming, but only if it's not an actual dependency.
2323
2324     GenTree*  op1        = node->gtGetOp1();
2325     regNumber sourceReg1 = REG_NA;
2326     regNumber sourceReg2 = REG_NA;
2327
2328     if (!op1->isContained())
2329     {
2330         sourceReg1 = op1->gtRegNum;
2331     }
2332     else if (op1->isIndir())
2333     {
2334         GenTreeIndir* indir   = op1->AsIndir();
2335         GenTree*      memBase = indir->Base();
2336
2337         if (memBase != nullptr)
2338         {
2339             sourceReg1 = memBase->gtRegNum;
2340         }
2341
2342         if (indir->HasIndex())
2343         {
2344             sourceReg2 = indir->Index()->gtRegNum;
2345         }
2346     }
2347
2348     regNumber targetReg = node->gtRegNum;
2349     if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2350     {
2351         getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2352     }
2353     genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2354 }
2355
2356 #endif // FEATURE_HW_INTRINSICS