42dba8250a5df8287ec69971325dae438d6f05f7
[platform/upstream/coreclr.git] / src / jit / hwintrinsiccodegenxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX               Intel hardware intrinsic Code Generator                     XX
9 XX                                                                           XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12 */
13 #include "jitpch.h"
14 #ifdef _MSC_VER
15 #pragma hdrstop
16 #endif
17
18 #ifdef FEATURE_HW_INTRINSICS
19
20 #include "emit.h"
21 #include "codegen.h"
22 #include "sideeffects.h"
23 #include "lower.h"
24 #include "gcinfo.h"
25 #include "gcinfoencoder.h"
26
27 //------------------------------------------------------------------------
28 // assertIsContainableHWIntrinsicOp: Asserts that op is containable by node
29 //
30 // Arguments:
31 //    lowering - The lowering phase from the compiler
32 //    node     - The HWIntrinsic node that has the contained node
33 //    op       - The op that is contained
34 //
35 static void assertIsContainableHWIntrinsicOp(Lowering* lowering, GenTreeHWIntrinsic* node, GenTree* op)
36 {
37 #if DEBUG
38     // The Lowering::IsContainableHWIntrinsicOp call is not quite right, since it follows pre-register allocation
39     // logic. However, this check is still important due to the various containment rules that SIMD intrinsics follow.
40     //
41     // We use isContainable to track the special HWIntrinsic node containment rules (for things like LoadAligned and
42     // LoadUnaligned) and we use the supportsRegOptional check to support general-purpose loads (both from stack
43     // spillage
44     // and for isUsedFromMemory contained nodes, in the case where the register allocator decided to not allocate a
45     // register
46     // in the first place).
47
48     bool supportsRegOptional = false;
49     bool isContainable       = lowering->IsContainableHWIntrinsicOp(node, op, &supportsRegOptional);
50     assert(isContainable || supportsRegOptional);
51 #endif // DEBUG
52 }
53
54 //------------------------------------------------------------------------
55 // genIsTableDrivenHWIntrinsic:
56 //
57 // Arguments:
58 //    category - category of a HW intrinsic
59 //
60 // Return Value:
61 //    returns true if this category can be table-driven in CodeGen
62 //
63 static bool genIsTableDrivenHWIntrinsic(NamedIntrinsic intrinsicId, HWIntrinsicCategory category)
64 {
65     // TODO - make more categories to the table-driven framework
66     // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
67     const bool tableDrivenCategory =
68         (category != HW_Category_Special) && (category != HW_Category_Scalar) && (category != HW_Category_Helper);
69     const bool tableDrivenFlag =
70         !HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId) && !HWIntrinsicInfo::HasSpecialCodegen(intrinsicId);
71     return tableDrivenCategory && tableDrivenFlag;
72 }
73
74 //------------------------------------------------------------------------
75 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
76 //
77 // Arguments:
78 //    node - The hardware intrinsic node
79 //
80 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
81 {
82     NamedIntrinsic      intrinsicId = node->gtHWIntrinsicId;
83     InstructionSet      isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
84     HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
85     int                 ival        = HWIntrinsicInfo::lookupIval(intrinsicId);
86     int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
87
88     assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
89
90     if (genIsTableDrivenHWIntrinsic(intrinsicId, category))
91     {
92         GenTree*  op1        = node->gtGetOp1();
93         GenTree*  op2        = node->gtGetOp2();
94         regNumber targetReg  = node->gtRegNum;
95         var_types targetType = node->TypeGet();
96         var_types baseType   = node->gtSIMDBaseType;
97
98         regNumber op1Reg = REG_NA;
99         regNumber op2Reg = REG_NA;
100         emitter*  emit   = getEmitter();
101
102         assert(numArgs >= 0);
103         instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
104         assert(ins != INS_invalid);
105         emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
106         assert(simdSize != 0);
107
108         switch (numArgs)
109         {
110             case 1:
111             {
112                 if (node->OperIsMemoryLoad())
113                 {
114                     genConsumeAddress(op1);
115                     // Until we improve the handling of addressing modes in the emitter, we'll create a
116                     // temporary GT_IND to generate code with.
117                     GenTreeIndir load = indirForm(node->TypeGet(), op1);
118                     emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
119                 }
120                 else
121                 {
122                     genConsumeRegs(op1);
123                     op1Reg = op1->gtRegNum;
124
125                     if ((ival != -1) && varTypeIsFloating(baseType))
126                     {
127                         assert((ival >= 0) && (ival <= 127));
128                         genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
129                     }
130                     else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
131                     {
132                         emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
133                     }
134                     else
135                     {
136                         genHWIntrinsic_R_RM(node, ins, simdSize);
137                     }
138                 }
139                 break;
140             }
141
142             case 2:
143             {
144                 if (category == HW_Category_MemoryStore)
145                 {
146                     genConsumeAddress(op1);
147                     genConsumeReg(op2);
148                     // Until we improve the handling of addressing modes in the emitter, we'll create a
149                     // temporary GT_STORE_IND to generate code with.
150                     GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
151                     emit->emitInsStoreInd(ins, simdSize, &store);
152                     break;
153                 }
154                 genConsumeRegs(op1);
155                 genConsumeRegs(op2);
156
157                 op1Reg = op1->gtRegNum;
158                 op2Reg = op2->gtRegNum;
159
160                 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
161                 {
162                     // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
163                     //
164                     // For non-commutative intrinsics, we should have ensured that op2 was marked
165                     // delay free in order to prevent it from getting assigned the same register
166                     // as target. However, for commutative intrinsics, we can just swap the operands
167                     // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
168
169                     noway_assert(node->OperIsCommutative());
170                     op2Reg = op1Reg;
171                     op1Reg = targetReg;
172                 }
173
174                 if ((ival != -1) && varTypeIsFloating(baseType))
175                 {
176                     assert((ival >= 0) && (ival <= 127));
177                     genHWIntrinsic_R_R_RM_I(node, ins, static_cast<int8_t>(ival));
178                 }
179                 else if (category == HW_Category_MemoryLoad)
180                 {
181                     // Get the address and the 'other' register.
182                     GenTree*  addr;
183                     regNumber otherReg;
184                     if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
185                     {
186                         addr     = op1;
187                         otherReg = op2Reg;
188                     }
189                     else
190                     {
191                         addr     = op2;
192                         otherReg = op1Reg;
193                     }
194                     // Until we improve the handling of addressing modes in the emitter, we'll create a
195                     // temporary GT_IND to generate code with.
196                     GenTreeIndir load = indirForm(node->TypeGet(), addr);
197                     genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
198                 }
199                 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
200                 {
201                     assert(ival == -1);
202
203                     if (intrinsicId == NI_SSE2_Extract)
204                     {
205                         // extract instructions return to GP-registers, so it needs int size as the emitsize
206                         simdSize = emitTypeSize(TYP_INT);
207                     }
208
209                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
210
211                     if (op2->IsCnsIntOrI())
212                     {
213                         ssize_t ival = op2->AsIntCon()->IconValue();
214                         assert((ival >= 0) && (ival <= 255));
215                         emitSwCase((int8_t)ival);
216                     }
217                     else
218                     {
219                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
220                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
221                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
222                         regNumber baseReg = node->ExtractTempReg();
223                         regNumber offsReg = node->GetSingleTempReg();
224                         genHWIntrinsicJumpTableFallback(intrinsicId, op2Reg, baseReg, offsReg, emitSwCase);
225                     }
226                 }
227                 else
228                 {
229                     genHWIntrinsic_R_R_RM(node, ins, EA_ATTR(node->gtSIMDSize));
230                 }
231                 break;
232             }
233
234             case 3:
235             {
236                 GenTreeArgList* argList = op1->AsArgList();
237                 op1                     = argList->Current();
238                 genConsumeRegs(op1);
239                 op1Reg = op1->gtRegNum;
240
241                 argList = argList->Rest();
242                 op2     = argList->Current();
243                 genConsumeRegs(op2);
244                 op2Reg = op2->gtRegNum;
245
246                 argList      = argList->Rest();
247                 GenTree* op3 = argList->Current();
248                 genConsumeRegs(op3);
249                 regNumber op3Reg = op3->gtRegNum;
250
251                 if (HWIntrinsicInfo::isImmOp(intrinsicId, op3))
252                 {
253                     assert(ival == -1);
254
255                     auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_R_RM_I(node, ins, i); };
256
257                     if (op3->IsCnsIntOrI())
258                     {
259                         ssize_t ival = op3->AsIntCon()->IconValue();
260                         assert((ival >= 0) && (ival <= 255));
261                         emitSwCase((int8_t)ival);
262                     }
263                     else
264                     {
265                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
266                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
267                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
268                         regNumber baseReg = node->ExtractTempReg();
269                         regNumber offsReg = node->GetSingleTempReg();
270                         genHWIntrinsicJumpTableFallback(intrinsicId, op3Reg, baseReg, offsReg, emitSwCase);
271                     }
272                 }
273                 else if (category == HW_Category_MemoryStore)
274                 {
275                     // The Mask instructions do not currently support containment of the address.
276                     assert(!op2->isContained());
277                     if (intrinsicId == NI_AVX_MaskStore || intrinsicId == NI_AVX2_MaskStore)
278                     {
279                         emit->emitIns_AR_R_R(ins, simdSize, op2Reg, op3Reg, op1Reg, 0);
280                     }
281                     else
282                     {
283                         assert(intrinsicId == NI_SSE2_MaskMove);
284                         assert(targetReg == REG_NA);
285
286                         // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
287                         if (op3Reg != REG_EDI)
288                         {
289                             emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
290                         }
291                         emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
292                     }
293                 }
294                 else
295                 {
296                     switch (intrinsicId)
297                     {
298                         case NI_SSE41_BlendVariable:
299                         case NI_AVX_BlendVariable:
300                         case NI_AVX2_BlendVariable:
301                         {
302                             genHWIntrinsic_R_R_RM_R(node, ins);
303                             break;
304                         }
305
306                         default:
307                         {
308                             unreached();
309                             break;
310                         };
311                     }
312                 }
313                 break;
314             }
315
316             default:
317                 unreached();
318                 break;
319         }
320         genProduceReg(node);
321         return;
322     }
323
324     switch (isa)
325     {
326         case InstructionSet_Vector128:
327         case InstructionSet_Vector256:
328             genBaseIntrinsic(node);
329             break;
330         case InstructionSet_SSE:
331         case InstructionSet_SSE_X64:
332             genSSEIntrinsic(node);
333             break;
334         case InstructionSet_SSE2:
335         case InstructionSet_SSE2_X64:
336             genSSE2Intrinsic(node);
337             break;
338         case InstructionSet_SSE41:
339         case InstructionSet_SSE41_X64:
340             genSSE41Intrinsic(node);
341             break;
342         case InstructionSet_SSE42:
343         case InstructionSet_SSE42_X64:
344             genSSE42Intrinsic(node);
345             break;
346         case InstructionSet_AVX:
347         case InstructionSet_AVX2:
348             genAvxOrAvx2Intrinsic(node);
349             break;
350         case InstructionSet_AES:
351             genAESIntrinsic(node);
352             break;
353         case InstructionSet_BMI1:
354         case InstructionSet_BMI1_X64:
355         case InstructionSet_BMI2:
356         case InstructionSet_BMI2_X64:
357             genBMI1OrBMI2Intrinsic(node);
358             break;
359         case InstructionSet_FMA:
360             genFMAIntrinsic(node);
361             break;
362         case InstructionSet_LZCNT:
363         case InstructionSet_LZCNT_X64:
364             genLZCNTIntrinsic(node);
365             break;
366         case InstructionSet_PCLMULQDQ:
367             genPCLMULQDQIntrinsic(node);
368             break;
369         case InstructionSet_POPCNT:
370         case InstructionSet_POPCNT_X64:
371             genPOPCNTIntrinsic(node);
372             break;
373         default:
374             unreached();
375             break;
376     }
377 }
378
379 //------------------------------------------------------------------------
380 // genHWIntrinsic_R_RM: Generates the code for a hardware intrinsic node that takes a
381 //                      register/memory operand and that returns a value in register
382 //
383 // Arguments:
384 //    node - The hardware intrinsic node
385 //    ins  - The instruction being generated
386 //    attr - The emit attribute for the instruciton being generated
387 //
388 void CodeGen::genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
389 {
390     var_types targetType = node->TypeGet();
391     regNumber targetReg  = node->gtRegNum;
392     GenTree*  op1        = node->gtGetOp1();
393     GenTree*  op2        = node->gtGetOp2();
394     emitter*  emit       = getEmitter();
395
396     if (op2 != nullptr)
397     {
398         // The CompareScalarOrdered* and CompareScalarUnordered* intrinsics come down this
399         // code path. They are all MultiIns, as the return value comes from the flags and
400         // we have two operands instead.
401
402         assert(HWIntrinsicInfo::GeneratesMultipleIns(node->gtHWIntrinsicId));
403         assert(targetReg != REG_NA);
404
405         targetReg = op1->gtRegNum;
406         op1       = op2;
407         op2       = nullptr;
408     }
409     else
410     {
411         assert(!node->OperIsCommutative());
412     }
413
414     assert(targetReg != REG_NA);
415     assert(op2 == nullptr);
416
417     if (op1->isContained() || op1->isUsedFromSpillTemp())
418     {
419         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
420         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
421
422         TempDsc* tmpDsc = nullptr;
423         unsigned varNum = BAD_VAR_NUM;
424         unsigned offset = (unsigned)-1;
425
426         if (op1->isUsedFromSpillTemp())
427         {
428             assert(op1->IsRegOptional());
429
430             tmpDsc = getSpillTempDsc(op1);
431             varNum = tmpDsc->tdTempNum();
432             offset = 0;
433
434             regSet.tmpRlsTemp(tmpDsc);
435         }
436         else if (op1->isIndir() || op1->OperIsHWIntrinsic())
437         {
438             GenTree*      addr;
439             GenTreeIndir* memIndir = nullptr;
440
441             if (op1->isIndir())
442             {
443                 memIndir = op1->AsIndir();
444                 addr     = memIndir->Addr();
445             }
446             else
447             {
448                 assert(op1->AsHWIntrinsic()->OperIsMemoryLoad());
449                 assert(HWIntrinsicInfo::lookupNumArgs(op1->AsHWIntrinsic()) == 1);
450                 addr = op1->gtGetOp1();
451             }
452
453             switch (addr->OperGet())
454             {
455                 case GT_LCL_VAR_ADDR:
456                 {
457                     varNum = addr->AsLclVarCommon()->GetLclNum();
458                     offset = 0;
459                     break;
460                 }
461
462                 case GT_CLS_VAR_ADDR:
463                 {
464                     emit->emitIns_R_C(ins, attr, targetReg, addr->gtClsVar.gtClsVarHnd, 0);
465                     return;
466                 }
467
468                 default:
469                 {
470                     if (memIndir == nullptr)
471                     {
472                         // This is the HW intrinsic load case.
473                         // Until we improve the handling of addressing modes in the emitter, we'll create a
474                         // temporary GT_IND to generate code with.
475                         GenTreeIndir load = indirForm(op1->TypeGet(), addr);
476                         memIndir          = &load;
477                     }
478                     emit->emitIns_R_A(ins, attr, targetReg, memIndir);
479                     return;
480                 }
481             }
482         }
483         else
484         {
485             switch (op1->OperGet())
486             {
487                 case GT_LCL_FLD:
488                 {
489                     GenTreeLclFld* lclField = op1->AsLclFld();
490
491                     varNum = lclField->GetLclNum();
492                     offset = lclField->gtLclFld.gtLclOffs;
493                     break;
494                 }
495
496                 case GT_LCL_VAR:
497                 {
498                     assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
499                     varNum = op1->AsLclVar()->GetLclNum();
500                     offset = 0;
501                     break;
502                 }
503
504                 default:
505                 {
506                     unreached();
507                     break;
508                 }
509             }
510         }
511
512         // Ensure we got a good varNum and offset.
513         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
514         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
515         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
516         assert(offset != (unsigned)-1);
517
518         emit->emitIns_R_S(ins, attr, targetReg, varNum, offset);
519     }
520     else
521     {
522         regNumber op1Reg = op1->gtRegNum;
523         emit->emitIns_R_R(ins, attr, targetReg, op1Reg);
524     }
525 }
526
527 //------------------------------------------------------------------------
528 // genHWIntrinsic_R_RM_I: Generates the code for a hardware intrinsic node that takes a register/memory operand,
529 //                        an immediate operand, and that returns a value in register
530 //
531 // Arguments:
532 //    node - The hardware intrinsic node
533 //    ins  - The instruction being generated
534 //    ival - The immediate value
535 //
536 void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
537 {
538     var_types targetType = node->TypeGet();
539     regNumber targetReg  = node->gtRegNum;
540     GenTree*  op1        = node->gtGetOp1();
541     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
542     emitter*  emit       = getEmitter();
543
544     // TODO-XArch-CQ: Commutative operations can have op1 be contained
545     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
546
547     assert(targetReg != REG_NA);
548     assert(!node->OperIsCommutative()); // One operand intrinsics cannot be commutative
549
550     if (op1->isContained() || op1->isUsedFromSpillTemp())
551     {
552         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
553         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
554     }
555     inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
556 }
557
558 //------------------------------------------------------------------------
559 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
560 //                        register/memory operand, and that returns a value in register
561 //
562 // Arguments:
563 //    node - The hardware intrinsic node
564 //    ins  - The instruction being generated
565 //    attr - The emit attribute for the instruciton being generated
566 //
567 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr)
568 {
569     regNumber targetReg = node->gtRegNum;
570     GenTree*  op1       = node->gtGetOp1();
571     GenTree*  op2       = node->gtGetOp2();
572     regNumber op1Reg    = op1->gtRegNum;
573
574     assert(targetReg != REG_NA);
575     assert(op1Reg != REG_NA);
576
577     genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, op1Reg, op2);
578 }
579
580 //------------------------------------------------------------------------
581 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
582 //                        register/memory operand, and that returns a value in register
583 //
584 // Arguments:
585 //    node - The hardware intrinsic node
586 //    ins  - The instruction being generated
587 //    attr - The emit attribute for the instruciton being generated
588 //    targetReg - The register allocated to the result
589 //    op1Reg    - The register allocated to the first operand
590 //    op2       - Another operand that maybe in register or memory
591 //
592 void CodeGen::genHWIntrinsic_R_R_RM(
593     GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2)
594 {
595     emitter* emit = getEmitter();
596
597     // TODO-XArch-CQ: Commutative operations can have op1 be contained
598     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
599
600     assert(targetReg != REG_NA);
601     assert(op1Reg != REG_NA);
602
603     if (op2->isContained() || op2->isUsedFromSpillTemp())
604     {
605         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
606         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
607
608         TempDsc* tmpDsc = nullptr;
609         unsigned varNum = BAD_VAR_NUM;
610         unsigned offset = (unsigned)-1;
611
612         if (op2->isUsedFromSpillTemp())
613         {
614             assert(op2->IsRegOptional());
615
616             tmpDsc = getSpillTempDsc(op2);
617             varNum = tmpDsc->tdTempNum();
618             offset = 0;
619
620             regSet.tmpRlsTemp(tmpDsc);
621         }
622         else if (op2->isIndir() || op2->OperIsHWIntrinsic())
623         {
624             GenTree*      addr;
625             GenTreeIndir* memIndir = nullptr;
626
627             if (op2->isIndir())
628             {
629                 memIndir = op2->AsIndir();
630                 addr     = memIndir->Addr();
631             }
632             else
633             {
634                 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
635                 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
636                 addr = op2->gtGetOp1();
637             }
638
639             switch (addr->OperGet())
640             {
641                 case GT_LCL_VAR_ADDR:
642                 {
643                     varNum = addr->AsLclVarCommon()->GetLclNum();
644                     offset = 0;
645                     break;
646                 }
647
648                 case GT_CLS_VAR_ADDR:
649                 {
650                     emit->emitIns_SIMD_R_R_C(ins, attr, targetReg, op1Reg, addr->gtClsVar.gtClsVarHnd, 0);
651                     return;
652                 }
653
654                 default:
655                 {
656                     if (memIndir == nullptr)
657                     {
658                         // This is the HW intrinsic load case.
659                         // Until we improve the handling of addressing modes in the emitter, we'll create a
660                         // temporary GT_IND to generate code with.
661                         GenTreeIndir load = indirForm(op2->TypeGet(), addr);
662                         memIndir          = &load;
663                     }
664                     emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, memIndir);
665                     return;
666                 }
667             }
668         }
669         else
670         {
671             switch (op2->OperGet())
672             {
673                 case GT_LCL_FLD:
674                 {
675                     GenTreeLclFld* lclField = op2->AsLclFld();
676
677                     varNum = lclField->GetLclNum();
678                     offset = lclField->gtLclFld.gtLclOffs;
679                     break;
680                 }
681
682                 case GT_LCL_VAR:
683                 {
684                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
685                     varNum = op2->AsLclVar()->GetLclNum();
686                     offset = 0;
687                     break;
688                 }
689
690                 default:
691                     unreached();
692                     break;
693             }
694         }
695
696         // Ensure we got a good varNum and offset.
697         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
698         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
699         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
700         assert(offset != (unsigned)-1);
701
702         emit->emitIns_SIMD_R_R_S(ins, attr, targetReg, op1Reg, varNum, offset);
703     }
704     else
705     {
706         regNumber op2Reg = op2->gtRegNum;
707
708         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
709         {
710             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
711             //
712             // For non-commutative intrinsics, we should have ensured that op2 was marked
713             // delay free in order to prevent it from getting assigned the same register
714             // as target. However, for commutative intrinsics, we can just swap the operands
715             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
716
717             noway_assert(node->OperIsCommutative());
718             op2Reg = op1Reg;
719             op1Reg = targetReg;
720         }
721
722         emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
723     }
724 }
725
726 //------------------------------------------------------------------------
727 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
728 //                        register/memory operand, an immediate operand, and that returns a value in register
729 //
730 // Arguments:
731 //    node - The hardware intrinsic node
732 //    ins  - The instruction being generated
733 //    ival - The immediate value
734 //
735 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, int8_t ival)
736 {
737     var_types targetType = node->TypeGet();
738     regNumber targetReg  = node->gtRegNum;
739     GenTree*  op1        = node->gtGetOp1();
740     GenTree*  op2        = node->gtGetOp2();
741     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
742     emitter*  emit       = getEmitter();
743
744     // TODO-XArch-CQ: Commutative operations can have op1 be contained
745     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
746
747     if (op1->OperIsList())
748     {
749         assert(op2 == nullptr);
750
751         GenTreeArgList* argList = op1->AsArgList();
752
753         op1     = argList->Current();
754         argList = argList->Rest();
755
756         op2     = argList->Current();
757         argList = argList->Rest();
758
759         assert(argList->Current() != nullptr);
760         assert(argList->Rest() == nullptr);
761     }
762
763     regNumber op1Reg = op1->gtRegNum;
764
765     assert(targetReg != REG_NA);
766     assert(op1Reg != REG_NA);
767
768     if (op2->isContained() || op2->isUsedFromSpillTemp())
769     {
770         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
771         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
772
773         TempDsc* tmpDsc = nullptr;
774         unsigned varNum = BAD_VAR_NUM;
775         unsigned offset = (unsigned)-1;
776
777         if (op2->isUsedFromSpillTemp())
778         {
779             assert(op2->IsRegOptional());
780
781             tmpDsc = getSpillTempDsc(op2);
782             varNum = tmpDsc->tdTempNum();
783             offset = 0;
784
785             regSet.tmpRlsTemp(tmpDsc);
786         }
787         else if (op2->isIndir() || op2->OperIsHWIntrinsic())
788         {
789             GenTree*      addr;
790             GenTreeIndir* memIndir = nullptr;
791
792             if (op2->isIndir())
793             {
794                 memIndir = op2->AsIndir();
795                 addr     = memIndir->Addr();
796             }
797             else
798             {
799                 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
800                 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
801                 addr = op2->gtGetOp1();
802             }
803
804             switch (addr->OperGet())
805             {
806                 case GT_LCL_VAR_ADDR:
807                 {
808                     varNum = addr->AsLclVarCommon()->GetLclNum();
809                     offset = 0;
810                     break;
811                 }
812
813                 case GT_CLS_VAR_ADDR:
814                 {
815                     emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, addr->gtClsVar.gtClsVarHnd, 0, ival);
816                     return;
817                 }
818
819                 default:
820                 {
821                     if (memIndir == nullptr)
822                     {
823                         // This is the HW intrinsic load case.
824                         // Until we improve the handling of addressing modes in the emitter, we'll create a
825                         // temporary GT_IND to generate code with.
826                         GenTreeIndir load = indirForm(op2->TypeGet(), addr);
827                         memIndir          = &load;
828                     }
829                     emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
830                     return;
831                 }
832             }
833         }
834         else
835         {
836             switch (op2->OperGet())
837             {
838                 case GT_LCL_FLD:
839                 {
840                     GenTreeLclFld* lclField = op2->AsLclFld();
841
842                     varNum = lclField->GetLclNum();
843                     offset = lclField->gtLclFld.gtLclOffs;
844                     break;
845                 }
846
847                 case GT_LCL_VAR:
848                 {
849                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
850                     varNum = op2->AsLclVar()->GetLclNum();
851                     offset = 0;
852                     break;
853                 }
854
855                 default:
856                     unreached();
857                     break;
858             }
859         }
860
861         // Ensure we got a good varNum and offset.
862         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
863         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
864         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
865         assert(offset != (unsigned)-1);
866
867         emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
868     }
869     else
870     {
871         regNumber op2Reg = op2->gtRegNum;
872
873         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
874         {
875             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
876             //
877             // For non-commutative intrinsics, we should have ensured that op2 was marked
878             // delay free in order to prevent it from getting assigned the same register
879             // as target. However, for commutative intrinsics, we can just swap the operands
880             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
881
882             noway_assert(node->OperIsCommutative());
883             op2Reg = op1Reg;
884             op1Reg = targetReg;
885         }
886
887         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
888     }
889 }
890
891 //------------------------------------------------------------------------
892 // genHWIntrinsic_R_R_RM_R: Generates the code for a hardware intrinsic node that takes a register operand, a
893 //                          register/memory operand, another register operand, and that returns a value in register
894 //
895 // Arguments:
896 //    node - The hardware intrinsic node
897 //    ins  - The instruction being generated
898 //
899 void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins)
900 {
901     var_types targetType = node->TypeGet();
902     regNumber targetReg  = node->gtRegNum;
903     GenTree*  op1        = node->gtGetOp1();
904     GenTree*  op2        = node->gtGetOp2();
905     GenTree*  op3        = nullptr;
906     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
907     emitter*  emit       = getEmitter();
908
909     assert(op1->OperIsList());
910     assert(op2 == nullptr);
911
912     GenTreeArgList* argList = op1->AsArgList();
913
914     op1     = argList->Current();
915     argList = argList->Rest();
916
917     op2     = argList->Current();
918     argList = argList->Rest();
919
920     op3 = argList->Current();
921     assert(argList->Rest() == nullptr);
922
923     regNumber op1Reg = op1->gtRegNum;
924     regNumber op3Reg = op3->gtRegNum;
925
926     assert(targetReg != REG_NA);
927     assert(op1Reg != REG_NA);
928     assert(op3Reg != REG_NA);
929
930     if (op2->isContained() || op2->isUsedFromSpillTemp())
931     {
932         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
933         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2);
934
935         TempDsc* tmpDsc = nullptr;
936         unsigned varNum = BAD_VAR_NUM;
937         unsigned offset = (unsigned)-1;
938
939         if (op2->isUsedFromSpillTemp())
940         {
941             assert(op2->IsRegOptional());
942
943             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
944             //                     pattern. It could probably be extracted to its own method.
945             tmpDsc = getSpillTempDsc(op2);
946             varNum = tmpDsc->tdTempNum();
947             offset = 0;
948
949             regSet.tmpRlsTemp(tmpDsc);
950         }
951         else if (op2->isIndir() || op2->OperIsHWIntrinsic())
952         {
953             GenTree*      addr;
954             GenTreeIndir* memIndir = nullptr;
955
956             if (op2->isIndir())
957             {
958                 memIndir = op2->AsIndir();
959                 addr     = memIndir->Addr();
960             }
961             else
962             {
963                 assert(op2->AsHWIntrinsic()->OperIsMemoryLoad());
964                 assert(HWIntrinsicInfo::lookupNumArgs(op2->AsHWIntrinsic()) == 1);
965                 addr = op2->gtGetOp1();
966             }
967
968             switch (addr->OperGet())
969             {
970                 case GT_LCL_VAR_ADDR:
971                 {
972                     varNum = addr->AsLclVarCommon()->GetLclNum();
973                     offset = 0;
974                     break;
975                 }
976
977                 case GT_CLS_VAR_ADDR:
978                 {
979                     emit->emitIns_SIMD_R_R_C_R(ins, simdSize, targetReg, op1Reg, op3Reg, addr->gtClsVar.gtClsVarHnd, 0);
980                     return;
981                 }
982
983                 default:
984                 {
985                     if (memIndir == nullptr)
986                     {
987                         // This is the HW intrinsic load case.
988                         // Until we improve the handling of addressing modes in the emitter, we'll create a
989                         // temporary GT_IND to generate code with.
990                         GenTreeIndir load = indirForm(op2->TypeGet(), addr);
991                         memIndir          = &load;
992                     }
993                     emit->emitIns_SIMD_R_R_A_R(ins, simdSize, targetReg, op1Reg, op3Reg, memIndir);
994                     return;
995                 }
996             }
997         }
998         else
999         {
1000             switch (op2->OperGet())
1001             {
1002                 case GT_LCL_FLD:
1003                 {
1004                     GenTreeLclFld* lclField = op2->AsLclFld();
1005
1006                     varNum = lclField->GetLclNum();
1007                     offset = lclField->gtLclFld.gtLclOffs;
1008                     break;
1009                 }
1010
1011                 case GT_LCL_VAR:
1012                 {
1013                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
1014                     varNum = op2->AsLclVar()->GetLclNum();
1015                     offset = 0;
1016                     break;
1017                 }
1018
1019                 default:
1020                     unreached();
1021                     break;
1022             }
1023         }
1024
1025         // Ensure we got a good varNum and offset.
1026         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1027         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1028         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1029         assert(offset != (unsigned)-1);
1030
1031         emit->emitIns_SIMD_R_R_S_R(ins, simdSize, targetReg, op1Reg, op3Reg, varNum, offset);
1032     }
1033     else
1034     {
1035         emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, op3Reg);
1036     }
1037 }
1038
1039 //------------------------------------------------------------------------
1040 // genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
1041 //                          a register/memory operand, and that returns a value in register
1042 //
1043 // Arguments:
1044 //    ins       - The instruction being generated
1045 //    attr      - The emit attribute
1046 //    targetReg - The target register
1047 //    op1Reg    - The register of the first operand
1048 //    op2Reg    - The register of the second operand
1049 //    op3       - The third operand
1050 //
1051 void CodeGen::genHWIntrinsic_R_R_R_RM(
1052     instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
1053 {
1054     assert(targetReg != REG_NA);
1055     assert(op1Reg != REG_NA);
1056     assert(op2Reg != REG_NA);
1057
1058     emitter* emit = getEmitter();
1059
1060     if (op3->isContained() || op3->isUsedFromSpillTemp())
1061     {
1062         TempDsc* tmpDsc = nullptr;
1063         unsigned varNum = BAD_VAR_NUM;
1064         unsigned offset = (unsigned)-1;
1065
1066         if (op3->isUsedFromSpillTemp())
1067         {
1068             assert(op3->IsRegOptional());
1069
1070             // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
1071             //                     pattern. It could probably be extracted to its own method.
1072             tmpDsc = getSpillTempDsc(op3);
1073             varNum = tmpDsc->tdTempNum();
1074             offset = 0;
1075
1076             regSet.tmpRlsTemp(tmpDsc);
1077         }
1078         else if (op3->isIndir() || op3->OperIsHWIntrinsic())
1079         {
1080             GenTree*      addr;
1081             GenTreeIndir* memIndir = nullptr;
1082             if (op3->isIndir())
1083             {
1084                 memIndir = op3->AsIndir();
1085                 addr     = memIndir->Addr();
1086             }
1087             else
1088             {
1089                 assert(op3->AsHWIntrinsic()->OperIsMemoryLoad());
1090                 assert(HWIntrinsicInfo::lookupNumArgs(op3->AsHWIntrinsic()) == 1);
1091                 addr = op3->gtGetOp1();
1092             }
1093
1094             switch (addr->OperGet())
1095             {
1096                 case GT_LCL_VAR_ADDR:
1097                 {
1098                     varNum = addr->AsLclVarCommon()->GetLclNum();
1099                     offset = 0;
1100                     break;
1101                 }
1102
1103                 case GT_CLS_VAR_ADDR:
1104                 {
1105                     emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, addr->gtClsVar.gtClsVarHnd, 0);
1106                     return;
1107                 }
1108
1109                 default:
1110                 {
1111                     if (memIndir == nullptr)
1112                     {
1113                         // This is the HW intrinsic load case.
1114                         // Until we improve the handling of addressing modes in the emitter, we'll create a
1115                         // temporary GT_IND to generate code with.
1116                         GenTreeIndir load = indirForm(op3->TypeGet(), addr);
1117                         memIndir          = &load;
1118                     }
1119                     emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
1120                     return;
1121                 }
1122             }
1123         }
1124         else
1125         {
1126             switch (op3->OperGet())
1127             {
1128                 case GT_LCL_FLD:
1129                 {
1130                     GenTreeLclFld* lclField = op3->AsLclFld();
1131
1132                     varNum = lclField->GetLclNum();
1133                     offset = lclField->gtLclFld.gtLclOffs;
1134                     break;
1135                 }
1136
1137                 case GT_LCL_VAR:
1138                 {
1139                     assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
1140                     varNum = op3->AsLclVar()->GetLclNum();
1141                     offset = 0;
1142                     break;
1143                 }
1144
1145                 default:
1146                     unreached();
1147                     break;
1148             }
1149         }
1150
1151         // Ensure we got a good varNum and offset.
1152         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
1153         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
1154         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
1155         assert(offset != (unsigned)-1);
1156
1157         emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
1158     }
1159     else
1160     {
1161         emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
1162     }
1163 }
1164
1165 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
1166 //                       with non-constant argument
1167 //
1168 // Arguments:
1169 //    intrinsic      - intrinsic ID
1170 //    nonConstImmReg - the register contains non-constant imm8 argument
1171 //    baseReg        - a register for the start of the switch table
1172 //    offsReg        - a register for the offset into the switch table
1173 //    emitSwCase     - the lambda to generate a switch case
1174 //
1175 // Return Value:
1176 //    generate the jump-table fallback for imm-intrinsics with non-constant argument.
1177 // Note:
1178 //    This function can be used for all imm-intrinsics (whether full-range or not),
1179 //    The compiler front-end (i.e. importer) is responsible to insert a range-check IR
1180 //    (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
1181 //
1182 template <typename HWIntrinsicSwitchCaseBody>
1183 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsic,
1184                                               regNumber                 nonConstImmReg,
1185                                               regNumber                 baseReg,
1186                                               regNumber                 offsReg,
1187                                               HWIntrinsicSwitchCaseBody emitSwCase)
1188 {
1189     assert(nonConstImmReg != REG_NA);
1190     // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1191     // that does work with the current compiler generated jump-table fallback
1192     assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
1193     emitter* emit = getEmitter();
1194
1195     const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
1196     assert(maxByte <= 256);
1197     BasicBlock* jmpTable[256];
1198
1199     unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
1200     unsigned jmpTableOffs = 0;
1201
1202     // Emit the jump table
1203     for (unsigned i = 0; i < maxByte; i++)
1204     {
1205         jmpTable[i] = genCreateTempLabel();
1206         emit->emitDataGenData(i, jmpTable[i]);
1207     }
1208
1209     emit->emitDataGenEnd();
1210
1211     // Compute and jump to the appropriate offset in the switch table
1212     emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
1213
1214     emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
1215     emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
1216     emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
1217     emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
1218
1219     // Emit the switch table entries
1220
1221     BasicBlock* switchTableBeg = genCreateTempLabel();
1222     BasicBlock* switchTableEnd = genCreateTempLabel();
1223
1224     genDefineTempLabel(switchTableBeg);
1225
1226     for (unsigned i = 0; i < maxByte; i++)
1227     {
1228         genDefineTempLabel(jmpTable[i]);
1229         emitSwCase((int8_t)i);
1230         emit->emitIns_J(INS_jmp, switchTableEnd);
1231     }
1232
1233     genDefineTempLabel(switchTableEnd);
1234 }
1235
1236 //------------------------------------------------------------------------
1237 // genBaseIntrinsic: Generates the code for a base hardware intrinsic node
1238 //
1239 // Arguments:
1240 //    node - The hardware intrinsic node
1241 //
1242 // Note:
1243 //    We currently assume that all base intrinsics have zero or one operand.
1244 //
1245 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
1246 {
1247     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1248     regNumber      targetReg   = node->gtRegNum;
1249     var_types      targetType  = node->TypeGet();
1250     var_types      baseType    = node->gtSIMDBaseType;
1251
1252     assert(compiler->compSupports(InstructionSet_SSE));
1253     assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
1254
1255     GenTree* op1 = node->gtGetOp1();
1256
1257     genConsumeHWIntrinsicOperands(node);
1258     regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
1259
1260     assert(node->gtGetOp2() == nullptr);
1261
1262     emitter*    emit = getEmitter();
1263     emitAttr    attr = EA_ATTR(node->gtSIMDSize);
1264     instruction ins  = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1265
1266     switch (intrinsicId)
1267     {
1268         case NI_Vector128_CreateScalarUnsafe:
1269         case NI_Vector256_CreateScalarUnsafe:
1270         {
1271             if (varTypeIsIntegral(baseType))
1272             {
1273                 genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType));
1274             }
1275             else
1276             {
1277                 assert(varTypeIsFloating(baseType));
1278
1279                 attr = emitTypeSize(baseType);
1280
1281                 if (op1->isContained() || op1->isUsedFromSpillTemp())
1282                 {
1283                     genHWIntrinsic_R_RM(node, ins, attr);
1284                 }
1285                 else if (targetReg != op1Reg)
1286                 {
1287                     // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1288                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1289                 }
1290             }
1291             break;
1292         }
1293
1294         case NI_Vector128_ToScalar:
1295         case NI_Vector256_ToScalar:
1296         {
1297             assert(varTypeIsFloating(baseType));
1298
1299             attr = emitTypeSize(TYP_SIMD16);
1300
1301             if (op1->isContained() || op1->isUsedFromSpillTemp())
1302             {
1303                 genHWIntrinsic_R_RM(node, ins, attr);
1304             }
1305             else if (targetReg != op1Reg)
1306             {
1307                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1308                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1309             }
1310             break;
1311         }
1312
1313         case NI_Vector128_ToVector256:
1314         {
1315             // ToVector256 has zero-extend semantics in order to ensure it is deterministic
1316             // We always emit a move to the target register, even when op1Reg == targetReg,
1317             // in order to ensure that Bits MAXVL-1:128 are zeroed.
1318
1319             attr = emitTypeSize(TYP_SIMD16);
1320
1321             if (op1->isContained() || op1->isUsedFromSpillTemp())
1322             {
1323                 genHWIntrinsic_R_RM(node, ins, attr);
1324             }
1325             else
1326             {
1327                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1328                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1329             }
1330             break;
1331         }
1332
1333         case NI_Vector128_ToVector256Unsafe:
1334         case NI_Vector256_GetLower:
1335         {
1336             if (op1->isContained() || op1->isUsedFromSpillTemp())
1337             {
1338                 genHWIntrinsic_R_RM(node, ins, attr);
1339             }
1340             else if (targetReg != op1Reg)
1341             {
1342                 // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
1343                 emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
1344             }
1345             break;
1346         }
1347
1348         case NI_Vector128_Zero:
1349         case NI_Vector256_Zero:
1350         {
1351             assert(op1 == nullptr);
1352             emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1353             break;
1354         }
1355
1356         default:
1357         {
1358             unreached();
1359             break;
1360         }
1361     }
1362
1363     genProduceReg(node);
1364 }
1365
1366 //------------------------------------------------------------------------
1367 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
1368 //
1369 // Arguments:
1370 //    node - The hardware intrinsic node
1371 //
1372 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
1373 {
1374     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1375     GenTree*       op1         = node->gtGetOp1();
1376     GenTree*       op2         = node->gtGetOp2();
1377     GenTree*       op3         = nullptr;
1378     GenTree*       op4         = nullptr;
1379     regNumber      targetReg   = node->gtRegNum;
1380     var_types      targetType  = node->TypeGet();
1381     var_types      baseType    = node->gtSIMDBaseType;
1382
1383     regNumber op1Reg = REG_NA;
1384     regNumber op2Reg = REG_NA;
1385     regNumber op3Reg = REG_NA;
1386     regNumber op4Reg = REG_NA;
1387     emitter*  emit   = getEmitter();
1388
1389     genConsumeHWIntrinsicOperands(node);
1390
1391     switch (intrinsicId)
1392     {
1393         case NI_SSE_CompareScalarOrderedEqual:
1394         case NI_SSE_CompareScalarUnorderedEqual:
1395         {
1396             assert(baseType == TYP_FLOAT);
1397             regNumber   tmpReg = node->GetSingleTempReg();
1398             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1399
1400             // Ensure we aren't overwriting targetReg
1401             assert(tmpReg != targetReg);
1402
1403             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1404             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1405             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1406             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1407             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1408             break;
1409         }
1410
1411         case NI_SSE_CompareScalarOrderedGreaterThan:
1412         case NI_SSE_CompareScalarUnorderedGreaterThan:
1413         {
1414             assert(baseType == TYP_FLOAT);
1415             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1416
1417             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1418             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1419             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1420             break;
1421         }
1422
1423         case NI_SSE_CompareScalarOrderedGreaterThanOrEqual:
1424         case NI_SSE_CompareScalarUnorderedGreaterThanOrEqual:
1425         {
1426             assert(baseType == TYP_FLOAT);
1427             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1428
1429             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1430             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1431             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1432             break;
1433         }
1434
1435         case NI_SSE_CompareScalarOrderedLessThan:
1436         case NI_SSE_CompareScalarUnorderedLessThan:
1437         {
1438             assert(baseType == TYP_FLOAT);
1439             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1440
1441             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1442             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1443             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1444             break;
1445         }
1446
1447         case NI_SSE_CompareScalarOrderedLessThanOrEqual:
1448         case NI_SSE_CompareScalarUnorderedLessThanOrEqual:
1449         {
1450             assert(baseType == TYP_FLOAT);
1451             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1452
1453             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1454             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1455             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1456             break;
1457         }
1458
1459         case NI_SSE_CompareScalarOrderedNotEqual:
1460         case NI_SSE_CompareScalarUnorderedNotEqual:
1461         {
1462             assert(baseType == TYP_FLOAT);
1463             regNumber   tmpReg = node->GetSingleTempReg();
1464             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1465
1466             // Ensure we aren't overwriting targetReg
1467             assert(tmpReg != targetReg);
1468
1469             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1470             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1471             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1472             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1473             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1474             break;
1475         }
1476
1477         case NI_SSE_X64_ConvertToInt64:
1478         case NI_SSE_X64_ConvertToInt64WithTruncation:
1479         {
1480             assert(targetType == TYP_LONG);
1481             assert(op1 != nullptr);
1482             assert(op2 == nullptr);
1483             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1484             genHWIntrinsic_R_RM(node, ins, EA_8BYTE);
1485             break;
1486         }
1487
1488         case NI_SSE_X64_ConvertScalarToVector128Single:
1489         {
1490             assert(baseType == TYP_LONG);
1491             assert(op1 != nullptr);
1492             assert(op2 != nullptr);
1493             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1494             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1495             break;
1496         }
1497
1498         case NI_SSE_Prefetch0:
1499         case NI_SSE_Prefetch1:
1500         case NI_SSE_Prefetch2:
1501         case NI_SSE_PrefetchNonTemporal:
1502         {
1503             assert(baseType == TYP_UBYTE);
1504             assert(op2 == nullptr);
1505
1506             // These do not support containment.
1507             assert(!op1->isContained());
1508             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
1509             op1Reg          = op1->gtRegNum;
1510             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
1511             break;
1512         }
1513
1514         case NI_SSE_StoreFence:
1515         {
1516             assert(baseType == TYP_VOID);
1517             assert(op1 == nullptr);
1518             assert(op2 == nullptr);
1519             emit->emitIns(INS_sfence);
1520             break;
1521         }
1522
1523         default:
1524             unreached();
1525             break;
1526     }
1527
1528     genProduceReg(node);
1529 }
1530
1531 //------------------------------------------------------------------------
1532 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
1533 //
1534 // Arguments:
1535 //    node - The hardware intrinsic node
1536 //
1537 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
1538 {
1539     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1540     GenTree*       op1         = node->gtGetOp1();
1541     GenTree*       op2         = node->gtGetOp2();
1542     regNumber      targetReg   = node->gtRegNum;
1543     var_types      targetType  = node->TypeGet();
1544     var_types      baseType    = node->gtSIMDBaseType;
1545     regNumber      op1Reg      = REG_NA;
1546     regNumber      op2Reg      = REG_NA;
1547     emitter*       emit        = getEmitter();
1548
1549     genConsumeHWIntrinsicOperands(node);
1550
1551     switch (intrinsicId)
1552     {
1553         // All integer overloads are handled by table codegen
1554         case NI_SSE2_CompareLessThan:
1555         {
1556             assert(op1 != nullptr);
1557             assert(op2 != nullptr);
1558
1559             assert(baseType == TYP_DOUBLE);
1560
1561             int ival = HWIntrinsicInfo::lookupIval(intrinsicId);
1562             assert((ival >= 0) && (ival <= 127));
1563
1564             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1565             op1Reg          = op1->gtRegNum;
1566             op2Reg          = op2->gtRegNum;
1567             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
1568
1569             break;
1570         }
1571
1572         case NI_SSE2_CompareScalarOrderedEqual:
1573         case NI_SSE2_CompareScalarUnorderedEqual:
1574         {
1575             assert(baseType == TYP_DOUBLE);
1576             regNumber   tmpReg = node->GetSingleTempReg();
1577             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1578
1579             // Ensure we aren't overwriting targetReg
1580             assert(tmpReg != targetReg);
1581
1582             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1583             emit->emitIns_R(INS_setnp, EA_1BYTE, targetReg);
1584             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
1585             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
1586             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1587             break;
1588         }
1589
1590         case NI_SSE2_CompareScalarOrderedGreaterThan:
1591         case NI_SSE2_CompareScalarUnorderedGreaterThan:
1592         {
1593             assert(baseType == TYP_DOUBLE);
1594             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1595
1596             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1597             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1598             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1599             break;
1600         }
1601
1602         case NI_SSE2_CompareScalarOrderedGreaterThanOrEqual:
1603         case NI_SSE2_CompareScalarUnorderedGreaterThanOrEqual:
1604         {
1605             assert(baseType == TYP_DOUBLE);
1606             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1607
1608             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1609             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1610             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1611             break;
1612         }
1613
1614         case NI_SSE2_CompareScalarOrderedLessThan:
1615         case NI_SSE2_CompareScalarUnorderedLessThan:
1616         {
1617             assert(baseType == TYP_DOUBLE);
1618             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1619
1620             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1621             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1622             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1623             break;
1624         }
1625
1626         case NI_SSE2_CompareScalarOrderedLessThanOrEqual:
1627         case NI_SSE2_CompareScalarUnorderedLessThanOrEqual:
1628         {
1629             assert(baseType == TYP_DOUBLE);
1630             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1631
1632             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1633             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
1634             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1635             break;
1636         }
1637
1638         case NI_SSE2_CompareScalarOrderedNotEqual:
1639         case NI_SSE2_CompareScalarUnorderedNotEqual:
1640         {
1641             assert(baseType == TYP_DOUBLE);
1642             instruction ins    = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1643             regNumber   tmpReg = node->GetSingleTempReg();
1644
1645             // Ensure we aren't overwriting targetReg
1646             assert(tmpReg != targetReg);
1647
1648             genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1649             emit->emitIns_R(INS_setp, EA_1BYTE, targetReg);
1650             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
1651             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
1652             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
1653             break;
1654         }
1655
1656         case NI_SSE2_X64_ConvertScalarToVector128Double:
1657         {
1658             assert(baseType == TYP_LONG);
1659             assert(op1 != nullptr);
1660             assert(op2 != nullptr);
1661             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1662             genHWIntrinsic_R_R_RM(node, ins, EA_8BYTE);
1663             break;
1664         }
1665
1666         case NI_SSE2_X64_ConvertScalarToVector128Int64:
1667         case NI_SSE2_X64_ConvertScalarToVector128UInt64:
1668         {
1669             assert(baseType == TYP_LONG || baseType == TYP_ULONG);
1670             assert(op1 != nullptr);
1671             assert(op2 == nullptr);
1672             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1673             genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType));
1674             break;
1675         }
1676
1677         case NI_SSE2_ConvertToInt32:
1678         case NI_SSE2_ConvertToInt32WithTruncation:
1679         case NI_SSE2_ConvertToUInt32:
1680         case NI_SSE2_X64_ConvertToInt64:
1681         case NI_SSE2_X64_ConvertToInt64WithTruncation:
1682         case NI_SSE2_X64_ConvertToUInt64:
1683         {
1684             assert(op2 == nullptr);
1685             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1686
1687             if (varTypeIsIntegral(baseType))
1688             {
1689                 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1690                 op1Reg = op1->gtRegNum;
1691                 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1692             }
1693             else
1694             {
1695                 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT);
1696                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType));
1697             }
1698             break;
1699         }
1700
1701         case NI_SSE2_LoadFence:
1702         {
1703             assert(baseType == TYP_VOID);
1704             assert(op1 == nullptr);
1705             assert(op2 == nullptr);
1706             emit->emitIns(INS_lfence);
1707             break;
1708         }
1709
1710         case NI_SSE2_MemoryFence:
1711         {
1712             assert(baseType == TYP_VOID);
1713             assert(op1 == nullptr);
1714             assert(op2 == nullptr);
1715             emit->emitIns(INS_mfence);
1716             break;
1717         }
1718
1719         case NI_SSE2_StoreNonTemporal:
1720         case NI_SSE2_X64_StoreNonTemporal:
1721         {
1722             assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1723             assert(op1 != nullptr);
1724             assert(op2 != nullptr);
1725
1726             instruction     ins   = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1727             GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
1728             emit->emitInsStoreInd(ins, emitTypeSize(baseType), &store);
1729             break;
1730         }
1731
1732         default:
1733             unreached();
1734             break;
1735     }
1736
1737     genProduceReg(node);
1738 }
1739
1740 //------------------------------------------------------------------------
1741 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1742 //
1743 // Arguments:
1744 //    node - The hardware intrinsic node
1745 //
1746 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1747 {
1748     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1749     GenTree*       op1         = node->gtGetOp1();
1750     GenTree*       op2         = node->gtGetOp2();
1751     GenTree*       op3         = nullptr;
1752     GenTree*       op4         = nullptr;
1753     regNumber      targetReg   = node->gtRegNum;
1754     var_types      targetType  = node->TypeGet();
1755     var_types      baseType    = node->gtSIMDBaseType;
1756
1757     regNumber op1Reg = REG_NA;
1758     regNumber op2Reg = REG_NA;
1759     regNumber op3Reg = REG_NA;
1760     regNumber op4Reg = REG_NA;
1761     emitter*  emit   = getEmitter();
1762
1763     genConsumeHWIntrinsicOperands(node);
1764
1765     switch (intrinsicId)
1766     {
1767         case NI_SSE41_ConvertToVector128Int16:
1768         case NI_SSE41_ConvertToVector128Int32:
1769         case NI_SSE41_ConvertToVector128Int64:
1770         {
1771             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1772
1773             if (!varTypeIsSIMD(op1->gtType))
1774             {
1775                 // Until we improve the handling of addressing modes in the emitter, we'll create a
1776                 // temporary GT_IND to generate code with.
1777                 GenTreeIndir load = indirForm(node->TypeGet(), op1);
1778                 emit->emitInsLoadInd(ins, emitTypeSize(TYP_SIMD16), node->gtRegNum, &load);
1779             }
1780             else
1781             {
1782                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD16));
1783             }
1784             break;
1785         }
1786
1787         case NI_SSE41_TestZ:
1788         {
1789             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1790             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1791             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1792             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1793             break;
1794         }
1795
1796         case NI_SSE41_TestC:
1797         {
1798             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1799             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1800             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1801             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1802             break;
1803         }
1804
1805         case NI_SSE41_TestNotZAndNotC:
1806         {
1807             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
1808             genHWIntrinsic_R_RM(node, INS_ptest, emitTypeSize(TYP_SIMD16));
1809             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1810             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
1811             break;
1812         }
1813
1814         case NI_SSE41_Extract:
1815         case NI_SSE41_X64_Extract:
1816         {
1817             regNumber   tmpTargetReg = REG_NA;
1818             instruction ins          = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1819             if (baseType == TYP_FLOAT)
1820             {
1821                 tmpTargetReg = node->ExtractTempReg();
1822             }
1823
1824             auto emitSwCase = [&](int8_t i) {
1825                 if (baseType == TYP_FLOAT)
1826                 {
1827                     // extract instructions return to GP-registers, so it needs int size as the emitsize
1828                     inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
1829                     emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1830                 }
1831                 else
1832                 {
1833                     inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
1834                 }
1835             };
1836
1837             if (op2->IsCnsIntOrI())
1838             {
1839                 ssize_t ival = op2->AsIntCon()->IconValue();
1840                 assert((ival >= 0) && (ival <= 255));
1841                 emitSwCase((int8_t)ival);
1842             }
1843             else
1844             {
1845                 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1846                 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1847                 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1848                 regNumber baseReg = node->ExtractTempReg();
1849                 regNumber offsReg = node->GetSingleTempReg();
1850                 genHWIntrinsicJumpTableFallback(intrinsicId, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1851             }
1852             break;
1853         }
1854
1855         default:
1856             unreached();
1857             break;
1858     }
1859
1860     genProduceReg(node);
1861 }
1862
1863 //------------------------------------------------------------------------
1864 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1865 //
1866 // Arguments:
1867 //    node - The hardware intrinsic node
1868 //
1869 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1870 {
1871     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1872     regNumber      targetReg   = node->gtRegNum;
1873     GenTree*       op1         = node->gtGetOp1();
1874     GenTree*       op2         = node->gtGetOp2();
1875     var_types      baseType    = node->gtSIMDBaseType;
1876     var_types      targetType  = node->TypeGet();
1877     emitter*       emit        = getEmitter();
1878
1879     genConsumeHWIntrinsicOperands(node);
1880     regNumber op1Reg = op1->gtRegNum;
1881
1882     assert(targetReg != REG_NA);
1883     assert(op1Reg != REG_NA);
1884     assert(op2 != nullptr);
1885     assert(!node->OperIsCommutative());
1886
1887     switch (intrinsicId)
1888     {
1889         case NI_SSE42_Crc32:
1890         case NI_SSE42_X64_Crc32:
1891         {
1892             if (op1Reg != targetReg)
1893             {
1894                 assert(op2->gtRegNum != targetReg);
1895                 emit->emitIns_R_R(INS_mov, emitTypeSize(targetType), targetReg, op1Reg);
1896             }
1897
1898             // This makes the genHWIntrinsic_R_RM code much simpler, as we don't need an
1899             // overload that explicitly takes the operands.
1900             node->gtOp1 = op2;
1901             node->gtOp2 = nullptr;
1902
1903             if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument
1904             {
1905                 assert(targetType == TYP_INT);
1906                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType));
1907             }
1908             else
1909             {
1910                 assert(op1->TypeGet() == op2->TypeGet());
1911                 assert((targetType == TYP_INT) || (targetType == TYP_LONG));
1912                 genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType));
1913             }
1914
1915             break;
1916         }
1917
1918         default:
1919         {
1920             unreached();
1921             break;
1922         }
1923     }
1924
1925     genProduceReg(node);
1926 }
1927
1928 //------------------------------------------------------------------------
1929 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1930 //
1931 // Arguments:
1932 //    node - The hardware intrinsic node
1933 //
1934 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1935 {
1936     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
1937     var_types      baseType    = node->gtSIMDBaseType;
1938     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
1939     var_types      targetType  = node->TypeGet();
1940     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1941     int            numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
1942     GenTree*       op1         = node->gtGetOp1();
1943     GenTree*       op2         = node->gtGetOp2();
1944     regNumber      op1Reg      = REG_NA;
1945     regNumber      op2Reg      = REG_NA;
1946     regNumber      targetReg   = node->gtRegNum;
1947     emitter*       emit        = getEmitter();
1948
1949     genConsumeHWIntrinsicOperands(node);
1950
1951     switch (intrinsicId)
1952     {
1953         case NI_AVX2_ConvertToInt32:
1954         case NI_AVX2_ConvertToUInt32:
1955         {
1956             op1Reg = op1->gtRegNum;
1957             assert(numArgs == 1);
1958             assert((baseType == TYP_INT) || (baseType == TYP_UINT));
1959             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1960             emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1961             break;
1962         }
1963
1964         case NI_AVX2_ConvertToVector256Int16:
1965         case NI_AVX2_ConvertToVector256Int32:
1966         case NI_AVX2_ConvertToVector256Int64:
1967         {
1968             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
1969
1970             if (!varTypeIsSIMD(op1->gtType))
1971             {
1972                 // Until we improve the handling of addressing modes in the emitter, we'll create a
1973                 // temporary GT_IND to generate code with.
1974                 GenTreeIndir load = indirForm(node->TypeGet(), op1);
1975                 emit->emitInsLoadInd(ins, emitTypeSize(TYP_SIMD32), node->gtRegNum, &load);
1976             }
1977             else
1978             {
1979                 genHWIntrinsic_R_RM(node, ins, emitTypeSize(TYP_SIMD32));
1980             }
1981             break;
1982         }
1983
1984         case NI_AVX2_GatherVector128:
1985         case NI_AVX2_GatherVector256:
1986         case NI_AVX2_GatherMaskVector128:
1987         case NI_AVX2_GatherMaskVector256:
1988         {
1989             GenTreeArgList* list = op1->AsArgList();
1990             op1                  = list->Current();
1991             op1Reg               = op1->gtRegNum;
1992
1993             list   = list->Rest();
1994             op2    = list->Current();
1995             op2Reg = op2->gtRegNum;
1996
1997             list         = list->Rest();
1998             GenTree* op3 = list->Current();
1999
2000             list             = list->Rest();
2001             GenTree* op4     = nullptr;
2002             GenTree* lastOp  = nullptr;
2003             GenTree* indexOp = nullptr;
2004
2005             regNumber op3Reg       = REG_NA;
2006             regNumber op4Reg       = REG_NA;
2007             regNumber addrBaseReg  = REG_NA;
2008             regNumber addrIndexReg = REG_NA;
2009             regNumber maskReg      = node->ExtractTempReg(RBM_ALLFLOAT);
2010
2011             if (numArgs == 5)
2012             {
2013                 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
2014                 op4          = list->Current();
2015                 list         = list->Rest();
2016                 lastOp       = list->Current();
2017                 op3Reg       = op3->gtRegNum;
2018                 op4Reg       = op4->gtRegNum;
2019                 addrBaseReg  = op2Reg;
2020                 addrIndexReg = op3Reg;
2021                 indexOp      = op3;
2022
2023                 // copy op4Reg into the tmp mask register,
2024                 // the mask register will be cleared by gather instructions
2025                 emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
2026
2027                 if (targetReg != op1Reg)
2028                 {
2029                     // copy source vector to the target register for masking merge
2030                     emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
2031                 }
2032             }
2033             else
2034             {
2035                 assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
2036                 addrBaseReg  = op1Reg;
2037                 addrIndexReg = op2Reg;
2038                 indexOp      = op2;
2039                 lastOp       = op3;
2040
2041                 // generate all-one mask vector
2042                 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
2043             }
2044
2045             bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
2046
2047             // hwintrinsiclistxarch.h uses Dword index instructions in default
2048             if (varTypeIsLong(node->gtIndexBaseType))
2049             {
2050                 switch (ins)
2051                 {
2052                     case INS_vpgatherdd:
2053                         ins = INS_vpgatherqd;
2054                         if (isVector128GatherWithVector256Index)
2055                         {
2056                             // YMM index in address mode
2057                             attr = emitTypeSize(TYP_SIMD32);
2058                         }
2059                         break;
2060                     case INS_vpgatherdq:
2061                         ins = INS_vpgatherqq;
2062                         break;
2063                     case INS_vgatherdps:
2064                         ins = INS_vgatherqps;
2065                         if (isVector128GatherWithVector256Index)
2066                         {
2067                             // YMM index in address mode
2068                             attr = emitTypeSize(TYP_SIMD32);
2069                         }
2070                         break;
2071                     case INS_vgatherdpd:
2072                         ins = INS_vgatherqpd;
2073                         break;
2074                     default:
2075                         unreached();
2076                 }
2077             }
2078
2079             assert(lastOp->IsCnsIntOrI());
2080             ssize_t ival = lastOp->AsIntCon()->IconValue();
2081             assert((ival >= 0) && (ival <= 255));
2082
2083             assert(targetReg != maskReg);
2084             assert(targetReg != addrIndexReg);
2085             assert(maskReg != addrIndexReg);
2086             emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2087
2088             break;
2089         }
2090
2091         case NI_AVX_TestC:
2092         {
2093             genHWIntrinsic_R_RM(node, ins, attr);
2094             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
2095             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2096             break;
2097         }
2098
2099         case NI_AVX_TestNotZAndNotC:
2100         {
2101             genHWIntrinsic_R_RM(node, ins, attr);
2102             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
2103             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2104             break;
2105         }
2106
2107         case NI_AVX_TestZ:
2108         {
2109             genHWIntrinsic_R_RM(node, ins, attr);
2110             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
2111             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
2112             break;
2113         }
2114
2115         default:
2116             unreached();
2117             break;
2118     }
2119
2120     genProduceReg(node);
2121 }
2122
2123 //------------------------------------------------------------------------
2124 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
2125 //
2126 // Arguments:
2127 //    node - The hardware intrinsic node
2128 //
2129 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
2130 {
2131     NYI("Implement AES intrinsic code generation");
2132 }
2133
2134 //------------------------------------------------------------------------
2135 // genBMI1OrBMI2Intrinsic: Generates the code for a BMI1 and BMI2 hardware intrinsic node
2136 //
2137 // Arguments:
2138 //    node - The hardware intrinsic node
2139 //
2140 void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
2141 {
2142     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2143     regNumber      targetReg   = node->gtRegNum;
2144     GenTree*       op1         = node->gtGetOp1();
2145     GenTree*       op2         = node->gtGetOp2();
2146     var_types      targetType  = node->TypeGet();
2147     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
2148     emitter*       emit        = getEmitter();
2149
2150     assert(targetReg != REG_NA);
2151     assert(op1 != nullptr);
2152
2153     genConsumeHWIntrinsicOperands(node);
2154
2155     switch (intrinsicId)
2156     {
2157         case NI_BMI1_AndNot:
2158         case NI_BMI1_X64_AndNot:
2159         case NI_BMI1_BitFieldExtract:
2160         case NI_BMI1_X64_BitFieldExtract:
2161         case NI_BMI2_ParallelBitDeposit:
2162         case NI_BMI2_ParallelBitExtract:
2163         case NI_BMI2_X64_ParallelBitDeposit:
2164         case NI_BMI2_X64_ParallelBitExtract:
2165         case NI_BMI2_ZeroHighBits:
2166         case NI_BMI2_X64_ZeroHighBits:
2167         {
2168             assert(op2 != nullptr);
2169             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2170             genHWIntrinsic_R_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2171             break;
2172         }
2173
2174         case NI_BMI1_ExtractLowestSetBit:
2175         case NI_BMI1_GetMaskUpToLowestSetBit:
2176         case NI_BMI1_ResetLowestSetBit:
2177         case NI_BMI1_X64_ExtractLowestSetBit:
2178         case NI_BMI1_X64_GetMaskUpToLowestSetBit:
2179         case NI_BMI1_X64_ResetLowestSetBit:
2180         {
2181             assert(op2 == nullptr);
2182             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2183             genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2184             break;
2185         }
2186
2187         case NI_BMI1_TrailingZeroCount:
2188         case NI_BMI1_X64_TrailingZeroCount:
2189         {
2190             assert(op2 == nullptr);
2191             assert((targetType == TYP_INT) || (targetType == TYP_LONG));
2192             genXCNTIntrinsic(node, ins);
2193             break;
2194         }
2195
2196         case NI_BMI2_MultiplyNoFlags:
2197         case NI_BMI2_X64_MultiplyNoFlags:
2198         {
2199             int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2200             assert(numArgs == 2 || numArgs == 3);
2201
2202             regNumber op1Reg = REG_NA;
2203             regNumber op2Reg = REG_NA;
2204             regNumber op3Reg = REG_NA;
2205             regNumber lowReg = REG_NA;
2206
2207             if (numArgs == 2)
2208             {
2209                 op1Reg = op1->gtRegNum;
2210                 op2Reg = op2->gtRegNum;
2211                 lowReg = targetReg;
2212             }
2213             else
2214             {
2215                 GenTreeArgList* argList = op1->AsArgList();
2216                 op1                     = argList->Current();
2217                 op1Reg                  = op1->gtRegNum;
2218                 argList                 = argList->Rest();
2219                 op2                     = argList->Current();
2220                 op2Reg                  = op2->gtRegNum;
2221                 argList                 = argList->Rest();
2222                 GenTree* op3            = argList->Current();
2223                 op3Reg                  = op3->gtRegNum;
2224                 assert(!op3->isContained());
2225                 assert(op3Reg != op1Reg);
2226                 assert(op3Reg != targetReg);
2227                 assert(op3Reg != REG_EDX);
2228                 lowReg = node->GetSingleTempReg();
2229                 assert(op3Reg != lowReg);
2230                 assert(lowReg != targetReg);
2231             }
2232
2233             // These do not support containment
2234             assert(!op2->isContained());
2235             emitAttr attr = emitTypeSize(targetType);
2236             // mov the first operand into implicit source operand EDX/RDX
2237             if (op1Reg != REG_EDX)
2238             {
2239                 assert(op2Reg != REG_EDX);
2240                 emit->emitIns_R_R(INS_mov, attr, REG_EDX, op1Reg);
2241             }
2242
2243             // generate code for MULX
2244             genHWIntrinsic_R_R_RM(node, ins, attr, targetReg, lowReg, op2);
2245
2246             // If requires the lower half result, store in the memory pointed to by op3
2247             if (numArgs == 3)
2248             {
2249                 emit->emitIns_AR_R(INS_mov, attr, lowReg, op3Reg, 0);
2250             }
2251
2252             break;
2253         }
2254
2255         default:
2256         {
2257             unreached();
2258             break;
2259         }
2260     }
2261
2262     genProduceReg(node);
2263 }
2264
2265 //------------------------------------------------------------------------
2266 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
2267 //
2268 // Arguments:
2269 //    node - The hardware intrinsic node
2270 //
2271 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
2272 {
2273     NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2274     var_types      baseType    = node->gtSIMDBaseType;
2275     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
2276     instruction    ins         = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
2277     GenTree*       op1         = node->gtGetOp1();
2278     regNumber      targetReg   = node->gtRegNum;
2279
2280     assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
2281
2282     genConsumeHWIntrinsicOperands(node);
2283     GenTreeArgList* argList = op1->AsArgList();
2284     op1                     = argList->Current();
2285
2286     argList      = argList->Rest();
2287     GenTree* op2 = argList->Current();
2288
2289     argList      = argList->Rest();
2290     GenTree* op3 = argList->Current();
2291
2292     regNumber op1Reg;
2293     regNumber op2Reg;
2294
2295     bool       isCommutative   = false;
2296     const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2297
2298     // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2299     assert(!copiesUpperBits || !op1->isContained());
2300
2301     if (op3->isContained() || op3->isUsedFromSpillTemp())
2302     {
2303         // 213 form: op1 = (op2 * op1) + [op3]
2304
2305         op1Reg = op1->gtRegNum;
2306         op2Reg = op2->gtRegNum;
2307
2308         isCommutative = !copiesUpperBits;
2309     }
2310     else if (op2->isContained() || op2->isUsedFromSpillTemp())
2311     {
2312         // 132 form: op1 = (op1 * op3) + [op2]
2313
2314         ins    = (instruction)(ins - 1);
2315         op1Reg = op1->gtRegNum;
2316         op2Reg = op3->gtRegNum;
2317         op3    = op2;
2318     }
2319     else if (op1->isContained() || op1->isUsedFromSpillTemp())
2320     {
2321         // 231 form: op3 = (op2 * op3) + [op1]
2322
2323         ins    = (instruction)(ins + 1);
2324         op1Reg = op3->gtRegNum;
2325         op2Reg = op2->gtRegNum;
2326         op3    = op1;
2327     }
2328     else
2329     {
2330         // 213 form: op1 = (op2 * op1) + op3
2331
2332         op1Reg = op1->gtRegNum;
2333         op2Reg = op2->gtRegNum;
2334
2335         isCommutative = !copiesUpperBits;
2336     }
2337
2338     if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
2339     {
2340         assert(node->isRMWHWIntrinsic(compiler));
2341
2342         // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
2343         //
2344         // For non-commutative intrinsics, we should have ensured that op2 was marked
2345         // delay free in order to prevent it from getting assigned the same register
2346         // as target. However, for commutative intrinsics, we can just swap the operands
2347         // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
2348
2349         op2Reg = op1Reg;
2350         op1Reg = targetReg;
2351     }
2352
2353     genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
2354     genProduceReg(node);
2355 }
2356
2357 //------------------------------------------------------------------------
2358 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
2359 //
2360 // Arguments:
2361 //    node - The hardware intrinsic node
2362 //
2363 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
2364 {
2365     assert(node->gtHWIntrinsicId == NI_LZCNT_LeadingZeroCount ||
2366            node->gtHWIntrinsicId == NI_LZCNT_X64_LeadingZeroCount);
2367
2368     genConsumeOperands(node);
2369     genXCNTIntrinsic(node, INS_lzcnt);
2370     genProduceReg(node);
2371 }
2372
2373 //------------------------------------------------------------------------
2374 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
2375 //
2376 // Arguments:
2377 //    node - The hardware intrinsic node
2378 //
2379 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
2380 {
2381     NYI("Implement PCLMULQDQ intrinsic code generation");
2382 }
2383
2384 //------------------------------------------------------------------------
2385 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
2386 //
2387 // Arguments:
2388 //    node - The hardware intrinsic node
2389 //
2390 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
2391 {
2392     assert(node->gtHWIntrinsicId == NI_POPCNT_PopCount || node->gtHWIntrinsicId == NI_POPCNT_X64_PopCount);
2393
2394     genConsumeOperands(node);
2395     genXCNTIntrinsic(node, INS_popcnt);
2396     genProduceReg(node);
2397 }
2398
2399 //------------------------------------------------------------------------
2400 // genXCNTIntrinsic: Generates the code for a lzcnt/tzcnt/popcnt hardware intrinsic node, breaks false dependencies on
2401 // the target register
2402 //
2403 // Arguments:
2404 //    node - The hardware intrinsic node
2405 //    ins  - The instruction being generated
2406 //
2407 void CodeGen::genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins)
2408 {
2409     // LZCNT/TZCNT/POPCNT have a false dependency on the target register on Intel Sandy Bridge, Haswell, and Skylake
2410     // (POPCNT only) processors, so insert a `XOR target, target` to break the dependency via XOR triggering register
2411     // renaming, but only if it's not an actual dependency.
2412
2413     GenTree*  op1        = node->gtGetOp1();
2414     regNumber sourceReg1 = REG_NA;
2415     regNumber sourceReg2 = REG_NA;
2416
2417     if (!op1->isContained())
2418     {
2419         sourceReg1 = op1->gtRegNum;
2420     }
2421     else if (op1->isIndir())
2422     {
2423         GenTreeIndir* indir   = op1->AsIndir();
2424         GenTree*      memBase = indir->Base();
2425
2426         if (memBase != nullptr)
2427         {
2428             sourceReg1 = memBase->gtRegNum;
2429         }
2430
2431         if (indir->HasIndex())
2432         {
2433             sourceReg2 = indir->Index()->gtRegNum;
2434         }
2435     }
2436
2437     regNumber targetReg = node->gtRegNum;
2438     if ((targetReg != sourceReg1) && (targetReg != sourceReg2))
2439     {
2440         getEmitter()->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
2441     }
2442     genHWIntrinsic_R_RM(node, ins, emitTypeSize(node->TypeGet()));
2443 }
2444
2445 #endif // FEATURE_HW_INTRINSICS