Merge pull request #18504 from mikedn/comp-small
[platform/upstream/coreclr.git] / src / jit / lsraxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX                    Register Requirements for AMD64                        XX
9 XX                                                                           XX
10 XX  This encapsulates all the logic for setting register requirements for    XX
11 XX  the AMD64 architecture.                                                  XX
12 XX                                                                           XX
13 XX                                                                           XX
14 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 */
17
18 #include "jitpch.h"
19 #ifdef _MSC_VER
20 #pragma hdrstop
21 #endif
22
23 #ifdef _TARGET_XARCH_
24
25 #include "jit.h"
26 #include "sideeffects.h"
27 #include "lower.h"
28
29 //------------------------------------------------------------------------
30 // BuildNode: Build the RefPositions for for a node
31 //
32 // Arguments:
33 //    treeNode - the node of interest
34 //
35 // Return Value:
36 //    The number of sources consumed by this node.
37 //
38 // Notes:
39 // Preconditions:
40 //    LSRA Has been initialized.
41 //
42 // Postconditions:
43 //    RefPositions have been built for all the register defs and uses required
44 //    for this node.
45 //
46 int LinearScan::BuildNode(GenTree* tree)
47 {
48     assert(!tree->isContained());
49     Interval* prefSrcInterval = nullptr;
50     int       srcCount;
51     int       dstCount      = 0;
52     regMaskTP dstCandidates = RBM_NONE;
53     regMaskTP killMask      = RBM_NONE;
54     bool      isLocalDefUse = false;
55
56     // Reset the build-related members of LinearScan.
57     clearBuildState();
58
59     // Set the default dstCount. This may be modified below.
60     if (tree->IsValue())
61     {
62         dstCount = 1;
63         if (tree->IsUnusedValue())
64         {
65             isLocalDefUse = true;
66         }
67     }
68     else
69     {
70         dstCount = 0;
71     }
72
73     // floating type generates AVX instruction (vmovss etc.), set the flag
74     SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
75
76     switch (tree->OperGet())
77     {
78         default:
79             srcCount = BuildSimple(tree);
80             break;
81
82         case GT_LCL_VAR:
83             // Because we do containment analysis before we redo dataflow and identify register
84             // candidates, the containment analysis only uses !lvDoNotEnregister to estimate register
85             // candidates.
86             // If there is a lclVar that is estimated to be register candidate but
87             // is not, if they were marked regOptional they should now be marked contained instead.
88             // TODO-XArch-CQ: When this is being called while RefPositions are being created,
89             // use lvLRACandidate here instead.
90             if (tree->IsRegOptional())
91             {
92                 if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked ||
93                     compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister)
94                 {
95                     tree->ClearRegOptional();
96                     tree->SetContained();
97                     INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 0));
98                     return 0;
99                 }
100             }
101             __fallthrough;
102
103         case GT_LCL_FLD:
104         {
105             // We handle tracked variables differently from non-tracked ones.  If it is tracked,
106             // we will simply add a use of the tracked variable at its parent/consumer.
107             // Otherwise, for a use we need to actually add the appropriate references for loading
108             // or storing the variable.
109             //
110             // A tracked variable won't actually get used until the appropriate ancestor tree node
111             // is processed, unless this is marked "isLocalDefUse" because it is a stack-based argument
112             // to a call or an orphaned dead node.
113             //
114             LclVarDsc* const varDsc = &compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum];
115             if (isCandidateVar(varDsc))
116             {
117                 INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 1));
118                 return 0;
119             }
120             srcCount = 0;
121 #ifdef FEATURE_SIMD
122             // Need an additional register to read upper 4 bytes of Vector3.
123             if (tree->TypeGet() == TYP_SIMD12)
124             {
125                 // We need an internal register different from targetReg in which 'tree' produces its result
126                 // because both targetReg and internal reg will be in use at the same time.
127                 buildInternalFloatRegisterDefForNode(tree, allSIMDRegs());
128                 setInternalRegsDelayFree = true;
129                 buildInternalRegisterUses();
130             }
131 #endif
132             BuildDef(tree);
133         }
134         break;
135
136         case GT_STORE_LCL_FLD:
137         case GT_STORE_LCL_VAR:
138             srcCount = BuildStoreLoc(tree->AsLclVarCommon());
139             break;
140
141         case GT_FIELD_LIST:
142             // These should always be contained. We don't correctly allocate or
143             // generate code for a non-contained GT_FIELD_LIST.
144             noway_assert(!"Non-contained GT_FIELD_LIST");
145             srcCount = 0;
146             break;
147
148         case GT_LIST:
149         case GT_ARGPLACE:
150         case GT_NO_OP:
151         case GT_START_NONGC:
152             srcCount = 0;
153             assert(dstCount == 0);
154             break;
155
156         case GT_PROF_HOOK:
157             srcCount = 0;
158             assert(dstCount == 0);
159             killMask = getKillSetForProfilerHook();
160             BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
161             break;
162
163         case GT_CNS_INT:
164         case GT_CNS_LNG:
165         case GT_CNS_DBL:
166         {
167             srcCount = 0;
168             assert(dstCount == 1);
169             assert(!tree->IsReuseRegVal());
170             RefPosition* def               = BuildDef(tree);
171             def->getInterval()->isConstant = true;
172         }
173         break;
174
175 #if !defined(_TARGET_64BIT_)
176
177         case GT_LONG:
178             assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here.
179             // An unused GT_LONG node needs to consume its sources, but need not produce a register.
180             tree->gtType = TYP_VOID;
181             tree->ClearUnusedValue();
182             isLocalDefUse = false;
183             srcCount      = 2;
184             dstCount      = 0;
185             BuildUse(tree->gtGetOp1());
186             BuildUse(tree->gtGetOp2());
187             break;
188
189 #endif // !defined(_TARGET_64BIT_)
190
191         case GT_BOX:
192         case GT_COMMA:
193         case GT_QMARK:
194         case GT_COLON:
195             srcCount = 0;
196             unreached();
197             break;
198
199         case GT_RETURN:
200             srcCount = BuildReturn(tree);
201             killMask = getKillSetForReturn();
202             BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
203             break;
204
205         case GT_RETFILT:
206             assert(dstCount == 0);
207             if (tree->TypeGet() == TYP_VOID)
208             {
209                 srcCount = 0;
210             }
211             else
212             {
213                 assert(tree->TypeGet() == TYP_INT);
214                 srcCount = 1;
215                 BuildUse(tree->gtGetOp1(), RBM_INTRET);
216             }
217             break;
218
219         // A GT_NOP is either a passthrough (if it is void, or if it has
220         // a child), but must be considered to produce a dummy value if it
221         // has a type but no child
222         case GT_NOP:
223             srcCount = 0;
224             assert((tree->gtGetOp1() == nullptr) || tree->isContained());
225             if (tree->TypeGet() != TYP_VOID && tree->gtGetOp1() == nullptr)
226             {
227                 assert(dstCount == 1);
228                 BuildUse(tree->gtGetOp1());
229                 BuildDef(tree);
230             }
231             else
232             {
233                 assert(dstCount == 0);
234             }
235             break;
236
237         case GT_JTRUE:
238         {
239             srcCount = 0;
240             assert(dstCount == 0);
241             GenTree* cmp = tree->gtGetOp1();
242             assert(!cmp->IsValue());
243         }
244         break;
245
246         case GT_JCC:
247             srcCount = 0;
248             assert(dstCount == 0);
249             break;
250
251         case GT_SETCC:
252             srcCount = 0;
253             assert(dstCount == 1);
254             // This defines a byte value (note that on x64 allByteRegs() is defined as RBM_ALLINT).
255             BuildDef(tree, allByteRegs());
256             break;
257
258         case GT_JMP:
259             srcCount = 0;
260             assert(dstCount == 0);
261             break;
262
263         case GT_SWITCH:
264             // This should never occur since switch nodes must not be visible at this
265             // point in the JIT.
266             srcCount = 0;
267             noway_assert(!"Switch must be lowered at this point");
268             break;
269
270         case GT_JMPTABLE:
271             srcCount = 0;
272             assert(dstCount == 1);
273             BuildDef(tree);
274             break;
275
276         case GT_SWITCH_TABLE:
277         {
278             assert(dstCount == 0);
279             buildInternalIntRegisterDefForNode(tree);
280             srcCount = BuildBinaryUses(tree->AsOp());
281             buildInternalRegisterUses();
282             assert(srcCount == 2);
283         }
284         break;
285
286         case GT_ASG:
287             noway_assert(!"We should never hit any assignment operator in lowering");
288             srcCount = 0;
289             break;
290
291 #if !defined(_TARGET_64BIT_)
292         case GT_ADD_LO:
293         case GT_ADD_HI:
294         case GT_SUB_LO:
295         case GT_SUB_HI:
296 #endif
297         case GT_ADD:
298         case GT_SUB:
299         case GT_AND:
300         case GT_OR:
301         case GT_XOR:
302             srcCount = BuildBinaryUses(tree->AsOp());
303             assert(dstCount == 1);
304             BuildDef(tree);
305             break;
306
307         case GT_BT:
308             srcCount = BuildBinaryUses(tree->AsOp());
309             assert(dstCount == 0);
310             break;
311
312         case GT_RETURNTRAP:
313         {
314             // This just turns into a compare of its child with an int + a conditional call.
315             RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree);
316             srcCount                 = BuildOperandUses(tree->gtGetOp1());
317             buildInternalRegisterUses();
318             killMask = compiler->compHelperCallKillSet(CORINFO_HELP_STOP_FOR_GC);
319             BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
320         }
321         break;
322
323         case GT_MOD:
324         case GT_DIV:
325         case GT_UMOD:
326         case GT_UDIV:
327             srcCount = BuildModDiv(tree->AsOp());
328             break;
329
330         case GT_MUL:
331         case GT_MULHI:
332 #if defined(_TARGET_X86_)
333         case GT_MUL_LONG:
334 #endif
335             srcCount = BuildMul(tree->AsOp());
336             break;
337
338         case GT_INTRINSIC:
339             srcCount = BuildIntrinsic(tree->AsOp());
340             break;
341
342 #ifdef FEATURE_SIMD
343         case GT_SIMD:
344             srcCount = BuildSIMD(tree->AsSIMD());
345             break;
346 #endif // FEATURE_SIMD
347
348 #ifdef FEATURE_HW_INTRINSICS
349         case GT_HWIntrinsic:
350             srcCount = BuildHWIntrinsic(tree->AsHWIntrinsic());
351             break;
352 #endif // FEATURE_HW_INTRINSICS
353
354         case GT_CAST:
355             srcCount = BuildCast(tree);
356             break;
357
358         case GT_BITCAST:
359         {
360             assert(dstCount == 1);
361             tgtPrefUse = BuildUse(tree->gtGetOp1());
362             BuildDef(tree);
363             srcCount = 1;
364         }
365         break;
366
367         case GT_NEG:
368             // TODO-XArch-CQ:
369             // SSE instruction set doesn't have an instruction to negate a number.
370             // The recommended way is to xor the float/double number with a bitmask.
371             // The only way to xor is using xorps or xorpd both of which operate on
372             // 128-bit operands.  To hold the bit-mask we would need another xmm
373             // register or a 16-byte aligned 128-bit data constant. Right now emitter
374             // lacks the support for emitting such constants or instruction with mem
375             // addressing mode referring to a 128-bit operand. For now we use an
376             // internal xmm register to load 32/64-bit bitmask from data section.
377             // Note that by trading additional data section memory (128-bit) we can
378             // save on the need for an internal register and also a memory-to-reg
379             // move.
380             //
381             // Note: another option to avoid internal register requirement is by
382             // lowering as GT_SUB(0, src).  This will generate code different from
383             // Jit64 and could possibly result in compat issues (?).
384             if (varTypeIsFloating(tree))
385             {
386
387                 RefPosition* internalDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
388                 srcCount                 = BuildOperandUses(tree->gtGetOp1());
389                 buildInternalRegisterUses();
390             }
391             else
392             {
393                 srcCount = BuildOperandUses(tree->gtGetOp1());
394             }
395             BuildDef(tree);
396             break;
397
398         case GT_NOT:
399             srcCount = BuildOperandUses(tree->gtGetOp1());
400             BuildDef(tree);
401             break;
402
403         case GT_LSH:
404         case GT_RSH:
405         case GT_RSZ:
406         case GT_ROL:
407         case GT_ROR:
408 #ifdef _TARGET_X86_
409         case GT_LSH_HI:
410         case GT_RSH_LO:
411 #endif
412             srcCount = BuildShiftRotate(tree);
413             break;
414
415         case GT_EQ:
416         case GT_NE:
417         case GT_LT:
418         case GT_LE:
419         case GT_GE:
420         case GT_GT:
421         case GT_TEST_EQ:
422         case GT_TEST_NE:
423         case GT_CMP:
424             srcCount = BuildCmp(tree);
425             break;
426
427         case GT_CKFINITE:
428         {
429             assert(dstCount == 1);
430             RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree);
431             srcCount                 = BuildOperandUses(tree->gtGetOp1());
432             buildInternalRegisterUses();
433             BuildDef(tree);
434         }
435         break;
436
437         case GT_CMPXCHG:
438         {
439             srcCount = 3;
440             assert(dstCount == 1);
441
442             // Comparand is preferenced to RAX.
443             // The remaining two operands can be in any reg other than RAX.
444             BuildUse(tree->gtCmpXchg.gtOpLocation, allRegs(TYP_INT) & ~RBM_RAX);
445             BuildUse(tree->gtCmpXchg.gtOpValue, allRegs(TYP_INT) & ~RBM_RAX);
446             BuildUse(tree->gtCmpXchg.gtOpComparand, RBM_RAX);
447             BuildDef(tree, RBM_RAX);
448         }
449         break;
450
451         case GT_XADD:
452         case GT_XCHG:
453         {
454             // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
455             // to special case them.
456             // These tree nodes will have their op1 marked as isDelayFree=true.
457             // That is, op1's reg remains in use until the subsequent instruction.
458             GenTree* addr = tree->gtGetOp1();
459             GenTree* data = tree->gtGetOp2();
460             assert(!addr->isContained());
461             RefPosition* addrUse = BuildUse(addr);
462             setDelayFree(addrUse);
463             tgtPrefUse = addrUse;
464             assert(!data->isContained());
465             BuildUse(data);
466             srcCount = 2;
467             assert(dstCount == 1);
468             BuildDef(tree);
469         }
470         break;
471
472         case GT_PUTARG_REG:
473             srcCount = BuildPutArgReg(tree->AsUnOp());
474             break;
475
476         case GT_CALL:
477             srcCount = BuildCall(tree->AsCall());
478             if (tree->AsCall()->HasMultiRegRetVal())
479             {
480                 dstCount = tree->AsCall()->GetReturnTypeDesc()->GetReturnRegCount();
481             }
482             break;
483
484         case GT_ADDR:
485         {
486             // For a GT_ADDR, the child node should not be evaluated into a register
487             GenTree* child = tree->gtGetOp1();
488             assert(!isCandidateLocalRef(child));
489             assert(child->isContained());
490             assert(dstCount == 1);
491             srcCount = 0;
492         }
493         break;
494
495 #if !defined(FEATURE_PUT_STRUCT_ARG_STK)
496         case GT_OBJ:
497 #endif
498         case GT_BLK:
499         case GT_DYN_BLK:
500             // These should all be eliminated prior to Lowering.
501             assert(!"Non-store block node in Lowering");
502             srcCount = 0;
503             break;
504
505 #ifdef FEATURE_PUT_STRUCT_ARG_STK
506         case GT_PUTARG_STK:
507             srcCount = BuildPutArgStk(tree->AsPutArgStk());
508             break;
509 #endif // FEATURE_PUT_STRUCT_ARG_STK
510
511         case GT_STORE_BLK:
512         case GT_STORE_OBJ:
513         case GT_STORE_DYN_BLK:
514             srcCount = BuildBlockStore(tree->AsBlk());
515             break;
516
517         case GT_INIT_VAL:
518             // Always a passthrough of its child's value.
519             assert(!"INIT_VAL should always be contained");
520             srcCount = 0;
521             break;
522
523         case GT_LCLHEAP:
524             srcCount = BuildLclHeap(tree);
525             break;
526
527         case GT_ARR_BOUNDS_CHECK:
528 #ifdef FEATURE_SIMD
529         case GT_SIMD_CHK:
530 #endif // FEATURE_SIMD
531 #ifdef FEATURE_HW_INTRINSICS
532         case GT_HW_INTRINSIC_CHK:
533 #endif // FEATURE_HW_INTRINSICS
534             // Consumes arrLen & index - has no result
535             srcCount = 2;
536             assert(dstCount == 0);
537             srcCount = BuildOperandUses(tree->AsBoundsChk()->gtIndex);
538             srcCount += BuildOperandUses(tree->AsBoundsChk()->gtArrLen);
539             break;
540
541         case GT_ARR_ELEM:
542             // These must have been lowered to GT_ARR_INDEX
543             noway_assert(!"We should never see a GT_ARR_ELEM after Lowering.");
544             srcCount = 0;
545             break;
546
547         case GT_ARR_INDEX:
548         {
549             srcCount = 2;
550             assert(dstCount == 1);
551             assert(!tree->AsArrIndex()->ArrObj()->isContained());
552             assert(!tree->AsArrIndex()->IndexExpr()->isContained());
553             // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
554             // times while the result is being computed.
555             RefPosition* arrObjUse = BuildUse(tree->AsArrIndex()->ArrObj());
556             setDelayFree(arrObjUse);
557             BuildUse(tree->AsArrIndex()->IndexExpr());
558             BuildDef(tree);
559         }
560         break;
561
562         case GT_ARR_OFFSET:
563         {
564             // This consumes the offset, if any, the arrObj and the effective index,
565             // and produces the flattened offset for this dimension.
566             assert(dstCount == 1);
567             srcCount                 = 0;
568             RefPosition* internalDef = nullptr;
569             if (tree->gtArrOffs.gtOffset->isContained())
570             {
571                 srcCount = 2;
572             }
573             else
574             {
575                 // Here we simply need an internal register, which must be different
576                 // from any of the operand's registers, but may be the same as targetReg.
577                 srcCount    = 3;
578                 internalDef = buildInternalIntRegisterDefForNode(tree);
579                 BuildUse(tree->AsArrOffs()->gtOffset);
580             }
581             BuildUse(tree->AsArrOffs()->gtIndex);
582             BuildUse(tree->AsArrOffs()->gtArrObj);
583             if (internalDef != nullptr)
584             {
585                 buildInternalRegisterUses();
586             }
587             BuildDef(tree);
588         }
589         break;
590
591         case GT_LEA:
592             // The LEA usually passes its operands through to the GT_IND, in which case it will
593             // be contained, but we may be instantiating an address, in which case we set them here.
594             srcCount = 0;
595             assert(dstCount == 1);
596             if (tree->AsAddrMode()->HasBase())
597             {
598                 srcCount++;
599                 BuildUse(tree->AsAddrMode()->Base());
600             }
601             if (tree->AsAddrMode()->HasIndex())
602             {
603                 srcCount++;
604                 BuildUse(tree->AsAddrMode()->Index());
605             }
606             BuildDef(tree);
607             break;
608
609         case GT_STOREIND:
610             if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
611             {
612                 srcCount = BuildGCWriteBarrier(tree);
613                 break;
614             }
615             srcCount = BuildIndir(tree->AsIndir());
616             break;
617
618         case GT_NULLCHECK:
619         {
620             assert(dstCount == 0);
621             regMaskTP indirCandidates = RBM_NONE;
622             BuildUse(tree->gtGetOp1(), indirCandidates);
623             srcCount = 1;
624             break;
625         }
626
627         case GT_IND:
628             srcCount = BuildIndir(tree->AsIndir());
629             assert(dstCount == 1);
630             break;
631
632         case GT_CATCH_ARG:
633             srcCount = 0;
634             assert(dstCount == 1);
635             BuildDef(tree, RBM_EXCEPTION_OBJECT);
636             break;
637
638 #if !FEATURE_EH_FUNCLETS
639         case GT_END_LFIN:
640             srcCount = 0;
641             assert(dstCount == 0);
642             break;
643 #endif
644
645         case GT_CLS_VAR:
646             // These nodes are eliminated by rationalizer.
647             JITDUMP("Unexpected node %s in Lower.\n", GenTree::OpName(tree->OperGet()));
648             unreached();
649             break;
650
651         case GT_INDEX_ADDR:
652         {
653             assert(dstCount == 1);
654             RefPosition* internalDef = nullptr;
655             if (tree->AsIndexAddr()->Index()->TypeGet() == TYP_I_IMPL)
656             {
657                 internalDef = buildInternalIntRegisterDefForNode(tree);
658             }
659             else
660             {
661                 switch (tree->AsIndexAddr()->gtElemSize)
662                 {
663                     case 1:
664                     case 2:
665                     case 4:
666                     case 8:
667                         break;
668
669                     default:
670                         internalDef = buildInternalIntRegisterDefForNode(tree);
671                         break;
672                 }
673             }
674             srcCount = BuildBinaryUses(tree->AsOp());
675             if (internalDef != nullptr)
676             {
677                 buildInternalRegisterUses();
678             }
679             BuildDef(tree);
680         }
681         break;
682
683     } // end switch (tree->OperGet())
684
685     // We need to be sure that we've set srcCount and dstCount appropriately
686     assert((dstCount < 2) || (tree->IsMultiRegCall() && dstCount == MAX_RET_REG_COUNT));
687     assert(isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue()));
688     assert(!tree->IsUnusedValue() || (dstCount != 0));
689     assert(dstCount == tree->GetRegisterDstCount());
690     INDEBUG(dumpNodeInfo(tree, dstCandidates, srcCount, dstCount));
691     return srcCount;
692 }
693
694 GenTree* LinearScan::getTgtPrefOperand(GenTreeOp* tree)
695 {
696     // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
697     // Even then we would like to set isTgtPref on Op1.
698     if (tree->OperIsBinary() && isRMWRegOper(tree))
699     {
700         GenTree* op1 = tree->gtGetOp1();
701         GenTree* op2 = tree->gtGetOp2();
702
703         // Commutative opers like add/mul/and/or/xor could reverse the order of
704         // operands if it is safe to do so.  In such a case we would like op2 to be
705         // target preferenced instead of op1.
706         if (tree->OperIsCommutative() && op1->isContained() && op2 != nullptr)
707         {
708             op1 = op2;
709             op2 = tree->gtGetOp1();
710         }
711
712         // If we have a read-modify-write operation, we want to preference op1 to the target,
713         // if it is not contained.
714         if (!op1->isContained() && !op1->OperIs(GT_LIST))
715         {
716             return op1;
717         }
718     }
719     return nullptr;
720 }
721
722 //------------------------------------------------------------------------------
723 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
724 //
725 // Arguments:
726 //    tree      - a binary tree node
727 //
728 // Return Value:
729 //    Returns true if we can use the read-modify-write instruction form
730 //
731 // Notes:
732 //    This is used to determine whether to preference the source to the destination register.
733 //
734 bool LinearScan::isRMWRegOper(GenTree* tree)
735 {
736     // TODO-XArch-CQ: Make this more accurate.
737     // For now, We assume that most binary operators are of the RMW form.
738     assert(tree->OperIsBinary());
739
740     if (tree->OperIsCompare() || tree->OperIs(GT_CMP) || tree->OperIs(GT_BT))
741     {
742         return false;
743     }
744
745     switch (tree->OperGet())
746     {
747         // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
748         case GT_LEA:
749         case GT_STOREIND:
750         case GT_ARR_INDEX:
751         case GT_STORE_BLK:
752         case GT_STORE_OBJ:
753         case GT_SWITCH_TABLE:
754         case GT_LOCKADD:
755 #ifdef _TARGET_X86_
756         case GT_LONG:
757 #endif
758             return false;
759
760         // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
761         case GT_MUL:
762             return (!tree->gtGetOp2()->isContainedIntOrIImmed() && !tree->gtGetOp1()->isContainedIntOrIImmed());
763
764 #ifdef FEATURE_HW_INTRINSICS
765         case GT_HWIntrinsic:
766             return tree->isRMWHWIntrinsic(compiler);
767 #endif // FEATURE_HW_INTRINSICS
768
769         default:
770             return true;
771     }
772 }
773
774 // Support for building RefPositions for RMW nodes.
775 int LinearScan::BuildRMWUses(GenTreeOp* node, regMaskTP candidates)
776 {
777     int       srcCount      = 0;
778     GenTree*  op1           = node->gtOp1;
779     GenTree*  op2           = node->gtGetOp2IfPresent();
780     bool      isReverseOp   = node->IsReverseOp();
781     regMaskTP op1Candidates = candidates;
782     regMaskTP op2Candidates = candidates;
783
784 #ifdef _TARGET_X86_
785     if (varTypeIsByte(node))
786     {
787         regMaskTP byteCandidates = (candidates == RBM_NONE) ? allByteRegs() : (candidates & allByteRegs());
788         if (!op1->isContained())
789         {
790             assert(byteCandidates != RBM_NONE);
791             op1Candidates = byteCandidates;
792         }
793         if (node->OperIsCommutative() && !op2->isContained())
794         {
795             assert(byteCandidates != RBM_NONE);
796             op2Candidates = byteCandidates;
797         }
798     }
799 #endif // _TARGET_X86_
800
801     GenTree* tgtPrefOperand = getTgtPrefOperand(node);
802     assert((tgtPrefOperand == nullptr) || (tgtPrefOperand == op1) || node->OperIsCommutative());
803     assert(!isReverseOp || node->OperIsCommutative());
804
805     // Determine which operand, if any, should be delayRegFree. Normally, this would be op2,
806     // but if we have a commutative operator and op1 is a contained memory op, it would be op1.
807     // We need to make the delayRegFree operand remain live until the op is complete, by marking
808     // the source(s) associated with op2 as "delayFree".
809     // Note that if op2 of a binary RMW operator is a memory op, even if the operator
810     // is commutative, codegen cannot reverse them.
811     // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
812     // more work to be done to correctly reverse the operands if they involve memory
813     // operands.  Also, we may need to handle more cases than GT_IND, especially once
814     // we've modified the register allocator to not require all nodes to be assigned
815     // a register (e.g. a spilled lclVar can often be referenced directly from memory).
816     // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
817     GenTree* delayUseOperand = op2;
818     if (node->OperIsCommutative())
819     {
820         if (op1->isContained() && op2 != nullptr)
821         {
822             delayUseOperand = op1;
823         }
824         else if (!op2->isContained() || op2->IsCnsIntOrI())
825         {
826             // If we have a commutative operator and op2 is not a memory op, we don't need
827             // to set delayRegFree on either operand because codegen can swap them.
828             delayUseOperand = nullptr;
829         }
830     }
831     else if (op1->isContained())
832     {
833         delayUseOperand = nullptr;
834     }
835     if (delayUseOperand != nullptr)
836     {
837         assert(delayUseOperand != tgtPrefOperand);
838     }
839
840     if (isReverseOp)
841     {
842         op1 = op2;
843         op2 = node->gtOp1;
844     }
845
846     // Build first use
847     if (tgtPrefOperand == op1)
848     {
849         assert(!op1->isContained());
850         tgtPrefUse = BuildUse(op1, op1Candidates);
851         srcCount++;
852     }
853     else if (delayUseOperand == op1)
854     {
855         srcCount += BuildDelayFreeUses(op1, op1Candidates);
856     }
857     else
858     {
859         srcCount += BuildOperandUses(op1, op1Candidates);
860     }
861     // Build second use
862     if (op2 != nullptr)
863     {
864         if (tgtPrefOperand == op2)
865         {
866             assert(!op2->isContained());
867             tgtPrefUse = BuildUse(op2, op2Candidates);
868             srcCount++;
869         }
870         else if (delayUseOperand == op2)
871         {
872             srcCount += BuildDelayFreeUses(op2, op2Candidates);
873         }
874         else
875         {
876             srcCount += BuildOperandUses(op2, op2Candidates);
877         }
878     }
879     return srcCount;
880 }
881
882 //------------------------------------------------------------------------
883 // BuildShiftRotate: Set the NodeInfo for a shift or rotate.
884 //
885 // Arguments:
886 //    tree      - The node of interest
887 //
888 // Return Value:
889 //    The number of sources consumed by this node.
890 //
891 int LinearScan::BuildShiftRotate(GenTree* tree)
892 {
893     // For shift operations, we need that the number
894     // of bits moved gets stored in CL in case
895     // the number of bits to shift is not a constant.
896     int       srcCount      = 0;
897     GenTree*  shiftBy       = tree->gtGetOp2();
898     GenTree*  source        = tree->gtGetOp1();
899     regMaskTP srcCandidates = RBM_NONE;
900     regMaskTP dstCandidates = RBM_NONE;
901
902     // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
903     // We will allow whatever can be encoded - hope you know what you are doing.
904     if (shiftBy->isContained())
905     {
906         assert(shiftBy->OperIsConst());
907     }
908     else
909     {
910         srcCandidates = allRegs(TYP_INT) & ~RBM_RCX;
911         dstCandidates = allRegs(TYP_INT) & ~RBM_RCX;
912     }
913
914     // Note that Rotate Left/Right instructions don't set ZF and SF flags.
915     //
916     // If the operand being shifted is 32-bits then upper three bits are masked
917     // by hardware to get actual shift count.  Similarly for 64-bit operands
918     // shift count is narrowed to [0..63].  If the resulting shift count is zero,
919     // then shift operation won't modify flags.
920     //
921     // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
922     // if the shift count is known to be non-zero and in the range depending on the
923     // operand size.
924     CLANG_FORMAT_COMMENT_ANCHOR;
925
926 #ifdef _TARGET_X86_
927     // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
928     // we can have a three operand form.
929     if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
930     {
931         assert((source->OperGet() == GT_LONG) && source->isContained());
932
933         GenTree* sourceLo = source->gtGetOp1();
934         GenTree* sourceHi = source->gtGetOp2();
935         assert(!sourceLo->isContained() && !sourceHi->isContained());
936         RefPosition* sourceLoUse = BuildUse(sourceLo, srcCandidates);
937         RefPosition* sourceHiUse = BuildUse(sourceHi, srcCandidates);
938
939         if (!tree->isContained())
940         {
941             if (tree->OperGet() == GT_LSH_HI)
942             {
943                 setDelayFree(sourceLoUse);
944             }
945             else
946             {
947                 setDelayFree(sourceHiUse);
948             }
949         }
950     }
951     else
952 #endif
953         if (!source->isContained())
954     {
955         tgtPrefUse = BuildUse(source, srcCandidates);
956         srcCount++;
957     }
958     else
959     {
960         srcCount += BuildOperandUses(source, srcCandidates);
961     }
962     if (!tree->isContained())
963     {
964         if (!shiftBy->isContained())
965         {
966             srcCount += BuildDelayFreeUses(shiftBy, RBM_RCX);
967         }
968         BuildDef(tree, dstCandidates);
969     }
970     else
971     {
972         if (!shiftBy->isContained())
973         {
974             srcCount += BuildOperandUses(shiftBy, RBM_RCX);
975         }
976     }
977     return srcCount;
978 }
979
980 //------------------------------------------------------------------------
981 // BuildCall: Set the NodeInfo for a call.
982 //
983 // Arguments:
984 //    call      - The call node of interest
985 //
986 // Return Value:
987 //    The number of sources consumed by this node.
988 //
989 int LinearScan::BuildCall(GenTreeCall* call)
990 {
991     bool            hasMultiRegRetVal = false;
992     ReturnTypeDesc* retTypeDesc       = nullptr;
993     int             srcCount          = 0;
994     int             dstCount          = 0;
995     regMaskTP       dstCandidates     = RBM_NONE;
996
997     assert(!call->isContained());
998     if (call->TypeGet() != TYP_VOID)
999     {
1000         hasMultiRegRetVal = call->HasMultiRegRetVal();
1001         if (hasMultiRegRetVal)
1002         {
1003             // dst count = number of registers in which the value is returned by call
1004             retTypeDesc = call->GetReturnTypeDesc();
1005             dstCount    = retTypeDesc->GetReturnRegCount();
1006         }
1007         else
1008         {
1009             dstCount = 1;
1010         }
1011     }
1012
1013     GenTree* ctrlExpr = call->gtControlExpr;
1014     if (call->gtCallType == CT_INDIRECT)
1015     {
1016         ctrlExpr = call->gtCallAddr;
1017     }
1018
1019     RegisterType registerType = call->TypeGet();
1020
1021     // Set destination candidates for return value of the call.
1022     CLANG_FORMAT_COMMENT_ANCHOR;
1023
1024 #ifdef _TARGET_X86_
1025     if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
1026     {
1027         // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
1028         // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
1029         // correct argument registers.
1030         dstCandidates = RBM_PINVOKE_TCB;
1031     }
1032     else
1033 #endif // _TARGET_X86_
1034         if (hasMultiRegRetVal)
1035     {
1036         assert(retTypeDesc != nullptr);
1037         dstCandidates = retTypeDesc->GetABIReturnRegs();
1038         assert((int)genCountBits(dstCandidates) == dstCount);
1039     }
1040     else if (varTypeIsFloating(registerType))
1041     {
1042 #ifdef _TARGET_X86_
1043         // The return value will be on the X87 stack, and we will need to move it.
1044         dstCandidates = allRegs(registerType);
1045 #else  // !_TARGET_X86_
1046         dstCandidates              = RBM_FLOATRET;
1047 #endif // !_TARGET_X86_
1048     }
1049     else if (registerType == TYP_LONG)
1050     {
1051         dstCandidates = RBM_LNGRET;
1052     }
1053     else
1054     {
1055         dstCandidates = RBM_INTRET;
1056     }
1057
1058     // number of args to a call =
1059     // callRegArgs + (callargs - placeholders, setup, etc)
1060     // there is an explicit thisPtr but it is redundant
1061
1062     bool callHasFloatRegArgs = false;
1063     bool isVarArgs           = call->IsVarargs();
1064
1065     // First, determine internal registers.
1066     // We will need one for any float arguments to a varArgs call.
1067     for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1068     {
1069         GenTree* argNode = list->Current();
1070         if (argNode->OperIsPutArgReg())
1071         {
1072             HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1073         }
1074         else if (argNode->OperGet() == GT_FIELD_LIST)
1075         {
1076             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1077             {
1078                 assert(entry->Current()->OperIsPutArgReg());
1079                 HandleFloatVarArgs(call, entry->Current(), &callHasFloatRegArgs);
1080             }
1081         }
1082     }
1083
1084     // Now, count reg args
1085     for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1086     {
1087         // By this point, lowering has ensured that all call arguments are one of the following:
1088         // - an arg setup store
1089         // - an arg placeholder
1090         // - a nop
1091         // - a copy blk
1092         // - a field list
1093         // - a put arg
1094         //
1095         // Note that this property is statically checked by LinearScan::CheckBlock.
1096         GenTree* argNode = list->Current();
1097
1098         // Each register argument corresponds to one source.
1099         if (argNode->OperIsPutArgReg())
1100         {
1101             srcCount++;
1102             BuildUse(argNode, genRegMask(argNode->gtRegNum));
1103         }
1104 #ifdef UNIX_AMD64_ABI
1105         else if (argNode->OperGet() == GT_FIELD_LIST)
1106         {
1107             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1108             {
1109                 assert(entry->Current()->OperIsPutArgReg());
1110                 srcCount++;
1111                 BuildUse(entry->Current(), genRegMask(entry->Current()->gtRegNum));
1112             }
1113         }
1114 #endif // UNIX_AMD64_ABI
1115
1116 #ifdef DEBUG
1117         // In DEBUG only, check validity with respect to the arg table entry.
1118
1119         fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1120         assert(curArgTabEntry);
1121
1122         if (curArgTabEntry->regNum == REG_STK)
1123         {
1124             // late arg that is not passed in a register
1125             assert(argNode->gtOper == GT_PUTARG_STK);
1126
1127 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1128             // If the node is TYP_STRUCT and it is put on stack with
1129             // putarg_stk operation, we consume and produce no registers.
1130             // In this case the embedded Obj node should not produce
1131             // registers too since it is contained.
1132             // Note that if it is a SIMD type the argument will be in a register.
1133             if (argNode->TypeGet() == TYP_STRUCT)
1134             {
1135                 assert(argNode->gtGetOp1() != nullptr && argNode->gtGetOp1()->OperGet() == GT_OBJ);
1136                 assert(argNode->gtGetOp1()->isContained());
1137             }
1138 #endif // FEATURE_PUT_STRUCT_ARG_STK
1139             continue;
1140         }
1141 #ifdef UNIX_AMD64_ABI
1142         if (argNode->OperGet() == GT_FIELD_LIST)
1143         {
1144             assert(argNode->isContained());
1145             assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
1146
1147             int i = 0;
1148             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1149             {
1150                 const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum;
1151                 assert(entry->Current()->gtRegNum == argReg);
1152                 assert(i < 2);
1153                 i++;
1154             }
1155         }
1156         else
1157 #endif // UNIX_AMD64_ABI
1158         {
1159             const regNumber argReg = curArgTabEntry->regNum;
1160             assert(argNode->gtRegNum == argReg);
1161         }
1162 #endif // DEBUG
1163     }
1164
1165     // Now, count stack args
1166     // Note that these need to be computed into a register, but then
1167     // they're just stored to the stack - so the reg doesn't
1168     // need to remain live until the call.  In fact, it must not
1169     // because the code generator doesn't actually consider it live,
1170     // so it can't be spilled.
1171
1172     GenTree* args = call->gtCallArgs;
1173     while (args)
1174     {
1175         GenTree* arg = args->gtGetOp1();
1176         if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
1177         {
1178             if (arg->IsValue() && !arg->isContained())
1179             {
1180                 assert(arg->IsUnusedValue());
1181             }
1182         }
1183         args = args->gtGetOp2();
1184     }
1185
1186     // set reg requirements on call target represented as control sequence.
1187     if (ctrlExpr != nullptr)
1188     {
1189         regMaskTP ctrlExprCandidates = RBM_NONE;
1190
1191         // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1192         // computed into a register.
1193         if (call->IsFastTailCall())
1194         {
1195             assert(!ctrlExpr->isContained());
1196             // Fast tail call - make sure that call target is always computed in RAX
1197             // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1198             ctrlExprCandidates = RBM_RAX;
1199         }
1200 #ifdef _TARGET_X86_
1201         else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1202         {
1203             // On x86, we need to generate a very specific pattern for indirect VSD calls:
1204             //
1205             //    3-byte nop
1206             //    call dword ptr [eax]
1207             //
1208             // Where EAX is also used as an argument to the stub dispatch helper. Make
1209             // sure that the call target address is computed into EAX in this case.
1210             assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
1211             ctrlExprCandidates = RBM_VIRTUAL_STUB_TARGET;
1212         }
1213 #endif // _TARGET_X86_
1214
1215 #if FEATURE_VARARG
1216         // If it is a fast tail call, it is already preferenced to use RAX.
1217         // Therefore, no need set src candidates on call tgt again.
1218         if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall())
1219         {
1220             // Don't assign the call target to any of the argument registers because
1221             // we will use them to also pass floating point arguments as required
1222             // by Amd64 ABI.
1223             ctrlExprCandidates = allRegs(TYP_INT) & ~(RBM_ARG_REGS);
1224         }
1225 #endif // !FEATURE_VARARG
1226         srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
1227     }
1228
1229     buildInternalRegisterUses();
1230
1231     // Now generate defs and kills.
1232     regMaskTP killMask = getKillSetForCall(call);
1233     BuildDefsWithKills(call, dstCount, dstCandidates, killMask);
1234     return srcCount;
1235 }
1236
1237 //------------------------------------------------------------------------
1238 // BuildBlockStore: Set the NodeInfo for a block store.
1239 //
1240 // Arguments:
1241 //    blkNode       - The block store node of interest
1242 //
1243 // Return Value:
1244 //    The number of sources consumed by this node.
1245 //
1246 int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
1247 {
1248     GenTree* dstAddr  = blkNode->Addr();
1249     unsigned size     = blkNode->gtBlkSize;
1250     GenTree* source   = blkNode->Data();
1251     int      srcCount = 0;
1252
1253     GenTree* srcAddrOrFill = nullptr;
1254     bool     isInitBlk     = blkNode->OperIsInitBlkOp();
1255
1256     regMaskTP dstAddrRegMask = RBM_NONE;
1257     regMaskTP sourceRegMask  = RBM_NONE;
1258     regMaskTP blkSizeRegMask = RBM_NONE;
1259
1260     if (isInitBlk)
1261     {
1262         GenTree* initVal = source;
1263         if (initVal->OperIsInitVal())
1264         {
1265             assert(initVal->isContained());
1266             initVal = initVal->gtGetOp1();
1267         }
1268         srcAddrOrFill = initVal;
1269
1270         switch (blkNode->gtBlkOpKind)
1271         {
1272             case GenTreeBlk::BlkOpKindUnroll:
1273                 assert(initVal->IsCnsIntOrI());
1274                 if (size >= XMM_REGSIZE_BYTES)
1275                 {
1276                     // Reserve an XMM register to fill it with a pack of 16 init value constants.
1277                     buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1278                     // use XMM register to fill with constants, it's AVX instruction and set the flag
1279                     SetContainsAVXFlags();
1280                 }
1281 #ifdef _TARGET_X86_
1282                 if ((size & 1) != 0)
1283                 {
1284                     // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1285                     // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1286                     // when unrolling, so only allow byteable registers as the source value. (We could
1287                     // consider just using BlkOpKindRepInstr instead.)
1288                     sourceRegMask = allByteRegs();
1289                 }
1290 #endif // _TARGET_X86_
1291                 break;
1292
1293             case GenTreeBlk::BlkOpKindRepInstr:
1294                 // rep stos has the following register requirements:
1295                 // a) The memory address to be in RDI.
1296                 // b) The fill value has to be in RAX.
1297                 // c) The buffer size will go in RCX.
1298                 dstAddrRegMask = RBM_RDI;
1299                 sourceRegMask  = RBM_RAX;
1300                 blkSizeRegMask = RBM_RCX;
1301                 break;
1302
1303             case GenTreeBlk::BlkOpKindHelper:
1304 #ifdef _TARGET_AMD64_
1305                 // The helper follows the regular AMD64 ABI.
1306                 dstAddrRegMask = RBM_ARG_0;
1307                 sourceRegMask  = RBM_ARG_1;
1308                 blkSizeRegMask = RBM_ARG_2;
1309 #else  // !_TARGET_AMD64_
1310                 dstAddrRegMask     = RBM_RDI;
1311                 sourceRegMask      = RBM_RAX;
1312                 blkSizeRegMask     = RBM_RCX;
1313 #endif // !_TARGET_AMD64_
1314                 break;
1315
1316             default:
1317                 unreached();
1318         }
1319     }
1320     else
1321     {
1322         // CopyObj or CopyBlk
1323         if (source->gtOper == GT_IND)
1324         {
1325             assert(source->isContained());
1326             srcAddrOrFill = source->gtGetOp1();
1327         }
1328         if (blkNode->OperGet() == GT_STORE_OBJ)
1329         {
1330             if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
1331             {
1332                 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1333                 blkSizeRegMask = RBM_RCX;
1334             }
1335             // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
1336             // sources.
1337             sourceRegMask  = RBM_RSI;
1338             dstAddrRegMask = RBM_RDI;
1339         }
1340         else
1341         {
1342             switch (blkNode->gtBlkOpKind)
1343             {
1344                 case GenTreeBlk::BlkOpKindUnroll:
1345                     // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1346                     //
1347                     // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1348                     // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1349                     // RBM_NON_BYTE_REGS from internal candidates.
1350                     if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1351                     {
1352                         regMaskTP regMask = allRegs(TYP_INT);
1353
1354 #ifdef _TARGET_X86_
1355                         if ((size & 1) != 0)
1356                         {
1357                             regMask &= ~RBM_NON_BYTE_REGS;
1358                         }
1359 #endif
1360                         buildInternalIntRegisterDefForNode(blkNode, regMask);
1361                     }
1362
1363                     if (size >= XMM_REGSIZE_BYTES)
1364                     {
1365                         // If we have a buffer larger than XMM_REGSIZE_BYTES,
1366                         // reserve an XMM register to use it for a
1367                         // series of 16-byte loads and stores.
1368                         buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1369                         // Uses XMM reg for load and store and hence check to see whether AVX instructions
1370                         // are used for codegen, set ContainsAVX flag
1371                         SetContainsAVXFlags();
1372                     }
1373                     break;
1374
1375                 case GenTreeBlk::BlkOpKindRepInstr:
1376                     // rep stos has the following register requirements:
1377                     // a) The dest address has to be in RDI.
1378                     // b) The src address has to be in RSI.
1379                     // c) The buffer size will go in RCX.
1380                     dstAddrRegMask = RBM_RDI;
1381                     sourceRegMask  = RBM_RSI;
1382                     blkSizeRegMask = RBM_RCX;
1383                     break;
1384
1385                 case GenTreeBlk::BlkOpKindHelper:
1386 #ifdef _TARGET_AMD64_
1387                     // The helper follows the regular AMD64 ABI.
1388                     dstAddrRegMask = RBM_ARG_0;
1389                     sourceRegMask  = RBM_ARG_1;
1390                     blkSizeRegMask = RBM_ARG_2;
1391 #else  // !_TARGET_AMD64_
1392                     dstAddrRegMask = RBM_RDI;
1393                     sourceRegMask  = RBM_RAX;
1394                     blkSizeRegMask = RBM_RCX;
1395 #endif // !_TARGET_AMD64_
1396                     break;
1397
1398                 default:
1399                     unreached();
1400             }
1401         }
1402         if ((srcAddrOrFill == nullptr) && (sourceRegMask != RBM_NONE))
1403         {
1404             // This is a local source; we'll use a temp register for its address.
1405             assert(source->isContained() && source->OperIsLocal());
1406             buildInternalIntRegisterDefForNode(blkNode, sourceRegMask);
1407         }
1408     }
1409
1410     if ((size != 0) && (blkSizeRegMask != RBM_NONE))
1411     {
1412         // Reserve a temp register for the block size argument.
1413         buildInternalIntRegisterDefForNode(blkNode, blkSizeRegMask);
1414     }
1415
1416     if (!dstAddr->isContained() && !blkNode->IsReverseOp())
1417     {
1418         srcCount++;
1419         BuildUse(dstAddr, dstAddrRegMask);
1420     }
1421     if ((srcAddrOrFill != nullptr) && !srcAddrOrFill->isContained())
1422     {
1423         srcCount++;
1424         BuildUse(srcAddrOrFill, sourceRegMask);
1425     }
1426     if (!dstAddr->isContained() && blkNode->IsReverseOp())
1427     {
1428         srcCount++;
1429         BuildUse(dstAddr, dstAddrRegMask);
1430     }
1431
1432     if (size == 0)
1433     {
1434         assert(blkNode->OperIs(GT_STORE_DYN_BLK));
1435         // The block size argument is a third argument to GT_STORE_DYN_BLK
1436         srcCount++;
1437         GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1438         BuildUse(blockSize, blkSizeRegMask);
1439     }
1440     buildInternalRegisterUses();
1441     regMaskTP killMask = getKillSetForBlockStore(blkNode);
1442     BuildDefsWithKills(blkNode, 0, RBM_NONE, killMask);
1443     return srcCount;
1444 }
1445
1446 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1447 //------------------------------------------------------------------------
1448 // BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1449 //
1450 // Arguments:
1451 //    tree      - The node of interest
1452 //
1453 // Return Value:
1454 //    The number of sources consumed by this node.
1455 //
1456 int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
1457 {
1458     int srcCount = 0;
1459     if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
1460     {
1461         assert(putArgStk->gtOp1->isContained());
1462
1463         RefPosition* simdTemp   = nullptr;
1464         RefPosition* intTemp    = nullptr;
1465         unsigned     prevOffset = putArgStk->getArgSize();
1466         // We need to iterate over the fields twice; once to determine the need for internal temps,
1467         // and once to actually build the uses.
1468         for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1469         {
1470             GenTree* const  fieldNode   = current->Current();
1471             const var_types fieldType   = fieldNode->TypeGet();
1472             const unsigned  fieldOffset = current->gtFieldOffset;
1473
1474 #ifdef _TARGET_X86_
1475             assert(fieldType != TYP_LONG);
1476 #endif // _TARGET_X86_
1477
1478 #if defined(FEATURE_SIMD)
1479             // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the
1480             // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
1481             // we "round up" to 16.
1482             if ((current->gtFieldType == TYP_SIMD12) && (simdTemp == nullptr))
1483             {
1484                 simdTemp = buildInternalFloatRegisterDefForNode(putArgStk);
1485             }
1486 #endif // defined(FEATURE_SIMD)
1487
1488 #ifdef _TARGET_X86_
1489             if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
1490             {
1491                 // We can treat as a slot any field that is stored at a slot boundary, where the previous
1492                 // field is not in the same slot. (Note that we store the fields in reverse order.)
1493                 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
1494                 if (intTemp == nullptr)
1495                 {
1496                     intTemp = buildInternalIntRegisterDefForNode(putArgStk);
1497                 }
1498                 if (!fieldIsSlot && varTypeIsByte(fieldType))
1499                 {
1500                     // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
1501                     // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
1502                     // need a byte-addressable register for the store. We will enforce this requirement on an internal
1503                     // register, which we can use to copy multiple byte values.
1504                     intTemp->registerAssignment &= allByteRegs();
1505                 }
1506             }
1507 #endif // _TARGET_X86_
1508
1509             if (varTypeIsGC(fieldType))
1510             {
1511                 putArgStk->gtNumberReferenceSlots++;
1512             }
1513             prevOffset = fieldOffset;
1514         }
1515
1516         for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1517         {
1518             GenTree* const fieldNode = current->Current();
1519             if (!fieldNode->isContained())
1520             {
1521                 BuildUse(fieldNode);
1522                 srcCount++;
1523             }
1524         }
1525         buildInternalRegisterUses();
1526
1527         return srcCount;
1528     }
1529
1530     GenTree*  src  = putArgStk->gtOp1;
1531     var_types type = src->TypeGet();
1532
1533 #if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1534     // For PutArgStk of a TYP_SIMD12, we need an extra register.
1535     if (putArgStk->isSIMD12())
1536     {
1537         buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
1538         BuildUse(putArgStk->gtOp1);
1539         srcCount = 1;
1540         buildInternalRegisterUses();
1541         return srcCount;
1542     }
1543 #endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1544
1545     if (type != TYP_STRUCT)
1546     {
1547         return BuildSimple(putArgStk);
1548     }
1549
1550     GenTree* dst     = putArgStk;
1551     GenTree* srcAddr = nullptr;
1552
1553     // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1554     // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1555     // our framework assemblies, so this is the main code generation scheme we'll use.
1556     ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
1557     switch (putArgStk->gtPutArgStkKind)
1558     {
1559         case GenTreePutArgStk::Kind::Push:
1560         case GenTreePutArgStk::Kind::PushAllSlots:
1561         case GenTreePutArgStk::Kind::Unroll:
1562             // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1563             //
1564             // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1565             // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1566             // RBM_NON_BYTE_REGS from internal candidates.
1567             if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
1568             {
1569                 regMaskTP regMask = allRegs(TYP_INT);
1570
1571 #ifdef _TARGET_X86_
1572                 if ((size % 2) != 0)
1573                 {
1574                     regMask &= ~RBM_NON_BYTE_REGS;
1575                 }
1576 #endif
1577                 buildInternalIntRegisterDefForNode(putArgStk, regMask);
1578             }
1579
1580 #ifdef _TARGET_X86_
1581             if (size >= 8)
1582 #else  // !_TARGET_X86_
1583             if (size >= XMM_REGSIZE_BYTES)
1584 #endif // !_TARGET_X86_
1585             {
1586                 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
1587                 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
1588                 // series of 16-byte loads and stores.
1589                 buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
1590                 SetContainsAVXFlags();
1591             }
1592             break;
1593
1594         case GenTreePutArgStk::Kind::RepInstr:
1595             buildInternalIntRegisterDefForNode(putArgStk, RBM_RDI);
1596             buildInternalIntRegisterDefForNode(putArgStk, RBM_RCX);
1597             buildInternalIntRegisterDefForNode(putArgStk, RBM_RSI);
1598             break;
1599
1600         default:
1601             unreached();
1602     }
1603
1604     srcCount = BuildOperandUses(src);
1605     buildInternalRegisterUses();
1606     return srcCount;
1607 }
1608 #endif // FEATURE_PUT_STRUCT_ARG_STK
1609
1610 //------------------------------------------------------------------------
1611 // BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
1612 //
1613 // Arguments:
1614 //    tree      - The node of interest
1615 //
1616 // Return Value:
1617 //    The number of sources consumed by this node.
1618 //
1619 int LinearScan::BuildLclHeap(GenTree* tree)
1620 {
1621     int srcCount = 1;
1622
1623     // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1624     // Here '-' means don't care.
1625     //
1626     //     Size?                    Init Memory?         # temp regs
1627     //      0                            -                  0 (returns 0)
1628     //      const and <=6 reg words      -                  0 (pushes '0')
1629     //      const and >6 reg words       Yes                0 (pushes '0')
1630     //      const and <PageSize          No                 0 (amd64) 1 (x86)
1631     //                                                        (x86:tmpReg for sutracting from esp)
1632     //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
1633     //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
1634     //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
1635     //
1636     // Note: Here we don't need internal register to be different from targetReg.
1637     // Rather, require it to be different from operand's reg.
1638
1639     GenTree* size = tree->gtGetOp1();
1640     if (size->IsCnsIntOrI())
1641     {
1642         assert(size->isContained());
1643         srcCount       = 0;
1644         size_t sizeVal = size->gtIntCon.gtIconVal;
1645
1646         if (sizeVal == 0)
1647         {
1648             buildInternalIntRegisterDefForNode(tree);
1649         }
1650         else
1651         {
1652             // Compute the amount of memory to properly STACK_ALIGN.
1653             // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1654             // This should also help in debugging as we can examine the original size specified with localloc.
1655             sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1656
1657             // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1658             // we will generate 'push 0'.
1659             assert((sizeVal % REGSIZE_BYTES) == 0);
1660             size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1661             if (cntRegSizedWords > 6)
1662             {
1663                 if (!compiler->info.compInitMem)
1664                 {
1665                     // No need to initialize allocated stack space.
1666                     if (sizeVal < compiler->eeGetPageSize())
1667                     {
1668 #ifdef _TARGET_X86_
1669                         // x86 needs a register here to avoid generating "sub" on ESP.
1670                         buildInternalIntRegisterDefForNode(tree);
1671 #endif
1672                     }
1673                     else
1674                     {
1675                         // We need two registers: regCnt and RegTmp
1676                         buildInternalIntRegisterDefForNode(tree);
1677                         buildInternalIntRegisterDefForNode(tree);
1678                     }
1679                 }
1680             }
1681         }
1682     }
1683     else
1684     {
1685         if (!compiler->info.compInitMem)
1686         {
1687             buildInternalIntRegisterDefForNode(tree);
1688             buildInternalIntRegisterDefForNode(tree);
1689         }
1690         BuildUse(size);
1691     }
1692     buildInternalRegisterUses();
1693     BuildDef(tree);
1694     return srcCount;
1695 }
1696
1697 //------------------------------------------------------------------------
1698 // BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
1699 //
1700 // Arguments:
1701 //    tree      - The node of interest
1702 //
1703 // Return Value:
1704 //    The number of sources consumed by this node.
1705 //
1706 int LinearScan::BuildModDiv(GenTree* tree)
1707 {
1708     GenTree*     op1           = tree->gtGetOp1();
1709     GenTree*     op2           = tree->gtGetOp2();
1710     regMaskTP    dstCandidates = RBM_NONE;
1711     RefPosition* internalDef   = nullptr;
1712     int          srcCount      = 0;
1713
1714     if (varTypeIsFloating(tree->TypeGet()))
1715     {
1716         return BuildSimple(tree);
1717     }
1718
1719     // Amd64 Div/Idiv instruction:
1720     //    Dividend in RAX:RDX  and computes
1721     //    Quotient in RAX, Remainder in RDX
1722
1723     if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
1724     {
1725         // We are interested in just the remainder.
1726         // RAX is used as a trashable register during computation of remainder.
1727         dstCandidates = RBM_RDX;
1728     }
1729     else
1730     {
1731         // We are interested in just the quotient.
1732         // RDX gets used as trashable register during computation of quotient
1733         dstCandidates = RBM_RAX;
1734     }
1735
1736 #ifdef _TARGET_X86_
1737     if (op1->OperGet() == GT_LONG)
1738     {
1739         assert(op1->isContained());
1740
1741         // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
1742         GenTree* loVal = op1->gtGetOp1();
1743         GenTree* hiVal = op1->gtGetOp2();
1744         assert(!loVal->isContained() && !hiVal->isContained());
1745
1746         assert(op2->IsCnsIntOrI());
1747         assert(tree->OperGet() == GT_UMOD);
1748
1749         // This situation also requires an internal register.
1750         buildInternalIntRegisterDefForNode(tree);
1751
1752         BuildUse(loVal, RBM_EAX);
1753         BuildUse(hiVal, RBM_EDX);
1754         srcCount = 2;
1755     }
1756     else
1757 #endif
1758     {
1759         // If possible would like to have op1 in RAX to avoid a register move.
1760         RefPosition* op1Use = BuildUse(op1, RBM_EAX);
1761         tgtPrefUse          = op1Use;
1762         srcCount            = 1;
1763     }
1764
1765     srcCount += BuildDelayFreeUses(op2, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
1766
1767     buildInternalRegisterUses();
1768
1769     regMaskTP killMask = getKillSetForModDiv(tree->AsOp());
1770     BuildDefsWithKills(tree, 1, dstCandidates, killMask);
1771     return srcCount;
1772 }
1773
1774 //------------------------------------------------------------------------
1775 // BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
1776 //
1777 // Arguments:
1778 //    tree      - The node of interest
1779 //
1780 // Return Value:
1781 //    The number of sources consumed by this node.
1782 //
1783 int LinearScan::BuildIntrinsic(GenTree* tree)
1784 {
1785     // Both operand and its result must be of floating point type.
1786     GenTree* op1 = tree->gtGetOp1();
1787     assert(varTypeIsFloating(op1));
1788     assert(op1->TypeGet() == tree->TypeGet());
1789     RefPosition* internalFloatDef = nullptr;
1790
1791     switch (tree->gtIntrinsic.gtIntrinsicId)
1792     {
1793         case CORINFO_INTRINSIC_Sqrt:
1794             break;
1795
1796         case CORINFO_INTRINSIC_Abs:
1797             // Abs(float x) = x & 0x7fffffff
1798             // Abs(double x) = x & 0x7ffffff ffffffff
1799
1800             // In case of Abs we need an internal register to hold mask.
1801
1802             // TODO-XArch-CQ: avoid using an internal register for the mask.
1803             // Andps or andpd both will operate on 128-bit operands.
1804             // The data section constant to hold the mask is a 64-bit size.
1805             // Therefore, we need both the operand and mask to be in
1806             // xmm register. When we add support in emitter to emit 128-bit
1807             // data constants and instructions that operate on 128-bit
1808             // memory operands we can avoid the need for an internal register.
1809             if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
1810             {
1811                 internalFloatDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
1812             }
1813             break;
1814
1815 #ifdef _TARGET_X86_
1816         case CORINFO_INTRINSIC_Cos:
1817         case CORINFO_INTRINSIC_Sin:
1818             NYI_X86("Math intrinsics Cos and Sin");
1819             break;
1820 #endif // _TARGET_X86_
1821
1822         case CORINFO_INTRINSIC_Round:
1823         case CORINFO_INTRINSIC_Ceiling:
1824         case CORINFO_INTRINSIC_Floor:
1825             break;
1826
1827         default:
1828             // Right now only Sqrt/Abs are treated as math intrinsics
1829             noway_assert(!"Unsupported math intrinsic");
1830             unreached();
1831             break;
1832     }
1833     assert(tree->gtGetOp2IfPresent() == nullptr);
1834     int srcCount;
1835     if (op1->isContained())
1836     {
1837         srcCount = BuildOperandUses(op1);
1838     }
1839     else
1840     {
1841         tgtPrefUse = BuildUse(op1);
1842         srcCount   = 1;
1843     }
1844     if (internalFloatDef != nullptr)
1845     {
1846         buildInternalRegisterUses();
1847     }
1848     BuildDef(tree);
1849     return srcCount;
1850 }
1851
1852 #ifdef FEATURE_SIMD
1853 //------------------------------------------------------------------------
1854 // BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
1855 //
1856 // Arguments:
1857 //    tree       - The GT_SIMD node of interest
1858 //
1859 // Return Value:
1860 //    The number of sources consumed by this node.
1861 //
1862 int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
1863 {
1864     // Only SIMDIntrinsicInit can be contained. Other than that,
1865     // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount.
1866     int       dstCount      = simdTree->IsValue() ? 1 : 0;
1867     bool      buildUses     = true;
1868     regMaskTP dstCandidates = RBM_NONE;
1869
1870     if (simdTree->isContained())
1871     {
1872         assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
1873     }
1874     else if (dstCount != 1)
1875     {
1876         assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ||
1877                (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality));
1878     }
1879     SetContainsAVXFlags(true, simdTree->gtSIMDSize);
1880     GenTree* op1      = simdTree->gtGetOp1();
1881     GenTree* op2      = simdTree->gtGetOp2();
1882     int      srcCount = 0;
1883
1884     switch (simdTree->gtSIMDIntrinsicID)
1885     {
1886         case SIMDIntrinsicInit:
1887         {
1888             // This sets all fields of a SIMD struct to the given value.
1889             // Mark op1 as contained if it is either zero or int constant of all 1's,
1890             // or a float constant with 16 or 32 byte simdType (AVX case)
1891             //
1892             // Should never see small int base type vectors except for zero initialization.
1893             assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
1894
1895 #if !defined(_TARGET_64BIT_)
1896             if (op1->OperGet() == GT_LONG)
1897             {
1898                 assert(op1->isContained());
1899                 GenTree* op1lo = op1->gtGetOp1();
1900                 GenTree* op1hi = op1->gtGetOp2();
1901
1902                 if (op1lo->isContained())
1903                 {
1904                     srcCount = 0;
1905                     assert(op1hi->isContained());
1906                     assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
1907                            (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
1908                 }
1909                 else
1910                 {
1911                     srcCount = 2;
1912                     buildInternalFloatRegisterDefForNode(simdTree);
1913                     setInternalRegsDelayFree = true;
1914                 }
1915
1916                 if (srcCount == 2)
1917                 {
1918                     BuildUse(op1lo, RBM_EAX);
1919                     BuildUse(op1hi, RBM_EDX);
1920                 }
1921                 buildUses = false;
1922             }
1923 #endif // !defined(_TARGET_64BIT_)
1924         }
1925         break;
1926
1927         case SIMDIntrinsicInitN:
1928         {
1929             var_types baseType = simdTree->gtSIMDBaseType;
1930             srcCount           = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
1931             // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
1932             buildInternalFloatRegisterDefForNode(simdTree);
1933             int initCount = 0;
1934             for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
1935             {
1936                 assert(list->OperGet() == GT_LIST);
1937                 GenTree* listItem = list->gtGetOp1();
1938                 assert(listItem->TypeGet() == baseType);
1939                 assert(!listItem->isContained());
1940                 BuildUse(listItem);
1941                 initCount++;
1942             }
1943             assert(initCount == srcCount);
1944             buildUses = false;
1945         }
1946         break;
1947
1948         case SIMDIntrinsicInitArray:
1949             // We have an array and an index, which may be contained.
1950             break;
1951
1952         case SIMDIntrinsicDiv:
1953             // SSE2 has no instruction support for division on integer vectors
1954             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1955             break;
1956
1957         case SIMDIntrinsicAbs:
1958             // float/double vectors: This gets implemented as bitwise-And operation
1959             // with a mask and hence should never see  here.
1960             //
1961             // Must be a Vector<int> or Vector<short> Vector<sbyte>
1962             assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
1963                    simdTree->gtSIMDBaseType == TYP_BYTE);
1964             assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
1965             break;
1966
1967         case SIMDIntrinsicSqrt:
1968             // SSE2 has no instruction support for sqrt on integer vectors.
1969             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1970             break;
1971
1972         case SIMDIntrinsicAdd:
1973         case SIMDIntrinsicSub:
1974         case SIMDIntrinsicMul:
1975         case SIMDIntrinsicBitwiseAnd:
1976         case SIMDIntrinsicBitwiseAndNot:
1977         case SIMDIntrinsicBitwiseOr:
1978         case SIMDIntrinsicBitwiseXor:
1979         case SIMDIntrinsicMin:
1980         case SIMDIntrinsicMax:
1981             // SSE2 32-bit integer multiplication requires two temp regs
1982             if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
1983                 compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
1984             {
1985                 buildInternalFloatRegisterDefForNode(simdTree);
1986                 buildInternalFloatRegisterDefForNode(simdTree);
1987             }
1988             break;
1989
1990         case SIMDIntrinsicEqual:
1991             break;
1992
1993         // SSE2 doesn't support < and <= directly on int vectors.
1994         // Instead we need to use > and >= with swapped operands.
1995         case SIMDIntrinsicLessThan:
1996         case SIMDIntrinsicLessThanOrEqual:
1997             noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
1998             break;
1999
2000         // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
2001         // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
2002         // Instead we need to use <  and <= with swapped operands.
2003         case SIMDIntrinsicGreaterThan:
2004             noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2005             break;
2006
2007         case SIMDIntrinsicOpEquality:
2008         case SIMDIntrinsicOpInEquality:
2009             if (simdTree->gtGetOp2()->isContained())
2010             {
2011                 // If the second operand is contained then ContainCheckSIMD has determined
2012                 // that PTEST can be used. We only need a single source register and no
2013                 // internal registers.
2014             }
2015             else
2016             {
2017                 // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
2018                 // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
2019                 // and one internal INT register (to hold the result of PMOVMSKB).
2020                 buildInternalIntRegisterDefForNode(simdTree);
2021                 buildInternalFloatRegisterDefForNode(simdTree);
2022             }
2023             // These SIMD nodes only set the condition flags.
2024             dstCount = 0;
2025             break;
2026
2027         case SIMDIntrinsicDotProduct:
2028             // Float/Double vectors:
2029             // For SSE, or AVX with 32-byte vectors, we also need an internal register
2030             // as scratch. Further we need the targetReg and internal reg to be distinct
2031             // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2032             // don't need a tmpReg.
2033             //
2034             // 32-byte integer vector on SSE4/AVX:
2035             // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2036             // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
2037             // registers since targetReg is an int type register.
2038             //
2039             // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2040             // and the need for scratch registers.
2041             if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2042             {
2043                 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
2044                     (simdTree->gtGetOp1()->TypeGet() == TYP_SIMD32))
2045                 {
2046                     buildInternalFloatRegisterDefForNode(simdTree);
2047                     setInternalRegsDelayFree = true;
2048                 }
2049                 // else don't need scratch reg(s).
2050             }
2051             else
2052             {
2053                 assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2054
2055                 // No need to setInternalRegsDelayFree since targetReg is a
2056                 // an int type reg and guaranteed to be different from xmm/ymm
2057                 // regs.
2058                 buildInternalFloatRegisterDefForNode(simdTree);
2059                 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2060                 {
2061                     buildInternalFloatRegisterDefForNode(simdTree);
2062                 }
2063             }
2064             break;
2065
2066         case SIMDIntrinsicGetItem:
2067         {
2068             // This implements get_Item method. The sources are:
2069             //  - the source SIMD struct
2070             //  - index (which element to get)
2071             // The result is baseType of SIMD struct.
2072             // op1 may be a contained memory op, but if so we will consume its address.
2073             // op2 may be a contained constant.
2074             op1 = simdTree->gtGetOp1();
2075             op2 = simdTree->gtGetOp2();
2076
2077             if (!op1->isContained())
2078             {
2079                 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2080                 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2081                 // can use that in the process of extracting the element.
2082                 //
2083                 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2084                 // we will need a temp if are indexing into the upper half of the AVX register.
2085                 // In all other cases with constant index, we need a temp xmm register to extract the
2086                 // element if index is other than zero.
2087
2088                 if (!op2->IsCnsIntOrI())
2089                 {
2090                     (void)compiler->getSIMDInitTempVarNum();
2091                 }
2092                 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2093                 {
2094                     bool needFloatTemp;
2095                     if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2096                         (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
2097                     {
2098                         int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2099                         needFloatTemp    = (byteShiftCnt >= 16);
2100                     }
2101                     else
2102                     {
2103                         needFloatTemp = !op2->IsIntegralConst(0);
2104                     }
2105
2106                     if (needFloatTemp)
2107                     {
2108                         buildInternalFloatRegisterDefForNode(simdTree);
2109                     }
2110                 }
2111 #ifdef _TARGET_X86_
2112                 // This logic is duplicated from genSIMDIntrinsicGetItem().
2113                 // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
2114                 // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
2115                 // cases will require this, so the non-byteable registers can be excluded.
2116
2117                 var_types baseType = simdTree->gtSIMDBaseType;
2118                 if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
2119                 {
2120                     bool     ZeroOrSignExtnReqd = true;
2121                     unsigned baseSize           = genTypeSize(baseType);
2122                     if (baseSize == 1)
2123                     {
2124                         if ((op2->gtIntCon.gtIconVal % 2) == 1)
2125                         {
2126                             ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2127                         }
2128                     }
2129                     else
2130                     {
2131                         assert(baseSize == 2);
2132                         ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2133                     }
2134                     if (ZeroOrSignExtnReqd)
2135                     {
2136                         dstCandidates = allByteRegs();
2137                     }
2138                 }
2139 #endif // _TARGET_X86_
2140             }
2141         }
2142         break;
2143
2144         case SIMDIntrinsicSetX:
2145         case SIMDIntrinsicSetY:
2146         case SIMDIntrinsicSetZ:
2147         case SIMDIntrinsicSetW:
2148             // We need an internal integer register for SSE2 codegen
2149             if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2150             {
2151                 buildInternalIntRegisterDefForNode(simdTree);
2152             }
2153
2154             break;
2155
2156         case SIMDIntrinsicCast:
2157             break;
2158
2159         case SIMDIntrinsicConvertToSingle:
2160             if (simdTree->gtSIMDBaseType == TYP_UINT)
2161             {
2162                 // We need an internal register different from targetReg.
2163                 setInternalRegsDelayFree = true;
2164                 buildInternalFloatRegisterDefForNode(simdTree);
2165                 buildInternalFloatRegisterDefForNode(simdTree);
2166                 // We also need an integer register.
2167                 buildInternalIntRegisterDefForNode(simdTree);
2168             }
2169             break;
2170
2171         case SIMDIntrinsicConvertToInt32:
2172             break;
2173
2174         case SIMDIntrinsicWidenLo:
2175         case SIMDIntrinsicWidenHi:
2176             if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
2177             {
2178                 // We need an internal register different from targetReg.
2179                 setInternalRegsDelayFree = true;
2180                 buildInternalFloatRegisterDefForNode(simdTree);
2181             }
2182             break;
2183
2184         case SIMDIntrinsicConvertToInt64:
2185             // We need an internal register different from targetReg.
2186             setInternalRegsDelayFree = true;
2187             buildInternalFloatRegisterDefForNode(simdTree);
2188             if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2189             {
2190                 buildInternalFloatRegisterDefForNode(simdTree);
2191             }
2192             // We also need an integer register.
2193             buildInternalIntRegisterDefForNode(simdTree);
2194             break;
2195
2196         case SIMDIntrinsicConvertToDouble:
2197             // We need an internal register different from targetReg.
2198             setInternalRegsDelayFree = true;
2199             buildInternalFloatRegisterDefForNode(simdTree);
2200 #ifdef _TARGET_X86_
2201             if (simdTree->gtSIMDBaseType == TYP_LONG)
2202             {
2203                 buildInternalFloatRegisterDefForNode(simdTree);
2204                 buildInternalFloatRegisterDefForNode(simdTree);
2205             }
2206             else
2207 #endif
2208                 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
2209             {
2210                 buildInternalFloatRegisterDefForNode(simdTree);
2211             }
2212             // We also need an integer register.
2213             buildInternalIntRegisterDefForNode(simdTree);
2214             break;
2215
2216         case SIMDIntrinsicNarrow:
2217             // We need an internal register different from targetReg.
2218             setInternalRegsDelayFree = true;
2219             buildInternalFloatRegisterDefForNode(simdTree);
2220             if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
2221             {
2222                 buildInternalFloatRegisterDefForNode(simdTree);
2223             }
2224             break;
2225
2226         case SIMDIntrinsicShuffleSSE2:
2227             // Second operand is an integer constant and marked as contained.
2228             assert(simdTree->gtGetOp2()->isContainedIntOrIImmed());
2229             break;
2230
2231         case SIMDIntrinsicGetX:
2232         case SIMDIntrinsicGetY:
2233         case SIMDIntrinsicGetZ:
2234         case SIMDIntrinsicGetW:
2235         case SIMDIntrinsicGetOne:
2236         case SIMDIntrinsicGetZero:
2237         case SIMDIntrinsicGetCount:
2238         case SIMDIntrinsicGetAllOnes:
2239             assert(!"Get intrinsics should not be seen during Lowering.");
2240             unreached();
2241
2242         default:
2243             noway_assert(!"Unimplemented SIMD node type.");
2244             unreached();
2245     }
2246     if (buildUses)
2247     {
2248         assert(!op1->OperIs(GT_LIST));
2249         assert(srcCount == 0);
2250         // This is overly conservative, but is here for zero diffs.
2251         srcCount = BuildRMWUses(simdTree);
2252     }
2253     buildInternalRegisterUses();
2254     if (dstCount == 1)
2255     {
2256         BuildDef(simdTree, dstCandidates);
2257     }
2258     else
2259     {
2260         assert(dstCount == 0);
2261     }
2262     return srcCount;
2263 }
2264 #endif // FEATURE_SIMD
2265
2266 #ifdef FEATURE_HW_INTRINSICS
2267 //------------------------------------------------------------------------
2268 // BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree.
2269 //
2270 // Arguments:
2271 //    tree       - The GT_HWIntrinsic node of interest
2272 //
2273 // Return Value:
2274 //    The number of sources consumed by this node.
2275 //
2276 int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
2277 {
2278     NamedIntrinsic      intrinsicId = intrinsicTree->gtHWIntrinsicId;
2279     var_types           baseType    = intrinsicTree->gtSIMDBaseType;
2280     InstructionSet      isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
2281     HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
2282     int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(intrinsicTree);
2283
2284     if ((isa == InstructionSet_AVX) || (isa == InstructionSet_AVX2))
2285     {
2286         SetContainsAVXFlags(true, 32);
2287     }
2288
2289     GenTree* op1    = intrinsicTree->gtGetOp1();
2290     GenTree* op2    = intrinsicTree->gtGetOp2();
2291     GenTree* op3    = nullptr;
2292     GenTree* lastOp = nullptr;
2293
2294     int srcCount = 0;
2295     int dstCount = intrinsicTree->IsValue() ? 1 : 0;
2296
2297     regMaskTP dstCandidates = RBM_NONE;
2298
2299     if (op1 == nullptr)
2300     {
2301         assert(op2 == nullptr);
2302         assert(numArgs == 0);
2303     }
2304     else
2305     {
2306         if (op1->OperIsList())
2307         {
2308             assert(op2 == nullptr);
2309             assert(numArgs == 3);
2310
2311             GenTreeArgList* argList = op1->AsArgList();
2312
2313             op1     = argList->Current();
2314             argList = argList->Rest();
2315
2316             op2     = argList->Current();
2317             argList = argList->Rest();
2318
2319             op3     = argList->Current();
2320             argList = argList->Rest();
2321
2322             lastOp = op3;
2323             assert(argList == nullptr);
2324         }
2325         else if (op2 != nullptr)
2326         {
2327             assert(numArgs == 2);
2328             lastOp = op2;
2329         }
2330         else
2331         {
2332             assert(numArgs == 1);
2333             lastOp = op1;
2334         }
2335
2336         assert(lastOp != nullptr);
2337
2338         bool buildUses = true;
2339
2340         if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId))
2341         {
2342             if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed())
2343             {
2344                 assert(!lastOp->IsCnsIntOrI());
2345
2346                 // We need two extra reg when lastOp isn't a constant so
2347                 // the offset into the jump table for the fallback path
2348                 // can be computed.
2349                 buildInternalIntRegisterDefForNode(intrinsicTree);
2350                 buildInternalIntRegisterDefForNode(intrinsicTree);
2351             }
2352         }
2353
2354         // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
2355         // is not allocated the same register as the target.
2356         bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
2357
2358         // Create internal temps, and handle any other special requirements.
2359         // Note that the default case for building uses will handle the RMW flag, but if the uses
2360         // are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree)
2361         // must be handled within the case.
2362         switch (intrinsicId)
2363         {
2364             case NI_SSE_CompareEqualOrderedScalar:
2365             case NI_SSE_CompareEqualUnorderedScalar:
2366             case NI_SSE_CompareNotEqualOrderedScalar:
2367             case NI_SSE_CompareNotEqualUnorderedScalar:
2368             case NI_SSE2_CompareEqualOrderedScalar:
2369             case NI_SSE2_CompareEqualUnorderedScalar:
2370             case NI_SSE2_CompareNotEqualOrderedScalar:
2371             case NI_SSE2_CompareNotEqualUnorderedScalar:
2372             {
2373                 buildInternalIntRegisterDefForNode(intrinsicTree, allByteRegs());
2374                 setInternalRegsDelayFree = true;
2375                 break;
2376             }
2377
2378             case NI_SSE_SetScalarVector128:
2379             case NI_SSE2_SetScalarVector128:
2380             {
2381                 buildInternalFloatRegisterDefForNode(intrinsicTree);
2382                 setInternalRegsDelayFree = true;
2383                 break;
2384             }
2385
2386             case NI_SSE_ConvertToSingle:
2387             case NI_SSE2_ConvertToDouble:
2388             case NI_AVX_ExtendToVector256:
2389             case NI_AVX_GetLowerHalf:
2390             {
2391                 assert(numArgs == 1);
2392                 assert(!isRMW);
2393                 assert(dstCount == 1);
2394
2395                 if (!op1->isContained())
2396                 {
2397                     tgtPrefUse = BuildUse(op1);
2398                     srcCount   = 1;
2399                 }
2400                 else
2401                 {
2402                     srcCount += BuildOperandUses(op1);
2403                 }
2404
2405                 buildUses = false;
2406                 break;
2407             }
2408
2409             case NI_AVX_SetAllVector256:
2410             {
2411                 if (varTypeIsIntegral(baseType))
2412                 {
2413                     buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
2414                     if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsByte(baseType))
2415                     {
2416                         buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
2417                     }
2418                 }
2419                 break;
2420             }
2421
2422             case NI_SSE2_MaskMove:
2423             {
2424                 assert(numArgs == 3);
2425                 assert(!isRMW);
2426
2427                 // MaskMove hardcodes the destination (op3) in DI/EDI/RDI
2428                 srcCount += BuildOperandUses(op1);
2429                 srcCount += BuildOperandUses(op2);
2430                 srcCount += BuildOperandUses(op3, RBM_EDI);
2431
2432                 buildUses = false;
2433                 break;
2434             }
2435
2436             case NI_SSE41_BlendVariable:
2437             {
2438                 assert(numArgs == 3);
2439
2440                 if (!compiler->canUseVexEncoding())
2441                 {
2442                     assert(isRMW);
2443
2444                     // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
2445                     srcCount += BuildOperandUses(op1);
2446                     srcCount += BuildDelayFreeUses(op2);
2447                     srcCount += BuildDelayFreeUses(op3, RBM_XMM0);
2448
2449                     buildUses = false;
2450                 }
2451                 break;
2452             }
2453
2454             case NI_SSE41_TestAllOnes:
2455             {
2456                 buildInternalFloatRegisterDefForNode(intrinsicTree);
2457                 break;
2458             }
2459
2460             case NI_SSE41_Extract:
2461             {
2462                 if (baseType == TYP_FLOAT)
2463                 {
2464                     buildInternalIntRegisterDefForNode(intrinsicTree);
2465                 }
2466 #ifdef _TARGET_X86_
2467                 else if (varTypeIsByte(baseType))
2468                 {
2469                     dstCandidates = allByteRegs();
2470                 }
2471 #endif
2472                 break;
2473             }
2474
2475 #ifdef _TARGET_X86_
2476             case NI_SSE42_Crc32:
2477             {
2478                 // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
2479                 // to the code generator. We may want to encode the overload info in another way.
2480
2481                 assert(numArgs == 2);
2482                 assert(isRMW);
2483
2484                 // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
2485                 srcCount += BuildOperandUses(op1);
2486                 srcCount += BuildDelayFreeUses(op2, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE);
2487
2488                 buildUses = false;
2489                 break;
2490             }
2491 #endif // _TARGET_X86_
2492
2493             case NI_FMA_MultiplyAdd:
2494             case NI_FMA_MultiplyAddNegated:
2495             case NI_FMA_MultiplyAddNegatedScalar:
2496             case NI_FMA_MultiplyAddScalar:
2497             case NI_FMA_MultiplyAddSubtract:
2498             case NI_FMA_MultiplySubtract:
2499             case NI_FMA_MultiplySubtractAdd:
2500             case NI_FMA_MultiplySubtractNegated:
2501             case NI_FMA_MultiplySubtractNegatedScalar:
2502             case NI_FMA_MultiplySubtractScalar:
2503             {
2504                 assert(numArgs == 3);
2505                 assert(isRMW);
2506
2507                 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2508
2509                 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2510                 assert(!copiesUpperBits || !op1->isContained());
2511
2512                 if (op3->isContained())
2513                 {
2514                     // 213 form: op1 = (op2 * op1) + [op3]
2515
2516                     if (copiesUpperBits)
2517                     {
2518                         tgtPrefUse = BuildUse(op1);
2519
2520                         srcCount += 1;
2521                         srcCount += BuildDelayFreeUses(op2);
2522                     }
2523                     else
2524                     {
2525                         // op1 and op2 are commutative, so don't
2526                         // set either to be tgtPref or delayFree
2527
2528                         srcCount += BuildOperandUses(op1);
2529                         srcCount += BuildOperandUses(op2);
2530                     }
2531
2532                     srcCount += BuildOperandUses(op3);
2533                 }
2534                 else if (op2->isContained())
2535                 {
2536                     // 132 form: op1 = (op1 * op3) + [op2]
2537
2538                     tgtPrefUse = BuildUse(op1);
2539
2540                     srcCount += 1;
2541                     srcCount += BuildOperandUses(op2);
2542                     srcCount += BuildDelayFreeUses(op3);
2543                 }
2544                 else if (op1->isContained())
2545                 {
2546                     // 231 form: op3 = (op2 * op3) + [op1]
2547
2548                     tgtPrefUse = BuildUse(op3);
2549
2550                     srcCount += BuildOperandUses(op1);
2551                     srcCount += BuildDelayFreeUses(op2);
2552                     srcCount += 1;
2553                 }
2554                 else
2555                 {
2556                     // 213 form: op1 = (op2 * op1) + op3
2557
2558                     if (copiesUpperBits)
2559                     {
2560                         tgtPrefUse = BuildUse(op1);
2561
2562                         srcCount += 1;
2563                         srcCount += BuildDelayFreeUses(op2);
2564                     }
2565                     else
2566                     {
2567                         // op1 and op2 are commutative, so don't
2568                         // set either to be tgtPref or delayFree
2569
2570                         srcCount += BuildOperandUses(op1);
2571                         srcCount += BuildOperandUses(op2);
2572                     }
2573
2574                     srcCount += BuildDelayFreeUses(op3);
2575                 }
2576
2577                 buildUses = false;
2578                 break;
2579             }
2580
2581             default:
2582             {
2583                 assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
2584                 break;
2585             }
2586         }
2587
2588         if (buildUses)
2589         {
2590             assert((numArgs > 0) && (numArgs < 4));
2591
2592             srcCount += BuildOperandUses(op1);
2593
2594             if (op2 != nullptr)
2595             {
2596                 srcCount += (isRMW) ? BuildDelayFreeUses(op2) : BuildOperandUses(op2);
2597
2598                 if (op3 != nullptr)
2599                 {
2600                     srcCount += (isRMW) ? BuildDelayFreeUses(op3) : BuildOperandUses(op3);
2601                 }
2602             }
2603         }
2604
2605         buildInternalRegisterUses();
2606     }
2607
2608     if (dstCount == 1)
2609     {
2610         BuildDef(intrinsicTree, dstCandidates);
2611     }
2612     else
2613     {
2614         assert(dstCount == 0);
2615     }
2616
2617     return srcCount;
2618 }
2619 #endif
2620
2621 //------------------------------------------------------------------------
2622 // BuildCast: Set the NodeInfo for a GT_CAST.
2623 //
2624 // Arguments:
2625 //    tree      - The node of interest
2626 //
2627 // Return Value:
2628 //    The number of sources consumed by this node.
2629 //
2630 int LinearScan::BuildCast(GenTree* tree)
2631 {
2632     // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
2633     //         see CodeGen::genIntToIntCast()
2634
2635     // Non-overflow casts to/from float/double are done using SSE2 instructions
2636     // and that allow the source operand to be either a reg or memop. Given the
2637     // fact that casts from small int to float/double are done as two-level casts,
2638     // the source operand is always guaranteed to be of size 4 or 8 bytes.
2639     var_types castToType = tree->CastToType();
2640     GenTree*  castOp     = tree->gtCast.CastOp();
2641     var_types castOpType = castOp->TypeGet();
2642     regMaskTP candidates = RBM_NONE;
2643
2644     if (tree->gtFlags & GTF_UNSIGNED)
2645     {
2646         castOpType = genUnsignedType(castOpType);
2647     }
2648
2649 #ifdef _TARGET_X86_
2650     if (varTypeIsByte(castToType))
2651     {
2652         candidates = allByteRegs();
2653     }
2654 #endif // _TARGET_X86_
2655
2656     // some overflow checks need a temp reg:
2657     //  - GT_CAST from INT64/UINT64 to UINT32
2658     RefPosition* internalDef = nullptr;
2659     if (tree->gtOverflow() && (castToType == TYP_UINT))
2660     {
2661         if (genTypeSize(castOpType) == 8)
2662         {
2663             // Here we don't need internal register to be different from targetReg,
2664             // rather require it to be different from operand's reg.
2665             buildInternalIntRegisterDefForNode(tree);
2666         }
2667     }
2668     int srcCount = BuildOperandUses(castOp, candidates);
2669     buildInternalRegisterUses();
2670     BuildDef(tree, candidates);
2671     return srcCount;
2672 }
2673
2674 //-----------------------------------------------------------------------------------------
2675 // BuildIndir: Specify register requirements for address expression of an indirection operation.
2676 //
2677 // Arguments:
2678 //    indirTree    -   GT_IND or GT_STOREIND gentree node
2679 //
2680 // Return Value:
2681 //    The number of sources consumed by this node.
2682 //
2683 int LinearScan::BuildIndir(GenTreeIndir* indirTree)
2684 {
2685     // If this is the rhs of a block copy (i.e. non-enregisterable struct),
2686     // it has no register requirements.
2687     if (indirTree->TypeGet() == TYP_STRUCT)
2688     {
2689         return 0;
2690     }
2691
2692 #ifdef FEATURE_SIMD
2693     RefPosition* internalFloatDef = nullptr;
2694     if (indirTree->TypeGet() == TYP_SIMD12)
2695     {
2696         // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir().
2697         assert(!indirTree->Addr()->isContained());
2698
2699         // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
2700         // To assemble the vector properly we would need an additional
2701         // XMM register.
2702         internalFloatDef = buildInternalFloatRegisterDefForNode(indirTree);
2703
2704         // In case of GT_IND we need an internal register different from targetReg and
2705         // both of the registers are used at the same time.
2706         if (indirTree->OperGet() == GT_IND)
2707         {
2708             setInternalRegsDelayFree = true;
2709         }
2710     }
2711 #endif // FEATURE_SIMD
2712
2713     regMaskTP indirCandidates = RBM_NONE;
2714     int       srcCount        = BuildIndirUses(indirTree, indirCandidates);
2715     if (indirTree->gtOper == GT_STOREIND)
2716     {
2717         GenTree* source = indirTree->gtGetOp2();
2718         if (indirTree->AsStoreInd()->IsRMWMemoryOp())
2719         {
2720             // Because 'source' is contained, we haven't yet determined its special register requirements, if any.
2721             // As it happens, the Shift or Rotate cases are the only ones with special requirements.
2722             assert(source->isContained() && source->OperIsRMWMemOp());
2723             GenTree*      nonMemSource = nullptr;
2724             GenTreeIndir* otherIndir   = nullptr;
2725
2726             if (source->OperIsShiftOrRotate())
2727             {
2728                 srcCount += BuildShiftRotate(source);
2729             }
2730             else
2731             {
2732                 regMaskTP srcCandidates = RBM_NONE;
2733
2734 #ifdef _TARGET_X86_
2735                 // Determine if we need byte regs for the non-mem source, if any.
2736                 // Note that BuildShiftRotate (above) will handle the byte requirement as needed,
2737                 // but STOREIND isn't itself an RMW op, so we have to explicitly set it for that case.
2738
2739                 GenTree* nonMemSource = nullptr;
2740
2741                 if (indirTree->AsStoreInd()->IsRMWDstOp1())
2742                 {
2743                     otherIndir = source->gtGetOp1()->AsIndir();
2744                     if (source->OperIsBinary())
2745                     {
2746                         nonMemSource = source->gtGetOp2();
2747                     }
2748                 }
2749                 else if (indirTree->AsStoreInd()->IsRMWDstOp2())
2750                 {
2751                     otherIndir   = source->gtGetOp2()->AsIndir();
2752                     nonMemSource = source->gtGetOp1();
2753                 }
2754                 if ((nonMemSource != nullptr) && !nonMemSource->isContained() && varTypeIsByte(indirTree))
2755                 {
2756                     srcCandidates = RBM_BYTE_REGS;
2757                 }
2758 #endif
2759                 if (otherIndir != nullptr)
2760                 {
2761                     // Any lclVars in the addressing mode of this indirection are contained.
2762                     // If they are marked as lastUse, transfer the last use flag to the store indir.
2763                     GenTree* base    = otherIndir->Base();
2764                     GenTree* dstBase = indirTree->Base();
2765                     CheckAndMoveRMWLastUse(base, dstBase);
2766                     GenTree* index    = otherIndir->Index();
2767                     GenTree* dstIndex = indirTree->Index();
2768                     CheckAndMoveRMWLastUse(index, dstIndex);
2769                 }
2770                 srcCount += BuildBinaryUses(source->AsOp(), srcCandidates);
2771             }
2772         }
2773         else
2774         {
2775 #ifdef _TARGET_X86_
2776             if (varTypeIsByte(indirTree) && !source->isContained())
2777             {
2778                 BuildUse(source, allByteRegs());
2779                 srcCount++;
2780             }
2781             else
2782 #endif
2783             {
2784                 srcCount += BuildOperandUses(source);
2785             }
2786         }
2787     }
2788 #ifdef FEATURE_SIMD
2789     buildInternalRegisterUses();
2790 #endif // FEATURE_SIMD
2791
2792     if (indirTree->gtOper != GT_STOREIND)
2793     {
2794         BuildDef(indirTree);
2795     }
2796     return srcCount;
2797 }
2798
2799 //------------------------------------------------------------------------
2800 // BuildMul: Set the NodeInfo for a multiply.
2801 //
2802 // Arguments:
2803 //    tree      - The node of interest
2804 //
2805 // Return Value:
2806 //    The number of sources consumed by this node.
2807 //
2808 int LinearScan::BuildMul(GenTree* tree)
2809 {
2810     assert(tree->OperIsMul());
2811     GenTree* op1 = tree->gtGetOp1();
2812     GenTree* op2 = tree->gtGetOp2();
2813
2814     // Only non-floating point mul has special requirements
2815     if (varTypeIsFloating(tree->TypeGet()))
2816     {
2817         return BuildSimple(tree);
2818     }
2819
2820     int       srcCount      = BuildBinaryUses(tree->AsOp());
2821     regMaskTP dstCandidates = RBM_NONE;
2822
2823     bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
2824     bool requiresOverflowCheck = tree->gtOverflowEx();
2825
2826     // There are three forms of x86 multiply:
2827     // one-op form:     RDX:RAX = RAX * r/m
2828     // two-op form:     reg *= r/m
2829     // three-op form:   reg = r/m * imm
2830
2831     // This special widening 32x32->64 MUL is not used on x64
2832     CLANG_FORMAT_COMMENT_ANCHOR;
2833 #if defined(_TARGET_X86_)
2834     if (tree->OperGet() != GT_MUL_LONG)
2835 #endif
2836     {
2837         assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
2838     }
2839
2840     // We do use the widening multiply to implement
2841     // the overflow checking for unsigned multiply
2842     //
2843     if (isUnsignedMultiply && requiresOverflowCheck)
2844     {
2845         // The only encoding provided is RDX:RAX = RAX * rm
2846         //
2847         // Here we set RAX as the only destination candidate
2848         // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
2849         //
2850         dstCandidates = RBM_RAX;
2851     }
2852     else if (tree->OperGet() == GT_MULHI)
2853     {
2854         // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
2855         // upper 32 bits of the result set the destination candidate to REG_RDX.
2856         dstCandidates = RBM_RDX;
2857     }
2858 #if defined(_TARGET_X86_)
2859     else if (tree->OperGet() == GT_MUL_LONG)
2860     {
2861         // have to use the encoding:RDX:RAX = RAX * rm
2862         dstCandidates = RBM_RAX;
2863     }
2864 #endif
2865     GenTree* containedMemOp = nullptr;
2866     if (op1->isContained() && !op1->IsCnsIntOrI())
2867     {
2868         assert(!op2->isContained() || op2->IsCnsIntOrI());
2869         containedMemOp = op1;
2870     }
2871     else if (op2->isContained() && !op2->IsCnsIntOrI())
2872     {
2873         containedMemOp = op2;
2874     }
2875     regMaskTP killMask = getKillSetForMul(tree->AsOp());
2876     BuildDefsWithKills(tree, 1, dstCandidates, killMask);
2877     return srcCount;
2878 }
2879
2880 //------------------------------------------------------------------------------
2881 // SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
2882 // Contains256bitAVX flag when SIMD vector size is 32 bytes
2883 //
2884 // Arguments:
2885 //    isFloatingPointType   - true if it is floating point type
2886 //    sizeOfSIMDVector      - SIMD Vector size
2887 //
2888 void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
2889 {
2890     if (isFloatingPointType && compiler->canUseVexEncoding())
2891     {
2892         compiler->getEmitter()->SetContainsAVX(true);
2893         if (sizeOfSIMDVector == 32)
2894         {
2895             compiler->getEmitter()->SetContains256bitAVX(true);
2896         }
2897     }
2898 }
2899
2900 #endif // _TARGET_XARCH_