Fix reading Time zone rules using Julian days (#17672)
[platform/upstream/coreclr.git] / src / jit / lsraxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX                    Register Requirements for AMD64                        XX
9 XX                                                                           XX
10 XX  This encapsulates all the logic for setting register requirements for    XX
11 XX  the AMD64 architecture.                                                  XX
12 XX                                                                           XX
13 XX                                                                           XX
14 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 */
17
18 #include "jitpch.h"
19 #ifdef _MSC_VER
20 #pragma hdrstop
21 #endif
22
23 #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
24
25 #ifdef _TARGET_XARCH_
26
27 #include "jit.h"
28 #include "sideeffects.h"
29 #include "lower.h"
30
31 //------------------------------------------------------------------------
32 // BuildNode: Set register requirements for a node
33 //
34 // Arguments:
35 //    treeNode - the node of interest
36 //
37 // Notes:
38 // Preconditions:
39 //    LSRA Has been initialized and there is a TreeNodeInfo node
40 //    already allocated and initialized for every tree in the IR.
41 // Postconditions:
42 //    Every TreeNodeInfo instance has the right annotations on register
43 //    requirements needed by LSRA to build the Interval Table (source,
44 //    destination and internal [temp] register counts).
45 //
46 void LinearScan::BuildNode(GenTree* tree)
47 {
48     TreeNodeInfo* info = currentNodeInfo;
49     assert(!tree->isContained());
50
51     if (tree->IsValue())
52     {
53         info->dstCount = 1;
54         if (tree->IsUnusedValue())
55         {
56             info->isLocalDefUse = true;
57         }
58     }
59     else
60     {
61         info->dstCount = 0;
62     }
63
64     // floating type generates AVX instruction (vmovss etc.), set the flag
65     SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
66     switch (tree->OperGet())
67     {
68         default:
69             BuildSimple(tree);
70             break;
71
72         case GT_LCL_VAR:
73             // Because we do containment analysis before we redo dataflow and identify register
74             // candidates, the containment analysis only !lvDoNotEnregister to estimate register
75             // candidates.
76             // If there is a lclVar that is estimated to be register candidate but
77             // is not, if they were marked regOptional they should now be marked contained instead.
78             // TODO-XArch-CQ: When this is being called while RefPositions are being created,
79             // use lvLRACandidate here instead.
80             if (tree->IsRegOptional())
81             {
82                 if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked ||
83                     compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister)
84                 {
85                     tree->ClearRegOptional();
86                     tree->SetContained();
87                     info->dstCount = 0;
88                     return;
89                 }
90             }
91             __fallthrough;
92
93         case GT_LCL_FLD:
94             info->srcCount = 0;
95
96 #ifdef FEATURE_SIMD
97             // Need an additional register to read upper 4 bytes of Vector3.
98             if (tree->TypeGet() == TYP_SIMD12)
99             {
100                 // We need an internal register different from targetReg in which 'tree' produces its result
101                 // because both targetReg and internal reg will be in use at the same time.
102                 info->internalFloatCount     = 1;
103                 info->isInternalRegDelayFree = true;
104                 info->setInternalCandidates(this, allSIMDRegs());
105             }
106 #endif
107             break;
108
109         case GT_STORE_LCL_FLD:
110         case GT_STORE_LCL_VAR:
111             BuildStoreLoc(tree->AsLclVarCommon());
112             break;
113
114         case GT_FIELD_LIST:
115             // These should always be contained. We don't correctly allocate or
116             // generate code for a non-contained GT_FIELD_LIST.
117             noway_assert(!"Non-contained GT_FIELD_LIST");
118             break;
119
120         case GT_LIST:
121         case GT_ARGPLACE:
122         case GT_NO_OP:
123         case GT_START_NONGC:
124         case GT_PROF_HOOK:
125             info->srcCount = 0;
126             assert(info->dstCount == 0);
127             break;
128
129         case GT_CNS_DBL:
130             info->srcCount = 0;
131             assert(info->dstCount == 1);
132             break;
133
134 #if !defined(_TARGET_64BIT_)
135
136         case GT_LONG:
137             assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here.
138             // An unused GT_LONG node needs to consume its sources, but need not produce a register.
139             tree->gtType = TYP_VOID;
140             tree->ClearUnusedValue();
141             info->isLocalDefUse = false;
142             info->srcCount      = 2;
143             info->dstCount      = 0;
144             appendLocationInfoToList(tree->gtGetOp1());
145             appendLocationInfoToList(tree->gtGetOp2());
146             break;
147
148 #endif // !defined(_TARGET_64BIT_)
149
150         case GT_BOX:
151         case GT_COMMA:
152         case GT_QMARK:
153         case GT_COLON:
154             info->srcCount = 0;
155             assert(info->dstCount == 0);
156             unreached();
157             break;
158
159         case GT_RETURN:
160             BuildReturn(tree);
161             break;
162
163         case GT_RETFILT:
164             assert(info->dstCount == 0);
165             if (tree->TypeGet() == TYP_VOID)
166             {
167                 info->srcCount = 0;
168             }
169             else
170             {
171                 assert(tree->TypeGet() == TYP_INT);
172
173                 info->srcCount = 1;
174
175                 info->setSrcCandidates(this, RBM_INTRET);
176                 LocationInfoListNode* locationInfo = getLocationInfo(tree->gtOp.gtOp1);
177                 locationInfo->info.setSrcCandidates(this, RBM_INTRET);
178                 useList.Append(locationInfo);
179             }
180             break;
181
182         // A GT_NOP is either a passthrough (if it is void, or if it has
183         // a child), but must be considered to produce a dummy value if it
184         // has a type but no child
185         case GT_NOP:
186             info->srcCount = 0;
187             assert((tree->gtOp.gtOp1 == nullptr) || tree->isContained());
188             if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
189             {
190                 assert(info->dstCount == 1);
191             }
192             else
193             {
194                 assert(info->dstCount == 0);
195             }
196             break;
197
198         case GT_JTRUE:
199         {
200             info->srcCount = 0;
201             assert(info->dstCount == 0);
202             GenTree* cmp = tree->gtGetOp1();
203             assert(!cmp->IsValue());
204         }
205         break;
206
207         case GT_JCC:
208             info->srcCount = 0;
209             assert(info->dstCount == 0);
210             break;
211
212         case GT_SETCC:
213             info->srcCount = 0;
214             assert(info->dstCount == 1);
215 #ifdef _TARGET_X86_
216             info->setDstCandidates(this, RBM_BYTE_REGS);
217 #endif // _TARGET_X86_
218             break;
219
220         case GT_JMP:
221             info->srcCount = 0;
222             assert(info->dstCount == 0);
223             break;
224
225         case GT_SWITCH:
226             // This should never occur since switch nodes must not be visible at this
227             // point in the JIT.
228             info->srcCount = 0;
229             noway_assert(!"Switch must be lowered at this point");
230             break;
231
232         case GT_JMPTABLE:
233             info->srcCount = 0;
234             assert(info->dstCount == 1);
235             break;
236
237         case GT_SWITCH_TABLE:
238             info->internalIntCount = 1;
239             assert(info->dstCount == 0);
240             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
241             assert(info->srcCount == 2);
242             break;
243
244         case GT_ASG:
245             noway_assert(!"We should never hit any assignment operator in lowering");
246             info->srcCount = 0;
247             break;
248
249 #if !defined(_TARGET_64BIT_)
250         case GT_ADD_LO:
251         case GT_ADD_HI:
252         case GT_SUB_LO:
253         case GT_SUB_HI:
254 #endif
255         case GT_ADD:
256         case GT_SUB:
257         case GT_AND:
258         case GT_OR:
259         case GT_XOR:
260         case GT_BT:
261             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
262             break;
263
264         case GT_RETURNTRAP:
265             // This just turns into a compare of its child with an int + a conditional call.
266             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
267             assert(info->dstCount == 0);
268             info->internalIntCount = 1;
269             info->setInternalCandidates(this, allRegs(TYP_INT));
270             break;
271
272         case GT_MOD:
273         case GT_DIV:
274         case GT_UMOD:
275         case GT_UDIV:
276             BuildModDiv(tree->AsOp());
277             break;
278
279         case GT_MUL:
280         case GT_MULHI:
281 #if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
282         case GT_MUL_LONG:
283 #endif
284             BuildMul(tree->AsOp());
285             break;
286
287         case GT_INTRINSIC:
288             BuildIntrinsic(tree->AsOp());
289             break;
290
291 #ifdef FEATURE_SIMD
292         case GT_SIMD:
293             BuildSIMD(tree->AsSIMD());
294             break;
295 #endif // FEATURE_SIMD
296
297 #ifdef FEATURE_HW_INTRINSICS
298         case GT_HWIntrinsic:
299             BuildHWIntrinsic(tree->AsHWIntrinsic());
300             break;
301 #endif // FEATURE_HW_INTRINSICS
302
303         case GT_CAST:
304             BuildCast(tree);
305             break;
306
307         case GT_BITCAST:
308         {
309             LocationInfoListNode* locationInfo = getLocationInfo(tree->gtOp.gtOp1);
310             locationInfo->info.isTgtPref       = true;
311             useList.Append(locationInfo);
312             info->srcCount = 1;
313             info->dstCount = 1;
314         }
315         break;
316
317         case GT_NEG:
318             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
319
320             // TODO-XArch-CQ:
321             // SSE instruction set doesn't have an instruction to negate a number.
322             // The recommended way is to xor the float/double number with a bitmask.
323             // The only way to xor is using xorps or xorpd both of which operate on
324             // 128-bit operands.  To hold the bit-mask we would need another xmm
325             // register or a 16-byte aligned 128-bit data constant. Right now emitter
326             // lacks the support for emitting such constants or instruction with mem
327             // addressing mode referring to a 128-bit operand. For now we use an
328             // internal xmm register to load 32/64-bit bitmask from data section.
329             // Note that by trading additional data section memory (128-bit) we can
330             // save on the need for an internal register and also a memory-to-reg
331             // move.
332             //
333             // Note: another option to avoid internal register requirement is by
334             // lowering as GT_SUB(0, src).  This will generate code different from
335             // Jit64 and could possibly result in compat issues (?).
336             if (varTypeIsFloating(tree))
337             {
338                 info->internalFloatCount = 1;
339                 info->setInternalCandidates(this, internalFloatRegCandidates());
340             }
341             break;
342
343         case GT_NOT:
344             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
345             break;
346
347         case GT_LSH:
348         case GT_RSH:
349         case GT_RSZ:
350         case GT_ROL:
351         case GT_ROR:
352 #ifdef _TARGET_X86_
353         case GT_LSH_HI:
354         case GT_RSH_LO:
355 #endif
356             (void)BuildShiftRotate(tree);
357             break;
358
359         case GT_EQ:
360         case GT_NE:
361         case GT_LT:
362         case GT_LE:
363         case GT_GE:
364         case GT_GT:
365         case GT_TEST_EQ:
366         case GT_TEST_NE:
367         case GT_CMP:
368             BuildCmp(tree);
369             break;
370
371         case GT_CKFINITE:
372             appendLocationInfoToList(tree->gtOp.gtOp1);
373             info->srcCount = 1;
374             assert(info->dstCount == 1);
375             info->internalIntCount = 1;
376             break;
377
378         case GT_CMPXCHG:
379         {
380             info->srcCount = 3;
381             assert(info->dstCount == 1);
382
383             // comparand is preferenced to RAX.
384             // Remaining two operands can be in any reg other than RAX.
385             LocationInfoListNode* locationInfo = getLocationInfo(tree->gtCmpXchg.gtOpLocation);
386             locationInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RAX);
387             useList.Append(locationInfo);
388             LocationInfoListNode* valueInfo = getLocationInfo(tree->gtCmpXchg.gtOpValue);
389             valueInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RAX);
390             useList.Append(valueInfo);
391             info->setDstCandidates(this, RBM_RAX);
392             LocationInfoListNode* comparandInfo = getLocationInfo(tree->gtCmpXchg.gtOpComparand);
393             comparandInfo->info.setSrcCandidates(this, RBM_RAX);
394             useList.Append(comparandInfo);
395         }
396         break;
397
398         case GT_LOCKADD:
399             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
400             assert(info->dstCount == (tree->TypeGet() == TYP_VOID) ? 0 : 1);
401             break;
402
403         case GT_PUTARG_REG:
404             BuildPutArgReg(tree->AsUnOp());
405             break;
406
407         case GT_CALL:
408             BuildCall(tree->AsCall());
409             break;
410
411         case GT_ADDR:
412         {
413             // For a GT_ADDR, the child node should not be evaluated into a register
414             GenTree* child = tree->gtOp.gtOp1;
415             assert(!isCandidateLocalRef(child));
416             assert(child->isContained());
417             assert(info->dstCount == 1);
418             info->srcCount = 0;
419         }
420         break;
421
422 #if !defined(FEATURE_PUT_STRUCT_ARG_STK)
423         case GT_OBJ:
424 #endif
425         case GT_BLK:
426         case GT_DYN_BLK:
427             // These should all be eliminated prior to Lowering.
428             assert(!"Non-store block node in Lowering");
429             info->srcCount = 0;
430             break;
431
432 #ifdef FEATURE_PUT_STRUCT_ARG_STK
433         case GT_PUTARG_STK:
434             BuildPutArgStk(tree->AsPutArgStk());
435             break;
436 #endif // FEATURE_PUT_STRUCT_ARG_STK
437
438         case GT_STORE_BLK:
439         case GT_STORE_OBJ:
440         case GT_STORE_DYN_BLK:
441             BuildBlockStore(tree->AsBlk());
442             break;
443
444         case GT_INIT_VAL:
445             // Always a passthrough of its child's value.
446             assert(!"INIT_VAL should always be contained");
447             break;
448
449         case GT_LCLHEAP:
450             BuildLclHeap(tree);
451             break;
452
453         case GT_ARR_BOUNDS_CHECK:
454 #ifdef FEATURE_SIMD
455         case GT_SIMD_CHK:
456 #endif // FEATURE_SIMD
457 #ifdef FEATURE_HW_INTRINSICS
458         case GT_HW_INTRINSIC_CHK:
459 #endif // FEATURE_HW_INTRINSICS
460             // Consumes arrLen & index - has no result
461             info->srcCount = 2;
462             assert(info->dstCount == 0);
463             info->srcCount = GetOperandInfo(tree->AsBoundsChk()->gtIndex);
464             info->srcCount += GetOperandInfo(tree->AsBoundsChk()->gtArrLen);
465             break;
466
467         case GT_ARR_ELEM:
468             // These must have been lowered to GT_ARR_INDEX
469             noway_assert(!"We should never see a GT_ARR_ELEM after Lowering.");
470             info->srcCount = 0;
471             break;
472
473         case GT_ARR_INDEX:
474         {
475             info->srcCount = 2;
476             assert(info->dstCount == 1);
477             assert(!tree->AsArrIndex()->ArrObj()->isContained());
478             assert(!tree->AsArrIndex()->IndexExpr()->isContained());
479             // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
480             // times while the result is being computed.
481             LocationInfoListNode* arrObjInfo = getLocationInfo(tree->AsArrIndex()->ArrObj());
482             arrObjInfo->info.isDelayFree     = true;
483             useList.Append(arrObjInfo);
484             useList.Append(getLocationInfo(tree->AsArrIndex()->IndexExpr()));
485             info->hasDelayFreeSrc = true;
486         }
487         break;
488
489         case GT_ARR_OFFSET:
490             // This consumes the offset, if any, the arrObj and the effective index,
491             // and produces the flattened offset for this dimension.
492             assert(info->dstCount == 1);
493             if (tree->gtArrOffs.gtOffset->isContained())
494             {
495                 info->srcCount = 2;
496             }
497             else
498             {
499                 // Here we simply need an internal register, which must be different
500                 // from any of the operand's registers, but may be the same as targetReg.
501                 info->srcCount         = 3;
502                 info->internalIntCount = 1;
503                 appendLocationInfoToList(tree->AsArrOffs()->gtOffset);
504             }
505             appendLocationInfoToList(tree->AsArrOffs()->gtIndex);
506             appendLocationInfoToList(tree->AsArrOffs()->gtArrObj);
507             break;
508
509         case GT_LEA:
510             // The LEA usually passes its operands through to the GT_IND, in which case it will
511             // be contained, but we may be instantiating an address, in which case we set them here.
512             info->srcCount = 0;
513             assert(info->dstCount == 1);
514             if (tree->AsAddrMode()->HasBase())
515             {
516                 info->srcCount++;
517                 appendLocationInfoToList(tree->AsAddrMode()->Base());
518             }
519             if (tree->AsAddrMode()->HasIndex())
520             {
521                 info->srcCount++;
522                 appendLocationInfoToList(tree->AsAddrMode()->Index());
523             }
524             break;
525
526         case GT_STOREIND:
527             if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
528             {
529                 BuildGCWriteBarrier(tree);
530                 break;
531             }
532             BuildIndir(tree->AsIndir());
533             break;
534
535         case GT_NULLCHECK:
536             assert(info->dstCount == 0);
537             appendLocationInfoToList(tree->gtOp.gtOp1);
538             info->srcCount = 1;
539             break;
540
541         case GT_IND:
542             BuildIndir(tree->AsIndir());
543             assert(info->dstCount == 1);
544             break;
545
546         case GT_CATCH_ARG:
547             info->srcCount = 0;
548             assert(info->dstCount == 1);
549             info->setDstCandidates(this, RBM_EXCEPTION_OBJECT);
550             break;
551
552 #if !FEATURE_EH_FUNCLETS
553         case GT_END_LFIN:
554             info->srcCount = 0;
555             assert(info->dstCount == 0);
556             break;
557 #endif
558
559         case GT_CLS_VAR:
560             // These nodes are eliminated by rationalizer.
561             JITDUMP("Unexpected node %s in Lower.\n", GenTree::OpName(tree->OperGet()));
562             unreached();
563             break;
564
565         case GT_INDEX_ADDR:
566             assert(info->dstCount == 1);
567             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
568
569             if (tree->AsIndexAddr()->Index()->TypeGet() == TYP_I_IMPL)
570             {
571                 info->internalIntCount = 1;
572             }
573             else
574             {
575                 switch (tree->AsIndexAddr()->gtElemSize)
576                 {
577                     case 1:
578                     case 2:
579                     case 4:
580                     case 8:
581                         break;
582
583                     default:
584                         info->internalIntCount = 1;
585                         break;
586                 }
587             }
588             break;
589     } // end switch (tree->OperGet())
590
591     // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
592     // Even then we would like to set isTgtPref on Op1.
593     if (tree->OperIsBinary() && info->srcCount >= 1)
594     {
595         if (isRMWRegOper(tree))
596         {
597             GenTree* op1 = tree->gtOp.gtOp1;
598             GenTree* op2 = tree->gtOp.gtOp2;
599
600             // Commutative opers like add/mul/and/or/xor could reverse the order of
601             // operands if it is safe to do so.  In such a case we would like op2 to be
602             // target preferenced instead of op1.
603             if (tree->OperIsCommutative() && op1->isContained() && op2 != nullptr)
604             {
605                 op1 = op2;
606                 op2 = tree->gtOp.gtOp1;
607             }
608
609             // If we have a read-modify-write operation, we want to preference op1 to the target,
610             // if it is not contained.
611             if (!op1->isContained() && !op1->OperIs(GT_LIST))
612             {
613                 useList.GetTreeNodeInfo(op1).isTgtPref = true;
614             }
615
616             // Is this a non-commutative operator, or is op2 a contained memory op?
617             // In either case, we need to make op2 remain live until the op is complete, by marking
618             // the source(s) associated with op2 as "delayFree" if this node defines a register.
619             // Note that if op2 of a binary RMW operator is a memory op, even if the operator
620             // is commutative, codegen cannot reverse them.
621             // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
622             // more work to be done to correctly reverse the operands if they involve memory
623             // operands.  Also, we may need to handle more cases than GT_IND, especially once
624             // we've modified the register allocator to not require all nodes to be assigned
625             // a register (e.g. a spilled lclVar can often be referenced directly from memory).
626             // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
627
628             GenTree* delayUseSrc = nullptr;
629             // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
630             // to special case them.
631             if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
632             {
633                 // These tree nodes will have their op1 marked as isDelayFree=true.
634                 // Hence these tree nodes should have a Def position so that op1's reg
635                 // gets freed at DefLoc+1.
636                 if (tree->TypeGet() == TYP_VOID)
637                 {
638                     // Right now a GT_XADD node could be morphed into a
639                     // GT_LOCKADD of TYP_VOID. See gtExtractSideEffList().
640                     // Note that it is advantageous to use GT_LOCKADD
641                     // instead of of GT_XADD as the former uses lock.add,
642                     // which allows its second operand to be a contained
643                     // immediate wheres xadd instruction requires its
644                     // second operand to be in a register.
645                     assert(info->dstCount == 0);
646
647                     // Give it an artificial type and mark it as an unused value.
648                     // This results in a Def position created but not considered consumed by its parent node.
649                     tree->gtType        = TYP_INT;
650                     info->dstCount      = 1;
651                     info->isLocalDefUse = true;
652                     tree->SetUnusedValue();
653                 }
654                 else
655                 {
656                     assert(info->dstCount != 0);
657                 }
658
659                 delayUseSrc = op1;
660             }
661             else if ((info->dstCount != 0) && (op2 != nullptr) &&
662                      (!tree->OperIsCommutative() || (op2->isContained() && !op2->IsCnsIntOrI())))
663             {
664                 delayUseSrc = op2;
665             }
666             if ((delayUseSrc != nullptr) && CheckAndSetDelayFree(delayUseSrc))
667             {
668                 info->hasDelayFreeSrc = true;
669             }
670         }
671     }
672
673     BuildCheckByteable(tree);
674
675     // We need to be sure that we've set info->srcCount and info->dstCount appropriately
676     assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
677     assert(info->isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue()));
678     assert(!tree->IsUnusedValue() || (info->dstCount != 0));
679     assert(info->dstCount == tree->GetRegisterDstCount());
680 }
681
682 //---------------------------------------------------------------------
683 // CheckAndSetDelayFree - Set isDelayFree on the given operand or its child(ren), if appropriate
684 //
685 // Arguments
686 //    delayUseSrc - a node that may have a delayed use
687 //
688 // Return Value:
689 //    True iff the node or one of its children has been marked isDelayFree
690 //
691 // Notes:
692 //    Only register operands should be marked isDelayFree, not contained immediates or memory.
693 //
694 bool LinearScan::CheckAndSetDelayFree(GenTree* delayUseSrc)
695 {
696     // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
697     // on the base & index, if any.
698     // Otherwise, we set it on delayUseSrc itself.
699     bool returnValue = false;
700     if (delayUseSrc->isContained())
701     {
702         // If delayUseSrc is a non-Indir contained node (e.g. a local) there's no register use to delay.
703         if (delayUseSrc->isIndir())
704         {
705             GenTree* base  = delayUseSrc->AsIndir()->Base();
706             GenTree* index = delayUseSrc->AsIndir()->Index();
707             if ((base != nullptr) && !base->isContained())
708             {
709                 useList.GetTreeNodeInfo(base).isDelayFree = true;
710                 returnValue                               = true;
711             }
712             if (index != nullptr)
713             {
714                 assert(!index->isContained());
715                 useList.GetTreeNodeInfo(index).isDelayFree = true;
716                 returnValue                                = true;
717             }
718         }
719     }
720     else
721     {
722         useList.GetTreeNodeInfo(delayUseSrc).isDelayFree = true;
723         returnValue                                      = true;
724     }
725     return returnValue;
726 }
727
728 //------------------------------------------------------------------------
729 // BuildCheckByteable: Check the tree to see if "byte-able" registers are
730 // required, and set the tree node info accordingly.
731 //
732 // Arguments:
733 //    tree      - The node of interest
734 //
735 // Return Value:
736 //    None.
737 //
738 void LinearScan::BuildCheckByteable(GenTree* tree)
739 {
740 #ifdef _TARGET_X86_
741     TreeNodeInfo* info = currentNodeInfo;
742     // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
743     // if the tree node is a byte type.
744     //
745     // Though this looks conservative in theory, in practice we could not think of a case where
746     // the below logic leads to conservative register specification.  In future when or if we find
747     // one such case, this logic needs to be fine tuned for that case(s).
748
749     if (ExcludeNonByteableRegisters(tree))
750     {
751         regMaskTP regMask;
752         if (info->dstCount > 0)
753         {
754             regMask = info->getDstCandidates(this);
755             assert(regMask != RBM_NONE);
756             info->setDstCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
757         }
758
759         if (tree->OperIsSimple())
760         {
761             GenTree* op = tree->gtOp.gtOp1;
762             // We need byte registers on the operands of most simple operators that produce a byte result.
763             // However, indirections are simple operators but do not require their address in a byte register.
764             if ((op != nullptr) && !tree->OperIsIndir())
765             {
766                 // No need to set src candidates on a contained child operand.
767                 if (!op->isContained())
768                 {
769                     TreeNodeInfo& op1Info = useList.GetTreeNodeInfo(op);
770                     regMask               = op1Info.getSrcCandidates(this);
771                     assert(regMask != RBM_NONE);
772                     op1Info.setSrcCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
773                 }
774             }
775
776             if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
777             {
778                 op = tree->gtOp.gtOp2;
779                 if (!op->isContained())
780                 {
781                     TreeNodeInfo& op2Info = useList.GetTreeNodeInfo(op);
782                     regMask               = op2Info.getSrcCandidates(this);
783                     assert(regMask != RBM_NONE);
784                     op2Info.setSrcCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
785                 }
786             }
787         }
788     }
789 #endif //_TARGET_X86_
790 }
791
792 //------------------------------------------------------------------------------
793 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
794 //
795 // Arguments:
796 //    tree      - a binary tree node
797 //
798 // Return Value:
799 //    Returns true if we can use the read-modify-write instruction form
800 //
801 // Notes:
802 //    This is used to determine whether to preference the source to the destination register.
803 //
804 bool LinearScan::isRMWRegOper(GenTree* tree)
805 {
806     // TODO-XArch-CQ: Make this more accurate.
807     // For now, We assume that most binary operators are of the RMW form.
808     assert(tree->OperIsBinary());
809
810     if (tree->OperIsCompare() || tree->OperIs(GT_CMP))
811     {
812         return false;
813     }
814
815     switch (tree->OperGet())
816     {
817         // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
818         case GT_LEA:
819         case GT_STOREIND:
820         case GT_ARR_INDEX:
821         case GT_STORE_BLK:
822         case GT_STORE_OBJ:
823             return false;
824
825         // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
826         case GT_MUL:
827             return (!tree->gtOp.gtOp2->isContainedIntOrIImmed() && !tree->gtOp.gtOp1->isContainedIntOrIImmed());
828
829 #ifdef FEATURE_HW_INTRINSICS
830         case GT_HWIntrinsic:
831             return tree->isRMWHWIntrinsic(compiler);
832 #endif // FEATURE_HW_INTRINSICS
833
834         default:
835             return true;
836     }
837 }
838
839 //------------------------------------------------------------------------
840 // BuildShiftRotate: Set the NodeInfo for a shift or rotate.
841 //
842 // Arguments:
843 //    tree      - The node of interest
844 //
845 // Return Value:
846 //    None.
847 //
848 int LinearScan::BuildShiftRotate(GenTree* tree)
849 {
850     TreeNodeInfo* info = currentNodeInfo;
851     // For shift operations, we need that the number
852     // of bits moved gets stored in CL in case
853     // the number of bits to shift is not a constant.
854     int                   srcCount    = 0;
855     GenTree*              shiftBy     = tree->gtOp.gtOp2;
856     GenTree*              source      = tree->gtOp.gtOp1;
857     LocationInfoListNode* shiftByInfo = nullptr;
858     // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
859     // We will allow whatever can be encoded - hope you know what you are doing.
860     if (shiftBy->isContained())
861     {
862         srcCount += GetOperandInfo(source);
863     }
864     else
865     {
866         srcCount++;
867         shiftByInfo = getLocationInfo(shiftBy);
868         shiftByInfo->info.setSrcCandidates(this, RBM_RCX);
869         info->setDstCandidates(this, allRegs(TYP_INT) & ~RBM_RCX);
870         LocationInfoListNode* sourceInfo;
871         srcCount += GetOperandInfo(source, &sourceInfo);
872         for (; sourceInfo != nullptr; sourceInfo = sourceInfo->Next())
873         {
874             sourceInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RCX);
875         }
876     }
877
878     // Note that Rotate Left/Right instructions don't set ZF and SF flags.
879     //
880     // If the operand being shifted is 32-bits then upper three bits are masked
881     // by hardware to get actual shift count.  Similarly for 64-bit operands
882     // shift count is narrowed to [0..63].  If the resulting shift count is zero,
883     // then shift operation won't modify flags.
884     //
885     // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
886     // if the shift count is known to be non-zero and in the range depending on the
887     // operand size.
888     CLANG_FORMAT_COMMENT_ANCHOR;
889
890 #ifdef _TARGET_X86_
891     // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
892     // we can have a three operand form. Increment the srcCount.
893     if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
894     {
895         assert((source->OperGet() == GT_LONG) && source->isContained());
896
897         GenTree*              sourceLo     = source->gtOp.gtOp1;
898         LocationInfoListNode* sourceLoInfo = useList.Begin();
899         LocationInfoListNode* sourceHiInfo = useList.GetSecond(INDEBUG(source->gtGetOp2()));
900
901         info->hasDelayFreeSrc = true;
902         if (tree->OperGet() == GT_LSH_HI)
903         {
904             sourceLoInfo->info.isDelayFree = true;
905         }
906         else
907         {
908             sourceHiInfo->info.isDelayFree = true;
909         }
910     }
911 #endif
912     if (shiftByInfo != nullptr)
913     {
914         if (tree->IsReverseOp())
915         {
916             useList.Prepend(shiftByInfo);
917         }
918         else
919         {
920             useList.Append(shiftByInfo);
921         }
922     }
923     if (!tree->isContained())
924     {
925         info->srcCount = srcCount;
926     }
927     return srcCount;
928 }
929
930 //------------------------------------------------------------------------
931 // BuildCall: Set the NodeInfo for a call.
932 //
933 // Arguments:
934 //    call      - The call node of interest
935 //
936 // Return Value:
937 //    None.
938 //
939 void LinearScan::BuildCall(GenTreeCall* call)
940 {
941     TreeNodeInfo*   info              = currentNodeInfo;
942     bool            hasMultiRegRetVal = false;
943     ReturnTypeDesc* retTypeDesc       = nullptr;
944
945     assert(!call->isContained());
946     info->srcCount = 0;
947     if (call->TypeGet() != TYP_VOID)
948     {
949         hasMultiRegRetVal = call->HasMultiRegRetVal();
950         if (hasMultiRegRetVal)
951         {
952             // dst count = number of registers in which the value is returned by call
953             retTypeDesc    = call->GetReturnTypeDesc();
954             info->dstCount = retTypeDesc->GetReturnRegCount();
955         }
956         else
957         {
958             assert(info->dstCount == 1);
959         }
960     }
961     else
962     {
963         assert(info->dstCount == 0);
964     }
965
966     GenTree*              ctrlExpr     = call->gtControlExpr;
967     LocationInfoListNode* ctrlExprInfo = nullptr;
968     if (call->gtCallType == CT_INDIRECT)
969     {
970         ctrlExpr = call->gtCallAddr;
971     }
972
973     // If this is a varargs call, we will clear the internal candidates in case we need
974     // to reserve some integer registers for copying float args.
975     // We have to do this because otherwise the default candidates are allRegs, and adding
976     // the individual specific registers will have no effect.
977     if (call->IsVarargs())
978     {
979         info->setInternalCandidates(this, RBM_NONE);
980     }
981
982     RegisterType registerType = call->TypeGet();
983
984     // Set destination candidates for return value of the call.
985     CLANG_FORMAT_COMMENT_ANCHOR;
986
987 #ifdef _TARGET_X86_
988     if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
989     {
990         // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
991         // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
992         // correct argument registers.
993         info->setDstCandidates(this, RBM_PINVOKE_TCB);
994     }
995     else
996 #endif // _TARGET_X86_
997         if (hasMultiRegRetVal)
998     {
999         assert(retTypeDesc != nullptr);
1000         info->setDstCandidates(this, retTypeDesc->GetABIReturnRegs());
1001     }
1002     else if (varTypeIsFloating(registerType))
1003     {
1004 #ifdef _TARGET_X86_
1005         // The return value will be on the X87 stack, and we will need to move it.
1006         info->setDstCandidates(this, allRegs(registerType));
1007 #else  // !_TARGET_X86_
1008         info->setDstCandidates(this, RBM_FLOATRET);
1009 #endif // !_TARGET_X86_
1010     }
1011     else if (registerType == TYP_LONG)
1012     {
1013         info->setDstCandidates(this, RBM_LNGRET);
1014     }
1015     else
1016     {
1017         info->setDstCandidates(this, RBM_INTRET);
1018     }
1019
1020     // number of args to a call =
1021     // callRegArgs + (callargs - placeholders, setup, etc)
1022     // there is an explicit thisPtr but it is redundant
1023
1024     bool callHasFloatRegArgs = false;
1025     bool isVarArgs           = call->IsVarargs();
1026
1027     // First, count reg args
1028     for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1029     {
1030         assert(list->OperIsList());
1031
1032         // By this point, lowering has ensured that all call arguments are one of the following:
1033         // - an arg setup store
1034         // - an arg placeholder
1035         // - a nop
1036         // - a copy blk
1037         // - a field list
1038         // - a put arg
1039         //
1040         // Note that this property is statically checked by LinearScan::CheckBlock.
1041         GenTree* argNode = list->Current();
1042
1043         // Each register argument corresponds to one source.
1044         if (argNode->OperIsPutArgReg())
1045         {
1046             info->srcCount++;
1047             HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1048             appendLocationInfoToList(argNode);
1049         }
1050 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1051         else if (argNode->OperGet() == GT_FIELD_LIST)
1052         {
1053             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1054             {
1055                 assert(entry->Current()->OperIsPutArgReg());
1056                 info->srcCount++;
1057                 HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1058                 appendLocationInfoToList(entry->Current());
1059             }
1060         }
1061 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1062
1063 #ifdef DEBUG
1064         // In DEBUG only, check validity with respect to the arg table entry.
1065
1066         fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1067         assert(curArgTabEntry);
1068
1069         if (curArgTabEntry->regNum == REG_STK)
1070         {
1071             // late arg that is not passed in a register
1072             assert(argNode->gtOper == GT_PUTARG_STK);
1073
1074 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1075             // If the node is TYP_STRUCT and it is put on stack with
1076             // putarg_stk operation, we consume and produce no registers.
1077             // In this case the embedded Obj node should not produce
1078             // registers too since it is contained.
1079             // Note that if it is a SIMD type the argument will be in a register.
1080             if (argNode->TypeGet() == TYP_STRUCT)
1081             {
1082                 assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
1083                 assert(argNode->gtOp.gtOp1->isContained());
1084             }
1085 #endif // FEATURE_PUT_STRUCT_ARG_STK
1086             continue;
1087         }
1088 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1089         if (argNode->OperGet() == GT_FIELD_LIST)
1090         {
1091             assert(argNode->isContained());
1092             assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
1093
1094             int i = 0;
1095             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1096             {
1097                 const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum;
1098                 assert(entry->Current()->gtRegNum == argReg);
1099                 assert(i < 2);
1100                 i++;
1101             }
1102         }
1103         else
1104 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1105         {
1106             const regNumber argReg = curArgTabEntry->regNum;
1107             assert(argNode->gtRegNum == argReg);
1108         }
1109 #endif // DEBUG
1110     }
1111
1112     // Now, count stack args
1113     // Note that these need to be computed into a register, but then
1114     // they're just stored to the stack - so the reg doesn't
1115     // need to remain live until the call.  In fact, it must not
1116     // because the code generator doesn't actually consider it live,
1117     // so it can't be spilled.
1118
1119     GenTree* args = call->gtCallArgs;
1120     while (args)
1121     {
1122         GenTree* arg = args->gtOp.gtOp1;
1123         if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
1124         {
1125             if (arg->IsValue() && !arg->isContained())
1126             {
1127                 // argInfo->isLocalDefUse = true;
1128                 assert(arg->IsUnusedValue());
1129             }
1130             // assert(argInfo->dstCount == 0);
1131         }
1132         args = args->gtOp.gtOp2;
1133     }
1134
1135     // set reg requirements on call target represented as control sequence.
1136     if (ctrlExpr != nullptr)
1137     {
1138         LocationInfoListNode* ctrlExprInfo  = nullptr;
1139         int                   ctrlExprCount = GetOperandInfo(ctrlExpr);
1140         if (ctrlExprCount != 0)
1141         {
1142             assert(ctrlExprCount == 1);
1143             ctrlExprInfo = useList.Last();
1144             info->srcCount++;
1145         }
1146
1147         // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1148         // computed into a register.
1149         if (call->IsFastTailCall())
1150         {
1151             assert(!ctrlExpr->isContained() && ctrlExprInfo != nullptr);
1152             // Fast tail call - make sure that call target is always computed in RAX
1153             // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1154             ctrlExprInfo->info.setSrcCandidates(this, RBM_RAX);
1155         }
1156 #ifdef _TARGET_X86_
1157         else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1158         {
1159             // On x86, we need to generate a very specific pattern for indirect VSD calls:
1160             //
1161             //    3-byte nop
1162             //    call dword ptr [eax]
1163             //
1164             // Where EAX is also used as an argument to the stub dispatch helper. Make
1165             // sure that the call target address is computed into EAX in this case.
1166             assert(ctrlExprInfo != nullptr);
1167             assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
1168             ctrlExprInfo->info.setSrcCandidates(this, RBM_VIRTUAL_STUB_TARGET);
1169         }
1170 #endif // _TARGET_X86_
1171
1172 #if FEATURE_VARARG
1173         // If it is a fast tail call, it is already preferenced to use RAX.
1174         // Therefore, no need set src candidates on call tgt again.
1175         if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExprInfo != nullptr))
1176         {
1177             // Don't assign the call target to any of the argument registers because
1178             // we will use them to also pass floating point arguments as required
1179             // by Amd64 ABI.
1180             ctrlExprInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~(RBM_ARG_REGS));
1181         }
1182 #endif // !FEATURE_VARARG
1183     }
1184 }
1185
1186 //------------------------------------------------------------------------
1187 // BuildBlockStore: Set the NodeInfo for a block store.
1188 //
1189 // Arguments:
1190 //    blkNode       - The block store node of interest
1191 //
1192 // Return Value:
1193 //    None.
1194 //
1195 void LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
1196 {
1197     TreeNodeInfo* info    = currentNodeInfo;
1198     GenTree*      dstAddr = blkNode->Addr();
1199     unsigned      size    = blkNode->gtBlkSize;
1200     GenTree*      source  = blkNode->Data();
1201
1202     LocationInfoListNode* dstAddrInfo = nullptr;
1203     LocationInfoListNode* sourceInfo  = nullptr;
1204     LocationInfoListNode* sizeInfo    = nullptr;
1205
1206     // Sources are dest address, initVal or source.
1207     // We may require an additional source or temp register for the size.
1208     if (!dstAddr->isContained())
1209     {
1210         info->srcCount++;
1211         dstAddrInfo = getLocationInfo(dstAddr);
1212     }
1213     assert(info->dstCount == 0);
1214     info->setInternalCandidates(this, RBM_NONE);
1215     GenTree* srcAddrOrFill = nullptr;
1216     bool     isInitBlk     = blkNode->OperIsInitBlkOp();
1217
1218     regMaskTP dstAddrRegMask = RBM_NONE;
1219     regMaskTP sourceRegMask  = RBM_NONE;
1220     regMaskTP blkSizeRegMask = RBM_NONE;
1221
1222     if (isInitBlk)
1223     {
1224         GenTree* initVal = source;
1225         if (initVal->OperIsInitVal())
1226         {
1227             assert(initVal->isContained());
1228             initVal = initVal->gtGetOp1();
1229         }
1230         srcAddrOrFill = initVal;
1231         if (!initVal->isContained())
1232         {
1233             info->srcCount++;
1234             sourceInfo = getLocationInfo(initVal);
1235         }
1236
1237         switch (blkNode->gtBlkOpKind)
1238         {
1239             case GenTreeBlk::BlkOpKindUnroll:
1240                 assert(initVal->IsCnsIntOrI());
1241                 if (size >= XMM_REGSIZE_BYTES)
1242                 {
1243                     // Reserve an XMM register to fill it with a pack of 16 init value constants.
1244                     info->internalFloatCount = 1;
1245                     info->setInternalCandidates(this, internalFloatRegCandidates());
1246                     // use XMM register to fill with constants, it's AVX instruction and set the flag
1247                     SetContainsAVXFlags();
1248                 }
1249 #ifdef _TARGET_X86_
1250                 if ((size & 1) != 0)
1251                 {
1252                     // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1253                     // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1254                     // when unrolling, so only allow byteable registers as the source value. (We could
1255                     // consider just using BlkOpKindRepInstr instead.)
1256                     sourceRegMask = RBM_BYTE_REGS;
1257                 }
1258 #endif // _TARGET_X86_
1259                 break;
1260
1261             case GenTreeBlk::BlkOpKindRepInstr:
1262                 // rep stos has the following register requirements:
1263                 // a) The memory address to be in RDI.
1264                 // b) The fill value has to be in RAX.
1265                 // c) The buffer size will go in RCX.
1266                 dstAddrRegMask = RBM_RDI;
1267                 sourceRegMask  = RBM_RAX;
1268                 blkSizeRegMask = RBM_RCX;
1269                 break;
1270
1271             case GenTreeBlk::BlkOpKindHelper:
1272 #ifdef _TARGET_AMD64_
1273                 // The helper follows the regular AMD64 ABI.
1274                 dstAddrRegMask = RBM_ARG_0;
1275                 sourceRegMask  = RBM_ARG_1;
1276                 blkSizeRegMask = RBM_ARG_2;
1277 #else  // !_TARGET_AMD64_
1278                 dstAddrRegMask             = RBM_RDI;
1279                 sourceRegMask              = RBM_RAX;
1280                 blkSizeRegMask             = RBM_RCX;
1281 #endif // !_TARGET_AMD64_
1282                 break;
1283
1284             default:
1285                 unreached();
1286         }
1287     }
1288     else
1289     {
1290         // CopyObj or CopyBlk
1291         if (source->gtOper == GT_IND)
1292         {
1293             assert(source->isContained());
1294             srcAddrOrFill = source->gtGetOp1();
1295             if (!srcAddrOrFill->isContained())
1296             {
1297                 sourceInfo = getLocationInfo(srcAddrOrFill);
1298                 info->srcCount++;
1299             }
1300         }
1301         if (blkNode->OperGet() == GT_STORE_OBJ)
1302         {
1303             if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
1304             {
1305                 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1306                 blkSizeRegMask = RBM_RCX;
1307             }
1308             // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
1309             // sources.
1310             sourceRegMask  = RBM_RSI;
1311             dstAddrRegMask = RBM_RDI;
1312         }
1313         else
1314         {
1315             switch (blkNode->gtBlkOpKind)
1316             {
1317                 case GenTreeBlk::BlkOpKindUnroll:
1318                     // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1319                     //
1320                     // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1321                     // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1322                     // RBM_NON_BYTE_REGS from internal candidates.
1323                     if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1324                     {
1325                         info->internalIntCount++;
1326                         regMaskTP regMask = allRegs(TYP_INT);
1327
1328 #ifdef _TARGET_X86_
1329                         if ((size & 1) != 0)
1330                         {
1331                             regMask &= ~RBM_NON_BYTE_REGS;
1332                         }
1333 #endif
1334                         info->setInternalCandidates(this, regMask);
1335                     }
1336
1337                     if (size >= XMM_REGSIZE_BYTES)
1338                     {
1339                         // If we have a buffer larger than XMM_REGSIZE_BYTES,
1340                         // reserve an XMM register to use it for a
1341                         // series of 16-byte loads and stores.
1342                         info->internalFloatCount = 1;
1343                         info->addInternalCandidates(this, internalFloatRegCandidates());
1344                         // Uses XMM reg for load and store and hence check to see whether AVX instructions
1345                         // are used for codegen, set ContainsAVX flag
1346                         SetContainsAVXFlags();
1347                     }
1348                     break;
1349
1350                 case GenTreeBlk::BlkOpKindRepInstr:
1351                     // rep stos has the following register requirements:
1352                     // a) The dest address has to be in RDI.
1353                     // b) The src address has to be in RSI.
1354                     // c) The buffer size will go in RCX.
1355                     dstAddrRegMask = RBM_RDI;
1356                     sourceRegMask  = RBM_RSI;
1357                     blkSizeRegMask = RBM_RCX;
1358                     break;
1359
1360                 case GenTreeBlk::BlkOpKindHelper:
1361 #ifdef _TARGET_AMD64_
1362                     // The helper follows the regular AMD64 ABI.
1363                     dstAddrRegMask = RBM_ARG_0;
1364                     sourceRegMask  = RBM_ARG_1;
1365                     blkSizeRegMask = RBM_ARG_2;
1366 #else  // !_TARGET_AMD64_
1367                     dstAddrRegMask         = RBM_RDI;
1368                     sourceRegMask          = RBM_RAX;
1369                     blkSizeRegMask         = RBM_RCX;
1370 #endif // !_TARGET_AMD64_
1371                     break;
1372
1373                 default:
1374                     unreached();
1375             }
1376         }
1377     }
1378
1379     if (dstAddrInfo != nullptr)
1380     {
1381         if (dstAddrRegMask != RBM_NONE)
1382         {
1383             dstAddrInfo->info.setSrcCandidates(this, dstAddrRegMask);
1384         }
1385         useList.Append(dstAddrInfo);
1386     }
1387     if (sourceRegMask != RBM_NONE)
1388     {
1389         if (sourceInfo != nullptr)
1390         {
1391             sourceInfo->info.setSrcCandidates(this, sourceRegMask);
1392         }
1393         else
1394         {
1395             // This is a local source; we'll use a temp register for its address.
1396             info->addInternalCandidates(this, sourceRegMask);
1397             info->internalIntCount++;
1398         }
1399     }
1400     if (sourceInfo != nullptr)
1401     {
1402         useList.Add(sourceInfo, blkNode->IsReverseOp());
1403     }
1404
1405     if (blkNode->OperIs(GT_STORE_DYN_BLK))
1406     {
1407         // The block size argument is a third argument to GT_STORE_DYN_BLK
1408         info->srcCount++;
1409
1410         GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1411         sizeInfo           = getLocationInfo(blockSize);
1412         useList.Add(sizeInfo, blkNode->AsDynBlk()->gtEvalSizeFirst);
1413     }
1414
1415     if (blkSizeRegMask != RBM_NONE)
1416     {
1417         if (size != 0)
1418         {
1419             // Reserve a temp register for the block size argument.
1420             info->addInternalCandidates(this, blkSizeRegMask);
1421             info->internalIntCount++;
1422         }
1423         else
1424         {
1425             // The block size argument is a third argument to GT_STORE_DYN_BLK
1426             assert((blkNode->gtOper == GT_STORE_DYN_BLK) && (sizeInfo != nullptr));
1427             info->setSrcCount(3);
1428             sizeInfo->info.setSrcCandidates(this, blkSizeRegMask);
1429         }
1430     }
1431 }
1432
1433 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1434 //------------------------------------------------------------------------
1435 // BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1436 //
1437 // Arguments:
1438 //    tree      - The node of interest
1439 //
1440 // Return Value:
1441 //    None.
1442 //
1443 void LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
1444 {
1445     TreeNodeInfo* info = currentNodeInfo;
1446     info->srcCount     = 0;
1447     assert(info->dstCount == 0);
1448
1449     if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
1450     {
1451         putArgStk->gtOp1->SetContained();
1452
1453 #ifdef _TARGET_X86_
1454         unsigned fieldCount    = 0;
1455         bool     needsByteTemp = false;
1456         bool     needsSimdTemp = false;
1457         unsigned prevOffset    = putArgStk->getArgSize();
1458         for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1459         {
1460             GenTree* const  fieldNode   = current->Current();
1461             const var_types fieldType   = fieldNode->TypeGet();
1462             const unsigned  fieldOffset = current->gtFieldOffset;
1463             assert(fieldType != TYP_LONG);
1464
1465 #if defined(FEATURE_SIMD)
1466             // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the
1467             // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
1468             // we "round up" to 16.
1469             if (current->gtFieldType == TYP_SIMD12)
1470             {
1471                 needsSimdTemp = true;
1472             }
1473 #endif // defined(FEATURE_SIMD)
1474
1475             // We can treat as a slot any field that is stored at a slot boundary, where the previous
1476             // field is not in the same slot. (Note that we store the fields in reverse order.)
1477             const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
1478             if (!fieldIsSlot)
1479             {
1480                 if (varTypeIsByte(fieldType))
1481                 {
1482                     // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
1483                     // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
1484                     // need a byte-addressable register for the store. We will enforce this requirement on an internal
1485                     // register, which we can use to copy multiple byte values.
1486                     needsByteTemp = true;
1487                 }
1488             }
1489
1490             if (varTypeIsGC(fieldType))
1491             {
1492                 putArgStk->gtNumberReferenceSlots++;
1493             }
1494             prevOffset = fieldOffset;
1495             fieldCount++;
1496             if (!fieldNode->isContained())
1497             {
1498                 appendLocationInfoToList(fieldNode);
1499                 info->srcCount++;
1500             }
1501         }
1502
1503         if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
1504         {
1505             // If any of the fields cannot be stored with an actual push, we may need a temporary
1506             // register to load the value before storing it to the stack location.
1507             info->internalIntCount = 1;
1508             regMaskTP regMask      = allRegs(TYP_INT);
1509             if (needsByteTemp)
1510             {
1511                 regMask &= ~RBM_NON_BYTE_REGS;
1512             }
1513             info->setInternalCandidates(this, regMask);
1514         }
1515
1516 #if defined(FEATURE_SIMD)
1517         // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
1518         if (needsSimdTemp)
1519         {
1520             assert(info->dstCount == 0);
1521             info->internalFloatCount += 1;
1522             info->addInternalCandidates(this, allSIMDRegs());
1523         }
1524 #endif // defined(FEATURE_SIMD)
1525
1526         return;
1527 #endif // _TARGET_X86_
1528     }
1529
1530     GenTree*  src  = putArgStk->gtOp1;
1531     var_types type = src->TypeGet();
1532
1533 #if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1534     // For PutArgStk of a TYP_SIMD12, we need an extra register.
1535     if (putArgStk->isSIMD12())
1536     {
1537         appendLocationInfoToList(putArgStk->gtOp1);
1538         info->srcCount           = 1;
1539         info->internalFloatCount = 1;
1540         info->setInternalCandidates(this, allSIMDRegs());
1541         return;
1542     }
1543 #endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1544
1545     if (type != TYP_STRUCT)
1546     {
1547         BuildSimple(putArgStk);
1548         return;
1549     }
1550
1551     GenTree* dst     = putArgStk;
1552     GenTree* srcAddr = nullptr;
1553
1554     info->srcCount = GetOperandInfo(src);
1555
1556     // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1557     // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1558     // our framework assemblies, so this is the main code generation scheme we'll use.
1559     ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
1560     switch (putArgStk->gtPutArgStkKind)
1561     {
1562         case GenTreePutArgStk::Kind::Push:
1563         case GenTreePutArgStk::Kind::PushAllSlots:
1564         case GenTreePutArgStk::Kind::Unroll:
1565             // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1566             //
1567             // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1568             // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1569             // RBM_NON_BYTE_REGS from internal candidates.
1570             if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
1571             {
1572                 info->internalIntCount++;
1573                 regMaskTP regMask = allRegs(TYP_INT);
1574
1575 #ifdef _TARGET_X86_
1576                 if ((size % 2) != 0)
1577                 {
1578                     regMask &= ~RBM_NON_BYTE_REGS;
1579                 }
1580 #endif
1581                 info->setInternalCandidates(this, regMask);
1582             }
1583
1584 #ifdef _TARGET_X86_
1585             if (size >= 8)
1586 #else  // !_TARGET_X86_
1587             if (size >= XMM_REGSIZE_BYTES)
1588 #endif // !_TARGET_X86_
1589             {
1590                 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
1591                 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
1592                 // series of 16-byte loads and stores.
1593                 info->internalFloatCount = 1;
1594                 info->addInternalCandidates(this, internalFloatRegCandidates());
1595                 SetContainsAVXFlags();
1596             }
1597             break;
1598
1599         case GenTreePutArgStk::Kind::RepInstr:
1600             info->internalIntCount += 3;
1601             info->setInternalCandidates(this, (RBM_RDI | RBM_RCX | RBM_RSI));
1602             break;
1603
1604         default:
1605             unreached();
1606     }
1607 }
1608 #endif // FEATURE_PUT_STRUCT_ARG_STK
1609
1610 //------------------------------------------------------------------------
1611 // BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
1612 //
1613 // Arguments:
1614 //    tree      - The node of interest
1615 //
1616 // Return Value:
1617 //    None.
1618 //
1619 void LinearScan::BuildLclHeap(GenTree* tree)
1620 {
1621     TreeNodeInfo* info = currentNodeInfo;
1622     info->srcCount     = 1;
1623     assert(info->dstCount == 1);
1624
1625     // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1626     // Here '-' means don't care.
1627     //
1628     //     Size?                    Init Memory?         # temp regs
1629     //      0                            -                  0 (returns 0)
1630     //      const and <=6 reg words      -                  0 (pushes '0')
1631     //      const and >6 reg words       Yes                0 (pushes '0')
1632     //      const and <PageSize          No                 0 (amd64) 1 (x86)
1633     //                                                        (x86:tmpReg for sutracting from esp)
1634     //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
1635     //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
1636     //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
1637     //
1638     // Note: Here we don't need internal register to be different from targetReg.
1639     // Rather, require it to be different from operand's reg.
1640
1641     GenTree* size = tree->gtOp.gtOp1;
1642     if (size->IsCnsIntOrI())
1643     {
1644         assert(size->isContained());
1645         info->srcCount = 0;
1646         size_t sizeVal = size->gtIntCon.gtIconVal;
1647
1648         if (sizeVal == 0)
1649         {
1650             info->internalIntCount = 0;
1651         }
1652         else
1653         {
1654             // Compute the amount of memory to properly STACK_ALIGN.
1655             // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1656             // This should also help in debugging as we can examine the original size specified with localloc.
1657             sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1658
1659             // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1660             // we will generate 'push 0'.
1661             assert((sizeVal % REGSIZE_BYTES) == 0);
1662             size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1663             if (cntRegSizedWords <= 6)
1664             {
1665                 info->internalIntCount = 0;
1666             }
1667             else if (!compiler->info.compInitMem)
1668             {
1669                 // No need to initialize allocated stack space.
1670                 if (sizeVal < compiler->eeGetPageSize())
1671                 {
1672 #ifdef _TARGET_X86_
1673                     info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
1674 #else                                           // !_TARGET_X86_
1675                     info->internalIntCount = 0;
1676 #endif                                          // !_TARGET_X86_
1677                 }
1678                 else
1679                 {
1680                     // We need two registers: regCnt and RegTmp
1681                     info->internalIntCount = 2;
1682                 }
1683             }
1684             else
1685             {
1686                 // >6 and need to zero initialize allocated stack space.
1687                 info->internalIntCount = 0;
1688             }
1689         }
1690     }
1691     else
1692     {
1693         appendLocationInfoToList(size);
1694         if (!compiler->info.compInitMem)
1695         {
1696             info->internalIntCount = 2;
1697         }
1698         else
1699         {
1700             info->internalIntCount = 0;
1701         }
1702     }
1703 }
1704
1705 //------------------------------------------------------------------------
1706 // BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
1707 //
1708 // Arguments:
1709 //    tree      - The node of interest
1710 //
1711 // Return Value:
1712 //    None.
1713 //
1714 void LinearScan::BuildModDiv(GenTree* tree)
1715 {
1716     TreeNodeInfo* info = currentNodeInfo;
1717     GenTree*      op1  = tree->gtGetOp1();
1718     GenTree*      op2  = tree->gtGetOp2();
1719
1720     assert(info->dstCount == 1);
1721
1722     if (varTypeIsFloating(tree->TypeGet()))
1723     {
1724         info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
1725         return;
1726     }
1727
1728     // Amd64 Div/Idiv instruction:
1729     //    Dividend in RAX:RDX  and computes
1730     //    Quotient in RAX, Remainder in RDX
1731
1732     if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
1733     {
1734         // We are interested in just the remainder.
1735         // RAX is used as a trashable register during computation of remainder.
1736         info->setDstCandidates(this, RBM_RDX);
1737     }
1738     else
1739     {
1740         // We are interested in just the quotient.
1741         // RDX gets used as trashable register during computation of quotient
1742         info->setDstCandidates(this, RBM_RAX);
1743     }
1744
1745 #ifdef _TARGET_X86_
1746     if (op1->OperGet() == GT_LONG)
1747     {
1748         assert(op1->isContained());
1749
1750         // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
1751         GenTree* loVal = op1->gtGetOp1();
1752         GenTree* hiVal = op1->gtGetOp2();
1753
1754         assert(op2->IsCnsIntOrI());
1755         assert(tree->OperGet() == GT_UMOD);
1756
1757         // This situation also requires an internal register.
1758         info->internalIntCount = 1;
1759         info->setInternalCandidates(this, allRegs(TYP_INT));
1760
1761         LocationInfoListNode* loValInfo = getLocationInfo(loVal);
1762         LocationInfoListNode* hiValInfo = getLocationInfo(hiVal);
1763         loValInfo->info.setSrcCandidates(this, RBM_EAX);
1764         hiValInfo->info.setSrcCandidates(this, RBM_EDX);
1765         useList.Append(loValInfo);
1766         useList.Append(hiValInfo);
1767         info->srcCount = 2;
1768     }
1769     else
1770 #endif
1771     {
1772         // If possible would like to have op1 in RAX to avoid a register move
1773         LocationInfoListNode* op1Info = getLocationInfo(op1);
1774         op1Info->info.setSrcCandidates(this, RBM_RAX);
1775         useList.Append(op1Info);
1776         info->srcCount = 1;
1777     }
1778
1779     LocationInfoListNode* op2Info;
1780     info->srcCount += GetOperandInfo(op2, &op2Info);
1781     for (; op2Info != nullptr; op2Info = op2Info->Next())
1782     {
1783         op2Info->info.setSrcCandidates(this, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
1784     }
1785 }
1786
1787 //------------------------------------------------------------------------
1788 // BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
1789 //
1790 // Arguments:
1791 //    tree      - The node of interest
1792 //
1793 // Return Value:
1794 //    None.
1795 //
1796 void LinearScan::BuildIntrinsic(GenTree* tree)
1797 {
1798     TreeNodeInfo* info = currentNodeInfo;
1799     // Both operand and its result must be of floating point type.
1800     GenTree* op1 = tree->gtGetOp1();
1801     assert(varTypeIsFloating(op1));
1802     assert(op1->TypeGet() == tree->TypeGet());
1803
1804     info->srcCount = GetOperandInfo(op1);
1805     assert(info->dstCount == 1);
1806
1807     switch (tree->gtIntrinsic.gtIntrinsicId)
1808     {
1809         case CORINFO_INTRINSIC_Sqrt:
1810             break;
1811
1812         case CORINFO_INTRINSIC_Abs:
1813             // Abs(float x) = x & 0x7fffffff
1814             // Abs(double x) = x & 0x7ffffff ffffffff
1815
1816             // In case of Abs we need an internal register to hold mask.
1817
1818             // TODO-XArch-CQ: avoid using an internal register for the mask.
1819             // Andps or andpd both will operate on 128-bit operands.
1820             // The data section constant to hold the mask is a 64-bit size.
1821             // Therefore, we need both the operand and mask to be in
1822             // xmm register. When we add support in emitter to emit 128-bit
1823             // data constants and instructions that operate on 128-bit
1824             // memory operands we can avoid the need for an internal register.
1825             if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
1826             {
1827                 info->internalFloatCount = 1;
1828                 info->setInternalCandidates(this, internalFloatRegCandidates());
1829             }
1830             break;
1831
1832 #ifdef _TARGET_X86_
1833         case CORINFO_INTRINSIC_Cos:
1834         case CORINFO_INTRINSIC_Sin:
1835             NYI_X86("Math intrinsics Cos and Sin");
1836             break;
1837 #endif // _TARGET_X86_
1838
1839         case CORINFO_INTRINSIC_Round:
1840         case CORINFO_INTRINSIC_Ceiling:
1841         case CORINFO_INTRINSIC_Floor:
1842 #if defined(LEGACY_BACKEND)
1843             NYI_X86("Math intrinsics Round, Ceiling, and Floor");
1844 #endif // LEGACY_BACKEND
1845             break;
1846
1847         default:
1848             // Right now only Sqrt/Abs are treated as math intrinsics
1849             noway_assert(!"Unsupported math intrinsic");
1850             unreached();
1851             break;
1852     }
1853 }
1854
1855 #ifdef FEATURE_SIMD
1856 //------------------------------------------------------------------------
1857 // BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
1858 //
1859 // Arguments:
1860 //    tree       - The GT_SIMD node of interest
1861 //
1862 // Return Value:
1863 //    None.
1864
1865 void LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
1866 {
1867     TreeNodeInfo* info = currentNodeInfo;
1868     // Only SIMDIntrinsicInit can be contained. Other than that,
1869     // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount.
1870     if (simdTree->isContained())
1871     {
1872         assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
1873     }
1874     else if (info->dstCount != 1)
1875     {
1876         assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ||
1877                (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality));
1878     }
1879     SetContainsAVXFlags(true, simdTree->gtSIMDSize);
1880     GenTree* op1   = simdTree->gtOp.gtOp1;
1881     GenTree* op2   = simdTree->gtOp.gtOp2;
1882     info->srcCount = 0;
1883     if (!op1->OperIs(GT_LIST))
1884     {
1885         info->srcCount += GetOperandInfo(op1);
1886     }
1887     if ((op2 != nullptr) && !op2->isContained())
1888     {
1889         info->srcCount += GetOperandInfo(op2);
1890     }
1891
1892     switch (simdTree->gtSIMDIntrinsicID)
1893     {
1894         case SIMDIntrinsicInit:
1895         {
1896             // This sets all fields of a SIMD struct to the given value.
1897             // Mark op1 as contained if it is either zero or int constant of all 1's,
1898             // or a float constant with 16 or 32 byte simdType (AVX case)
1899             //
1900             // Should never see small int base type vectors except for zero initialization.
1901             assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
1902
1903 #if !defined(_TARGET_64BIT_)
1904             if (op1->OperGet() == GT_LONG)
1905             {
1906                 assert(op1->isContained());
1907                 GenTree* op1lo = op1->gtGetOp1();
1908                 GenTree* op1hi = op1->gtGetOp2();
1909
1910                 if (op1lo->isContained())
1911                 {
1912                     assert(op1hi->isContained());
1913                     assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
1914                            (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
1915                     assert(info->srcCount == 0);
1916                 }
1917                 else
1918                 {
1919                     assert(info->srcCount == 2);
1920                     info->internalFloatCount = 1;
1921                     info->setInternalCandidates(this, allSIMDRegs());
1922                     info->isInternalRegDelayFree = true;
1923                 }
1924             }
1925 #endif // !defined(_TARGET_64BIT_)
1926         }
1927         break;
1928
1929         case SIMDIntrinsicInitN:
1930         {
1931             var_types baseType = simdTree->gtSIMDBaseType;
1932             info->srcCount     = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
1933             int initCount      = 0;
1934             for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
1935             {
1936                 assert(list->OperGet() == GT_LIST);
1937                 GenTree* listItem = list->gtGetOp1();
1938                 assert(listItem->TypeGet() == baseType);
1939                 assert(!listItem->isContained());
1940                 appendLocationInfoToList(listItem);
1941                 initCount++;
1942             }
1943             assert(initCount == info->srcCount);
1944
1945             // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
1946             info->internalFloatCount = 1;
1947             info->setInternalCandidates(this, allSIMDRegs());
1948         }
1949         break;
1950
1951         case SIMDIntrinsicInitArray:
1952             // We have an array and an index, which may be contained.
1953             assert(info->srcCount == (simdTree->gtGetOp2()->isContained() ? 1 : 2));
1954             break;
1955
1956         case SIMDIntrinsicDiv:
1957             // SSE2 has no instruction support for division on integer vectors
1958             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1959             assert(info->srcCount == 2);
1960             break;
1961
1962         case SIMDIntrinsicAbs:
1963             // float/double vectors: This gets implemented as bitwise-And operation
1964             // with a mask and hence should never see  here.
1965             //
1966             // Must be a Vector<int> or Vector<short> Vector<sbyte>
1967             assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
1968                    simdTree->gtSIMDBaseType == TYP_BYTE);
1969             assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
1970             assert(info->srcCount == 1);
1971             break;
1972
1973         case SIMDIntrinsicSqrt:
1974             // SSE2 has no instruction support for sqrt on integer vectors.
1975             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1976             assert(info->srcCount == 1);
1977             break;
1978
1979         case SIMDIntrinsicAdd:
1980         case SIMDIntrinsicSub:
1981         case SIMDIntrinsicMul:
1982         case SIMDIntrinsicBitwiseAnd:
1983         case SIMDIntrinsicBitwiseAndNot:
1984         case SIMDIntrinsicBitwiseOr:
1985         case SIMDIntrinsicBitwiseXor:
1986         case SIMDIntrinsicMin:
1987         case SIMDIntrinsicMax:
1988             assert(info->srcCount == 2);
1989
1990             // SSE2 32-bit integer multiplication requires two temp regs
1991             if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
1992                 compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
1993             {
1994                 info->internalFloatCount = 2;
1995                 info->setInternalCandidates(this, allSIMDRegs());
1996             }
1997             break;
1998
1999         case SIMDIntrinsicEqual:
2000             assert(info->srcCount == 2);
2001             break;
2002
2003         // SSE2 doesn't support < and <= directly on int vectors.
2004         // Instead we need to use > and >= with swapped operands.
2005         case SIMDIntrinsicLessThan:
2006         case SIMDIntrinsicLessThanOrEqual:
2007             assert(info->srcCount == 2);
2008             noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
2009             break;
2010
2011         // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
2012         // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
2013         // Instead we need to use <  and <= with swapped operands.
2014         case SIMDIntrinsicGreaterThan:
2015             noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2016             assert(info->srcCount == 2);
2017             break;
2018
2019         case SIMDIntrinsicOpEquality:
2020         case SIMDIntrinsicOpInEquality:
2021             if (simdTree->gtGetOp2()->isContained())
2022             {
2023                 // If the second operand is contained then ContainCheckSIMD has determined
2024                 // that PTEST can be used. We only need a single source register and no
2025                 // internal registers.
2026                 assert(info->srcCount == 1);
2027             }
2028             else
2029             {
2030                 // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
2031                 // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
2032                 // and one internal INT register (to hold the result of PMOVMSKB).
2033                 assert(info->srcCount == 2);
2034                 info->internalFloatCount = 1;
2035                 info->setInternalCandidates(this, allSIMDRegs());
2036                 info->internalIntCount = 1;
2037                 info->addInternalCandidates(this, allRegs(TYP_INT));
2038             }
2039             // These SIMD nodes only set the condition flags.
2040             info->dstCount = 0;
2041             break;
2042
2043         case SIMDIntrinsicDotProduct:
2044             // Float/Double vectors:
2045             // For SSE, or AVX with 32-byte vectors, we also need an internal register
2046             // as scratch. Further we need the targetReg and internal reg to be distinct
2047             // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2048             // don't need a tmpReg.
2049             //
2050             // 32-byte integer vector on SSE4/AVX:
2051             // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2052             // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
2053             // registers since targetReg is an int type register.
2054             //
2055             // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2056             // and the need for scratch registers.
2057             if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2058             {
2059                 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
2060                     (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
2061                 {
2062                     info->internalFloatCount     = 1;
2063                     info->isInternalRegDelayFree = true;
2064                     info->setInternalCandidates(this, allSIMDRegs());
2065                 }
2066                 // else don't need scratch reg(s).
2067             }
2068             else
2069             {
2070                 assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2071
2072                 // No need to set isInternalRegDelayFree since targetReg is a
2073                 // an int type reg and guaranteed to be different from xmm/ymm
2074                 // regs.
2075                 info->internalFloatCount = (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) ? 2 : 1;
2076                 info->setInternalCandidates(this, allSIMDRegs());
2077             }
2078             assert(info->srcCount == 2);
2079             break;
2080
2081         case SIMDIntrinsicGetItem:
2082         {
2083             // This implements get_Item method. The sources are:
2084             //  - the source SIMD struct
2085             //  - index (which element to get)
2086             // The result is baseType of SIMD struct.
2087             // op1 may be a contained memory op, but if so we will consume its address.
2088             // op2 may be a contained constant.
2089             op1 = simdTree->gtOp.gtOp1;
2090             op2 = simdTree->gtOp.gtOp2;
2091
2092             if (!op1->isContained())
2093             {
2094                 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2095                 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2096                 // can use that in the process of extracting the element.
2097                 //
2098                 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2099                 // we will need a temp if are indexing into the upper half of the AVX register.
2100                 // In all other cases with constant index, we need a temp xmm register to extract the
2101                 // element if index is other than zero.
2102
2103                 if (!op2->IsCnsIntOrI())
2104                 {
2105                     (void)compiler->getSIMDInitTempVarNum();
2106                 }
2107                 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2108                 {
2109                     bool needFloatTemp;
2110                     if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2111                         (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
2112                     {
2113                         int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2114                         needFloatTemp    = (byteShiftCnt >= 16);
2115                     }
2116                     else
2117                     {
2118                         needFloatTemp = !op2->IsIntegralConst(0);
2119                     }
2120
2121                     if (needFloatTemp)
2122                     {
2123                         info->internalFloatCount = 1;
2124                         info->setInternalCandidates(this, allSIMDRegs());
2125                     }
2126                 }
2127             }
2128         }
2129         break;
2130
2131         case SIMDIntrinsicSetX:
2132         case SIMDIntrinsicSetY:
2133         case SIMDIntrinsicSetZ:
2134         case SIMDIntrinsicSetW:
2135             assert(info->srcCount == 2);
2136
2137             // We need an internal integer register for SSE2 codegen
2138             if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2139             {
2140                 info->internalIntCount = 1;
2141                 info->setInternalCandidates(this, allRegs(TYP_INT));
2142             }
2143
2144             break;
2145
2146         case SIMDIntrinsicCast:
2147             assert(info->srcCount == 1);
2148             break;
2149
2150         case SIMDIntrinsicConvertToSingle:
2151             assert(info->srcCount == 1);
2152             if (simdTree->gtSIMDBaseType == TYP_UINT)
2153             {
2154                 // We need an internal register different from targetReg.
2155                 info->isInternalRegDelayFree = true;
2156                 info->internalIntCount       = 1;
2157                 info->internalFloatCount     = 2;
2158                 info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2159             }
2160             break;
2161
2162         case SIMDIntrinsicConvertToInt32:
2163             assert(info->srcCount == 1);
2164             break;
2165
2166         case SIMDIntrinsicWidenLo:
2167         case SIMDIntrinsicWidenHi:
2168             assert(info->srcCount == 1);
2169             if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
2170             {
2171                 // We need an internal register different from targetReg.
2172                 info->isInternalRegDelayFree = true;
2173                 info->internalFloatCount     = 1;
2174                 info->setInternalCandidates(this, allSIMDRegs());
2175             }
2176             break;
2177
2178         case SIMDIntrinsicConvertToInt64:
2179             assert(info->srcCount == 1);
2180             // We need an internal register different from targetReg.
2181             info->isInternalRegDelayFree = true;
2182             info->internalIntCount       = 1;
2183             if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2184             {
2185                 info->internalFloatCount = 2;
2186             }
2187             else
2188             {
2189                 info->internalFloatCount = 1;
2190             }
2191             info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2192             break;
2193
2194         case SIMDIntrinsicConvertToDouble:
2195             assert(info->srcCount == 1);
2196             // We need an internal register different from targetReg.
2197             info->isInternalRegDelayFree = true;
2198             info->internalIntCount       = 1;
2199 #ifdef _TARGET_X86_
2200             if (simdTree->gtSIMDBaseType == TYP_LONG)
2201             {
2202                 info->internalFloatCount = 3;
2203             }
2204             else
2205 #endif
2206                 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
2207             {
2208                 info->internalFloatCount = 2;
2209             }
2210             else
2211             {
2212                 info->internalFloatCount = 1;
2213             }
2214             info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2215             break;
2216
2217         case SIMDIntrinsicNarrow:
2218             assert(info->srcCount == 2);
2219             // We need an internal register different from targetReg.
2220             info->isInternalRegDelayFree = true;
2221             if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
2222             {
2223                 info->internalFloatCount = 2;
2224             }
2225             else
2226             {
2227                 info->internalFloatCount = 1;
2228             }
2229             info->setInternalCandidates(this, allSIMDRegs());
2230             break;
2231
2232         case SIMDIntrinsicShuffleSSE2:
2233             assert(info->srcCount == 1);
2234             // Second operand is an integer constant and marked as contained.
2235             assert(simdTree->gtOp.gtOp2->isContainedIntOrIImmed());
2236             break;
2237
2238         case SIMDIntrinsicGetX:
2239         case SIMDIntrinsicGetY:
2240         case SIMDIntrinsicGetZ:
2241         case SIMDIntrinsicGetW:
2242         case SIMDIntrinsicGetOne:
2243         case SIMDIntrinsicGetZero:
2244         case SIMDIntrinsicGetCount:
2245         case SIMDIntrinsicGetAllOnes:
2246             assert(!"Get intrinsics should not be seen during Lowering.");
2247             unreached();
2248
2249         default:
2250             noway_assert(!"Unimplemented SIMD node type.");
2251             unreached();
2252     }
2253 }
2254 #endif // FEATURE_SIMD
2255
2256 #ifdef FEATURE_HW_INTRINSICS
2257 //------------------------------------------------------------------------
2258 // BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree.
2259 //
2260 // Arguments:
2261 //    tree       - The GT_HWIntrinsic node of interest
2262 //
2263 // Return Value:
2264 //    None.
2265
2266 void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
2267 {
2268     TreeNodeInfo*       info        = currentNodeInfo;
2269     NamedIntrinsic      intrinsicID = intrinsicTree->gtHWIntrinsicId;
2270     var_types           baseType    = intrinsicTree->gtSIMDBaseType;
2271     InstructionSet      isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
2272     HWIntrinsicCategory category    = Compiler::categoryOfHWIntrinsic(intrinsicID);
2273     HWIntrinsicFlag     flags       = Compiler::flagsOfHWIntrinsic(intrinsicID);
2274     int                 numArgs     = Compiler::numArgsOfHWIntrinsic(intrinsicTree);
2275
2276     if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
2277     {
2278         SetContainsAVXFlags(true, 32);
2279     }
2280
2281     GenTree* op1   = intrinsicTree->gtOp.gtOp1;
2282     GenTree* op2   = intrinsicTree->gtOp.gtOp2;
2283     info->srcCount = 0;
2284
2285     if (op1 != nullptr)
2286     {
2287         if (op1->OperIsList())
2288         {
2289             for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest())
2290             {
2291                 info->srcCount += GetOperandInfo(list->Current());
2292             }
2293         }
2294         else
2295         {
2296             info->srcCount += GetOperandInfo(op1);
2297         }
2298     }
2299
2300     if (op2 != nullptr)
2301     {
2302         info->srcCount += GetOperandInfo(op2);
2303     }
2304
2305     if ((category == HW_Category_IMM) && ((flags & HW_Flag_NoJmpTableIMM) == 0))
2306     {
2307         GenTree* lastOp = Compiler::lastOpOfHWIntrinsic(intrinsicTree, numArgs);
2308         assert(lastOp != nullptr);
2309         if (Compiler::isImmHWIntrinsic(intrinsicID, lastOp) && !lastOp->isContainedIntOrIImmed())
2310         {
2311             assert(!lastOp->IsCnsIntOrI());
2312
2313             // We need two extra reg when lastOp isn't a constant so
2314             // the offset into the jump table for the fallback path
2315             // can be computed.
2316
2317             info->internalIntCount = 2;
2318             info->setInternalCandidates(this, allRegs(TYP_INT));
2319         }
2320     }
2321
2322     // Check for "srcCount >= 2" to match against 3+ operand nodes where one is constant
2323     if ((op2 == nullptr) && (info->srcCount >= 2) && intrinsicTree->isRMWHWIntrinsic(compiler))
2324     {
2325         // TODO-XArch-CQ: This is currently done in order to handle intrinsics which have more than
2326         // two arguments but which still have RMW semantics (such as NI_SSE41_Insert). We should make
2327         // this handling more general and move it back out to LinearScan::BuildNode.
2328
2329         assert(numArgs > 2);
2330         LocationInfoListNode* op2Info = useList.Begin()->Next();
2331         op2Info->info.isDelayFree     = true;
2332         info->hasDelayFreeSrc         = true;
2333     }
2334
2335     switch (intrinsicID)
2336     {
2337         case NI_SSE_CompareEqualOrderedScalar:
2338         case NI_SSE_CompareEqualUnorderedScalar:
2339         case NI_SSE_CompareNotEqualOrderedScalar:
2340         case NI_SSE_CompareNotEqualUnorderedScalar:
2341         case NI_SSE2_CompareEqualOrderedScalar:
2342         case NI_SSE2_CompareEqualUnorderedScalar:
2343         case NI_SSE2_CompareNotEqualOrderedScalar:
2344         case NI_SSE2_CompareNotEqualUnorderedScalar:
2345             info->internalIntCount = 1;
2346             info->setInternalCandidates(this, RBM_BYTE_REGS);
2347             info->isInternalRegDelayFree = true;
2348             break;
2349
2350         case NI_SSE_SetScalarVector128:
2351         case NI_SSE2_SetScalarVector128:
2352             // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
2353             info->internalFloatCount = 1;
2354             info->setInternalCandidates(this, allSIMDRegs());
2355             info->isInternalRegDelayFree = true;
2356             break;
2357
2358         case NI_SSE_ConvertToSingle:
2359         case NI_SSE_StaticCast:
2360         case NI_SSE2_ConvertToDouble:
2361         case NI_AVX_ExtendToVector256:
2362         case NI_AVX_GetLowerHalf:
2363         case NI_AVX_StaticCast:
2364         {
2365             assert(info->srcCount == 1);
2366             assert(info->dstCount == 1);
2367             useList.Last()->info.isTgtPref = true;
2368             break;
2369         }
2370
2371         case NI_AVX_SetAllVector256:
2372         {
2373             if (varTypeIsIntegral(baseType))
2374             {
2375                 info->internalFloatCount = 1;
2376                 if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsByte(baseType))
2377                 {
2378                     info->internalFloatCount += 1;
2379                 }
2380                 info->setInternalCandidates(this, allSIMDRegs());
2381             }
2382             break;
2383         }
2384
2385         case NI_SSE2_MaskMove:
2386         {
2387             // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
2388             LocationInfoListNode* op3Info = useList.Begin()->Next()->Next();
2389             op3Info->info.setSrcCandidates(this, RBM_EDI);
2390             break;
2391         }
2392
2393         case NI_SSE41_BlendVariable:
2394             if (!compiler->canUseVexEncoding())
2395             {
2396                 // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
2397                 LocationInfoListNode* op2Info = useList.Begin()->Next();
2398                 LocationInfoListNode* op3Info = op2Info->Next();
2399                 op2Info->info.isDelayFree     = true;
2400                 op3Info->info.isDelayFree     = true;
2401                 op3Info->info.setSrcCandidates(this, RBM_XMM0);
2402                 info->hasDelayFreeSrc = true;
2403             }
2404             break;
2405
2406         case NI_SSE41_TestAllOnes:
2407         {
2408             info->internalFloatCount = 1;
2409             info->setInternalCandidates(this, allSIMDRegs());
2410             break;
2411         }
2412
2413         case NI_SSE41_Extract:
2414             if (baseType == TYP_FLOAT)
2415             {
2416                 info->internalIntCount += 1;
2417             }
2418 #ifdef _TARGET_X86_
2419             else if (varTypeIsByte(baseType))
2420             {
2421                 info->setDstCandidates(this, RBM_BYTE_REGS);
2422             }
2423 #endif
2424             break;
2425
2426 #ifdef _TARGET_X86_
2427         case NI_SSE42_Crc32:
2428         {
2429             // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
2430             //
2431             // TODO - currently we use the BaseType to bring the type of the second argument
2432             // to the code generator. May encode the overload info in other way.
2433             var_types srcType = intrinsicTree->gtSIMDBaseType;
2434             if (varTypeIsByte(srcType))
2435             {
2436                 LocationInfoListNode* op2Info = useList.GetSecond(INDEBUG(intrinsicTree->gtGetOp2()));
2437                 op2Info->info.setSrcCandidates(this, RBM_BYTE_REGS);
2438             }
2439             break;
2440         }
2441 #endif // _TARGET_X86_
2442
2443         default:
2444             assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END));
2445             break;
2446     }
2447 }
2448 #endif
2449
2450 //------------------------------------------------------------------------
2451 // BuildCast: Set the NodeInfo for a GT_CAST.
2452 //
2453 // Arguments:
2454 //    tree      - The node of interest
2455 //
2456 // Return Value:
2457 //    None.
2458 //
2459 void LinearScan::BuildCast(GenTree* tree)
2460 {
2461     TreeNodeInfo* info = currentNodeInfo;
2462     // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
2463     //         see CodeGen::genIntToIntCast()
2464
2465     // Non-overflow casts to/from float/double are done using SSE2 instructions
2466     // and that allow the source operand to be either a reg or memop. Given the
2467     // fact that casts from small int to float/double are done as two-level casts,
2468     // the source operand is always guaranteed to be of size 4 or 8 bytes.
2469     var_types castToType = tree->CastToType();
2470     GenTree*  castOp     = tree->gtCast.CastOp();
2471     var_types castOpType = castOp->TypeGet();
2472
2473     info->srcCount = GetOperandInfo(castOp);
2474     assert(info->dstCount == 1);
2475     if (tree->gtFlags & GTF_UNSIGNED)
2476     {
2477         castOpType = genUnsignedType(castOpType);
2478     }
2479
2480     // some overflow checks need a temp reg:
2481     //  - GT_CAST from INT64/UINT64 to UINT32
2482     if (tree->gtOverflow() && (castToType == TYP_UINT))
2483     {
2484         if (genTypeSize(castOpType) == 8)
2485         {
2486             // Here we don't need internal register to be different from targetReg,
2487             // rather require it to be different from operand's reg.
2488             info->internalIntCount = 1;
2489         }
2490     }
2491 }
2492
2493 //-----------------------------------------------------------------------------------------
2494 // BuildIndir: Specify register requirements for address expression of an indirection operation.
2495 //
2496 // Arguments:
2497 //    indirTree    -   GT_IND or GT_STOREIND gentree node
2498 //
2499 void LinearScan::BuildIndir(GenTreeIndir* indirTree)
2500 {
2501     TreeNodeInfo* info = currentNodeInfo;
2502     // If this is the rhs of a block copy (i.e. non-enregisterable struct),
2503     // it has no register requirements.
2504     if (indirTree->TypeGet() == TYP_STRUCT)
2505     {
2506         return;
2507     }
2508
2509     int indirSrcCount = GetIndirInfo(indirTree);
2510     if (indirTree->gtOper == GT_STOREIND)
2511     {
2512         GenTree* source = indirTree->gtOp.gtOp2;
2513         if (indirTree->AsStoreInd()->IsRMWMemoryOp())
2514         {
2515             // Because 'source' is contained, we haven't yet determined its special register requirements, if any.
2516             // As it happens, the Shift or Rotate cases are the only ones with special requirements.
2517             assert(source->isContained() && source->OperIsRMWMemOp());
2518             GenTree*      nonMemSource = nullptr;
2519             GenTreeIndir* otherIndir   = nullptr;
2520
2521             if (source->OperIsShiftOrRotate())
2522             {
2523                 info->srcCount += BuildShiftRotate(source);
2524             }
2525             else
2526             {
2527                 info->srcCount += appendBinaryLocationInfoToList(source->AsOp());
2528             }
2529             if (indirTree->AsStoreInd()->IsRMWDstOp1())
2530             {
2531                 otherIndir = source->gtGetOp1()->AsIndir();
2532                 if (source->OperIsBinary())
2533                 {
2534                     nonMemSource = source->gtOp.gtOp2;
2535                 }
2536             }
2537             else if (indirTree->AsStoreInd()->IsRMWDstOp2())
2538             {
2539                 otherIndir   = source->gtGetOp2()->AsIndir();
2540                 nonMemSource = source->gtOp.gtOp1;
2541             }
2542             if (otherIndir != nullptr)
2543             {
2544                 // Any lclVars in the addressing mode of this indirection are contained.
2545                 // If they are marked as lastUse, transfer the last use flag to the store indir.
2546                 GenTree* base    = otherIndir->Base();
2547                 GenTree* dstBase = indirTree->Base();
2548                 CheckAndMoveRMWLastUse(base, dstBase);
2549                 GenTree* index    = otherIndir->Index();
2550                 GenTree* dstIndex = indirTree->Index();
2551                 CheckAndMoveRMWLastUse(index, dstIndex);
2552             }
2553             if (nonMemSource != nullptr)
2554             {
2555                 assert(!nonMemSource->isContained() || (!nonMemSource->isMemoryOp() && !nonMemSource->IsLocal()));
2556 #ifdef _TARGET_X86_
2557                 if (varTypeIsByte(indirTree) && !nonMemSource->isContained())
2558                 {
2559                     // If storeInd is of TYP_BYTE, set source to byteable registers.
2560                     TreeNodeInfo& nonMemSourceInfo = useList.GetTreeNodeInfo(nonMemSource);
2561                     regMaskTP     regMask          = nonMemSourceInfo.getSrcCandidates(this);
2562                     regMask &= ~RBM_NON_BYTE_REGS;
2563                     assert(regMask != RBM_NONE);
2564                     nonMemSourceInfo.setSrcCandidates(this, regMask);
2565                 }
2566 #endif
2567             }
2568         }
2569         else
2570         {
2571 #ifdef _TARGET_X86_
2572             if (varTypeIsByte(indirTree) && !source->isContained())
2573             {
2574                 // If storeInd is of TYP_BYTE, set source to byteable registers.
2575                 LocationInfoListNode* sourceInfo = getLocationInfo(source);
2576                 regMaskTP             regMask    = sourceInfo->info.getSrcCandidates(this);
2577                 regMask &= ~RBM_NON_BYTE_REGS;
2578                 assert(regMask != RBM_NONE);
2579                 sourceInfo->info.setSrcCandidates(this, regMask);
2580                 useList.Append(sourceInfo);
2581                 info->srcCount++;
2582             }
2583             else
2584 #endif
2585             {
2586                 info->srcCount += GetOperandInfo(source);
2587             }
2588         }
2589     }
2590     info->srcCount += indirSrcCount;
2591
2592 #ifdef FEATURE_SIMD
2593     if (indirTree->TypeGet() == TYP_SIMD12)
2594     {
2595         // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir().
2596         assert(!indirTree->Addr()->isContained());
2597
2598         // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
2599         // To assemble the vector properly we would need an additional
2600         // XMM register.
2601         info->internalFloatCount = 1;
2602
2603         // In case of GT_IND we need an internal register different from targetReg and
2604         // both of the registers are used at the same time.
2605         if (indirTree->OperGet() == GT_IND)
2606         {
2607             info->isInternalRegDelayFree = true;
2608         }
2609
2610         info->setInternalCandidates(this, allSIMDRegs());
2611
2612         return;
2613     }
2614 #endif // FEATURE_SIMD
2615
2616     assert(indirTree->Addr()->gtOper != GT_ARR_ELEM);
2617 }
2618
2619 //------------------------------------------------------------------------
2620 // BuildMul: Set the NodeInfo for a multiply.
2621 //
2622 // Arguments:
2623 //    tree      - The node of interest
2624 //
2625 // Return Value:
2626 //    None.
2627 //
2628 void LinearScan::BuildMul(GenTree* tree)
2629 {
2630     TreeNodeInfo* info = currentNodeInfo;
2631 #if defined(_TARGET_X86_)
2632     assert(tree->OperIs(GT_MUL, GT_MULHI, GT_MUL_LONG));
2633 #else
2634     assert(tree->OperIs(GT_MUL, GT_MULHI));
2635 #endif
2636     GenTree* op1   = tree->gtOp.gtOp1;
2637     GenTree* op2   = tree->gtOp.gtOp2;
2638     info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
2639     assert(info->dstCount == 1);
2640
2641     // Case of float/double mul.
2642     if (varTypeIsFloating(tree->TypeGet()))
2643     {
2644         return;
2645     }
2646
2647     bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
2648     bool requiresOverflowCheck = tree->gtOverflowEx();
2649
2650     // There are three forms of x86 multiply:
2651     // one-op form:     RDX:RAX = RAX * r/m
2652     // two-op form:     reg *= r/m
2653     // three-op form:   reg = r/m * imm
2654
2655     // This special widening 32x32->64 MUL is not used on x64
2656     CLANG_FORMAT_COMMENT_ANCHOR;
2657 #if defined(_TARGET_X86_)
2658     if (tree->OperGet() != GT_MUL_LONG)
2659 #endif
2660     {
2661         assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
2662     }
2663
2664     // We do use the widening multiply to implement
2665     // the overflow checking for unsigned multiply
2666     //
2667     if (isUnsignedMultiply && requiresOverflowCheck)
2668     {
2669         // The only encoding provided is RDX:RAX = RAX * rm
2670         //
2671         // Here we set RAX as the only destination candidate
2672         // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
2673         //
2674         info->setDstCandidates(this, RBM_RAX);
2675     }
2676     else if (tree->OperGet() == GT_MULHI)
2677     {
2678         // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
2679         // upper 32 bits of the result set the destination candidate to REG_RDX.
2680         info->setDstCandidates(this, RBM_RDX);
2681     }
2682 #if defined(_TARGET_X86_)
2683     else if (tree->OperGet() == GT_MUL_LONG)
2684     {
2685         // have to use the encoding:RDX:RAX = RAX * rm
2686         info->setDstCandidates(this, RBM_RAX);
2687     }
2688 #endif
2689     GenTree* containedMemOp = nullptr;
2690     if (op1->isContained() && !op1->IsCnsIntOrI())
2691     {
2692         assert(!op2->isContained() || op2->IsCnsIntOrI());
2693         containedMemOp = op1;
2694     }
2695     else if (op2->isContained() && !op2->IsCnsIntOrI())
2696     {
2697         containedMemOp = op2;
2698     }
2699     if ((containedMemOp != nullptr) && CheckAndSetDelayFree(containedMemOp))
2700     {
2701         info->hasDelayFreeSrc = true;
2702     }
2703 }
2704
2705 //------------------------------------------------------------------------------
2706 // SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
2707 // Contains256bitAVX flag when SIMD vector size is 32 bytes
2708 //
2709 // Arguments:
2710 //    isFloatingPointType   - true if it is floating point type
2711 //    sizeOfSIMDVector      - SIMD Vector size
2712 //
2713 void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
2714 {
2715     if (isFloatingPointType && compiler->canUseVexEncoding())
2716     {
2717         compiler->getEmitter()->SetContainsAVX(true);
2718         if (sizeOfSIMDVector == 32)
2719         {
2720             compiler->getEmitter()->SetContains256bitAVX(true);
2721         }
2722     }
2723 }
2724
2725 #ifdef _TARGET_X86_
2726 //------------------------------------------------------------------------
2727 // ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
2728 // various reasons
2729 //
2730 // Arguments:
2731 //    tree      - The node of interest
2732 //
2733 // Return Value:
2734 //    If we need to exclude non-byteable registers
2735 //
2736 bool LinearScan::ExcludeNonByteableRegisters(GenTree* tree)
2737 {
2738     // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
2739     // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
2740     // value. In this case we need to exclude esi/edi from the src candidates of op2.
2741     if (varTypeIsByte(tree))
2742     {
2743         return true;
2744     }
2745     // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
2746     else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
2747     {
2748         return true;
2749     }
2750     else if (tree->OperIsCompare() || tree->OperIs(GT_CMP))
2751     {
2752         GenTree* op1 = tree->gtGetOp1();
2753         GenTree* op2 = tree->gtGetOp2();
2754
2755         // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
2756         // ubyte as the result of comparison and if the result needs to be materialized into a reg
2757         // simply zero extend it to TYP_INT size.  Here is an example of generated code:
2758         //         cmp dl, byte ptr[addr mode]
2759         //         movzx edx, dl
2760         if (varTypeIsByte(op1) && varTypeIsByte(op2))
2761         {
2762             return true;
2763         }
2764         // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
2765         // ubyte as the result of the comparison and if the result needs to be materialized into a reg
2766         // simply zero extend it to TYP_INT size.
2767         else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
2768         {
2769             return true;
2770         }
2771         // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
2772         // ubyte as the result of the comparison and if the result needs to be materialized into a reg
2773         // simply zero extend it to TYP_INT size.
2774         else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
2775         {
2776             return true;
2777         }
2778         else
2779         {
2780             return false;
2781         }
2782     }
2783 #ifdef FEATURE_SIMD
2784     else if (tree->OperGet() == GT_SIMD)
2785     {
2786         GenTreeSIMD* simdNode = tree->AsSIMD();
2787         switch (simdNode->gtSIMDIntrinsicID)
2788         {
2789             case SIMDIntrinsicOpEquality:
2790             case SIMDIntrinsicOpInEquality:
2791                 // We manifest it into a byte register, so the target must be byteable.
2792                 return true;
2793
2794             case SIMDIntrinsicGetItem:
2795             {
2796                 // This logic is duplicated from genSIMDIntrinsicGetItem().
2797                 // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
2798                 // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
2799                 // cases will require this, so the non-byteable registers can be excluded.
2800
2801                 GenTree*  op1      = simdNode->gtGetOp1();
2802                 GenTree*  op2      = simdNode->gtGetOp2();
2803                 var_types baseType = simdNode->gtSIMDBaseType;
2804                 if (!isContainableMemoryOp(op1) && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
2805                 {
2806                     bool     ZeroOrSignExtnReqd = true;
2807                     unsigned baseSize           = genTypeSize(baseType);
2808                     if (baseSize == 1)
2809                     {
2810                         if ((op2->gtIntCon.gtIconVal % 2) == 1)
2811                         {
2812                             ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2813                         }
2814                     }
2815                     else
2816                     {
2817                         assert(baseSize == 2);
2818                         ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2819                     }
2820                     return ZeroOrSignExtnReqd;
2821                 }
2822                 break;
2823             }
2824
2825             default:
2826                 break;
2827         }
2828         return false;
2829     }
2830 #endif // FEATURE_SIMD
2831     else
2832     {
2833         return false;
2834     }
2835 }
2836 #endif // _TARGET_X86_
2837
2838 #endif // _TARGET_XARCH_
2839
2840 #endif // !LEGACY_BACKEND