Set isInternalRegDelayFree for several of the x86 hwintrinsics
[platform/upstream/coreclr.git] / src / jit / lsraxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX                    Register Requirements for AMD64                        XX
9 XX                                                                           XX
10 XX  This encapsulates all the logic for setting register requirements for    XX
11 XX  the AMD64 architecture.                                                  XX
12 XX                                                                           XX
13 XX                                                                           XX
14 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 */
17
18 #include "jitpch.h"
19 #ifdef _MSC_VER
20 #pragma hdrstop
21 #endif
22
23 #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
24
25 #ifdef _TARGET_XARCH_
26
27 #include "jit.h"
28 #include "sideeffects.h"
29 #include "lower.h"
30
31 //------------------------------------------------------------------------
32 // BuildNode: Set register requirements for a node
33 //
34 // Arguments:
35 //    treeNode - the node of interest
36 //
37 // Notes:
38 // Preconditions:
39 //    LSRA Has been initialized and there is a TreeNodeInfo node
40 //    already allocated and initialized for every tree in the IR.
41 // Postconditions:
42 //    Every TreeNodeInfo instance has the right annotations on register
43 //    requirements needed by LSRA to build the Interval Table (source,
44 //    destination and internal [temp] register counts).
45 //
46 void LinearScan::BuildNode(GenTree* tree)
47 {
48     TreeNodeInfo* info = currentNodeInfo;
49     assert(!tree->isContained());
50
51     if (tree->IsValue())
52     {
53         info->dstCount = 1;
54         if (tree->IsUnusedValue())
55         {
56             info->isLocalDefUse = true;
57         }
58     }
59     else
60     {
61         info->dstCount = 0;
62     }
63
64     // floating type generates AVX instruction (vmovss etc.), set the flag
65     SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
66     switch (tree->OperGet())
67     {
68         default:
69             BuildSimple(tree);
70             break;
71
72         case GT_LCL_VAR:
73             // Because we do containment analysis before we redo dataflow and identify register
74             // candidates, the containment analysis only !lvDoNotEnregister to estimate register
75             // candidates.
76             // If there is a lclVar that is estimated to be register candidate but
77             // is not, if they were marked regOptional they should now be marked contained instead.
78             // TODO-XArch-CQ: When this is being called while RefPositions are being created,
79             // use lvLRACandidate here instead.
80             if (tree->IsRegOptional())
81             {
82                 if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked ||
83                     compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister)
84                 {
85                     tree->ClearRegOptional();
86                     tree->SetContained();
87                     info->dstCount = 0;
88                     return;
89                 }
90             }
91             __fallthrough;
92
93         case GT_LCL_FLD:
94             info->srcCount = 0;
95
96 #ifdef FEATURE_SIMD
97             // Need an additional register to read upper 4 bytes of Vector3.
98             if (tree->TypeGet() == TYP_SIMD12)
99             {
100                 // We need an internal register different from targetReg in which 'tree' produces its result
101                 // because both targetReg and internal reg will be in use at the same time.
102                 info->internalFloatCount     = 1;
103                 info->isInternalRegDelayFree = true;
104                 info->setInternalCandidates(this, allSIMDRegs());
105             }
106 #endif
107             break;
108
109         case GT_STORE_LCL_FLD:
110         case GT_STORE_LCL_VAR:
111             BuildStoreLoc(tree->AsLclVarCommon());
112             break;
113
114         case GT_LIST:
115         case GT_FIELD_LIST:
116         case GT_ARGPLACE:
117         case GT_NO_OP:
118         case GT_START_NONGC:
119         case GT_PROF_HOOK:
120             info->srcCount = 0;
121             assert(info->dstCount == 0);
122             break;
123
124         case GT_CNS_DBL:
125             info->srcCount = 0;
126             assert(info->dstCount == 1);
127             break;
128
129 #if !defined(_TARGET_64BIT_)
130
131         case GT_LONG:
132             assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here.
133             // An unused GT_LONG node needs to consume its sources, but need not produce a register.
134             tree->gtType = TYP_VOID;
135             tree->ClearUnusedValue();
136             info->isLocalDefUse = false;
137             info->srcCount      = 2;
138             info->dstCount      = 0;
139             appendLocationInfoToList(tree->gtGetOp1());
140             appendLocationInfoToList(tree->gtGetOp2());
141             break;
142
143 #endif // !defined(_TARGET_64BIT_)
144
145         case GT_BOX:
146         case GT_COMMA:
147         case GT_QMARK:
148         case GT_COLON:
149             info->srcCount = 0;
150             assert(info->dstCount == 0);
151             unreached();
152             break;
153
154         case GT_RETURN:
155             BuildReturn(tree);
156             break;
157
158         case GT_RETFILT:
159             assert(info->dstCount == 0);
160             if (tree->TypeGet() == TYP_VOID)
161             {
162                 info->srcCount = 0;
163             }
164             else
165             {
166                 assert(tree->TypeGet() == TYP_INT);
167
168                 info->srcCount = 1;
169
170                 info->setSrcCandidates(this, RBM_INTRET);
171                 LocationInfoListNode* locationInfo = getLocationInfo(tree->gtOp.gtOp1);
172                 locationInfo->info.setSrcCandidates(this, RBM_INTRET);
173                 useList.Append(locationInfo);
174             }
175             break;
176
177         // A GT_NOP is either a passthrough (if it is void, or if it has
178         // a child), but must be considered to produce a dummy value if it
179         // has a type but no child
180         case GT_NOP:
181             info->srcCount = 0;
182             assert((tree->gtOp.gtOp1 == nullptr) || tree->isContained());
183             if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
184             {
185                 assert(info->dstCount == 1);
186             }
187             else
188             {
189                 assert(info->dstCount == 0);
190             }
191             break;
192
193         case GT_JTRUE:
194         {
195             info->srcCount = 0;
196             assert(info->dstCount == 0);
197             GenTree* cmp = tree->gtGetOp1();
198             assert(!cmp->IsValue());
199         }
200         break;
201
202         case GT_JCC:
203             info->srcCount = 0;
204             assert(info->dstCount == 0);
205             break;
206
207         case GT_SETCC:
208             info->srcCount = 0;
209             assert(info->dstCount == 1);
210 #ifdef _TARGET_X86_
211             info->setDstCandidates(this, RBM_BYTE_REGS);
212 #endif // _TARGET_X86_
213             break;
214
215         case GT_JMP:
216             info->srcCount = 0;
217             assert(info->dstCount == 0);
218             break;
219
220         case GT_SWITCH:
221             // This should never occur since switch nodes must not be visible at this
222             // point in the JIT.
223             info->srcCount = 0;
224             noway_assert(!"Switch must be lowered at this point");
225             break;
226
227         case GT_JMPTABLE:
228             info->srcCount = 0;
229             assert(info->dstCount == 1);
230             break;
231
232         case GT_SWITCH_TABLE:
233             info->internalIntCount = 1;
234             assert(info->dstCount == 0);
235             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
236             assert(info->srcCount == 2);
237             break;
238
239         case GT_ASG:
240             noway_assert(!"We should never hit any assignment operator in lowering");
241             info->srcCount = 0;
242             break;
243
244 #if !defined(_TARGET_64BIT_)
245         case GT_ADD_LO:
246         case GT_ADD_HI:
247         case GT_SUB_LO:
248         case GT_SUB_HI:
249 #endif
250         case GT_ADD:
251         case GT_SUB:
252         case GT_AND:
253         case GT_OR:
254         case GT_XOR:
255         case GT_BT:
256             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
257             break;
258
259         case GT_RETURNTRAP:
260             // This just turns into a compare of its child with an int + a conditional call.
261             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
262             assert(info->dstCount == 0);
263             info->internalIntCount = 1;
264             info->setInternalCandidates(this, allRegs(TYP_INT));
265             break;
266
267         case GT_MOD:
268         case GT_DIV:
269         case GT_UMOD:
270         case GT_UDIV:
271             BuildModDiv(tree->AsOp());
272             break;
273
274         case GT_MUL:
275         case GT_MULHI:
276 #if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
277         case GT_MUL_LONG:
278 #endif
279             BuildMul(tree->AsOp());
280             break;
281
282         case GT_INTRINSIC:
283             BuildIntrinsic(tree->AsOp());
284             break;
285
286 #ifdef FEATURE_SIMD
287         case GT_SIMD:
288             BuildSIMD(tree->AsSIMD());
289             break;
290 #endif // FEATURE_SIMD
291
292 #ifdef FEATURE_HW_INTRINSICS
293         case GT_HWIntrinsic:
294             BuildHWIntrinsic(tree->AsHWIntrinsic());
295             break;
296 #endif // FEATURE_HW_INTRINSICS
297
298         case GT_CAST:
299             BuildCast(tree);
300             break;
301
302         case GT_BITCAST:
303         {
304             LocationInfoListNode* locationInfo = getLocationInfo(tree->gtOp.gtOp1);
305             locationInfo->info.isTgtPref       = true;
306             useList.Append(locationInfo);
307             info->srcCount = 1;
308             info->dstCount = 1;
309         }
310         break;
311
312         case GT_NEG:
313             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
314
315             // TODO-XArch-CQ:
316             // SSE instruction set doesn't have an instruction to negate a number.
317             // The recommended way is to xor the float/double number with a bitmask.
318             // The only way to xor is using xorps or xorpd both of which operate on
319             // 128-bit operands.  To hold the bit-mask we would need another xmm
320             // register or a 16-byte aligned 128-bit data constant. Right now emitter
321             // lacks the support for emitting such constants or instruction with mem
322             // addressing mode referring to a 128-bit operand. For now we use an
323             // internal xmm register to load 32/64-bit bitmask from data section.
324             // Note that by trading additional data section memory (128-bit) we can
325             // save on the need for an internal register and also a memory-to-reg
326             // move.
327             //
328             // Note: another option to avoid internal register requirement is by
329             // lowering as GT_SUB(0, src).  This will generate code different from
330             // Jit64 and could possibly result in compat issues (?).
331             if (varTypeIsFloating(tree))
332             {
333                 info->internalFloatCount = 1;
334                 info->setInternalCandidates(this, internalFloatRegCandidates());
335             }
336             break;
337
338         case GT_NOT:
339             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
340             break;
341
342         case GT_LSH:
343         case GT_RSH:
344         case GT_RSZ:
345         case GT_ROL:
346         case GT_ROR:
347 #ifdef _TARGET_X86_
348         case GT_LSH_HI:
349         case GT_RSH_LO:
350 #endif
351             (void)BuildShiftRotate(tree);
352             break;
353
354         case GT_EQ:
355         case GT_NE:
356         case GT_LT:
357         case GT_LE:
358         case GT_GE:
359         case GT_GT:
360         case GT_TEST_EQ:
361         case GT_TEST_NE:
362         case GT_CMP:
363             BuildCmp(tree);
364             break;
365
366         case GT_CKFINITE:
367             appendLocationInfoToList(tree->gtOp.gtOp1);
368             info->srcCount = 1;
369             assert(info->dstCount == 1);
370             info->internalIntCount = 1;
371             break;
372
373         case GT_CMPXCHG:
374         {
375             info->srcCount = 3;
376             assert(info->dstCount == 1);
377
378             // comparand is preferenced to RAX.
379             // Remaining two operands can be in any reg other than RAX.
380             LocationInfoListNode* locationInfo = getLocationInfo(tree->gtCmpXchg.gtOpLocation);
381             locationInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RAX);
382             useList.Append(locationInfo);
383             LocationInfoListNode* valueInfo = getLocationInfo(tree->gtCmpXchg.gtOpValue);
384             valueInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RAX);
385             useList.Append(valueInfo);
386             info->setDstCandidates(this, RBM_RAX);
387             LocationInfoListNode* comparandInfo = getLocationInfo(tree->gtCmpXchg.gtOpComparand);
388             comparandInfo->info.setSrcCandidates(this, RBM_RAX);
389             useList.Append(comparandInfo);
390         }
391         break;
392
393         case GT_LOCKADD:
394             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
395             assert(info->dstCount == (tree->TypeGet() == TYP_VOID) ? 0 : 1);
396             break;
397
398         case GT_PUTARG_REG:
399             BuildPutArgReg(tree->AsUnOp());
400             break;
401
402         case GT_CALL:
403             BuildCall(tree->AsCall());
404             break;
405
406         case GT_ADDR:
407         {
408             // For a GT_ADDR, the child node should not be evaluated into a register
409             GenTree* child = tree->gtOp.gtOp1;
410             assert(!isCandidateLocalRef(child));
411             assert(child->isContained());
412             assert(info->dstCount == 1);
413             info->srcCount = 0;
414         }
415         break;
416
417 #if !defined(FEATURE_PUT_STRUCT_ARG_STK)
418         case GT_OBJ:
419 #endif
420         case GT_BLK:
421         case GT_DYN_BLK:
422             // These should all be eliminated prior to Lowering.
423             assert(!"Non-store block node in Lowering");
424             info->srcCount = 0;
425             break;
426
427 #ifdef FEATURE_PUT_STRUCT_ARG_STK
428         case GT_PUTARG_STK:
429             BuildPutArgStk(tree->AsPutArgStk());
430             break;
431 #endif // FEATURE_PUT_STRUCT_ARG_STK
432
433         case GT_STORE_BLK:
434         case GT_STORE_OBJ:
435         case GT_STORE_DYN_BLK:
436             BuildBlockStore(tree->AsBlk());
437             break;
438
439         case GT_INIT_VAL:
440             // Always a passthrough of its child's value.
441             assert(!"INIT_VAL should always be contained");
442             break;
443
444         case GT_LCLHEAP:
445             BuildLclHeap(tree);
446             break;
447
448         case GT_ARR_BOUNDS_CHECK:
449 #ifdef FEATURE_SIMD
450         case GT_SIMD_CHK:
451 #endif // FEATURE_SIMD
452 #ifdef FEATURE_HW_INTRINSICS
453         case GT_HW_INTRINSIC_CHK:
454 #endif // FEATURE_HW_INTRINSICS
455             // Consumes arrLen & index - has no result
456             info->srcCount = 2;
457             assert(info->dstCount == 0);
458             info->srcCount = GetOperandInfo(tree->AsBoundsChk()->gtIndex);
459             info->srcCount += GetOperandInfo(tree->AsBoundsChk()->gtArrLen);
460             break;
461
462         case GT_ARR_ELEM:
463             // These must have been lowered to GT_ARR_INDEX
464             noway_assert(!"We should never see a GT_ARR_ELEM after Lowering.");
465             info->srcCount = 0;
466             break;
467
468         case GT_ARR_INDEX:
469         {
470             info->srcCount = 2;
471             assert(info->dstCount == 1);
472             assert(!tree->AsArrIndex()->ArrObj()->isContained());
473             assert(!tree->AsArrIndex()->IndexExpr()->isContained());
474             // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
475             // times while the result is being computed.
476             LocationInfoListNode* arrObjInfo = getLocationInfo(tree->AsArrIndex()->ArrObj());
477             arrObjInfo->info.isDelayFree     = true;
478             useList.Append(arrObjInfo);
479             useList.Append(getLocationInfo(tree->AsArrIndex()->IndexExpr()));
480             info->hasDelayFreeSrc = true;
481         }
482         break;
483
484         case GT_ARR_OFFSET:
485             // This consumes the offset, if any, the arrObj and the effective index,
486             // and produces the flattened offset for this dimension.
487             assert(info->dstCount == 1);
488             if (tree->gtArrOffs.gtOffset->isContained())
489             {
490                 info->srcCount = 2;
491             }
492             else
493             {
494                 // Here we simply need an internal register, which must be different
495                 // from any of the operand's registers, but may be the same as targetReg.
496                 info->srcCount         = 3;
497                 info->internalIntCount = 1;
498                 appendLocationInfoToList(tree->AsArrOffs()->gtOffset);
499             }
500             appendLocationInfoToList(tree->AsArrOffs()->gtIndex);
501             appendLocationInfoToList(tree->AsArrOffs()->gtArrObj);
502             break;
503
504         case GT_LEA:
505             // The LEA usually passes its operands through to the GT_IND, in which case it will
506             // be contained, but we may be instantiating an address, in which case we set them here.
507             info->srcCount = 0;
508             assert(info->dstCount == 1);
509             if (tree->AsAddrMode()->HasBase())
510             {
511                 info->srcCount++;
512                 appendLocationInfoToList(tree->AsAddrMode()->Base());
513             }
514             if (tree->AsAddrMode()->HasIndex())
515             {
516                 info->srcCount++;
517                 appendLocationInfoToList(tree->AsAddrMode()->Index());
518             }
519             break;
520
521         case GT_STOREIND:
522             if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
523             {
524                 BuildGCWriteBarrier(tree);
525                 break;
526             }
527             BuildIndir(tree->AsIndir());
528             break;
529
530         case GT_NULLCHECK:
531             assert(info->dstCount == 0);
532             appendLocationInfoToList(tree->gtOp.gtOp1);
533             info->srcCount = 1;
534             break;
535
536         case GT_IND:
537             BuildIndir(tree->AsIndir());
538             assert(info->dstCount == 1);
539             break;
540
541         case GT_CATCH_ARG:
542             info->srcCount = 0;
543             assert(info->dstCount == 1);
544             info->setDstCandidates(this, RBM_EXCEPTION_OBJECT);
545             break;
546
547 #if !FEATURE_EH_FUNCLETS
548         case GT_END_LFIN:
549             info->srcCount = 0;
550             assert(info->dstCount == 0);
551             break;
552 #endif
553
554         case GT_CLS_VAR:
555             // These nodes are eliminated by rationalizer.
556             JITDUMP("Unexpected node %s in Lower.\n", GenTree::OpName(tree->OperGet()));
557             unreached();
558             break;
559
560         case GT_INDEX_ADDR:
561             assert(info->dstCount == 1);
562             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
563
564             if (tree->AsIndexAddr()->Index()->TypeGet() == TYP_I_IMPL)
565             {
566                 info->internalIntCount = 1;
567             }
568             else
569             {
570                 switch (tree->AsIndexAddr()->gtElemSize)
571                 {
572                     case 1:
573                     case 2:
574                     case 4:
575                     case 8:
576                         break;
577
578                     default:
579                         info->internalIntCount = 1;
580                         break;
581                 }
582             }
583             break;
584     } // end switch (tree->OperGet())
585
586     // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
587     // Even then we would like to set isTgtPref on Op1.
588     if (tree->OperIsBinary() && info->srcCount >= 1)
589     {
590         if (isRMWRegOper(tree))
591         {
592             GenTree* op1 = tree->gtOp.gtOp1;
593             GenTree* op2 = tree->gtOp.gtOp2;
594
595             // Commutative opers like add/mul/and/or/xor could reverse the order of
596             // operands if it is safe to do so.  In such a case we would like op2 to be
597             // target preferenced instead of op1.
598             if (tree->OperIsCommutative() && op1->isContained() && op2 != nullptr)
599             {
600                 op1 = op2;
601                 op2 = tree->gtOp.gtOp1;
602             }
603
604             // If we have a read-modify-write operation, we want to preference op1 to the target,
605             // if it is not contained.
606             if (!op1->isContained() && !op1->OperIs(GT_LIST))
607             {
608                 useList.GetTreeNodeInfo(op1).isTgtPref = true;
609             }
610
611             // Is this a non-commutative operator, or is op2 a contained memory op?
612             // In either case, we need to make op2 remain live until the op is complete, by marking
613             // the source(s) associated with op2 as "delayFree" if this node defines a register.
614             // Note that if op2 of a binary RMW operator is a memory op, even if the operator
615             // is commutative, codegen cannot reverse them.
616             // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
617             // more work to be done to correctly reverse the operands if they involve memory
618             // operands.  Also, we may need to handle more cases than GT_IND, especially once
619             // we've modified the register allocator to not require all nodes to be assigned
620             // a register (e.g. a spilled lclVar can often be referenced directly from memory).
621             // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
622
623             GenTree* delayUseSrc = nullptr;
624             // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
625             // to special case them.
626             if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
627             {
628                 // These tree nodes will have their op1 marked as isDelayFree=true.
629                 // Hence these tree nodes should have a Def position so that op1's reg
630                 // gets freed at DefLoc+1.
631                 if (tree->TypeGet() == TYP_VOID)
632                 {
633                     // Right now a GT_XADD node could be morphed into a
634                     // GT_LOCKADD of TYP_VOID. See gtExtractSideEffList().
635                     // Note that it is advantageous to use GT_LOCKADD
636                     // instead of of GT_XADD as the former uses lock.add,
637                     // which allows its second operand to be a contained
638                     // immediate wheres xadd instruction requires its
639                     // second operand to be in a register.
640                     assert(info->dstCount == 0);
641
642                     // Give it an artificial type and mark it as an unused value.
643                     // This results in a Def position created but not considered consumed by its parent node.
644                     tree->gtType        = TYP_INT;
645                     info->dstCount      = 1;
646                     info->isLocalDefUse = true;
647                     tree->SetUnusedValue();
648                 }
649                 else
650                 {
651                     assert(info->dstCount != 0);
652                 }
653
654                 delayUseSrc = op1;
655             }
656             else if ((info->dstCount != 0) && (op2 != nullptr) &&
657                      (!tree->OperIsCommutative() || (op2->isContained() && !op2->IsCnsIntOrI())))
658             {
659                 delayUseSrc = op2;
660             }
661             if ((delayUseSrc != nullptr) && CheckAndSetDelayFree(delayUseSrc))
662             {
663                 info->hasDelayFreeSrc = true;
664             }
665         }
666     }
667
668     BuildCheckByteable(tree);
669
670     // We need to be sure that we've set info->srcCount and info->dstCount appropriately
671     assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
672     assert(info->isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue()));
673     assert(!tree->IsUnusedValue() || (info->dstCount != 0));
674     assert(info->dstCount == tree->GetRegisterDstCount());
675 }
676
677 //---------------------------------------------------------------------
678 // CheckAndSetDelayFree - Set isDelayFree on the given operand or its child(ren), if appropriate
679 //
680 // Arguments
681 //    delayUseSrc - a node that may have a delayed use
682 //
683 // Return Value:
684 //    True iff the node or one of its children has been marked isDelayFree
685 //
686 // Notes:
687 //    Only register operands should be marked isDelayFree, not contained immediates or memory.
688 //
689 bool LinearScan::CheckAndSetDelayFree(GenTree* delayUseSrc)
690 {
691     // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
692     // on the base & index, if any.
693     // Otherwise, we set it on delayUseSrc itself.
694     bool returnValue = false;
695     if (delayUseSrc->isContained())
696     {
697         // If delayUseSrc is a non-Indir contained node (e.g. a local) there's no register use to delay.
698         if (delayUseSrc->isIndir())
699         {
700             GenTree* base  = delayUseSrc->AsIndir()->Base();
701             GenTree* index = delayUseSrc->AsIndir()->Index();
702             if ((base != nullptr) && !base->isContained())
703             {
704                 useList.GetTreeNodeInfo(base).isDelayFree = true;
705                 returnValue                               = true;
706             }
707             if (index != nullptr)
708             {
709                 assert(!index->isContained());
710                 useList.GetTreeNodeInfo(index).isDelayFree = true;
711                 returnValue                                = true;
712             }
713         }
714     }
715     else
716     {
717         useList.GetTreeNodeInfo(delayUseSrc).isDelayFree = true;
718         returnValue                                      = true;
719     }
720     return returnValue;
721 }
722
723 //------------------------------------------------------------------------
724 // BuildCheckByteable: Check the tree to see if "byte-able" registers are
725 // required, and set the tree node info accordingly.
726 //
727 // Arguments:
728 //    tree      - The node of interest
729 //
730 // Return Value:
731 //    None.
732 //
733 void LinearScan::BuildCheckByteable(GenTree* tree)
734 {
735 #ifdef _TARGET_X86_
736     TreeNodeInfo* info = currentNodeInfo;
737     // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
738     // if the tree node is a byte type.
739     //
740     // Though this looks conservative in theory, in practice we could not think of a case where
741     // the below logic leads to conservative register specification.  In future when or if we find
742     // one such case, this logic needs to be fine tuned for that case(s).
743
744     if (ExcludeNonByteableRegisters(tree))
745     {
746         regMaskTP regMask;
747         if (info->dstCount > 0)
748         {
749             regMask = info->getDstCandidates(this);
750             assert(regMask != RBM_NONE);
751             info->setDstCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
752         }
753
754         if (tree->OperIsSimple())
755         {
756             GenTree* op = tree->gtOp.gtOp1;
757             if (op != nullptr)
758             {
759                 // No need to set src candidates on a contained child operand.
760                 if (!op->isContained())
761                 {
762                     TreeNodeInfo& op1Info = useList.GetTreeNodeInfo(op);
763                     regMask               = op1Info.getSrcCandidates(this);
764                     assert(regMask != RBM_NONE);
765                     op1Info.setSrcCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
766                 }
767             }
768
769             if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
770             {
771                 op = tree->gtOp.gtOp2;
772                 if (!op->isContained())
773                 {
774                     TreeNodeInfo& op2Info = useList.GetTreeNodeInfo(op);
775                     regMask               = op2Info.getSrcCandidates(this);
776                     assert(regMask != RBM_NONE);
777                     op2Info.setSrcCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
778                 }
779             }
780         }
781     }
782 #endif //_TARGET_X86_
783 }
784
785 //------------------------------------------------------------------------------
786 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
787 //
788 // Arguments:
789 //    tree      - a binary tree node
790 //
791 // Return Value:
792 //    Returns true if we can use the read-modify-write instruction form
793 //
794 // Notes:
795 //    This is used to determine whether to preference the source to the destination register.
796 //
797 bool LinearScan::isRMWRegOper(GenTree* tree)
798 {
799     // TODO-XArch-CQ: Make this more accurate.
800     // For now, We assume that most binary operators are of the RMW form.
801     assert(tree->OperIsBinary());
802
803     if (tree->OperIsCompare() || tree->OperIs(GT_CMP))
804     {
805         return false;
806     }
807
808     switch (tree->OperGet())
809     {
810         // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
811         case GT_LEA:
812         case GT_STOREIND:
813         case GT_ARR_INDEX:
814         case GT_STORE_BLK:
815         case GT_STORE_OBJ:
816             return false;
817
818         // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
819         case GT_MUL:
820             return (!tree->gtOp.gtOp2->isContainedIntOrIImmed() && !tree->gtOp.gtOp1->isContainedIntOrIImmed());
821
822         default:
823             return true;
824     }
825 }
826
827 //------------------------------------------------------------------------
828 // BuildShiftRotate: Set the NodeInfo for a shift or rotate.
829 //
830 // Arguments:
831 //    tree      - The node of interest
832 //
833 // Return Value:
834 //    None.
835 //
836 int LinearScan::BuildShiftRotate(GenTree* tree)
837 {
838     TreeNodeInfo* info = currentNodeInfo;
839     // For shift operations, we need that the number
840     // of bits moved gets stored in CL in case
841     // the number of bits to shift is not a constant.
842     int                   srcCount    = 0;
843     GenTree*              shiftBy     = tree->gtOp.gtOp2;
844     GenTree*              source      = tree->gtOp.gtOp1;
845     LocationInfoListNode* shiftByInfo = nullptr;
846     // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
847     // We will allow whatever can be encoded - hope you know what you are doing.
848     if (shiftBy->isContained())
849     {
850         srcCount += GetOperandInfo(source);
851     }
852     else
853     {
854         srcCount++;
855         shiftByInfo = getLocationInfo(shiftBy);
856         shiftByInfo->info.setSrcCandidates(this, RBM_RCX);
857         info->setDstCandidates(this, allRegs(TYP_INT) & ~RBM_RCX);
858         LocationInfoListNode* sourceInfo;
859         srcCount += GetOperandInfo(source, &sourceInfo);
860         for (; sourceInfo != nullptr; sourceInfo = sourceInfo->Next())
861         {
862             sourceInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RCX);
863         }
864     }
865
866     // Note that Rotate Left/Right instructions don't set ZF and SF flags.
867     //
868     // If the operand being shifted is 32-bits then upper three bits are masked
869     // by hardware to get actual shift count.  Similarly for 64-bit operands
870     // shift count is narrowed to [0..63].  If the resulting shift count is zero,
871     // then shift operation won't modify flags.
872     //
873     // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
874     // if the shift count is known to be non-zero and in the range depending on the
875     // operand size.
876     CLANG_FORMAT_COMMENT_ANCHOR;
877
878 #ifdef _TARGET_X86_
879     // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
880     // we can have a three operand form. Increment the srcCount.
881     if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
882     {
883         assert((source->OperGet() == GT_LONG) && source->isContained());
884
885         GenTree*              sourceLo     = source->gtOp.gtOp1;
886         LocationInfoListNode* sourceLoInfo = useList.Begin();
887         LocationInfoListNode* sourceHiInfo = useList.GetSecond(INDEBUG(source->gtGetOp2()));
888
889         info->hasDelayFreeSrc = true;
890         if (tree->OperGet() == GT_LSH_HI)
891         {
892             sourceLoInfo->info.isDelayFree = true;
893         }
894         else
895         {
896             sourceHiInfo->info.isDelayFree = true;
897         }
898     }
899 #endif
900     if (shiftByInfo != nullptr)
901     {
902         if (tree->IsReverseOp())
903         {
904             useList.Prepend(shiftByInfo);
905         }
906         else
907         {
908             useList.Append(shiftByInfo);
909         }
910     }
911     if (!tree->isContained())
912     {
913         info->srcCount = srcCount;
914     }
915     return srcCount;
916 }
917
918 //------------------------------------------------------------------------
919 // BuildCall: Set the NodeInfo for a call.
920 //
921 // Arguments:
922 //    call      - The call node of interest
923 //
924 // Return Value:
925 //    None.
926 //
927 void LinearScan::BuildCall(GenTreeCall* call)
928 {
929     TreeNodeInfo*   info              = currentNodeInfo;
930     bool            hasMultiRegRetVal = false;
931     ReturnTypeDesc* retTypeDesc       = nullptr;
932
933     assert(!call->isContained());
934     info->srcCount = 0;
935     if (call->TypeGet() != TYP_VOID)
936     {
937         hasMultiRegRetVal = call->HasMultiRegRetVal();
938         if (hasMultiRegRetVal)
939         {
940             // dst count = number of registers in which the value is returned by call
941             retTypeDesc    = call->GetReturnTypeDesc();
942             info->dstCount = retTypeDesc->GetReturnRegCount();
943         }
944         else
945         {
946             assert(info->dstCount == 1);
947         }
948     }
949     else
950     {
951         assert(info->dstCount == 0);
952     }
953
954     GenTree*              ctrlExpr     = call->gtControlExpr;
955     LocationInfoListNode* ctrlExprInfo = nullptr;
956     if (call->gtCallType == CT_INDIRECT)
957     {
958         ctrlExpr = call->gtCallAddr;
959     }
960
961     // If this is a varargs call, we will clear the internal candidates in case we need
962     // to reserve some integer registers for copying float args.
963     // We have to do this because otherwise the default candidates are allRegs, and adding
964     // the individual specific registers will have no effect.
965     if (call->IsVarargs())
966     {
967         info->setInternalCandidates(this, RBM_NONE);
968     }
969
970     RegisterType registerType = call->TypeGet();
971
972     // Set destination candidates for return value of the call.
973     CLANG_FORMAT_COMMENT_ANCHOR;
974
975 #ifdef _TARGET_X86_
976     if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
977     {
978         // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
979         // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
980         // correct argument registers.
981         info->setDstCandidates(this, RBM_PINVOKE_TCB);
982     }
983     else
984 #endif // _TARGET_X86_
985         if (hasMultiRegRetVal)
986     {
987         assert(retTypeDesc != nullptr);
988         info->setDstCandidates(this, retTypeDesc->GetABIReturnRegs());
989     }
990     else if (varTypeIsFloating(registerType))
991     {
992 #ifdef _TARGET_X86_
993         // The return value will be on the X87 stack, and we will need to move it.
994         info->setDstCandidates(this, allRegs(registerType));
995 #else  // !_TARGET_X86_
996         info->setDstCandidates(this, RBM_FLOATRET);
997 #endif // !_TARGET_X86_
998     }
999     else if (registerType == TYP_LONG)
1000     {
1001         info->setDstCandidates(this, RBM_LNGRET);
1002     }
1003     else
1004     {
1005         info->setDstCandidates(this, RBM_INTRET);
1006     }
1007
1008     // number of args to a call =
1009     // callRegArgs + (callargs - placeholders, setup, etc)
1010     // there is an explicit thisPtr but it is redundant
1011
1012     bool callHasFloatRegArgs = false;
1013     bool isVarArgs           = call->IsVarargs();
1014
1015     // First, count reg args
1016     for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1017     {
1018         assert(list->OperIsList());
1019
1020         // By this point, lowering has ensured that all call arguments are one of the following:
1021         // - an arg setup store
1022         // - an arg placeholder
1023         // - a nop
1024         // - a copy blk
1025         // - a field list
1026         // - a put arg
1027         //
1028         // Note that this property is statically checked by LinearScan::CheckBlock.
1029         GenTree* argNode = list->Current();
1030
1031         // Each register argument corresponds to one source.
1032         if (argNode->OperIsPutArgReg())
1033         {
1034             info->srcCount++;
1035             HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1036             appendLocationInfoToList(argNode);
1037         }
1038 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1039         else if (argNode->OperGet() == GT_FIELD_LIST)
1040         {
1041             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1042             {
1043                 assert(entry->Current()->OperIsPutArgReg());
1044                 info->srcCount++;
1045                 HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1046                 appendLocationInfoToList(entry->Current());
1047             }
1048         }
1049 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1050
1051 #ifdef DEBUG
1052         // In DEBUG only, check validity with respect to the arg table entry.
1053
1054         fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1055         assert(curArgTabEntry);
1056
1057         if (curArgTabEntry->regNum == REG_STK)
1058         {
1059             // late arg that is not passed in a register
1060             assert(argNode->gtOper == GT_PUTARG_STK);
1061
1062 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1063             // If the node is TYP_STRUCT and it is put on stack with
1064             // putarg_stk operation, we consume and produce no registers.
1065             // In this case the embedded Obj node should not produce
1066             // registers too since it is contained.
1067             // Note that if it is a SIMD type the argument will be in a register.
1068             if (argNode->TypeGet() == TYP_STRUCT)
1069             {
1070                 assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
1071                 assert(argNode->gtOp.gtOp1->isContained());
1072             }
1073 #endif // FEATURE_PUT_STRUCT_ARG_STK
1074             continue;
1075         }
1076 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1077         if (argNode->OperGet() == GT_FIELD_LIST)
1078         {
1079             assert(argNode->isContained());
1080             assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
1081
1082             int i = 0;
1083             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1084             {
1085                 const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum;
1086                 assert(entry->Current()->gtRegNum == argReg);
1087                 assert(i < 2);
1088                 i++;
1089             }
1090         }
1091         else
1092 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1093         {
1094             const regNumber argReg = curArgTabEntry->regNum;
1095             assert(argNode->gtRegNum == argReg);
1096         }
1097 #endif // DEBUG
1098     }
1099
1100     // Now, count stack args
1101     // Note that these need to be computed into a register, but then
1102     // they're just stored to the stack - so the reg doesn't
1103     // need to remain live until the call.  In fact, it must not
1104     // because the code generator doesn't actually consider it live,
1105     // so it can't be spilled.
1106
1107     GenTree* args = call->gtCallArgs;
1108     while (args)
1109     {
1110         GenTree* arg = args->gtOp.gtOp1;
1111         if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
1112         {
1113             if (arg->IsValue() && !arg->isContained())
1114             {
1115                 // argInfo->isLocalDefUse = true;
1116                 assert(arg->IsUnusedValue());
1117             }
1118             // assert(argInfo->dstCount == 0);
1119         }
1120         args = args->gtOp.gtOp2;
1121     }
1122
1123     // set reg requirements on call target represented as control sequence.
1124     if (ctrlExpr != nullptr)
1125     {
1126         LocationInfoListNode* ctrlExprInfo  = nullptr;
1127         int                   ctrlExprCount = GetOperandInfo(ctrlExpr);
1128         if (ctrlExprCount != 0)
1129         {
1130             assert(ctrlExprCount == 1);
1131             ctrlExprInfo = useList.Last();
1132             info->srcCount++;
1133         }
1134
1135         // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1136         // computed into a register.
1137         if (call->IsFastTailCall())
1138         {
1139             assert(!ctrlExpr->isContained() && ctrlExprInfo != nullptr);
1140             // Fast tail call - make sure that call target is always computed in RAX
1141             // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1142             ctrlExprInfo->info.setSrcCandidates(this, RBM_RAX);
1143         }
1144 #ifdef _TARGET_X86_
1145         else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1146         {
1147             // On x86, we need to generate a very specific pattern for indirect VSD calls:
1148             //
1149             //    3-byte nop
1150             //    call dword ptr [eax]
1151             //
1152             // Where EAX is also used as an argument to the stub dispatch helper. Make
1153             // sure that the call target address is computed into EAX in this case.
1154             assert(ctrlExprInfo != nullptr);
1155             assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
1156             ctrlExprInfo->info.setSrcCandidates(this, RBM_VIRTUAL_STUB_TARGET);
1157         }
1158 #endif // _TARGET_X86_
1159
1160 #if FEATURE_VARARG
1161         // If it is a fast tail call, it is already preferenced to use RAX.
1162         // Therefore, no need set src candidates on call tgt again.
1163         if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExprInfo != nullptr))
1164         {
1165             // Don't assign the call target to any of the argument registers because
1166             // we will use them to also pass floating point arguments as required
1167             // by Amd64 ABI.
1168             ctrlExprInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~(RBM_ARG_REGS));
1169         }
1170 #endif // !FEATURE_VARARG
1171     }
1172 }
1173
1174 //------------------------------------------------------------------------
1175 // BuildBlockStore: Set the NodeInfo for a block store.
1176 //
1177 // Arguments:
1178 //    blkNode       - The block store node of interest
1179 //
1180 // Return Value:
1181 //    None.
1182 //
1183 void LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
1184 {
1185     TreeNodeInfo* info    = currentNodeInfo;
1186     GenTree*      dstAddr = blkNode->Addr();
1187     unsigned      size    = blkNode->gtBlkSize;
1188     GenTree*      source  = blkNode->Data();
1189
1190     LocationInfoListNode* dstAddrInfo = nullptr;
1191     LocationInfoListNode* sourceInfo  = nullptr;
1192     LocationInfoListNode* sizeInfo    = nullptr;
1193
1194     // Sources are dest address, initVal or source.
1195     // We may require an additional source or temp register for the size.
1196     if (!dstAddr->isContained())
1197     {
1198         info->srcCount++;
1199         dstAddrInfo = getLocationInfo(dstAddr);
1200     }
1201     assert(info->dstCount == 0);
1202     info->setInternalCandidates(this, RBM_NONE);
1203     GenTree* srcAddrOrFill = nullptr;
1204     bool     isInitBlk     = blkNode->OperIsInitBlkOp();
1205
1206     regMaskTP dstAddrRegMask = RBM_NONE;
1207     regMaskTP sourceRegMask  = RBM_NONE;
1208     regMaskTP blkSizeRegMask = RBM_NONE;
1209
1210     if (isInitBlk)
1211     {
1212         GenTree* initVal = source;
1213         if (initVal->OperIsInitVal())
1214         {
1215             assert(initVal->isContained());
1216             initVal = initVal->gtGetOp1();
1217         }
1218         srcAddrOrFill = initVal;
1219         if (!initVal->isContained())
1220         {
1221             info->srcCount++;
1222             sourceInfo = getLocationInfo(initVal);
1223         }
1224
1225         switch (blkNode->gtBlkOpKind)
1226         {
1227             case GenTreeBlk::BlkOpKindUnroll:
1228                 assert(initVal->IsCnsIntOrI());
1229                 if (size >= XMM_REGSIZE_BYTES)
1230                 {
1231                     // Reserve an XMM register to fill it with a pack of 16 init value constants.
1232                     info->internalFloatCount = 1;
1233                     info->setInternalCandidates(this, internalFloatRegCandidates());
1234                     // use XMM register to fill with constants, it's AVX instruction and set the flag
1235                     SetContainsAVXFlags();
1236                 }
1237 #ifdef _TARGET_X86_
1238                 if ((size & 1) != 0)
1239                 {
1240                     // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1241                     // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1242                     // when unrolling, so only allow byteable registers as the source value. (We could
1243                     // consider just using BlkOpKindRepInstr instead.)
1244                     sourceRegMask = RBM_BYTE_REGS;
1245                 }
1246 #endif // _TARGET_X86_
1247                 break;
1248
1249             case GenTreeBlk::BlkOpKindRepInstr:
1250                 // rep stos has the following register requirements:
1251                 // a) The memory address to be in RDI.
1252                 // b) The fill value has to be in RAX.
1253                 // c) The buffer size will go in RCX.
1254                 dstAddrRegMask = RBM_RDI;
1255                 sourceRegMask  = RBM_RAX;
1256                 blkSizeRegMask = RBM_RCX;
1257                 break;
1258
1259             case GenTreeBlk::BlkOpKindHelper:
1260 #ifdef _TARGET_AMD64_
1261                 // The helper follows the regular AMD64 ABI.
1262                 dstAddrRegMask = RBM_ARG_0;
1263                 sourceRegMask  = RBM_ARG_1;
1264                 blkSizeRegMask = RBM_ARG_2;
1265 #else  // !_TARGET_AMD64_
1266                 dstAddrRegMask             = RBM_RDI;
1267                 sourceRegMask              = RBM_RAX;
1268                 blkSizeRegMask             = RBM_RCX;
1269 #endif // !_TARGET_AMD64_
1270                 break;
1271
1272             default:
1273                 unreached();
1274         }
1275     }
1276     else
1277     {
1278         // CopyObj or CopyBlk
1279         if (source->gtOper == GT_IND)
1280         {
1281             assert(source->isContained());
1282             srcAddrOrFill = source->gtGetOp1();
1283             if (!srcAddrOrFill->isContained())
1284             {
1285                 sourceInfo = getLocationInfo(srcAddrOrFill);
1286                 info->srcCount++;
1287             }
1288         }
1289         if (blkNode->OperGet() == GT_STORE_OBJ)
1290         {
1291             if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
1292             {
1293                 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1294                 blkSizeRegMask = RBM_RCX;
1295             }
1296             // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
1297             // sources.
1298             sourceRegMask  = RBM_RSI;
1299             dstAddrRegMask = RBM_RDI;
1300         }
1301         else
1302         {
1303             switch (blkNode->gtBlkOpKind)
1304             {
1305                 case GenTreeBlk::BlkOpKindUnroll:
1306                     // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1307                     //
1308                     // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1309                     // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1310                     // RBM_NON_BYTE_REGS from internal candidates.
1311                     if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1312                     {
1313                         info->internalIntCount++;
1314                         regMaskTP regMask = allRegs(TYP_INT);
1315
1316 #ifdef _TARGET_X86_
1317                         if ((size & 1) != 0)
1318                         {
1319                             regMask &= ~RBM_NON_BYTE_REGS;
1320                         }
1321 #endif
1322                         info->setInternalCandidates(this, regMask);
1323                     }
1324
1325                     if (size >= XMM_REGSIZE_BYTES)
1326                     {
1327                         // If we have a buffer larger than XMM_REGSIZE_BYTES,
1328                         // reserve an XMM register to use it for a
1329                         // series of 16-byte loads and stores.
1330                         info->internalFloatCount = 1;
1331                         info->addInternalCandidates(this, internalFloatRegCandidates());
1332                         // Uses XMM reg for load and store and hence check to see whether AVX instructions
1333                         // are used for codegen, set ContainsAVX flag
1334                         SetContainsAVXFlags();
1335                     }
1336                     break;
1337
1338                 case GenTreeBlk::BlkOpKindRepInstr:
1339                     // rep stos has the following register requirements:
1340                     // a) The dest address has to be in RDI.
1341                     // b) The src address has to be in RSI.
1342                     // c) The buffer size will go in RCX.
1343                     dstAddrRegMask = RBM_RDI;
1344                     sourceRegMask  = RBM_RSI;
1345                     blkSizeRegMask = RBM_RCX;
1346                     break;
1347
1348                 case GenTreeBlk::BlkOpKindHelper:
1349 #ifdef _TARGET_AMD64_
1350                     // The helper follows the regular AMD64 ABI.
1351                     dstAddrRegMask = RBM_ARG_0;
1352                     sourceRegMask  = RBM_ARG_1;
1353                     blkSizeRegMask = RBM_ARG_2;
1354 #else  // !_TARGET_AMD64_
1355                     dstAddrRegMask         = RBM_RDI;
1356                     sourceRegMask          = RBM_RAX;
1357                     blkSizeRegMask         = RBM_RCX;
1358 #endif // !_TARGET_AMD64_
1359                     break;
1360
1361                 default:
1362                     unreached();
1363             }
1364         }
1365     }
1366
1367     if (dstAddrInfo != nullptr)
1368     {
1369         if (dstAddrRegMask != RBM_NONE)
1370         {
1371             dstAddrInfo->info.setSrcCandidates(this, dstAddrRegMask);
1372         }
1373         useList.Append(dstAddrInfo);
1374     }
1375     if (sourceRegMask != RBM_NONE)
1376     {
1377         if (sourceInfo != nullptr)
1378         {
1379             sourceInfo->info.setSrcCandidates(this, sourceRegMask);
1380         }
1381         else
1382         {
1383             // This is a local source; we'll use a temp register for its address.
1384             info->addInternalCandidates(this, sourceRegMask);
1385             info->internalIntCount++;
1386         }
1387     }
1388     if (sourceInfo != nullptr)
1389     {
1390         useList.Add(sourceInfo, blkNode->IsReverseOp());
1391     }
1392
1393     if (blkNode->OperIs(GT_STORE_DYN_BLK))
1394     {
1395         // The block size argument is a third argument to GT_STORE_DYN_BLK
1396         info->srcCount++;
1397
1398         GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1399         sizeInfo           = getLocationInfo(blockSize);
1400         useList.Add(sizeInfo, blkNode->AsDynBlk()->gtEvalSizeFirst);
1401     }
1402
1403     if (blkSizeRegMask != RBM_NONE)
1404     {
1405         if (size != 0)
1406         {
1407             // Reserve a temp register for the block size argument.
1408             info->addInternalCandidates(this, blkSizeRegMask);
1409             info->internalIntCount++;
1410         }
1411         else
1412         {
1413             // The block size argument is a third argument to GT_STORE_DYN_BLK
1414             assert((blkNode->gtOper == GT_STORE_DYN_BLK) && (sizeInfo != nullptr));
1415             info->setSrcCount(3);
1416             sizeInfo->info.setSrcCandidates(this, blkSizeRegMask);
1417         }
1418     }
1419 }
1420
1421 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1422 //------------------------------------------------------------------------
1423 // BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1424 //
1425 // Arguments:
1426 //    tree      - The node of interest
1427 //
1428 // Return Value:
1429 //    None.
1430 //
1431 void LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
1432 {
1433     TreeNodeInfo* info = currentNodeInfo;
1434     info->srcCount     = 0;
1435     assert(info->dstCount == 0);
1436
1437     if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
1438     {
1439         putArgStk->gtOp1->SetContained();
1440
1441 #ifdef _TARGET_X86_
1442         unsigned fieldCount    = 0;
1443         bool     needsByteTemp = false;
1444         bool     needsSimdTemp = false;
1445         unsigned prevOffset    = putArgStk->getArgSize();
1446         for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1447         {
1448             GenTree* const  fieldNode   = current->Current();
1449             const var_types fieldType   = fieldNode->TypeGet();
1450             const unsigned  fieldOffset = current->gtFieldOffset;
1451             assert(fieldType != TYP_LONG);
1452
1453 #if defined(FEATURE_SIMD)
1454             // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the
1455             // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
1456             // we "round up" to 16.
1457             if (current->gtFieldType == TYP_SIMD12)
1458             {
1459                 needsSimdTemp = true;
1460             }
1461 #endif // defined(FEATURE_SIMD)
1462
1463             // We can treat as a slot any field that is stored at a slot boundary, where the previous
1464             // field is not in the same slot. (Note that we store the fields in reverse order.)
1465             const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
1466             if (!fieldIsSlot)
1467             {
1468                 if (varTypeIsByte(fieldType))
1469                 {
1470                     // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
1471                     // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
1472                     // need a byte-addressable register for the store. We will enforce this requirement on an internal
1473                     // register, which we can use to copy multiple byte values.
1474                     needsByteTemp = true;
1475                 }
1476             }
1477
1478             if (varTypeIsGC(fieldType))
1479             {
1480                 putArgStk->gtNumberReferenceSlots++;
1481             }
1482             prevOffset = fieldOffset;
1483             fieldCount++;
1484             if (!fieldNode->isContained())
1485             {
1486                 appendLocationInfoToList(fieldNode);
1487                 info->srcCount++;
1488             }
1489         }
1490
1491         if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
1492         {
1493             // If any of the fields cannot be stored with an actual push, we may need a temporary
1494             // register to load the value before storing it to the stack location.
1495             info->internalIntCount = 1;
1496             regMaskTP regMask      = allRegs(TYP_INT);
1497             if (needsByteTemp)
1498             {
1499                 regMask &= ~RBM_NON_BYTE_REGS;
1500             }
1501             info->setInternalCandidates(this, regMask);
1502         }
1503
1504 #if defined(FEATURE_SIMD)
1505         // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
1506         if (needsSimdTemp)
1507         {
1508             assert(info->dstCount == 0);
1509             info->internalFloatCount += 1;
1510             info->addInternalCandidates(this, allSIMDRegs());
1511         }
1512 #endif // defined(FEATURE_SIMD)
1513
1514         return;
1515 #endif // _TARGET_X86_
1516     }
1517
1518     GenTree*  src  = putArgStk->gtOp1;
1519     var_types type = src->TypeGet();
1520
1521 #if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1522     // For PutArgStk of a TYP_SIMD12, we need an extra register.
1523     if (putArgStk->isSIMD12())
1524     {
1525         appendLocationInfoToList(putArgStk->gtOp1);
1526         info->srcCount           = 1;
1527         info->internalFloatCount = 1;
1528         info->setInternalCandidates(this, allSIMDRegs());
1529         return;
1530     }
1531 #endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1532
1533     if (type != TYP_STRUCT)
1534     {
1535         BuildSimple(putArgStk);
1536         return;
1537     }
1538
1539     GenTree* dst     = putArgStk;
1540     GenTree* srcAddr = nullptr;
1541
1542     info->srcCount = GetOperandInfo(src);
1543
1544     // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1545     // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1546     // our framework assemblies, so this is the main code generation scheme we'll use.
1547     ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
1548     switch (putArgStk->gtPutArgStkKind)
1549     {
1550         case GenTreePutArgStk::Kind::Push:
1551         case GenTreePutArgStk::Kind::PushAllSlots:
1552         case GenTreePutArgStk::Kind::Unroll:
1553             // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1554             //
1555             // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1556             // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1557             // RBM_NON_BYTE_REGS from internal candidates.
1558             if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
1559             {
1560                 info->internalIntCount++;
1561                 regMaskTP regMask = allRegs(TYP_INT);
1562
1563 #ifdef _TARGET_X86_
1564                 if ((size % 2) != 0)
1565                 {
1566                     regMask &= ~RBM_NON_BYTE_REGS;
1567                 }
1568 #endif
1569                 info->setInternalCandidates(this, regMask);
1570             }
1571
1572 #ifdef _TARGET_X86_
1573             if (size >= 8)
1574 #else  // !_TARGET_X86_
1575             if (size >= XMM_REGSIZE_BYTES)
1576 #endif // !_TARGET_X86_
1577             {
1578                 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
1579                 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
1580                 // series of 16-byte loads and stores.
1581                 info->internalFloatCount = 1;
1582                 info->addInternalCandidates(this, internalFloatRegCandidates());
1583                 SetContainsAVXFlags();
1584             }
1585             break;
1586
1587         case GenTreePutArgStk::Kind::RepInstr:
1588             info->internalIntCount += 3;
1589             info->setInternalCandidates(this, (RBM_RDI | RBM_RCX | RBM_RSI));
1590             break;
1591
1592         default:
1593             unreached();
1594     }
1595 }
1596 #endif // FEATURE_PUT_STRUCT_ARG_STK
1597
1598 //------------------------------------------------------------------------
1599 // BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
1600 //
1601 // Arguments:
1602 //    tree      - The node of interest
1603 //
1604 // Return Value:
1605 //    None.
1606 //
1607 void LinearScan::BuildLclHeap(GenTree* tree)
1608 {
1609     TreeNodeInfo* info = currentNodeInfo;
1610     info->srcCount     = 1;
1611     assert(info->dstCount == 1);
1612
1613     // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1614     // Here '-' means don't care.
1615     //
1616     //     Size?                    Init Memory?         # temp regs
1617     //      0                            -                  0 (returns 0)
1618     //      const and <=6 reg words      -                  0 (pushes '0')
1619     //      const and >6 reg words       Yes                0 (pushes '0')
1620     //      const and <PageSize          No                 0 (amd64) 1 (x86)
1621     //                                                        (x86:tmpReg for sutracting from esp)
1622     //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
1623     //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
1624     //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
1625     //
1626     // Note: Here we don't need internal register to be different from targetReg.
1627     // Rather, require it to be different from operand's reg.
1628
1629     GenTree* size = tree->gtOp.gtOp1;
1630     if (size->IsCnsIntOrI())
1631     {
1632         assert(size->isContained());
1633         info->srcCount = 0;
1634         size_t sizeVal = size->gtIntCon.gtIconVal;
1635
1636         if (sizeVal == 0)
1637         {
1638             info->internalIntCount = 0;
1639         }
1640         else
1641         {
1642             // Compute the amount of memory to properly STACK_ALIGN.
1643             // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1644             // This should also help in debugging as we can examine the original size specified with localloc.
1645             sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1646
1647             // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1648             // we will generate 'push 0'.
1649             assert((sizeVal % REGSIZE_BYTES) == 0);
1650             size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1651             if (cntRegSizedWords <= 6)
1652             {
1653                 info->internalIntCount = 0;
1654             }
1655             else if (!compiler->info.compInitMem)
1656             {
1657                 // No need to initialize allocated stack space.
1658                 if (sizeVal < compiler->eeGetPageSize())
1659                 {
1660 #ifdef _TARGET_X86_
1661                     info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
1662 #else                                           // !_TARGET_X86_
1663                     info->internalIntCount = 0;
1664 #endif                                          // !_TARGET_X86_
1665                 }
1666                 else
1667                 {
1668                     // We need two registers: regCnt and RegTmp
1669                     info->internalIntCount = 2;
1670                 }
1671             }
1672             else
1673             {
1674                 // >6 and need to zero initialize allocated stack space.
1675                 info->internalIntCount = 0;
1676             }
1677         }
1678     }
1679     else
1680     {
1681         appendLocationInfoToList(size);
1682         if (!compiler->info.compInitMem)
1683         {
1684             info->internalIntCount = 2;
1685         }
1686         else
1687         {
1688             info->internalIntCount = 0;
1689         }
1690     }
1691 }
1692
1693 //------------------------------------------------------------------------
1694 // BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
1695 //
1696 // Arguments:
1697 //    tree      - The node of interest
1698 //
1699 // Return Value:
1700 //    None.
1701 //
1702 void LinearScan::BuildModDiv(GenTree* tree)
1703 {
1704     TreeNodeInfo* info = currentNodeInfo;
1705     GenTree*      op1  = tree->gtGetOp1();
1706     GenTree*      op2  = tree->gtGetOp2();
1707
1708     assert(info->dstCount == 1);
1709
1710     if (varTypeIsFloating(tree->TypeGet()))
1711     {
1712         info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
1713         return;
1714     }
1715
1716     // Amd64 Div/Idiv instruction:
1717     //    Dividend in RAX:RDX  and computes
1718     //    Quotient in RAX, Remainder in RDX
1719
1720     if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
1721     {
1722         // We are interested in just the remainder.
1723         // RAX is used as a trashable register during computation of remainder.
1724         info->setDstCandidates(this, RBM_RDX);
1725     }
1726     else
1727     {
1728         // We are interested in just the quotient.
1729         // RDX gets used as trashable register during computation of quotient
1730         info->setDstCandidates(this, RBM_RAX);
1731     }
1732
1733 #ifdef _TARGET_X86_
1734     if (op1->OperGet() == GT_LONG)
1735     {
1736         assert(op1->isContained());
1737
1738         // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
1739         GenTree* loVal = op1->gtGetOp1();
1740         GenTree* hiVal = op1->gtGetOp2();
1741
1742         assert(op2->IsCnsIntOrI());
1743         assert(tree->OperGet() == GT_UMOD);
1744
1745         // This situation also requires an internal register.
1746         info->internalIntCount = 1;
1747         info->setInternalCandidates(this, allRegs(TYP_INT));
1748
1749         LocationInfoListNode* loValInfo = getLocationInfo(loVal);
1750         LocationInfoListNode* hiValInfo = getLocationInfo(hiVal);
1751         loValInfo->info.setSrcCandidates(this, RBM_EAX);
1752         hiValInfo->info.setSrcCandidates(this, RBM_EDX);
1753         useList.Append(loValInfo);
1754         useList.Append(hiValInfo);
1755         info->srcCount = 2;
1756     }
1757     else
1758 #endif
1759     {
1760         // If possible would like to have op1 in RAX to avoid a register move
1761         LocationInfoListNode* op1Info = getLocationInfo(op1);
1762         op1Info->info.setSrcCandidates(this, RBM_RAX);
1763         useList.Append(op1Info);
1764         info->srcCount = 1;
1765     }
1766
1767     LocationInfoListNode* op2Info;
1768     info->srcCount += GetOperandInfo(op2, &op2Info);
1769     for (; op2Info != nullptr; op2Info = op2Info->Next())
1770     {
1771         op2Info->info.setSrcCandidates(this, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
1772     }
1773 }
1774
1775 //------------------------------------------------------------------------
1776 // BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
1777 //
1778 // Arguments:
1779 //    tree      - The node of interest
1780 //
1781 // Return Value:
1782 //    None.
1783 //
1784 void LinearScan::BuildIntrinsic(GenTree* tree)
1785 {
1786     TreeNodeInfo* info = currentNodeInfo;
1787     // Both operand and its result must be of floating point type.
1788     GenTree* op1 = tree->gtGetOp1();
1789     assert(varTypeIsFloating(op1));
1790     assert(op1->TypeGet() == tree->TypeGet());
1791
1792     info->srcCount = GetOperandInfo(op1);
1793     assert(info->dstCount == 1);
1794
1795     switch (tree->gtIntrinsic.gtIntrinsicId)
1796     {
1797         case CORINFO_INTRINSIC_Sqrt:
1798             break;
1799
1800         case CORINFO_INTRINSIC_Abs:
1801             // Abs(float x) = x & 0x7fffffff
1802             // Abs(double x) = x & 0x7ffffff ffffffff
1803
1804             // In case of Abs we need an internal register to hold mask.
1805
1806             // TODO-XArch-CQ: avoid using an internal register for the mask.
1807             // Andps or andpd both will operate on 128-bit operands.
1808             // The data section constant to hold the mask is a 64-bit size.
1809             // Therefore, we need both the operand and mask to be in
1810             // xmm register. When we add support in emitter to emit 128-bit
1811             // data constants and instructions that operate on 128-bit
1812             // memory operands we can avoid the need for an internal register.
1813             if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
1814             {
1815                 info->internalFloatCount = 1;
1816                 info->setInternalCandidates(this, internalFloatRegCandidates());
1817             }
1818             break;
1819
1820 #ifdef _TARGET_X86_
1821         case CORINFO_INTRINSIC_Cos:
1822         case CORINFO_INTRINSIC_Sin:
1823             NYI_X86("Math intrinsics Cos and Sin");
1824             break;
1825 #endif // _TARGET_X86_
1826
1827         case CORINFO_INTRINSIC_Round:
1828         case CORINFO_INTRINSIC_Ceiling:
1829         case CORINFO_INTRINSIC_Floor:
1830 #if defined(LEGACY_BACKEND)
1831             NYI_X86("Math intrinsics Round, Ceiling, and Floor");
1832 #endif // LEGACY_BACKEND
1833             break;
1834
1835         default:
1836             // Right now only Sqrt/Abs are treated as math intrinsics
1837             noway_assert(!"Unsupported math intrinsic");
1838             unreached();
1839             break;
1840     }
1841 }
1842
1843 #ifdef FEATURE_SIMD
1844 //------------------------------------------------------------------------
1845 // BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
1846 //
1847 // Arguments:
1848 //    tree       - The GT_SIMD node of interest
1849 //
1850 // Return Value:
1851 //    None.
1852
1853 void LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
1854 {
1855     TreeNodeInfo* info = currentNodeInfo;
1856     // Only SIMDIntrinsicInit can be contained. Other than that,
1857     // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount.
1858     if (simdTree->isContained())
1859     {
1860         assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
1861     }
1862     else if (info->dstCount != 1)
1863     {
1864         assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ||
1865                (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality));
1866     }
1867     SetContainsAVXFlags(true, simdTree->gtSIMDSize);
1868     GenTree* op1   = simdTree->gtOp.gtOp1;
1869     GenTree* op2   = simdTree->gtOp.gtOp2;
1870     info->srcCount = 0;
1871     if (!op1->OperIs(GT_LIST))
1872     {
1873         info->srcCount += GetOperandInfo(op1);
1874     }
1875     if ((op2 != nullptr) && !op2->isContained())
1876     {
1877         info->srcCount += GetOperandInfo(op2);
1878     }
1879
1880     switch (simdTree->gtSIMDIntrinsicID)
1881     {
1882         case SIMDIntrinsicInit:
1883         {
1884             // This sets all fields of a SIMD struct to the given value.
1885             // Mark op1 as contained if it is either zero or int constant of all 1's,
1886             // or a float constant with 16 or 32 byte simdType (AVX case)
1887             //
1888             // Should never see small int base type vectors except for zero initialization.
1889             assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
1890
1891 #if !defined(_TARGET_64BIT_)
1892             if (op1->OperGet() == GT_LONG)
1893             {
1894                 assert(op1->isContained());
1895                 GenTree* op1lo = op1->gtGetOp1();
1896                 GenTree* op1hi = op1->gtGetOp2();
1897
1898                 if (op1lo->isContained())
1899                 {
1900                     assert(op1hi->isContained());
1901                     assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
1902                            (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
1903                     assert(info->srcCount == 0);
1904                 }
1905                 else
1906                 {
1907                     assert(info->srcCount == 2);
1908                     info->internalFloatCount = 1;
1909                     info->setInternalCandidates(this, allSIMDRegs());
1910                     info->isInternalRegDelayFree = true;
1911                 }
1912             }
1913 #endif // !defined(_TARGET_64BIT_)
1914         }
1915         break;
1916
1917         case SIMDIntrinsicInitN:
1918         {
1919             var_types baseType = simdTree->gtSIMDBaseType;
1920             info->srcCount     = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
1921             int initCount      = 0;
1922             for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
1923             {
1924                 assert(list->OperGet() == GT_LIST);
1925                 GenTree* listItem = list->gtGetOp1();
1926                 assert(listItem->TypeGet() == baseType);
1927                 assert(!listItem->isContained());
1928                 appendLocationInfoToList(listItem);
1929                 initCount++;
1930             }
1931             assert(initCount == info->srcCount);
1932
1933             // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
1934             info->internalFloatCount = 1;
1935             info->setInternalCandidates(this, allSIMDRegs());
1936         }
1937         break;
1938
1939         case SIMDIntrinsicInitArray:
1940             // We have an array and an index, which may be contained.
1941             assert(info->srcCount == (simdTree->gtGetOp2()->isContained() ? 1 : 2));
1942             break;
1943
1944         case SIMDIntrinsicDiv:
1945             // SSE2 has no instruction support for division on integer vectors
1946             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1947             assert(info->srcCount == 2);
1948             break;
1949
1950         case SIMDIntrinsicAbs:
1951             // float/double vectors: This gets implemented as bitwise-And operation
1952             // with a mask and hence should never see  here.
1953             //
1954             // Must be a Vector<int> or Vector<short> Vector<sbyte>
1955             assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
1956                    simdTree->gtSIMDBaseType == TYP_BYTE);
1957             assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
1958             assert(info->srcCount == 1);
1959             break;
1960
1961         case SIMDIntrinsicSqrt:
1962             // SSE2 has no instruction support for sqrt on integer vectors.
1963             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1964             assert(info->srcCount == 1);
1965             break;
1966
1967         case SIMDIntrinsicAdd:
1968         case SIMDIntrinsicSub:
1969         case SIMDIntrinsicMul:
1970         case SIMDIntrinsicBitwiseAnd:
1971         case SIMDIntrinsicBitwiseAndNot:
1972         case SIMDIntrinsicBitwiseOr:
1973         case SIMDIntrinsicBitwiseXor:
1974         case SIMDIntrinsicMin:
1975         case SIMDIntrinsicMax:
1976             assert(info->srcCount == 2);
1977
1978             // SSE2 32-bit integer multiplication requires two temp regs
1979             if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
1980                 compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
1981             {
1982                 info->internalFloatCount = 2;
1983                 info->setInternalCandidates(this, allSIMDRegs());
1984             }
1985             break;
1986
1987         case SIMDIntrinsicEqual:
1988             assert(info->srcCount == 2);
1989             break;
1990
1991         // SSE2 doesn't support < and <= directly on int vectors.
1992         // Instead we need to use > and >= with swapped operands.
1993         case SIMDIntrinsicLessThan:
1994         case SIMDIntrinsicLessThanOrEqual:
1995             assert(info->srcCount == 2);
1996             noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
1997             break;
1998
1999         // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
2000         // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
2001         // Instead we need to use <  and <= with swapped operands.
2002         case SIMDIntrinsicGreaterThan:
2003             noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2004             assert(info->srcCount == 2);
2005             break;
2006
2007         case SIMDIntrinsicOpEquality:
2008         case SIMDIntrinsicOpInEquality:
2009             if (simdTree->gtGetOp2()->isContained())
2010             {
2011                 // If the second operand is contained then ContainCheckSIMD has determined
2012                 // that PTEST can be used. We only need a single source register and no
2013                 // internal registers.
2014                 assert(info->srcCount == 1);
2015             }
2016             else
2017             {
2018                 // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
2019                 // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
2020                 // and one internal INT register (to hold the result of PMOVMSKB).
2021                 assert(info->srcCount == 2);
2022                 info->internalFloatCount = 1;
2023                 info->setInternalCandidates(this, allSIMDRegs());
2024                 info->internalIntCount = 1;
2025                 info->addInternalCandidates(this, allRegs(TYP_INT));
2026             }
2027             // These SIMD nodes only set the condition flags.
2028             info->dstCount = 0;
2029             break;
2030
2031         case SIMDIntrinsicDotProduct:
2032             // Float/Double vectors:
2033             // For SSE, or AVX with 32-byte vectors, we also need an internal register
2034             // as scratch. Further we need the targetReg and internal reg to be distinct
2035             // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2036             // don't need a tmpReg.
2037             //
2038             // 32-byte integer vector on SSE4/AVX:
2039             // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2040             // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
2041             // registers since targetReg is an int type register.
2042             //
2043             // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2044             // and the need for scratch registers.
2045             if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2046             {
2047                 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
2048                     (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
2049                 {
2050                     info->internalFloatCount     = 1;
2051                     info->isInternalRegDelayFree = true;
2052                     info->setInternalCandidates(this, allSIMDRegs());
2053                 }
2054                 // else don't need scratch reg(s).
2055             }
2056             else
2057             {
2058                 assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2059
2060                 // No need to set isInternalRegDelayFree since targetReg is a
2061                 // an int type reg and guaranteed to be different from xmm/ymm
2062                 // regs.
2063                 info->internalFloatCount = (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) ? 2 : 1;
2064                 info->setInternalCandidates(this, allSIMDRegs());
2065             }
2066             assert(info->srcCount == 2);
2067             break;
2068
2069         case SIMDIntrinsicGetItem:
2070         {
2071             // This implements get_Item method. The sources are:
2072             //  - the source SIMD struct
2073             //  - index (which element to get)
2074             // The result is baseType of SIMD struct.
2075             // op1 may be a contained memory op, but if so we will consume its address.
2076             // op2 may be a contained constant.
2077             op1 = simdTree->gtOp.gtOp1;
2078             op2 = simdTree->gtOp.gtOp2;
2079
2080             if (!op1->isContained())
2081             {
2082                 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2083                 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2084                 // can use that in the process of extracting the element.
2085                 //
2086                 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2087                 // we will need a temp if are indexing into the upper half of the AVX register.
2088                 // In all other cases with constant index, we need a temp xmm register to extract the
2089                 // element if index is other than zero.
2090
2091                 if (!op2->IsCnsIntOrI())
2092                 {
2093                     (void)compiler->getSIMDInitTempVarNum();
2094                 }
2095                 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2096                 {
2097                     bool needFloatTemp;
2098                     if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2099                         (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
2100                     {
2101                         int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2102                         needFloatTemp    = (byteShiftCnt >= 16);
2103                     }
2104                     else
2105                     {
2106                         needFloatTemp = !op2->IsIntegralConst(0);
2107                     }
2108
2109                     if (needFloatTemp)
2110                     {
2111                         info->internalFloatCount = 1;
2112                         info->setInternalCandidates(this, allSIMDRegs());
2113                     }
2114                 }
2115             }
2116         }
2117         break;
2118
2119         case SIMDIntrinsicSetX:
2120         case SIMDIntrinsicSetY:
2121         case SIMDIntrinsicSetZ:
2122         case SIMDIntrinsicSetW:
2123             assert(info->srcCount == 2);
2124
2125             // We need an internal integer register for SSE2 codegen
2126             if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2127             {
2128                 info->internalIntCount = 1;
2129                 info->setInternalCandidates(this, allRegs(TYP_INT));
2130             }
2131
2132             break;
2133
2134         case SIMDIntrinsicCast:
2135             assert(info->srcCount == 1);
2136             break;
2137
2138         case SIMDIntrinsicConvertToSingle:
2139             assert(info->srcCount == 1);
2140             if (simdTree->gtSIMDBaseType == TYP_UINT)
2141             {
2142                 // We need an internal register different from targetReg.
2143                 info->isInternalRegDelayFree = true;
2144                 info->internalIntCount       = 1;
2145                 info->internalFloatCount     = 2;
2146                 info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2147             }
2148             break;
2149
2150         case SIMDIntrinsicConvertToInt32:
2151             assert(info->srcCount == 1);
2152             break;
2153
2154         case SIMDIntrinsicWidenLo:
2155         case SIMDIntrinsicWidenHi:
2156             assert(info->srcCount == 1);
2157             if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
2158             {
2159                 // We need an internal register different from targetReg.
2160                 info->isInternalRegDelayFree = true;
2161                 info->internalFloatCount     = 1;
2162                 info->setInternalCandidates(this, allSIMDRegs());
2163             }
2164             break;
2165
2166         case SIMDIntrinsicConvertToInt64:
2167             assert(info->srcCount == 1);
2168             // We need an internal register different from targetReg.
2169             info->isInternalRegDelayFree = true;
2170             info->internalIntCount       = 1;
2171             if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2172             {
2173                 info->internalFloatCount = 2;
2174             }
2175             else
2176             {
2177                 info->internalFloatCount = 1;
2178             }
2179             info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2180             break;
2181
2182         case SIMDIntrinsicConvertToDouble:
2183             assert(info->srcCount == 1);
2184             // We need an internal register different from targetReg.
2185             info->isInternalRegDelayFree = true;
2186             info->internalIntCount       = 1;
2187 #ifdef _TARGET_X86_
2188             if (simdTree->gtSIMDBaseType == TYP_LONG)
2189             {
2190                 info->internalFloatCount = 3;
2191             }
2192             else
2193 #endif
2194                 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
2195             {
2196                 info->internalFloatCount = 2;
2197             }
2198             else
2199             {
2200                 info->internalFloatCount = 1;
2201             }
2202             info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2203             break;
2204
2205         case SIMDIntrinsicNarrow:
2206             assert(info->srcCount == 2);
2207             // We need an internal register different from targetReg.
2208             info->isInternalRegDelayFree = true;
2209             if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
2210             {
2211                 info->internalFloatCount = 2;
2212             }
2213             else
2214             {
2215                 info->internalFloatCount = 1;
2216             }
2217             info->setInternalCandidates(this, allSIMDRegs());
2218             break;
2219
2220         case SIMDIntrinsicShuffleSSE2:
2221             assert(info->srcCount == 1);
2222             // Second operand is an integer constant and marked as contained.
2223             assert(simdTree->gtOp.gtOp2->isContainedIntOrIImmed());
2224             break;
2225
2226         case SIMDIntrinsicGetX:
2227         case SIMDIntrinsicGetY:
2228         case SIMDIntrinsicGetZ:
2229         case SIMDIntrinsicGetW:
2230         case SIMDIntrinsicGetOne:
2231         case SIMDIntrinsicGetZero:
2232         case SIMDIntrinsicGetCount:
2233         case SIMDIntrinsicGetAllOnes:
2234             assert(!"Get intrinsics should not be seen during Lowering.");
2235             unreached();
2236
2237         default:
2238             noway_assert(!"Unimplemented SIMD node type.");
2239             unreached();
2240     }
2241 }
2242 #endif // FEATURE_SIMD
2243
2244 #ifdef FEATURE_HW_INTRINSICS
2245 //------------------------------------------------------------------------
2246 // BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree.
2247 //
2248 // Arguments:
2249 //    tree       - The GT_HWIntrinsic node of interest
2250 //
2251 // Return Value:
2252 //    None.
2253
2254 void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
2255 {
2256     TreeNodeInfo*       info        = currentNodeInfo;
2257     NamedIntrinsic      intrinsicID = intrinsicTree->gtHWIntrinsicId;
2258     InstructionSet      isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
2259     HWIntrinsicCategory category    = Compiler::categoryOfHWIntrinsic(intrinsicID);
2260     HWIntrinsicFlag     flags       = Compiler::flagsOfHWIntrinsic(intrinsicID);
2261     int                 numArgs     = Compiler::numArgsOfHWIntrinsic(intrinsicID);
2262
2263     if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
2264     {
2265         SetContainsAVXFlags(true, 32);
2266     }
2267
2268     GenTree* op1   = intrinsicTree->gtOp.gtOp1;
2269     GenTree* op2   = intrinsicTree->gtOp.gtOp2;
2270     info->srcCount = 0;
2271
2272     if (op1 != nullptr)
2273     {
2274         if (op1->OperIsList())
2275         {
2276             for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest())
2277             {
2278                 info->srcCount += GetOperandInfo(list->Current());
2279             }
2280         }
2281         else
2282         {
2283             info->srcCount += GetOperandInfo(op1);
2284         }
2285     }
2286
2287     if (op2 != nullptr)
2288     {
2289         info->srcCount += GetOperandInfo(op2);
2290     }
2291
2292     if ((category == HW_Category_IMM) && ((flags & HW_Flag_NoJmpTableIMM) == 0))
2293     {
2294         GenTree* lastOp = Compiler::lastOpOfHWIntrinsic(intrinsicTree, numArgs);
2295         assert(lastOp != nullptr);
2296         if (Compiler::isImmHWIntrinsic(intrinsicID, lastOp) && !lastOp->isContainedIntOrIImmed())
2297         {
2298             assert(!lastOp->IsCnsIntOrI());
2299
2300             // We need two extra reg when lastOp isn't a constant so
2301             // the offset into the jump table for the fallback path
2302             // can be computed.
2303
2304             info->internalIntCount = 2;
2305             info->setInternalCandidates(this, allRegs(TYP_INT));
2306         }
2307     }
2308
2309     if (!compiler->canUseVexEncoding())
2310     {
2311         // On machines without VEX support, we sometimes have to inject an intermediate
2312         // `movaps targetReg, op1Reg` in order to maintain the correct behavior. This
2313         // becomes a problem if `op2Reg == targetReg` since that means we will overwrite
2314         // op2. In order to resolve this, we currently mark the second operand as delay free.
2315
2316         if ((flags & HW_Flag_NoRMWSemantics) == 0)
2317         {
2318             assert(category != HW_Category_MemoryLoad);
2319             assert(category != HW_Category_MemoryStore);
2320
2321             assert((flags & HW_Flag_NoCodeGen) == 0);
2322
2323             assert(numArgs != 0);
2324             assert(numArgs != 1);
2325
2326             if (info->srcCount >= 2)
2327             {
2328                 LocationInfoListNode* op2Info = useList.Begin()->Next();
2329                 op2Info->info.isDelayFree     = true;
2330                 info->hasDelayFreeSrc         = true;
2331             }
2332         }
2333     }
2334
2335     switch (intrinsicID)
2336     {
2337         case NI_SSE_CompareEqualOrderedScalar:
2338         case NI_SSE_CompareEqualUnorderedScalar:
2339         case NI_SSE_CompareNotEqualOrderedScalar:
2340         case NI_SSE_CompareNotEqualUnorderedScalar:
2341         case NI_SSE2_CompareEqualOrderedScalar:
2342         case NI_SSE2_CompareEqualUnorderedScalar:
2343         case NI_SSE2_CompareNotEqualOrderedScalar:
2344         case NI_SSE2_CompareNotEqualUnorderedScalar:
2345             info->internalIntCount = 1;
2346             info->setInternalCandidates(this, RBM_BYTE_REGS);
2347             info->isInternalRegDelayFree = true;
2348             break;
2349
2350         case NI_SSE_SetScalarVector128:
2351             // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
2352             info->internalFloatCount = 1;
2353             info->setInternalCandidates(this, allSIMDRegs());
2354             info->isInternalRegDelayFree = true;
2355             break;
2356
2357         case NI_SSE_ConvertToSingle:
2358         case NI_SSE_StaticCast:
2359         case NI_SSE2_ConvertToDouble:
2360             assert(info->srcCount == 1);
2361             assert(info->dstCount == 1);
2362             useList.Last()->info.isTgtPref = true;
2363             break;
2364
2365         case NI_SSE41_BlendVariable:
2366             if (!compiler->canUseVexEncoding())
2367             {
2368                 // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
2369                 LocationInfoListNode* op2Info = useList.Begin()->Next();
2370                 LocationInfoListNode* op3Info = op2Info->Next();
2371                 op2Info->info.isDelayFree     = true;
2372                 op3Info->info.isDelayFree     = true;
2373                 op3Info->info.setSrcCandidates(this, RBM_XMM0);
2374                 info->hasDelayFreeSrc = true;
2375             }
2376             break;
2377
2378         case NI_SSE41_TestAllOnes:
2379         {
2380             info->internalFloatCount = 1;
2381             info->setInternalCandidates(this, allSIMDRegs());
2382             break;
2383         }
2384
2385 #ifdef _TARGET_X86_
2386         case NI_SSE42_Crc32:
2387         {
2388             // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
2389             //
2390             // TODO - currently we use the BaseType to bring the type of the second argument
2391             // to the code generator. May encode the overload info in other way.
2392             var_types srcType = intrinsicTree->gtSIMDBaseType;
2393             if (varTypeIsByte(srcType))
2394             {
2395                 LocationInfoListNode* op2Info = useList.GetSecond(INDEBUG(intrinsicTree->gtGetOp2()));
2396                 op2Info->info.setSrcCandidates(this, RBM_BYTE_REGS);
2397             }
2398             break;
2399         }
2400 #endif // _TARGET_X86_
2401
2402         default:
2403             assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END));
2404             break;
2405     }
2406 }
2407 #endif
2408
2409 //------------------------------------------------------------------------
2410 // BuildCast: Set the NodeInfo for a GT_CAST.
2411 //
2412 // Arguments:
2413 //    tree      - The node of interest
2414 //
2415 // Return Value:
2416 //    None.
2417 //
2418 void LinearScan::BuildCast(GenTree* tree)
2419 {
2420     TreeNodeInfo* info = currentNodeInfo;
2421     // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
2422     //         see CodeGen::genIntToIntCast()
2423
2424     // Non-overflow casts to/from float/double are done using SSE2 instructions
2425     // and that allow the source operand to be either a reg or memop. Given the
2426     // fact that casts from small int to float/double are done as two-level casts,
2427     // the source operand is always guaranteed to be of size 4 or 8 bytes.
2428     var_types castToType = tree->CastToType();
2429     GenTree*  castOp     = tree->gtCast.CastOp();
2430     var_types castOpType = castOp->TypeGet();
2431
2432     info->srcCount = GetOperandInfo(castOp);
2433     assert(info->dstCount == 1);
2434     if (tree->gtFlags & GTF_UNSIGNED)
2435     {
2436         castOpType = genUnsignedType(castOpType);
2437     }
2438
2439     // some overflow checks need a temp reg:
2440     //  - GT_CAST from INT64/UINT64 to UINT32
2441     if (tree->gtOverflow() && (castToType == TYP_UINT))
2442     {
2443         if (genTypeSize(castOpType) == 8)
2444         {
2445             // Here we don't need internal register to be different from targetReg,
2446             // rather require it to be different from operand's reg.
2447             info->internalIntCount = 1;
2448         }
2449     }
2450 }
2451
2452 //-----------------------------------------------------------------------------------------
2453 // BuildIndir: Specify register requirements for address expression of an indirection operation.
2454 //
2455 // Arguments:
2456 //    indirTree    -   GT_IND or GT_STOREIND gentree node
2457 //
2458 void LinearScan::BuildIndir(GenTreeIndir* indirTree)
2459 {
2460     TreeNodeInfo* info = currentNodeInfo;
2461     // If this is the rhs of a block copy (i.e. non-enregisterable struct),
2462     // it has no register requirements.
2463     if (indirTree->TypeGet() == TYP_STRUCT)
2464     {
2465         return;
2466     }
2467
2468     int indirSrcCount = GetIndirInfo(indirTree);
2469     if (indirTree->gtOper == GT_STOREIND)
2470     {
2471         GenTree* source = indirTree->gtOp.gtOp2;
2472         if (indirTree->AsStoreInd()->IsRMWMemoryOp())
2473         {
2474             // Because 'source' is contained, we haven't yet determined its special register requirements, if any.
2475             // As it happens, the Shift or Rotate cases are the only ones with special requirements.
2476             assert(source->isContained() && source->OperIsRMWMemOp());
2477             GenTree* nonMemSource = nullptr;
2478
2479             if (source->OperIsShiftOrRotate())
2480             {
2481                 info->srcCount += BuildShiftRotate(source);
2482             }
2483             else
2484             {
2485                 info->srcCount += appendBinaryLocationInfoToList(source->AsOp());
2486             }
2487             if (indirTree->AsStoreInd()->IsRMWDstOp1())
2488             {
2489                 if (source->OperIsBinary())
2490                 {
2491                     nonMemSource = source->gtOp.gtOp2;
2492                 }
2493             }
2494             else if (indirTree->AsStoreInd()->IsRMWDstOp2())
2495             {
2496                 nonMemSource = source->gtOp.gtOp1;
2497             }
2498             if (nonMemSource != nullptr)
2499             {
2500                 assert(!nonMemSource->isContained() || (!nonMemSource->isMemoryOp() && !nonMemSource->IsLocal()));
2501 #ifdef _TARGET_X86_
2502                 if (varTypeIsByte(indirTree) && !nonMemSource->isContained())
2503                 {
2504                     // If storeInd is of TYP_BYTE, set source to byteable registers.
2505                     TreeNodeInfo& nonMemSourceInfo = useList.GetTreeNodeInfo(nonMemSource);
2506                     regMaskTP     regMask          = nonMemSourceInfo.getSrcCandidates(this);
2507                     regMask &= ~RBM_NON_BYTE_REGS;
2508                     assert(regMask != RBM_NONE);
2509                     nonMemSourceInfo.setSrcCandidates(this, regMask);
2510                 }
2511 #endif
2512             }
2513         }
2514         else
2515         {
2516 #ifdef _TARGET_X86_
2517             if (varTypeIsByte(indirTree) && !source->isContained())
2518             {
2519                 // If storeInd is of TYP_BYTE, set source to byteable registers.
2520                 LocationInfoListNode* sourceInfo = getLocationInfo(source);
2521                 regMaskTP             regMask    = sourceInfo->info.getSrcCandidates(this);
2522                 regMask &= ~RBM_NON_BYTE_REGS;
2523                 assert(regMask != RBM_NONE);
2524                 sourceInfo->info.setSrcCandidates(this, regMask);
2525                 useList.Append(sourceInfo);
2526                 info->srcCount++;
2527             }
2528             else
2529 #endif
2530             {
2531                 info->srcCount += GetOperandInfo(source);
2532             }
2533         }
2534     }
2535     info->srcCount += indirSrcCount;
2536
2537 #ifdef FEATURE_SIMD
2538     if (indirTree->TypeGet() == TYP_SIMD12)
2539     {
2540         // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir().
2541         assert(!indirTree->Addr()->isContained());
2542
2543         // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
2544         // To assemble the vector properly we would need an additional
2545         // XMM register.
2546         info->internalFloatCount = 1;
2547
2548         // In case of GT_IND we need an internal register different from targetReg and
2549         // both of the registers are used at the same time.
2550         if (indirTree->OperGet() == GT_IND)
2551         {
2552             info->isInternalRegDelayFree = true;
2553         }
2554
2555         info->setInternalCandidates(this, allSIMDRegs());
2556
2557         return;
2558     }
2559 #endif // FEATURE_SIMD
2560
2561     assert(indirTree->Addr()->gtOper != GT_ARR_ELEM);
2562 }
2563
2564 //------------------------------------------------------------------------
2565 // BuildMul: Set the NodeInfo for a multiply.
2566 //
2567 // Arguments:
2568 //    tree      - The node of interest
2569 //
2570 // Return Value:
2571 //    None.
2572 //
2573 void LinearScan::BuildMul(GenTree* tree)
2574 {
2575     TreeNodeInfo* info = currentNodeInfo;
2576 #if defined(_TARGET_X86_)
2577     assert(tree->OperIs(GT_MUL, GT_MULHI, GT_MUL_LONG));
2578 #else
2579     assert(tree->OperIs(GT_MUL, GT_MULHI));
2580 #endif
2581     GenTree* op1   = tree->gtOp.gtOp1;
2582     GenTree* op2   = tree->gtOp.gtOp2;
2583     info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
2584     assert(info->dstCount == 1);
2585
2586     // Case of float/double mul.
2587     if (varTypeIsFloating(tree->TypeGet()))
2588     {
2589         return;
2590     }
2591
2592     bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
2593     bool requiresOverflowCheck = tree->gtOverflowEx();
2594
2595     // There are three forms of x86 multiply:
2596     // one-op form:     RDX:RAX = RAX * r/m
2597     // two-op form:     reg *= r/m
2598     // three-op form:   reg = r/m * imm
2599
2600     // This special widening 32x32->64 MUL is not used on x64
2601     CLANG_FORMAT_COMMENT_ANCHOR;
2602 #if defined(_TARGET_X86_)
2603     if (tree->OperGet() != GT_MUL_LONG)
2604 #endif
2605     {
2606         assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
2607     }
2608
2609     // We do use the widening multiply to implement
2610     // the overflow checking for unsigned multiply
2611     //
2612     if (isUnsignedMultiply && requiresOverflowCheck)
2613     {
2614         // The only encoding provided is RDX:RAX = RAX * rm
2615         //
2616         // Here we set RAX as the only destination candidate
2617         // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
2618         //
2619         info->setDstCandidates(this, RBM_RAX);
2620     }
2621     else if (tree->OperGet() == GT_MULHI)
2622     {
2623         // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
2624         // upper 32 bits of the result set the destination candidate to REG_RDX.
2625         info->setDstCandidates(this, RBM_RDX);
2626     }
2627 #if defined(_TARGET_X86_)
2628     else if (tree->OperGet() == GT_MUL_LONG)
2629     {
2630         // have to use the encoding:RDX:RAX = RAX * rm
2631         info->setDstCandidates(this, RBM_RAX);
2632     }
2633 #endif
2634     GenTree* containedMemOp = nullptr;
2635     if (op1->isContained() && !op1->IsCnsIntOrI())
2636     {
2637         assert(!op2->isContained() || op2->IsCnsIntOrI());
2638         containedMemOp = op1;
2639     }
2640     else if (op2->isContained() && !op2->IsCnsIntOrI())
2641     {
2642         containedMemOp = op2;
2643     }
2644     if ((containedMemOp != nullptr) && CheckAndSetDelayFree(containedMemOp))
2645     {
2646         info->hasDelayFreeSrc = true;
2647     }
2648 }
2649
2650 //------------------------------------------------------------------------------
2651 // SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
2652 // Contains256bitAVX flag when SIMD vector size is 32 bytes
2653 //
2654 // Arguments:
2655 //    isFloatingPointType   - true if it is floating point type
2656 //    sizeOfSIMDVector      - SIMD Vector size
2657 //
2658 void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
2659 {
2660     if (isFloatingPointType && compiler->canUseVexEncoding())
2661     {
2662         compiler->getEmitter()->SetContainsAVX(true);
2663         if (sizeOfSIMDVector == 32)
2664         {
2665             compiler->getEmitter()->SetContains256bitAVX(true);
2666         }
2667     }
2668 }
2669
2670 #ifdef _TARGET_X86_
2671 //------------------------------------------------------------------------
2672 // ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
2673 // various reasons
2674 //
2675 // Arguments:
2676 //    tree      - The node of interest
2677 //
2678 // Return Value:
2679 //    If we need to exclude non-byteable registers
2680 //
2681 bool LinearScan::ExcludeNonByteableRegisters(GenTree* tree)
2682 {
2683     // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
2684     // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
2685     // value. In this case we need to exclude esi/edi from the src candidates of op2.
2686     if (varTypeIsByte(tree))
2687     {
2688         return true;
2689     }
2690     // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
2691     else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
2692     {
2693         return true;
2694     }
2695     else if (tree->OperIsCompare() || tree->OperIs(GT_CMP))
2696     {
2697         GenTree* op1 = tree->gtGetOp1();
2698         GenTree* op2 = tree->gtGetOp2();
2699
2700         // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
2701         // ubyte as the result of comparison and if the result needs to be materialized into a reg
2702         // simply zero extend it to TYP_INT size.  Here is an example of generated code:
2703         //         cmp dl, byte ptr[addr mode]
2704         //         movzx edx, dl
2705         if (varTypeIsByte(op1) && varTypeIsByte(op2))
2706         {
2707             return true;
2708         }
2709         // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
2710         // ubyte as the result of the comparison and if the result needs to be materialized into a reg
2711         // simply zero extend it to TYP_INT size.
2712         else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
2713         {
2714             return true;
2715         }
2716         // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
2717         // ubyte as the result of the comparison and if the result needs to be materialized into a reg
2718         // simply zero extend it to TYP_INT size.
2719         else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
2720         {
2721             return true;
2722         }
2723         else
2724         {
2725             return false;
2726         }
2727     }
2728 #ifdef FEATURE_SIMD
2729     else if (tree->OperGet() == GT_SIMD)
2730     {
2731         GenTreeSIMD* simdNode = tree->AsSIMD();
2732         switch (simdNode->gtSIMDIntrinsicID)
2733         {
2734             case SIMDIntrinsicOpEquality:
2735             case SIMDIntrinsicOpInEquality:
2736                 // We manifest it into a byte register, so the target must be byteable.
2737                 return true;
2738
2739             case SIMDIntrinsicGetItem:
2740             {
2741                 // This logic is duplicated from genSIMDIntrinsicGetItem().
2742                 // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
2743                 // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
2744                 // cases will require this, so the non-byteable registers can be excluded.
2745
2746                 GenTree*  op1      = simdNode->gtGetOp1();
2747                 GenTree*  op2      = simdNode->gtGetOp2();
2748                 var_types baseType = simdNode->gtSIMDBaseType;
2749                 if (!isContainableMemoryOp(op1) && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
2750                 {
2751                     bool     ZeroOrSignExtnReqd = true;
2752                     unsigned baseSize           = genTypeSize(baseType);
2753                     if (baseSize == 1)
2754                     {
2755                         if ((op2->gtIntCon.gtIconVal % 2) == 1)
2756                         {
2757                             ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2758                         }
2759                     }
2760                     else
2761                     {
2762                         assert(baseSize == 2);
2763                         ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2764                     }
2765                     return ZeroOrSignExtnReqd;
2766                 }
2767                 break;
2768             }
2769
2770             default:
2771                 break;
2772         }
2773         return false;
2774     }
2775 #endif // FEATURE_SIMD
2776     else
2777     {
2778         return false;
2779     }
2780 }
2781 #endif // _TARGET_X86_
2782
2783 #endif // _TARGET_XARCH_
2784
2785 #endif // !LEGACY_BACKEND