Implement scalar Sse2 hardware intrinsics
[platform/upstream/coreclr.git] / src / jit / lsraxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX                    Register Requirements for AMD64                        XX
9 XX                                                                           XX
10 XX  This encapsulates all the logic for setting register requirements for    XX
11 XX  the AMD64 architecture.                                                  XX
12 XX                                                                           XX
13 XX                                                                           XX
14 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 */
17
18 #include "jitpch.h"
19 #ifdef _MSC_VER
20 #pragma hdrstop
21 #endif
22
23 #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
24
25 #ifdef _TARGET_XARCH_
26
27 #include "jit.h"
28 #include "sideeffects.h"
29 #include "lower.h"
30
31 //------------------------------------------------------------------------
32 // BuildNode: Set register requirements for a node
33 //
34 // Arguments:
35 //    treeNode - the node of interest
36 //
37 // Notes:
38 // Preconditions:
39 //    LSRA Has been initialized and there is a TreeNodeInfo node
40 //    already allocated and initialized for every tree in the IR.
41 // Postconditions:
42 //    Every TreeNodeInfo instance has the right annotations on register
43 //    requirements needed by LSRA to build the Interval Table (source,
44 //    destination and internal [temp] register counts).
45 //
46 void LinearScan::BuildNode(GenTree* tree)
47 {
48     TreeNodeInfo* info = currentNodeInfo;
49     assert(!tree->isContained());
50
51     if (tree->IsValue())
52     {
53         info->dstCount = 1;
54         if (tree->IsUnusedValue())
55         {
56             info->isLocalDefUse = true;
57         }
58     }
59     else
60     {
61         info->dstCount = 0;
62     }
63
64     // floating type generates AVX instruction (vmovss etc.), set the flag
65     SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
66     switch (tree->OperGet())
67     {
68         default:
69             BuildSimple(tree);
70             break;
71
72         case GT_LCL_VAR:
73             // Because we do containment analysis before we redo dataflow and identify register
74             // candidates, the containment analysis only !lvDoNotEnregister to estimate register
75             // candidates.
76             // If there is a lclVar that is estimated to be register candidate but
77             // is not, if they were marked regOptional they should now be marked contained instead.
78             // TODO-XArch-CQ: When this is being called while RefPositions are being created,
79             // use lvLRACandidate here instead.
80             if (tree->IsRegOptional())
81             {
82                 if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked ||
83                     compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister)
84                 {
85                     tree->ClearRegOptional();
86                     tree->SetContained();
87                     info->dstCount = 0;
88                     return;
89                 }
90             }
91             __fallthrough;
92
93         case GT_LCL_FLD:
94             info->srcCount = 0;
95
96 #ifdef FEATURE_SIMD
97             // Need an additional register to read upper 4 bytes of Vector3.
98             if (tree->TypeGet() == TYP_SIMD12)
99             {
100                 // We need an internal register different from targetReg in which 'tree' produces its result
101                 // because both targetReg and internal reg will be in use at the same time.
102                 info->internalFloatCount     = 1;
103                 info->isInternalRegDelayFree = true;
104                 info->setInternalCandidates(this, allSIMDRegs());
105             }
106 #endif
107             break;
108
109         case GT_STORE_LCL_FLD:
110         case GT_STORE_LCL_VAR:
111             BuildStoreLoc(tree->AsLclVarCommon());
112             break;
113
114         case GT_LIST:
115         case GT_FIELD_LIST:
116         case GT_ARGPLACE:
117         case GT_NO_OP:
118         case GT_START_NONGC:
119         case GT_PROF_HOOK:
120             info->srcCount = 0;
121             assert(info->dstCount == 0);
122             break;
123
124         case GT_CNS_DBL:
125             info->srcCount = 0;
126             assert(info->dstCount == 1);
127             break;
128
129 #if !defined(_TARGET_64BIT_)
130
131         case GT_LONG:
132             assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here.
133             // An unused GT_LONG node needs to consume its sources, but need not produce a register.
134             tree->gtType = TYP_VOID;
135             tree->ClearUnusedValue();
136             info->isLocalDefUse = false;
137             info->srcCount      = 2;
138             info->dstCount      = 0;
139             appendLocationInfoToList(tree->gtGetOp1());
140             appendLocationInfoToList(tree->gtGetOp2());
141             break;
142
143 #endif // !defined(_TARGET_64BIT_)
144
145         case GT_BOX:
146         case GT_COMMA:
147         case GT_QMARK:
148         case GT_COLON:
149             info->srcCount = 0;
150             assert(info->dstCount == 0);
151             unreached();
152             break;
153
154         case GT_RETURN:
155             BuildReturn(tree);
156             break;
157
158         case GT_RETFILT:
159             assert(info->dstCount == 0);
160             if (tree->TypeGet() == TYP_VOID)
161             {
162                 info->srcCount = 0;
163             }
164             else
165             {
166                 assert(tree->TypeGet() == TYP_INT);
167
168                 info->srcCount = 1;
169
170                 info->setSrcCandidates(this, RBM_INTRET);
171                 LocationInfoListNode* locationInfo = getLocationInfo(tree->gtOp.gtOp1);
172                 locationInfo->info.setSrcCandidates(this, RBM_INTRET);
173                 useList.Append(locationInfo);
174             }
175             break;
176
177         // A GT_NOP is either a passthrough (if it is void, or if it has
178         // a child), but must be considered to produce a dummy value if it
179         // has a type but no child
180         case GT_NOP:
181             info->srcCount = 0;
182             assert((tree->gtOp.gtOp1 == nullptr) || tree->isContained());
183             if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
184             {
185                 assert(info->dstCount == 1);
186             }
187             else
188             {
189                 assert(info->dstCount == 0);
190             }
191             break;
192
193         case GT_JTRUE:
194         {
195             info->srcCount = 0;
196             assert(info->dstCount == 0);
197             GenTree* cmp = tree->gtGetOp1();
198             assert(!cmp->IsValue());
199         }
200         break;
201
202         case GT_JCC:
203             info->srcCount = 0;
204             assert(info->dstCount == 0);
205             break;
206
207         case GT_SETCC:
208             info->srcCount = 0;
209             assert(info->dstCount == 1);
210 #ifdef _TARGET_X86_
211             info->setDstCandidates(this, RBM_BYTE_REGS);
212 #endif // _TARGET_X86_
213             break;
214
215         case GT_JMP:
216             info->srcCount = 0;
217             assert(info->dstCount == 0);
218             break;
219
220         case GT_SWITCH:
221             // This should never occur since switch nodes must not be visible at this
222             // point in the JIT.
223             info->srcCount = 0;
224             noway_assert(!"Switch must be lowered at this point");
225             break;
226
227         case GT_JMPTABLE:
228             info->srcCount = 0;
229             assert(info->dstCount == 1);
230             break;
231
232         case GT_SWITCH_TABLE:
233             info->internalIntCount = 1;
234             assert(info->dstCount == 0);
235             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
236             assert(info->srcCount == 2);
237             break;
238
239         case GT_ASG:
240             noway_assert(!"We should never hit any assignment operator in lowering");
241             info->srcCount = 0;
242             break;
243
244 #if !defined(_TARGET_64BIT_)
245         case GT_ADD_LO:
246         case GT_ADD_HI:
247         case GT_SUB_LO:
248         case GT_SUB_HI:
249 #endif
250         case GT_ADD:
251         case GT_SUB:
252         case GT_AND:
253         case GT_OR:
254         case GT_XOR:
255         case GT_BT:
256             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
257             break;
258
259         case GT_RETURNTRAP:
260             // This just turns into a compare of its child with an int + a conditional call.
261             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
262             assert(info->dstCount == 0);
263             info->internalIntCount = 1;
264             info->setInternalCandidates(this, allRegs(TYP_INT));
265             break;
266
267         case GT_MOD:
268         case GT_DIV:
269         case GT_UMOD:
270         case GT_UDIV:
271             BuildModDiv(tree->AsOp());
272             break;
273
274         case GT_MUL:
275         case GT_MULHI:
276 #if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
277         case GT_MUL_LONG:
278 #endif
279             BuildMul(tree->AsOp());
280             break;
281
282         case GT_INTRINSIC:
283             BuildIntrinsic(tree->AsOp());
284             break;
285
286 #ifdef FEATURE_SIMD
287         case GT_SIMD:
288             BuildSIMD(tree->AsSIMD());
289             break;
290 #endif // FEATURE_SIMD
291
292 #ifdef FEATURE_HW_INTRINSICS
293         case GT_HWIntrinsic:
294             BuildHWIntrinsic(tree->AsHWIntrinsic());
295             break;
296 #endif // FEATURE_HW_INTRINSICS
297
298         case GT_CAST:
299             BuildCast(tree);
300             break;
301
302         case GT_BITCAST:
303         {
304             LocationInfoListNode* locationInfo = getLocationInfo(tree->gtOp.gtOp1);
305             locationInfo->info.isTgtPref       = true;
306             useList.Append(locationInfo);
307             info->srcCount = 1;
308             info->dstCount = 1;
309         }
310         break;
311
312         case GT_NEG:
313             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
314
315             // TODO-XArch-CQ:
316             // SSE instruction set doesn't have an instruction to negate a number.
317             // The recommended way is to xor the float/double number with a bitmask.
318             // The only way to xor is using xorps or xorpd both of which operate on
319             // 128-bit operands.  To hold the bit-mask we would need another xmm
320             // register or a 16-byte aligned 128-bit data constant. Right now emitter
321             // lacks the support for emitting such constants or instruction with mem
322             // addressing mode referring to a 128-bit operand. For now we use an
323             // internal xmm register to load 32/64-bit bitmask from data section.
324             // Note that by trading additional data section memory (128-bit) we can
325             // save on the need for an internal register and also a memory-to-reg
326             // move.
327             //
328             // Note: another option to avoid internal register requirement is by
329             // lowering as GT_SUB(0, src).  This will generate code different from
330             // Jit64 and could possibly result in compat issues (?).
331             if (varTypeIsFloating(tree))
332             {
333                 info->internalFloatCount = 1;
334                 info->setInternalCandidates(this, internalFloatRegCandidates());
335             }
336             break;
337
338         case GT_NOT:
339             info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
340             break;
341
342         case GT_LSH:
343         case GT_RSH:
344         case GT_RSZ:
345         case GT_ROL:
346         case GT_ROR:
347 #ifdef _TARGET_X86_
348         case GT_LSH_HI:
349         case GT_RSH_LO:
350 #endif
351             (void)BuildShiftRotate(tree);
352             break;
353
354         case GT_EQ:
355         case GT_NE:
356         case GT_LT:
357         case GT_LE:
358         case GT_GE:
359         case GT_GT:
360         case GT_TEST_EQ:
361         case GT_TEST_NE:
362         case GT_CMP:
363             BuildCmp(tree);
364             break;
365
366         case GT_CKFINITE:
367             appendLocationInfoToList(tree->gtOp.gtOp1);
368             info->srcCount = 1;
369             assert(info->dstCount == 1);
370             info->internalIntCount = 1;
371             break;
372
373         case GT_CMPXCHG:
374         {
375             info->srcCount = 3;
376             assert(info->dstCount == 1);
377
378             // comparand is preferenced to RAX.
379             // Remaining two operands can be in any reg other than RAX.
380             LocationInfoListNode* locationInfo = getLocationInfo(tree->gtCmpXchg.gtOpLocation);
381             locationInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RAX);
382             useList.Append(locationInfo);
383             LocationInfoListNode* valueInfo = getLocationInfo(tree->gtCmpXchg.gtOpValue);
384             valueInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RAX);
385             useList.Append(valueInfo);
386             info->setDstCandidates(this, RBM_RAX);
387             LocationInfoListNode* comparandInfo = getLocationInfo(tree->gtCmpXchg.gtOpComparand);
388             comparandInfo->info.setSrcCandidates(this, RBM_RAX);
389             useList.Append(comparandInfo);
390         }
391         break;
392
393         case GT_LOCKADD:
394             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
395             assert(info->dstCount == (tree->TypeGet() == TYP_VOID) ? 0 : 1);
396             break;
397
398         case GT_PUTARG_REG:
399             BuildPutArgReg(tree->AsUnOp());
400             break;
401
402         case GT_CALL:
403             BuildCall(tree->AsCall());
404             break;
405
406         case GT_ADDR:
407         {
408             // For a GT_ADDR, the child node should not be evaluated into a register
409             GenTree* child = tree->gtOp.gtOp1;
410             assert(!isCandidateLocalRef(child));
411             assert(child->isContained());
412             assert(info->dstCount == 1);
413             info->srcCount = 0;
414         }
415         break;
416
417 #if !defined(FEATURE_PUT_STRUCT_ARG_STK)
418         case GT_OBJ:
419 #endif
420         case GT_BLK:
421         case GT_DYN_BLK:
422             // These should all be eliminated prior to Lowering.
423             assert(!"Non-store block node in Lowering");
424             info->srcCount = 0;
425             break;
426
427 #ifdef FEATURE_PUT_STRUCT_ARG_STK
428         case GT_PUTARG_STK:
429             BuildPutArgStk(tree->AsPutArgStk());
430             break;
431 #endif // FEATURE_PUT_STRUCT_ARG_STK
432
433         case GT_STORE_BLK:
434         case GT_STORE_OBJ:
435         case GT_STORE_DYN_BLK:
436             BuildBlockStore(tree->AsBlk());
437             break;
438
439         case GT_INIT_VAL:
440             // Always a passthrough of its child's value.
441             assert(!"INIT_VAL should always be contained");
442             break;
443
444         case GT_LCLHEAP:
445             BuildLclHeap(tree);
446             break;
447
448         case GT_ARR_BOUNDS_CHECK:
449 #ifdef FEATURE_SIMD
450         case GT_SIMD_CHK:
451 #endif // FEATURE_SIMD
452             // Consumes arrLen & index - has no result
453             info->srcCount = 2;
454             assert(info->dstCount == 0);
455             info->srcCount = GetOperandInfo(tree->AsBoundsChk()->gtIndex);
456             info->srcCount += GetOperandInfo(tree->AsBoundsChk()->gtArrLen);
457             break;
458
459         case GT_ARR_ELEM:
460             // These must have been lowered to GT_ARR_INDEX
461             noway_assert(!"We should never see a GT_ARR_ELEM after Lowering.");
462             info->srcCount = 0;
463             break;
464
465         case GT_ARR_INDEX:
466         {
467             info->srcCount = 2;
468             assert(info->dstCount == 1);
469             assert(!tree->AsArrIndex()->ArrObj()->isContained());
470             assert(!tree->AsArrIndex()->IndexExpr()->isContained());
471             // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
472             // times while the result is being computed.
473             LocationInfoListNode* arrObjInfo = getLocationInfo(tree->AsArrIndex()->ArrObj());
474             arrObjInfo->info.isDelayFree     = true;
475             useList.Append(arrObjInfo);
476             useList.Append(getLocationInfo(tree->AsArrIndex()->IndexExpr()));
477             info->hasDelayFreeSrc = true;
478         }
479         break;
480
481         case GT_ARR_OFFSET:
482             // This consumes the offset, if any, the arrObj and the effective index,
483             // and produces the flattened offset for this dimension.
484             assert(info->dstCount == 1);
485             if (tree->gtArrOffs.gtOffset->isContained())
486             {
487                 info->srcCount = 2;
488             }
489             else
490             {
491                 // Here we simply need an internal register, which must be different
492                 // from any of the operand's registers, but may be the same as targetReg.
493                 info->srcCount         = 3;
494                 info->internalIntCount = 1;
495                 appendLocationInfoToList(tree->AsArrOffs()->gtOffset);
496             }
497             appendLocationInfoToList(tree->AsArrOffs()->gtIndex);
498             appendLocationInfoToList(tree->AsArrOffs()->gtArrObj);
499             break;
500
501         case GT_LEA:
502             // The LEA usually passes its operands through to the GT_IND, in which case it will
503             // be contained, but we may be instantiating an address, in which case we set them here.
504             info->srcCount = 0;
505             assert(info->dstCount == 1);
506             if (tree->AsAddrMode()->HasBase())
507             {
508                 info->srcCount++;
509                 appendLocationInfoToList(tree->AsAddrMode()->Base());
510             }
511             if (tree->AsAddrMode()->HasIndex())
512             {
513                 info->srcCount++;
514                 appendLocationInfoToList(tree->AsAddrMode()->Index());
515             }
516             break;
517
518         case GT_STOREIND:
519             if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
520             {
521                 BuildGCWriteBarrier(tree);
522                 break;
523             }
524             BuildIndir(tree->AsIndir());
525             break;
526
527         case GT_NULLCHECK:
528             assert(info->dstCount == 0);
529             appendLocationInfoToList(tree->gtOp.gtOp1);
530             info->srcCount = 1;
531             break;
532
533         case GT_IND:
534             BuildIndir(tree->AsIndir());
535             assert(info->dstCount == 1);
536             break;
537
538         case GT_CATCH_ARG:
539             info->srcCount = 0;
540             assert(info->dstCount == 1);
541             info->setDstCandidates(this, RBM_EXCEPTION_OBJECT);
542             break;
543
544 #if !FEATURE_EH_FUNCLETS
545         case GT_END_LFIN:
546             info->srcCount = 0;
547             assert(info->dstCount == 0);
548             break;
549 #endif
550
551         case GT_CLS_VAR:
552             // These nodes are eliminated by rationalizer.
553             JITDUMP("Unexpected node %s in Lower.\n", GenTree::OpName(tree->OperGet()));
554             unreached();
555             break;
556
557         case GT_INDEX_ADDR:
558             assert(info->dstCount == 1);
559             info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
560
561             if (tree->AsIndexAddr()->Index()->TypeGet() == TYP_I_IMPL)
562             {
563                 info->internalIntCount = 1;
564             }
565             else
566             {
567                 switch (tree->AsIndexAddr()->gtElemSize)
568                 {
569                     case 1:
570                     case 2:
571                     case 4:
572                     case 8:
573                         break;
574
575                     default:
576                         info->internalIntCount = 1;
577                         break;
578                 }
579             }
580             break;
581     } // end switch (tree->OperGet())
582
583     // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
584     // Even then we would like to set isTgtPref on Op1.
585     if (tree->OperIsBinary() && info->srcCount >= 1)
586     {
587         if (isRMWRegOper(tree))
588         {
589             GenTree* op1 = tree->gtOp.gtOp1;
590             GenTree* op2 = tree->gtOp.gtOp2;
591
592             // Commutative opers like add/mul/and/or/xor could reverse the order of
593             // operands if it is safe to do so.  In such a case we would like op2 to be
594             // target preferenced instead of op1.
595             if (tree->OperIsCommutative() && op1->isContained() && op2 != nullptr)
596             {
597                 op1 = op2;
598                 op2 = tree->gtOp.gtOp1;
599             }
600
601             // If we have a read-modify-write operation, we want to preference op1 to the target,
602             // if it is not contained.
603             if (!op1->isContained() && !op1->OperIs(GT_LIST))
604             {
605                 useList.GetTreeNodeInfo(op1).isTgtPref = true;
606             }
607
608             // Is this a non-commutative operator, or is op2 a contained memory op?
609             // In either case, we need to make op2 remain live until the op is complete, by marking
610             // the source(s) associated with op2 as "delayFree".
611             // Note that if op2 of a binary RMW operator is a memory op, even if the operator
612             // is commutative, codegen cannot reverse them.
613             // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
614             // more work to be done to correctly reverse the operands if they involve memory
615             // operands.  Also, we may need to handle more cases than GT_IND, especially once
616             // we've modified the register allocator to not require all nodes to be assigned
617             // a register (e.g. a spilled lclVar can often be referenced directly from memory).
618             // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
619
620             GenTree* delayUseSrc = nullptr;
621             // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
622             // to special case them.
623             if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
624             {
625                 // These tree nodes will have their op1 marked as isDelayFree=true.
626                 // Hence these tree nodes should have a Def position so that op1's reg
627                 // gets freed at DefLoc+1.
628                 if (tree->TypeGet() == TYP_VOID)
629                 {
630                     // Right now a GT_XADD node could be morphed into a
631                     // GT_LOCKADD of TYP_VOID. See gtExtractSideEffList().
632                     // Note that it is advantageous to use GT_LOCKADD
633                     // instead of of GT_XADD as the former uses lock.add,
634                     // which allows its second operand to be a contained
635                     // immediate wheres xadd instruction requires its
636                     // second operand to be in a register.
637                     assert(info->dstCount == 0);
638
639                     // Give it an artificial type and mark it as an unused value.
640                     // This results in a Def position created but not considered consumed by its parent node.
641                     tree->gtType        = TYP_INT;
642                     info->dstCount      = 1;
643                     info->isLocalDefUse = true;
644                     tree->SetUnusedValue();
645                 }
646                 else
647                 {
648                     assert(info->dstCount != 0);
649                 }
650
651                 delayUseSrc = op1;
652             }
653             else if ((op2 != nullptr) && (!tree->OperIsCommutative() || (op2->isContained() && !op2->IsCnsIntOrI())))
654             {
655                 delayUseSrc = op2;
656             }
657             if ((delayUseSrc != nullptr) && CheckAndSetDelayFree(delayUseSrc))
658             {
659                 info->hasDelayFreeSrc = true;
660             }
661         }
662     }
663
664     BuildCheckByteable(tree);
665
666     // We need to be sure that we've set info->srcCount and info->dstCount appropriately
667     assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
668     assert(info->isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue()));
669     assert(!tree->IsUnusedValue() || (info->dstCount != 0));
670     assert(info->dstCount == tree->GetRegisterDstCount());
671 }
672
673 //---------------------------------------------------------------------
674 // CheckAndSetDelayFree - Set isDelayFree on the given operand or its child(ren), if appropriate
675 //
676 // Arguments
677 //    delayUseSrc - a node that may have a delayed use
678 //
679 // Return Value:
680 //    True iff the node or one of its children has been marked isDelayFree
681 //
682 // Notes:
683 //    Only register operands should be marked isDelayFree, not contained immediates or memory.
684 //
685 bool LinearScan::CheckAndSetDelayFree(GenTree* delayUseSrc)
686 {
687     // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
688     // on the base & index, if any.
689     // Otherwise, we set it on delayUseSrc itself.
690     bool returnValue = false;
691     if (delayUseSrc->isContained())
692     {
693         // If delayUseSrc is a non-Indir contained node (e.g. a local) there's no register use to delay.
694         if (delayUseSrc->isIndir())
695         {
696             GenTree* base  = delayUseSrc->AsIndir()->Base();
697             GenTree* index = delayUseSrc->AsIndir()->Index();
698             if ((base != nullptr) && !base->isContained())
699             {
700                 useList.GetTreeNodeInfo(base).isDelayFree = true;
701                 returnValue                               = true;
702             }
703             if (index != nullptr)
704             {
705                 assert(!index->isContained());
706                 useList.GetTreeNodeInfo(index).isDelayFree = true;
707                 returnValue                                = true;
708             }
709         }
710     }
711     else
712     {
713         useList.GetTreeNodeInfo(delayUseSrc).isDelayFree = true;
714         returnValue                                      = true;
715     }
716     return returnValue;
717 }
718
719 //------------------------------------------------------------------------
720 // BuildCheckByteable: Check the tree to see if "byte-able" registers are
721 // required, and set the tree node info accordingly.
722 //
723 // Arguments:
724 //    tree      - The node of interest
725 //
726 // Return Value:
727 //    None.
728 //
729 void LinearScan::BuildCheckByteable(GenTree* tree)
730 {
731 #ifdef _TARGET_X86_
732     TreeNodeInfo* info = currentNodeInfo;
733     // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
734     // if the tree node is a byte type.
735     //
736     // Though this looks conservative in theory, in practice we could not think of a case where
737     // the below logic leads to conservative register specification.  In future when or if we find
738     // one such case, this logic needs to be fine tuned for that case(s).
739
740     if (ExcludeNonByteableRegisters(tree))
741     {
742         regMaskTP regMask;
743         if (info->dstCount > 0)
744         {
745             regMask = info->getDstCandidates(this);
746             assert(regMask != RBM_NONE);
747             info->setDstCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
748         }
749
750         if (tree->OperIsSimple())
751         {
752             GenTree* op = tree->gtOp.gtOp1;
753             if (op != nullptr)
754             {
755                 // No need to set src candidates on a contained child operand.
756                 if (!op->isContained())
757                 {
758                     TreeNodeInfo& op1Info = useList.GetTreeNodeInfo(op);
759                     regMask               = op1Info.getSrcCandidates(this);
760                     assert(regMask != RBM_NONE);
761                     op1Info.setSrcCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
762                 }
763             }
764
765             if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
766             {
767                 op = tree->gtOp.gtOp2;
768                 if (!op->isContained())
769                 {
770                     TreeNodeInfo& op2Info = useList.GetTreeNodeInfo(op);
771                     regMask               = op2Info.getSrcCandidates(this);
772                     assert(regMask != RBM_NONE);
773                     op2Info.setSrcCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
774                 }
775             }
776         }
777     }
778 #endif //_TARGET_X86_
779 }
780
781 //------------------------------------------------------------------------------
782 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
783 //
784 // Arguments:
785 //    tree      - a binary tree node
786 //
787 // Return Value:
788 //    Returns true if we can use the read-modify-write instruction form
789 //
790 // Notes:
791 //    This is used to determine whether to preference the source to the destination register.
792 //
793 bool LinearScan::isRMWRegOper(GenTree* tree)
794 {
795     // TODO-XArch-CQ: Make this more accurate.
796     // For now, We assume that most binary operators are of the RMW form.
797     assert(tree->OperIsBinary());
798
799     if (tree->OperIsCompare() || tree->OperIs(GT_CMP))
800     {
801         return false;
802     }
803
804     switch (tree->OperGet())
805     {
806         // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
807         case GT_LEA:
808         case GT_STOREIND:
809         case GT_ARR_INDEX:
810         case GT_STORE_BLK:
811         case GT_STORE_OBJ:
812             return false;
813
814         // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
815         case GT_MUL:
816             return (!tree->gtOp.gtOp2->isContainedIntOrIImmed() && !tree->gtOp.gtOp1->isContainedIntOrIImmed());
817
818         default:
819             return true;
820     }
821 }
822
823 //------------------------------------------------------------------------
824 // BuildShiftRotate: Set the NodeInfo for a shift or rotate.
825 //
826 // Arguments:
827 //    tree      - The node of interest
828 //
829 // Return Value:
830 //    None.
831 //
832 int LinearScan::BuildShiftRotate(GenTree* tree)
833 {
834     TreeNodeInfo* info = currentNodeInfo;
835     // For shift operations, we need that the number
836     // of bits moved gets stored in CL in case
837     // the number of bits to shift is not a constant.
838     int                   srcCount    = 0;
839     GenTree*              shiftBy     = tree->gtOp.gtOp2;
840     GenTree*              source      = tree->gtOp.gtOp1;
841     LocationInfoListNode* shiftByInfo = nullptr;
842     // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
843     // We will allow whatever can be encoded - hope you know what you are doing.
844     if (shiftBy->isContained())
845     {
846         srcCount += GetOperandInfo(source);
847     }
848     else
849     {
850         srcCount++;
851         shiftByInfo = getLocationInfo(shiftBy);
852         shiftByInfo->info.setSrcCandidates(this, RBM_RCX);
853         info->setDstCandidates(this, allRegs(TYP_INT) & ~RBM_RCX);
854         LocationInfoListNode* sourceInfo;
855         srcCount += GetOperandInfo(source, &sourceInfo);
856         for (; sourceInfo != nullptr; sourceInfo = sourceInfo->Next())
857         {
858             sourceInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RCX);
859         }
860     }
861
862     // Note that Rotate Left/Right instructions don't set ZF and SF flags.
863     //
864     // If the operand being shifted is 32-bits then upper three bits are masked
865     // by hardware to get actual shift count.  Similarly for 64-bit operands
866     // shift count is narrowed to [0..63].  If the resulting shift count is zero,
867     // then shift operation won't modify flags.
868     //
869     // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
870     // if the shift count is known to be non-zero and in the range depending on the
871     // operand size.
872     CLANG_FORMAT_COMMENT_ANCHOR;
873
874 #ifdef _TARGET_X86_
875     // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
876     // we can have a three operand form. Increment the srcCount.
877     if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
878     {
879         assert((source->OperGet() == GT_LONG) && source->isContained());
880
881         GenTree*              sourceLo     = source->gtOp.gtOp1;
882         LocationInfoListNode* sourceLoInfo = useList.Begin();
883         LocationInfoListNode* sourceHiInfo = useList.GetSecond(INDEBUG(source->gtGetOp2()));
884
885         info->hasDelayFreeSrc = true;
886         if (tree->OperGet() == GT_LSH_HI)
887         {
888             sourceLoInfo->info.isDelayFree = true;
889         }
890         else
891         {
892             sourceHiInfo->info.isDelayFree = true;
893         }
894     }
895 #endif
896     if (shiftByInfo != nullptr)
897     {
898         if (tree->IsReverseOp())
899         {
900             useList.Prepend(shiftByInfo);
901         }
902         else
903         {
904             useList.Append(shiftByInfo);
905         }
906     }
907     if (!tree->isContained())
908     {
909         info->srcCount = srcCount;
910     }
911     return srcCount;
912 }
913
914 //------------------------------------------------------------------------
915 // BuildCall: Set the NodeInfo for a call.
916 //
917 // Arguments:
918 //    call      - The call node of interest
919 //
920 // Return Value:
921 //    None.
922 //
923 void LinearScan::BuildCall(GenTreeCall* call)
924 {
925     TreeNodeInfo*   info              = currentNodeInfo;
926     bool            hasMultiRegRetVal = false;
927     ReturnTypeDesc* retTypeDesc       = nullptr;
928
929     assert(!call->isContained());
930     info->srcCount = 0;
931     if (call->TypeGet() != TYP_VOID)
932     {
933         hasMultiRegRetVal = call->HasMultiRegRetVal();
934         if (hasMultiRegRetVal)
935         {
936             // dst count = number of registers in which the value is returned by call
937             retTypeDesc    = call->GetReturnTypeDesc();
938             info->dstCount = retTypeDesc->GetReturnRegCount();
939         }
940         else
941         {
942             assert(info->dstCount == 1);
943         }
944     }
945     else
946     {
947         assert(info->dstCount == 0);
948     }
949
950     GenTree*              ctrlExpr     = call->gtControlExpr;
951     LocationInfoListNode* ctrlExprInfo = nullptr;
952     if (call->gtCallType == CT_INDIRECT)
953     {
954         ctrlExpr = call->gtCallAddr;
955     }
956
957     // If this is a varargs call, we will clear the internal candidates in case we need
958     // to reserve some integer registers for copying float args.
959     // We have to do this because otherwise the default candidates are allRegs, and adding
960     // the individual specific registers will have no effect.
961     if (call->IsVarargs())
962     {
963         info->setInternalCandidates(this, RBM_NONE);
964     }
965
966     RegisterType registerType = call->TypeGet();
967
968     // Set destination candidates for return value of the call.
969     CLANG_FORMAT_COMMENT_ANCHOR;
970
971 #ifdef _TARGET_X86_
972     if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
973     {
974         // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
975         // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
976         // correct argument registers.
977         info->setDstCandidates(this, RBM_PINVOKE_TCB);
978     }
979     else
980 #endif // _TARGET_X86_
981         if (hasMultiRegRetVal)
982     {
983         assert(retTypeDesc != nullptr);
984         info->setDstCandidates(this, retTypeDesc->GetABIReturnRegs());
985     }
986     else if (varTypeIsFloating(registerType))
987     {
988 #ifdef _TARGET_X86_
989         // The return value will be on the X87 stack, and we will need to move it.
990         info->setDstCandidates(this, allRegs(registerType));
991 #else  // !_TARGET_X86_
992         info->setDstCandidates(this, RBM_FLOATRET);
993 #endif // !_TARGET_X86_
994     }
995     else if (registerType == TYP_LONG)
996     {
997         info->setDstCandidates(this, RBM_LNGRET);
998     }
999     else
1000     {
1001         info->setDstCandidates(this, RBM_INTRET);
1002     }
1003
1004     // number of args to a call =
1005     // callRegArgs + (callargs - placeholders, setup, etc)
1006     // there is an explicit thisPtr but it is redundant
1007
1008     bool callHasFloatRegArgs = false;
1009     bool isVarArgs           = call->IsVarargs();
1010
1011     // First, count reg args
1012     for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1013     {
1014         assert(list->OperIsList());
1015
1016         // By this point, lowering has ensured that all call arguments are one of the following:
1017         // - an arg setup store
1018         // - an arg placeholder
1019         // - a nop
1020         // - a copy blk
1021         // - a field list
1022         // - a put arg
1023         //
1024         // Note that this property is statically checked by LinearScan::CheckBlock.
1025         GenTree* argNode = list->Current();
1026
1027         // Each register argument corresponds to one source.
1028         if (argNode->OperIsPutArgReg())
1029         {
1030             info->srcCount++;
1031             HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1032             appendLocationInfoToList(argNode);
1033         }
1034 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1035         else if (argNode->OperGet() == GT_FIELD_LIST)
1036         {
1037             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1038             {
1039                 assert(entry->Current()->OperIsPutArgReg());
1040                 info->srcCount++;
1041                 HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1042                 appendLocationInfoToList(entry->Current());
1043             }
1044         }
1045 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1046
1047 #ifdef DEBUG
1048         // In DEBUG only, check validity with respect to the arg table entry.
1049
1050         fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1051         assert(curArgTabEntry);
1052
1053         if (curArgTabEntry->regNum == REG_STK)
1054         {
1055             // late arg that is not passed in a register
1056             assert(argNode->gtOper == GT_PUTARG_STK);
1057
1058 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1059             // If the node is TYP_STRUCT and it is put on stack with
1060             // putarg_stk operation, we consume and produce no registers.
1061             // In this case the embedded Obj node should not produce
1062             // registers too since it is contained.
1063             // Note that if it is a SIMD type the argument will be in a register.
1064             if (argNode->TypeGet() == TYP_STRUCT)
1065             {
1066                 assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
1067                 assert(argNode->gtOp.gtOp1->isContained());
1068             }
1069 #endif // FEATURE_PUT_STRUCT_ARG_STK
1070             continue;
1071         }
1072 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1073         if (argNode->OperGet() == GT_FIELD_LIST)
1074         {
1075             assert(argNode->isContained());
1076             assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
1077
1078             int i = 0;
1079             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1080             {
1081                 const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum;
1082                 assert(entry->Current()->gtRegNum == argReg);
1083                 assert(i < 2);
1084                 i++;
1085             }
1086         }
1087         else
1088 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1089         {
1090             const regNumber argReg = curArgTabEntry->regNum;
1091             assert(argNode->gtRegNum == argReg);
1092         }
1093 #endif // DEBUG
1094     }
1095
1096     // Now, count stack args
1097     // Note that these need to be computed into a register, but then
1098     // they're just stored to the stack - so the reg doesn't
1099     // need to remain live until the call.  In fact, it must not
1100     // because the code generator doesn't actually consider it live,
1101     // so it can't be spilled.
1102
1103     GenTree* args = call->gtCallArgs;
1104     while (args)
1105     {
1106         GenTree* arg = args->gtOp.gtOp1;
1107         if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
1108         {
1109             if (arg->IsValue() && !arg->isContained())
1110             {
1111                 // argInfo->isLocalDefUse = true;
1112                 assert(arg->IsUnusedValue());
1113             }
1114             // assert(argInfo->dstCount == 0);
1115         }
1116         args = args->gtOp.gtOp2;
1117     }
1118
1119     // set reg requirements on call target represented as control sequence.
1120     if (ctrlExpr != nullptr)
1121     {
1122         LocationInfoListNode* ctrlExprInfo  = nullptr;
1123         int                   ctrlExprCount = GetOperandInfo(ctrlExpr);
1124         if (ctrlExprCount != 0)
1125         {
1126             assert(ctrlExprCount == 1);
1127             ctrlExprInfo = useList.Last();
1128             info->srcCount++;
1129         }
1130
1131         // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1132         // computed into a register.
1133         if (call->IsFastTailCall())
1134         {
1135             assert(!ctrlExpr->isContained() && ctrlExprInfo != nullptr);
1136             // Fast tail call - make sure that call target is always computed in RAX
1137             // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1138             ctrlExprInfo->info.setSrcCandidates(this, RBM_RAX);
1139         }
1140 #ifdef _TARGET_X86_
1141         else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1142         {
1143             // On x86, we need to generate a very specific pattern for indirect VSD calls:
1144             //
1145             //    3-byte nop
1146             //    call dword ptr [eax]
1147             //
1148             // Where EAX is also used as an argument to the stub dispatch helper. Make
1149             // sure that the call target address is computed into EAX in this case.
1150             assert(ctrlExprInfo != nullptr);
1151             assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
1152             ctrlExprInfo->info.setSrcCandidates(this, RBM_VIRTUAL_STUB_TARGET);
1153         }
1154 #endif // _TARGET_X86_
1155
1156 #if FEATURE_VARARG
1157         // If it is a fast tail call, it is already preferenced to use RAX.
1158         // Therefore, no need set src candidates on call tgt again.
1159         if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExprInfo != nullptr))
1160         {
1161             // Don't assign the call target to any of the argument registers because
1162             // we will use them to also pass floating point arguments as required
1163             // by Amd64 ABI.
1164             ctrlExprInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~(RBM_ARG_REGS));
1165         }
1166 #endif // !FEATURE_VARARG
1167     }
1168 }
1169
1170 //------------------------------------------------------------------------
1171 // BuildBlockStore: Set the NodeInfo for a block store.
1172 //
1173 // Arguments:
1174 //    blkNode       - The block store node of interest
1175 //
1176 // Return Value:
1177 //    None.
1178 //
1179 void LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
1180 {
1181     TreeNodeInfo* info    = currentNodeInfo;
1182     GenTree*      dstAddr = blkNode->Addr();
1183     unsigned      size    = blkNode->gtBlkSize;
1184     GenTree*      source  = blkNode->Data();
1185
1186     LocationInfoListNode* dstAddrInfo = nullptr;
1187     LocationInfoListNode* sourceInfo  = nullptr;
1188     LocationInfoListNode* sizeInfo    = nullptr;
1189
1190     // Sources are dest address, initVal or source.
1191     // We may require an additional source or temp register for the size.
1192     if (!dstAddr->isContained())
1193     {
1194         info->srcCount++;
1195         dstAddrInfo = getLocationInfo(dstAddr);
1196     }
1197     assert(info->dstCount == 0);
1198     info->setInternalCandidates(this, RBM_NONE);
1199     GenTree* srcAddrOrFill = nullptr;
1200     bool     isInitBlk     = blkNode->OperIsInitBlkOp();
1201
1202     regMaskTP dstAddrRegMask = RBM_NONE;
1203     regMaskTP sourceRegMask  = RBM_NONE;
1204     regMaskTP blkSizeRegMask = RBM_NONE;
1205
1206     if (isInitBlk)
1207     {
1208         GenTree* initVal = source;
1209         if (initVal->OperIsInitVal())
1210         {
1211             assert(initVal->isContained());
1212             initVal = initVal->gtGetOp1();
1213         }
1214         srcAddrOrFill = initVal;
1215         if (!initVal->isContained())
1216         {
1217             info->srcCount++;
1218             sourceInfo = getLocationInfo(initVal);
1219         }
1220
1221         switch (blkNode->gtBlkOpKind)
1222         {
1223             case GenTreeBlk::BlkOpKindUnroll:
1224                 assert(initVal->IsCnsIntOrI());
1225                 if (size >= XMM_REGSIZE_BYTES)
1226                 {
1227                     // Reserve an XMM register to fill it with a pack of 16 init value constants.
1228                     info->internalFloatCount = 1;
1229                     info->setInternalCandidates(this, internalFloatRegCandidates());
1230                     // use XMM register to fill with constants, it's AVX instruction and set the flag
1231                     SetContainsAVXFlags();
1232                 }
1233 #ifdef _TARGET_X86_
1234                 if ((size & 1) != 0)
1235                 {
1236                     // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1237                     // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1238                     // when unrolling, so only allow byteable registers as the source value. (We could
1239                     // consider just using BlkOpKindRepInstr instead.)
1240                     sourceRegMask = RBM_BYTE_REGS;
1241                 }
1242 #endif // _TARGET_X86_
1243                 break;
1244
1245             case GenTreeBlk::BlkOpKindRepInstr:
1246                 // rep stos has the following register requirements:
1247                 // a) The memory address to be in RDI.
1248                 // b) The fill value has to be in RAX.
1249                 // c) The buffer size will go in RCX.
1250                 dstAddrRegMask = RBM_RDI;
1251                 sourceRegMask  = RBM_RAX;
1252                 blkSizeRegMask = RBM_RCX;
1253                 break;
1254
1255             case GenTreeBlk::BlkOpKindHelper:
1256 #ifdef _TARGET_AMD64_
1257                 // The helper follows the regular AMD64 ABI.
1258                 dstAddrRegMask = RBM_ARG_0;
1259                 sourceRegMask  = RBM_ARG_1;
1260                 blkSizeRegMask = RBM_ARG_2;
1261 #else  // !_TARGET_AMD64_
1262                 dstAddrRegMask             = RBM_RDI;
1263                 sourceRegMask              = RBM_RAX;
1264                 blkSizeRegMask             = RBM_RCX;
1265 #endif // !_TARGET_AMD64_
1266                 break;
1267
1268             default:
1269                 unreached();
1270         }
1271     }
1272     else
1273     {
1274         // CopyObj or CopyBlk
1275         if (source->gtOper == GT_IND)
1276         {
1277             assert(source->isContained());
1278             srcAddrOrFill = source->gtGetOp1();
1279             if (!srcAddrOrFill->isContained())
1280             {
1281                 sourceInfo = getLocationInfo(srcAddrOrFill);
1282                 info->srcCount++;
1283             }
1284         }
1285         if (blkNode->OperGet() == GT_STORE_OBJ)
1286         {
1287             if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
1288             {
1289                 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1290                 blkSizeRegMask = RBM_RCX;
1291             }
1292             // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
1293             // sources.
1294             sourceRegMask  = RBM_RSI;
1295             dstAddrRegMask = RBM_RDI;
1296         }
1297         else
1298         {
1299             switch (blkNode->gtBlkOpKind)
1300             {
1301                 case GenTreeBlk::BlkOpKindUnroll:
1302                     // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1303                     //
1304                     // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1305                     // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1306                     // RBM_NON_BYTE_REGS from internal candidates.
1307                     if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1308                     {
1309                         info->internalIntCount++;
1310                         regMaskTP regMask = allRegs(TYP_INT);
1311
1312 #ifdef _TARGET_X86_
1313                         if ((size & 1) != 0)
1314                         {
1315                             regMask &= ~RBM_NON_BYTE_REGS;
1316                         }
1317 #endif
1318                         info->setInternalCandidates(this, regMask);
1319                     }
1320
1321                     if (size >= XMM_REGSIZE_BYTES)
1322                     {
1323                         // If we have a buffer larger than XMM_REGSIZE_BYTES,
1324                         // reserve an XMM register to use it for a
1325                         // series of 16-byte loads and stores.
1326                         info->internalFloatCount = 1;
1327                         info->addInternalCandidates(this, internalFloatRegCandidates());
1328                         // Uses XMM reg for load and store and hence check to see whether AVX instructions
1329                         // are used for codegen, set ContainsAVX flag
1330                         SetContainsAVXFlags();
1331                     }
1332                     break;
1333
1334                 case GenTreeBlk::BlkOpKindRepInstr:
1335                     // rep stos has the following register requirements:
1336                     // a) The dest address has to be in RDI.
1337                     // b) The src address has to be in RSI.
1338                     // c) The buffer size will go in RCX.
1339                     dstAddrRegMask = RBM_RDI;
1340                     sourceRegMask  = RBM_RSI;
1341                     blkSizeRegMask = RBM_RCX;
1342                     break;
1343
1344                 case GenTreeBlk::BlkOpKindHelper:
1345 #ifdef _TARGET_AMD64_
1346                     // The helper follows the regular AMD64 ABI.
1347                     dstAddrRegMask = RBM_ARG_0;
1348                     sourceRegMask  = RBM_ARG_1;
1349                     blkSizeRegMask = RBM_ARG_2;
1350 #else  // !_TARGET_AMD64_
1351                     dstAddrRegMask         = RBM_RDI;
1352                     sourceRegMask          = RBM_RAX;
1353                     blkSizeRegMask         = RBM_RCX;
1354 #endif // !_TARGET_AMD64_
1355                     break;
1356
1357                 default:
1358                     unreached();
1359             }
1360         }
1361     }
1362
1363     if (dstAddrInfo != nullptr)
1364     {
1365         if (dstAddrRegMask != RBM_NONE)
1366         {
1367             dstAddrInfo->info.setSrcCandidates(this, dstAddrRegMask);
1368         }
1369         useList.Append(dstAddrInfo);
1370     }
1371     if (sourceRegMask != RBM_NONE)
1372     {
1373         if (sourceInfo != nullptr)
1374         {
1375             sourceInfo->info.setSrcCandidates(this, sourceRegMask);
1376         }
1377         else
1378         {
1379             // This is a local source; we'll use a temp register for its address.
1380             info->addInternalCandidates(this, sourceRegMask);
1381             info->internalIntCount++;
1382         }
1383     }
1384     if (sourceInfo != nullptr)
1385     {
1386         useList.Add(sourceInfo, blkNode->IsReverseOp());
1387     }
1388
1389     if (blkNode->OperIs(GT_STORE_DYN_BLK))
1390     {
1391         // The block size argument is a third argument to GT_STORE_DYN_BLK
1392         info->srcCount++;
1393
1394         GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1395         sizeInfo           = getLocationInfo(blockSize);
1396         useList.Add(sizeInfo, blkNode->AsDynBlk()->gtEvalSizeFirst);
1397     }
1398
1399     if (blkSizeRegMask != RBM_NONE)
1400     {
1401         if (size != 0)
1402         {
1403             // Reserve a temp register for the block size argument.
1404             info->addInternalCandidates(this, blkSizeRegMask);
1405             info->internalIntCount++;
1406         }
1407         else
1408         {
1409             // The block size argument is a third argument to GT_STORE_DYN_BLK
1410             assert((blkNode->gtOper == GT_STORE_DYN_BLK) && (sizeInfo != nullptr));
1411             info->setSrcCount(3);
1412             sizeInfo->info.setSrcCandidates(this, blkSizeRegMask);
1413         }
1414     }
1415 }
1416
1417 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1418 //------------------------------------------------------------------------
1419 // BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1420 //
1421 // Arguments:
1422 //    tree      - The node of interest
1423 //
1424 // Return Value:
1425 //    None.
1426 //
1427 void LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
1428 {
1429     TreeNodeInfo* info = currentNodeInfo;
1430     info->srcCount     = 0;
1431     assert(info->dstCount == 0);
1432
1433     if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
1434     {
1435         putArgStk->gtOp1->SetContained();
1436
1437 #ifdef _TARGET_X86_
1438         unsigned fieldCount    = 0;
1439         bool     needsByteTemp = false;
1440         bool     needsSimdTemp = false;
1441         unsigned prevOffset    = putArgStk->getArgSize();
1442         for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1443         {
1444             GenTree* const  fieldNode   = current->Current();
1445             const var_types fieldType   = fieldNode->TypeGet();
1446             const unsigned  fieldOffset = current->gtFieldOffset;
1447             assert(fieldType != TYP_LONG);
1448
1449 #if defined(FEATURE_SIMD)
1450             // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the
1451             // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
1452             // we "round up" to 16.
1453             if (current->gtFieldType == TYP_SIMD12)
1454             {
1455                 needsSimdTemp = true;
1456             }
1457 #endif // defined(FEATURE_SIMD)
1458
1459             // We can treat as a slot any field that is stored at a slot boundary, where the previous
1460             // field is not in the same slot. (Note that we store the fields in reverse order.)
1461             const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
1462             if (!fieldIsSlot)
1463             {
1464                 if (varTypeIsByte(fieldType))
1465                 {
1466                     // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
1467                     // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
1468                     // need a byte-addressable register for the store. We will enforce this requirement on an internal
1469                     // register, which we can use to copy multiple byte values.
1470                     needsByteTemp = true;
1471                 }
1472             }
1473
1474             if (varTypeIsGC(fieldType))
1475             {
1476                 putArgStk->gtNumberReferenceSlots++;
1477             }
1478             prevOffset = fieldOffset;
1479             fieldCount++;
1480             if (!fieldNode->isContained())
1481             {
1482                 appendLocationInfoToList(fieldNode);
1483                 info->srcCount++;
1484             }
1485         }
1486
1487         if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
1488         {
1489             // If any of the fields cannot be stored with an actual push, we may need a temporary
1490             // register to load the value before storing it to the stack location.
1491             info->internalIntCount = 1;
1492             regMaskTP regMask      = allRegs(TYP_INT);
1493             if (needsByteTemp)
1494             {
1495                 regMask &= ~RBM_NON_BYTE_REGS;
1496             }
1497             info->setInternalCandidates(this, regMask);
1498         }
1499
1500 #if defined(FEATURE_SIMD)
1501         // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
1502         if (needsSimdTemp)
1503         {
1504             assert(info->dstCount == 0);
1505             info->internalFloatCount += 1;
1506             info->addInternalCandidates(this, allSIMDRegs());
1507         }
1508 #endif // defined(FEATURE_SIMD)
1509
1510         return;
1511 #endif // _TARGET_X86_
1512     }
1513
1514     GenTree*  src  = putArgStk->gtOp1;
1515     var_types type = src->TypeGet();
1516
1517 #if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1518     // For PutArgStk of a TYP_SIMD12, we need an extra register.
1519     if (putArgStk->isSIMD12())
1520     {
1521         appendLocationInfoToList(putArgStk->gtOp1);
1522         info->srcCount           = 1;
1523         info->internalFloatCount = 1;
1524         info->setInternalCandidates(this, allSIMDRegs());
1525         return;
1526     }
1527 #endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1528
1529     if (type != TYP_STRUCT)
1530     {
1531         BuildSimple(putArgStk);
1532         return;
1533     }
1534
1535     GenTree* dst     = putArgStk;
1536     GenTree* srcAddr = nullptr;
1537
1538     info->srcCount = GetOperandInfo(src);
1539
1540     // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1541     // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1542     // our framework assemblies, so this is the main code generation scheme we'll use.
1543     ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
1544     switch (putArgStk->gtPutArgStkKind)
1545     {
1546         case GenTreePutArgStk::Kind::Push:
1547         case GenTreePutArgStk::Kind::PushAllSlots:
1548         case GenTreePutArgStk::Kind::Unroll:
1549             // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1550             //
1551             // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1552             // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1553             // RBM_NON_BYTE_REGS from internal candidates.
1554             if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
1555             {
1556                 info->internalIntCount++;
1557                 regMaskTP regMask = allRegs(TYP_INT);
1558
1559 #ifdef _TARGET_X86_
1560                 if ((size % 2) != 0)
1561                 {
1562                     regMask &= ~RBM_NON_BYTE_REGS;
1563                 }
1564 #endif
1565                 info->setInternalCandidates(this, regMask);
1566             }
1567
1568 #ifdef _TARGET_X86_
1569             if (size >= 8)
1570 #else  // !_TARGET_X86_
1571             if (size >= XMM_REGSIZE_BYTES)
1572 #endif // !_TARGET_X86_
1573             {
1574                 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
1575                 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
1576                 // series of 16-byte loads and stores.
1577                 info->internalFloatCount = 1;
1578                 info->addInternalCandidates(this, internalFloatRegCandidates());
1579                 SetContainsAVXFlags();
1580             }
1581             break;
1582
1583         case GenTreePutArgStk::Kind::RepInstr:
1584             info->internalIntCount += 3;
1585             info->setInternalCandidates(this, (RBM_RDI | RBM_RCX | RBM_RSI));
1586             break;
1587
1588         default:
1589             unreached();
1590     }
1591 }
1592 #endif // FEATURE_PUT_STRUCT_ARG_STK
1593
1594 //------------------------------------------------------------------------
1595 // BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
1596 //
1597 // Arguments:
1598 //    tree      - The node of interest
1599 //
1600 // Return Value:
1601 //    None.
1602 //
1603 void LinearScan::BuildLclHeap(GenTree* tree)
1604 {
1605     TreeNodeInfo* info = currentNodeInfo;
1606     info->srcCount     = 1;
1607     assert(info->dstCount == 1);
1608
1609     // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1610     // Here '-' means don't care.
1611     //
1612     //     Size?                    Init Memory?         # temp regs
1613     //      0                            -                  0 (returns 0)
1614     //      const and <=6 reg words      -                  0 (pushes '0')
1615     //      const and >6 reg words       Yes                0 (pushes '0')
1616     //      const and <PageSize          No                 0 (amd64) 1 (x86)
1617     //                                                        (x86:tmpReg for sutracting from esp)
1618     //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
1619     //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
1620     //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
1621     //
1622     // Note: Here we don't need internal register to be different from targetReg.
1623     // Rather, require it to be different from operand's reg.
1624
1625     GenTree* size = tree->gtOp.gtOp1;
1626     if (size->IsCnsIntOrI())
1627     {
1628         assert(size->isContained());
1629         info->srcCount = 0;
1630         size_t sizeVal = size->gtIntCon.gtIconVal;
1631
1632         if (sizeVal == 0)
1633         {
1634             info->internalIntCount = 0;
1635         }
1636         else
1637         {
1638             // Compute the amount of memory to properly STACK_ALIGN.
1639             // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1640             // This should also help in debugging as we can examine the original size specified with localloc.
1641             sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1642
1643             // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1644             // we will generate 'push 0'.
1645             assert((sizeVal % REGSIZE_BYTES) == 0);
1646             size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1647             if (cntRegSizedWords <= 6)
1648             {
1649                 info->internalIntCount = 0;
1650             }
1651             else if (!compiler->info.compInitMem)
1652             {
1653                 // No need to initialize allocated stack space.
1654                 if (sizeVal < compiler->eeGetPageSize())
1655                 {
1656 #ifdef _TARGET_X86_
1657                     info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
1658 #else                                           // !_TARGET_X86_
1659                     info->internalIntCount = 0;
1660 #endif                                          // !_TARGET_X86_
1661                 }
1662                 else
1663                 {
1664                     // We need two registers: regCnt and RegTmp
1665                     info->internalIntCount = 2;
1666                 }
1667             }
1668             else
1669             {
1670                 // >6 and need to zero initialize allocated stack space.
1671                 info->internalIntCount = 0;
1672             }
1673         }
1674     }
1675     else
1676     {
1677         appendLocationInfoToList(size);
1678         if (!compiler->info.compInitMem)
1679         {
1680             info->internalIntCount = 2;
1681         }
1682         else
1683         {
1684             info->internalIntCount = 0;
1685         }
1686     }
1687 }
1688
1689 //------------------------------------------------------------------------
1690 // BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
1691 //
1692 // Arguments:
1693 //    tree      - The node of interest
1694 //
1695 // Return Value:
1696 //    None.
1697 //
1698 void LinearScan::BuildModDiv(GenTree* tree)
1699 {
1700     TreeNodeInfo* info = currentNodeInfo;
1701     GenTree*      op1  = tree->gtGetOp1();
1702     GenTree*      op2  = tree->gtGetOp2();
1703
1704     assert(info->dstCount == 1);
1705
1706     if (varTypeIsFloating(tree->TypeGet()))
1707     {
1708         info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
1709         return;
1710     }
1711
1712     // Amd64 Div/Idiv instruction:
1713     //    Dividend in RAX:RDX  and computes
1714     //    Quotient in RAX, Remainder in RDX
1715
1716     if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
1717     {
1718         // We are interested in just the remainder.
1719         // RAX is used as a trashable register during computation of remainder.
1720         info->setDstCandidates(this, RBM_RDX);
1721     }
1722     else
1723     {
1724         // We are interested in just the quotient.
1725         // RDX gets used as trashable register during computation of quotient
1726         info->setDstCandidates(this, RBM_RAX);
1727     }
1728
1729 #ifdef _TARGET_X86_
1730     if (op1->OperGet() == GT_LONG)
1731     {
1732         assert(op1->isContained());
1733
1734         // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
1735         GenTree* loVal = op1->gtGetOp1();
1736         GenTree* hiVal = op1->gtGetOp2();
1737
1738         assert(op2->IsCnsIntOrI());
1739         assert(tree->OperGet() == GT_UMOD);
1740
1741         // This situation also requires an internal register.
1742         info->internalIntCount = 1;
1743         info->setInternalCandidates(this, allRegs(TYP_INT));
1744
1745         LocationInfoListNode* loValInfo = getLocationInfo(loVal);
1746         LocationInfoListNode* hiValInfo = getLocationInfo(hiVal);
1747         loValInfo->info.setSrcCandidates(this, RBM_EAX);
1748         hiValInfo->info.setSrcCandidates(this, RBM_EDX);
1749         useList.Append(loValInfo);
1750         useList.Append(hiValInfo);
1751         info->srcCount = 2;
1752     }
1753     else
1754 #endif
1755     {
1756         // If possible would like to have op1 in RAX to avoid a register move
1757         LocationInfoListNode* op1Info = getLocationInfo(op1);
1758         op1Info->info.setSrcCandidates(this, RBM_RAX);
1759         useList.Append(op1Info);
1760         info->srcCount = 1;
1761     }
1762
1763     LocationInfoListNode* op2Info;
1764     info->srcCount += GetOperandInfo(op2, &op2Info);
1765     for (; op2Info != nullptr; op2Info = op2Info->Next())
1766     {
1767         op2Info->info.setSrcCandidates(this, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
1768     }
1769 }
1770
1771 //------------------------------------------------------------------------
1772 // BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
1773 //
1774 // Arguments:
1775 //    tree      - The node of interest
1776 //
1777 // Return Value:
1778 //    None.
1779 //
1780 void LinearScan::BuildIntrinsic(GenTree* tree)
1781 {
1782     TreeNodeInfo* info = currentNodeInfo;
1783     // Both operand and its result must be of floating point type.
1784     GenTree* op1 = tree->gtGetOp1();
1785     assert(varTypeIsFloating(op1));
1786     assert(op1->TypeGet() == tree->TypeGet());
1787
1788     info->srcCount = GetOperandInfo(op1);
1789     assert(info->dstCount == 1);
1790
1791     switch (tree->gtIntrinsic.gtIntrinsicId)
1792     {
1793         case CORINFO_INTRINSIC_Sqrt:
1794             break;
1795
1796         case CORINFO_INTRINSIC_Abs:
1797             // Abs(float x) = x & 0x7fffffff
1798             // Abs(double x) = x & 0x7ffffff ffffffff
1799
1800             // In case of Abs we need an internal register to hold mask.
1801
1802             // TODO-XArch-CQ: avoid using an internal register for the mask.
1803             // Andps or andpd both will operate on 128-bit operands.
1804             // The data section constant to hold the mask is a 64-bit size.
1805             // Therefore, we need both the operand and mask to be in
1806             // xmm register. When we add support in emitter to emit 128-bit
1807             // data constants and instructions that operate on 128-bit
1808             // memory operands we can avoid the need for an internal register.
1809             if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
1810             {
1811                 info->internalFloatCount = 1;
1812                 info->setInternalCandidates(this, internalFloatRegCandidates());
1813             }
1814             break;
1815
1816 #ifdef _TARGET_X86_
1817         case CORINFO_INTRINSIC_Cos:
1818         case CORINFO_INTRINSIC_Sin:
1819             NYI_X86("Math intrinsics Cos and Sin");
1820             break;
1821 #endif // _TARGET_X86_
1822
1823         case CORINFO_INTRINSIC_Round:
1824         case CORINFO_INTRINSIC_Ceiling:
1825         case CORINFO_INTRINSIC_Floor:
1826 #if defined(LEGACY_BACKEND)
1827             NYI_X86("Math intrinsics Round, Ceiling, and Floor");
1828 #endif // LEGACY_BACKEND
1829             break;
1830
1831         default:
1832             // Right now only Sqrt/Abs are treated as math intrinsics
1833             noway_assert(!"Unsupported math intrinsic");
1834             unreached();
1835             break;
1836     }
1837 }
1838
1839 #ifdef FEATURE_SIMD
1840 //------------------------------------------------------------------------
1841 // BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
1842 //
1843 // Arguments:
1844 //    tree       - The GT_SIMD node of interest
1845 //
1846 // Return Value:
1847 //    None.
1848
1849 void LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
1850 {
1851     TreeNodeInfo* info = currentNodeInfo;
1852     // Only SIMDIntrinsicInit can be contained. Other than that,
1853     // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount.
1854     if (simdTree->isContained())
1855     {
1856         assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
1857     }
1858     else if (info->dstCount != 1)
1859     {
1860         assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ||
1861                (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality));
1862     }
1863     SetContainsAVXFlags(true, simdTree->gtSIMDSize);
1864     GenTree* op1   = simdTree->gtOp.gtOp1;
1865     GenTree* op2   = simdTree->gtOp.gtOp2;
1866     info->srcCount = 0;
1867     if (!op1->OperIs(GT_LIST))
1868     {
1869         info->srcCount += GetOperandInfo(op1);
1870     }
1871     if ((op2 != nullptr) && !op2->isContained())
1872     {
1873         info->srcCount += GetOperandInfo(op2);
1874     }
1875
1876     switch (simdTree->gtSIMDIntrinsicID)
1877     {
1878         case SIMDIntrinsicInit:
1879         {
1880             // This sets all fields of a SIMD struct to the given value.
1881             // Mark op1 as contained if it is either zero or int constant of all 1's,
1882             // or a float constant with 16 or 32 byte simdType (AVX case)
1883             //
1884             // Should never see small int base type vectors except for zero initialization.
1885             assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
1886
1887 #if !defined(_TARGET_64BIT_)
1888             if (op1->OperGet() == GT_LONG)
1889             {
1890                 assert(op1->isContained());
1891                 GenTree* op1lo = op1->gtGetOp1();
1892                 GenTree* op1hi = op1->gtGetOp2();
1893
1894                 if (op1lo->isContained())
1895                 {
1896                     assert(op1hi->isContained());
1897                     assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
1898                            (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
1899                     assert(info->srcCount == 0);
1900                 }
1901                 else
1902                 {
1903                     assert(info->srcCount == 2);
1904                     info->internalFloatCount = 1;
1905                     info->setInternalCandidates(this, allSIMDRegs());
1906                     info->isInternalRegDelayFree = true;
1907                 }
1908             }
1909 #endif // !defined(_TARGET_64BIT_)
1910         }
1911         break;
1912
1913         case SIMDIntrinsicInitN:
1914         {
1915             var_types baseType = simdTree->gtSIMDBaseType;
1916             info->srcCount     = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
1917             int initCount      = 0;
1918             for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
1919             {
1920                 assert(list->OperGet() == GT_LIST);
1921                 GenTree* listItem = list->gtGetOp1();
1922                 assert(listItem->TypeGet() == baseType);
1923                 assert(!listItem->isContained());
1924                 appendLocationInfoToList(listItem);
1925                 initCount++;
1926             }
1927             assert(initCount == info->srcCount);
1928
1929             // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
1930             info->internalFloatCount = 1;
1931             info->setInternalCandidates(this, allSIMDRegs());
1932         }
1933         break;
1934
1935         case SIMDIntrinsicInitArray:
1936             // We have an array and an index, which may be contained.
1937             assert(info->srcCount == (simdTree->gtGetOp2()->isContained() ? 1 : 2));
1938             break;
1939
1940         case SIMDIntrinsicDiv:
1941             // SSE2 has no instruction support for division on integer vectors
1942             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1943             assert(info->srcCount == 2);
1944             break;
1945
1946         case SIMDIntrinsicAbs:
1947             // float/double vectors: This gets implemented as bitwise-And operation
1948             // with a mask and hence should never see  here.
1949             //
1950             // Must be a Vector<int> or Vector<short> Vector<sbyte>
1951             assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
1952                    simdTree->gtSIMDBaseType == TYP_BYTE);
1953             assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
1954             assert(info->srcCount == 1);
1955             break;
1956
1957         case SIMDIntrinsicSqrt:
1958             // SSE2 has no instruction support for sqrt on integer vectors.
1959             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1960             assert(info->srcCount == 1);
1961             break;
1962
1963         case SIMDIntrinsicAdd:
1964         case SIMDIntrinsicSub:
1965         case SIMDIntrinsicMul:
1966         case SIMDIntrinsicBitwiseAnd:
1967         case SIMDIntrinsicBitwiseAndNot:
1968         case SIMDIntrinsicBitwiseOr:
1969         case SIMDIntrinsicBitwiseXor:
1970         case SIMDIntrinsicMin:
1971         case SIMDIntrinsicMax:
1972             assert(info->srcCount == 2);
1973
1974             // SSE2 32-bit integer multiplication requires two temp regs
1975             if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
1976                 compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
1977             {
1978                 info->internalFloatCount = 2;
1979                 info->setInternalCandidates(this, allSIMDRegs());
1980             }
1981             break;
1982
1983         case SIMDIntrinsicEqual:
1984             assert(info->srcCount == 2);
1985             break;
1986
1987         // SSE2 doesn't support < and <= directly on int vectors.
1988         // Instead we need to use > and >= with swapped operands.
1989         case SIMDIntrinsicLessThan:
1990         case SIMDIntrinsicLessThanOrEqual:
1991             assert(info->srcCount == 2);
1992             noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
1993             break;
1994
1995         // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
1996         // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
1997         // Instead we need to use <  and <= with swapped operands.
1998         case SIMDIntrinsicGreaterThan:
1999             noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2000             assert(info->srcCount == 2);
2001             break;
2002
2003         case SIMDIntrinsicOpEquality:
2004         case SIMDIntrinsicOpInEquality:
2005             if (simdTree->gtGetOp2()->isContained())
2006             {
2007                 // If the second operand is contained then ContainCheckSIMD has determined
2008                 // that PTEST can be used. We only need a single source register and no
2009                 // internal registers.
2010                 assert(info->srcCount == 1);
2011             }
2012             else
2013             {
2014                 // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
2015                 // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
2016                 // and one internal INT register (to hold the result of PMOVMSKB).
2017                 assert(info->srcCount == 2);
2018                 info->internalFloatCount = 1;
2019                 info->setInternalCandidates(this, allSIMDRegs());
2020                 info->internalIntCount = 1;
2021                 info->addInternalCandidates(this, allRegs(TYP_INT));
2022             }
2023             // These SIMD nodes only set the condition flags.
2024             info->dstCount = 0;
2025             break;
2026
2027         case SIMDIntrinsicDotProduct:
2028             // Float/Double vectors:
2029             // For SSE, or AVX with 32-byte vectors, we also need an internal register
2030             // as scratch. Further we need the targetReg and internal reg to be distinct
2031             // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2032             // don't need a tmpReg.
2033             //
2034             // 32-byte integer vector on SSE4/AVX:
2035             // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2036             // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
2037             // registers since targetReg is an int type register.
2038             //
2039             // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2040             // and the need for scratch registers.
2041             if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2042             {
2043                 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
2044                     (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
2045                 {
2046                     info->internalFloatCount     = 1;
2047                     info->isInternalRegDelayFree = true;
2048                     info->setInternalCandidates(this, allSIMDRegs());
2049                 }
2050                 // else don't need scratch reg(s).
2051             }
2052             else
2053             {
2054                 assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2055
2056                 // No need to set isInternalRegDelayFree since targetReg is a
2057                 // an int type reg and guaranteed to be different from xmm/ymm
2058                 // regs.
2059                 info->internalFloatCount = (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) ? 2 : 1;
2060                 info->setInternalCandidates(this, allSIMDRegs());
2061             }
2062             assert(info->srcCount == 2);
2063             break;
2064
2065         case SIMDIntrinsicGetItem:
2066         {
2067             // This implements get_Item method. The sources are:
2068             //  - the source SIMD struct
2069             //  - index (which element to get)
2070             // The result is baseType of SIMD struct.
2071             // op1 may be a contained memory op, but if so we will consume its address.
2072             // op2 may be a contained constant.
2073             op1 = simdTree->gtOp.gtOp1;
2074             op2 = simdTree->gtOp.gtOp2;
2075
2076             if (!op1->isContained())
2077             {
2078                 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2079                 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2080                 // can use that in the process of extracting the element.
2081                 //
2082                 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2083                 // we will need a temp if are indexing into the upper half of the AVX register.
2084                 // In all other cases with constant index, we need a temp xmm register to extract the
2085                 // element if index is other than zero.
2086
2087                 if (!op2->IsCnsIntOrI())
2088                 {
2089                     (void)compiler->getSIMDInitTempVarNum();
2090                 }
2091                 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2092                 {
2093                     bool needFloatTemp;
2094                     if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2095                         (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
2096                     {
2097                         int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2098                         needFloatTemp    = (byteShiftCnt >= 16);
2099                     }
2100                     else
2101                     {
2102                         needFloatTemp = !op2->IsIntegralConst(0);
2103                     }
2104
2105                     if (needFloatTemp)
2106                     {
2107                         info->internalFloatCount = 1;
2108                         info->setInternalCandidates(this, allSIMDRegs());
2109                     }
2110                 }
2111             }
2112         }
2113         break;
2114
2115         case SIMDIntrinsicSetX:
2116         case SIMDIntrinsicSetY:
2117         case SIMDIntrinsicSetZ:
2118         case SIMDIntrinsicSetW:
2119             assert(info->srcCount == 2);
2120
2121             // We need an internal integer register for SSE2 codegen
2122             if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2123             {
2124                 info->internalIntCount = 1;
2125                 info->setInternalCandidates(this, allRegs(TYP_INT));
2126             }
2127
2128             break;
2129
2130         case SIMDIntrinsicCast:
2131             assert(info->srcCount == 1);
2132             break;
2133
2134         case SIMDIntrinsicConvertToSingle:
2135             assert(info->srcCount == 1);
2136             if (simdTree->gtSIMDBaseType == TYP_UINT)
2137             {
2138                 // We need an internal register different from targetReg.
2139                 info->isInternalRegDelayFree = true;
2140                 info->internalIntCount       = 1;
2141                 info->internalFloatCount     = 2;
2142                 info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2143             }
2144             break;
2145
2146         case SIMDIntrinsicConvertToInt32:
2147             assert(info->srcCount == 1);
2148             break;
2149
2150         case SIMDIntrinsicWidenLo:
2151         case SIMDIntrinsicWidenHi:
2152             assert(info->srcCount == 1);
2153             if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
2154             {
2155                 // We need an internal register different from targetReg.
2156                 info->isInternalRegDelayFree = true;
2157                 info->internalFloatCount     = 1;
2158                 info->setInternalCandidates(this, allSIMDRegs());
2159             }
2160             break;
2161
2162         case SIMDIntrinsicConvertToInt64:
2163             assert(info->srcCount == 1);
2164             // We need an internal register different from targetReg.
2165             info->isInternalRegDelayFree = true;
2166             info->internalIntCount       = 1;
2167             if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2168             {
2169                 info->internalFloatCount = 2;
2170             }
2171             else
2172             {
2173                 info->internalFloatCount = 1;
2174             }
2175             info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2176             break;
2177
2178         case SIMDIntrinsicConvertToDouble:
2179             assert(info->srcCount == 1);
2180             // We need an internal register different from targetReg.
2181             info->isInternalRegDelayFree = true;
2182             info->internalIntCount       = 1;
2183 #ifdef _TARGET_X86_
2184             if (simdTree->gtSIMDBaseType == TYP_LONG)
2185             {
2186                 info->internalFloatCount = 3;
2187             }
2188             else
2189 #endif
2190                 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
2191             {
2192                 info->internalFloatCount = 2;
2193             }
2194             else
2195             {
2196                 info->internalFloatCount = 1;
2197             }
2198             info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2199             break;
2200
2201         case SIMDIntrinsicNarrow:
2202             assert(info->srcCount == 2);
2203             // We need an internal register different from targetReg.
2204             info->isInternalRegDelayFree = true;
2205             if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
2206             {
2207                 info->internalFloatCount = 2;
2208             }
2209             else
2210             {
2211                 info->internalFloatCount = 1;
2212             }
2213             info->setInternalCandidates(this, allSIMDRegs());
2214             break;
2215
2216         case SIMDIntrinsicShuffleSSE2:
2217             assert(info->srcCount == 1);
2218             // Second operand is an integer constant and marked as contained.
2219             assert(simdTree->gtOp.gtOp2->isContainedIntOrIImmed());
2220             break;
2221
2222         case SIMDIntrinsicGetX:
2223         case SIMDIntrinsicGetY:
2224         case SIMDIntrinsicGetZ:
2225         case SIMDIntrinsicGetW:
2226         case SIMDIntrinsicGetOne:
2227         case SIMDIntrinsicGetZero:
2228         case SIMDIntrinsicGetCount:
2229         case SIMDIntrinsicGetAllOnes:
2230             assert(!"Get intrinsics should not be seen during Lowering.");
2231             unreached();
2232
2233         default:
2234             noway_assert(!"Unimplemented SIMD node type.");
2235             unreached();
2236     }
2237 }
2238 #endif // FEATURE_SIMD
2239
2240 #ifdef FEATURE_HW_INTRINSICS
2241 //------------------------------------------------------------------------
2242 // BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree.
2243 //
2244 // Arguments:
2245 //    tree       - The GT_HWIntrinsic node of interest
2246 //
2247 // Return Value:
2248 //    None.
2249
2250 void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
2251 {
2252     TreeNodeInfo*  info        = currentNodeInfo;
2253     NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId;
2254     InstructionSet isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
2255     if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
2256     {
2257         SetContainsAVXFlags(true, 32);
2258     }
2259     GenTree* op1   = intrinsicTree->gtOp.gtOp1;
2260     GenTree* op2   = intrinsicTree->gtOp.gtOp2;
2261     info->srcCount = 0;
2262
2263     if (op1 != nullptr)
2264     {
2265         if (op1->OperIsList())
2266         {
2267             for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest())
2268             {
2269                 info->srcCount += GetOperandInfo(list->Current());
2270             }
2271         }
2272         else
2273         {
2274             info->srcCount += GetOperandInfo(op1);
2275         }
2276     }
2277
2278     if (op2 != nullptr)
2279     {
2280         info->srcCount += GetOperandInfo(op2);
2281     }
2282
2283     switch (intrinsicID)
2284     {
2285         case NI_SSE_CompareEqualOrderedScalar:
2286         case NI_SSE_CompareEqualUnorderedScalar:
2287         case NI_SSE_CompareNotEqualOrderedScalar:
2288         case NI_SSE_CompareNotEqualUnorderedScalar:
2289         case NI_SSE2_CompareEqualOrderedScalar:
2290         case NI_SSE2_CompareEqualUnorderedScalar:
2291         case NI_SSE2_CompareNotEqualOrderedScalar:
2292         case NI_SSE2_CompareNotEqualUnorderedScalar:
2293             info->internalIntCount = 1;
2294             info->setInternalCandidates(this, RBM_BYTE_REGS);
2295             break;
2296
2297         case NI_SSE_SetScalarVector128:
2298             // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
2299             info->internalFloatCount = 1;
2300             info->setInternalCandidates(this, allSIMDRegs());
2301             break;
2302
2303         case NI_SSE_Shuffle:
2304         {
2305             assert(op1->OperIsList());
2306             GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
2307
2308             if (!op3->isContainedIntOrIImmed())
2309             {
2310                 assert(!op3->IsCnsIntOrI());
2311
2312                 // We need two extra reg when op3 isn't a constant so
2313                 // the offset into the jump table for the fallback path
2314                 // can be computed.
2315
2316                 info->internalIntCount = 2;
2317                 info->setInternalCandidates(this, allRegs(TYP_INT));
2318             }
2319             break;
2320         }
2321
2322         case NI_SSE_ConvertToSingle:
2323         case NI_SSE_StaticCast:
2324         case NI_SSE2_ConvertToDouble:
2325             assert(info->srcCount == 1);
2326             assert(info->dstCount == 1);
2327             useList.Last()->info.isTgtPref = true;
2328             break;
2329
2330         case NI_SSE41_BlendVariable:
2331             if (!compiler->canUseVexEncoding())
2332             {
2333                 // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
2334                 LocationInfoListNode* op2Info = useList.Begin()->Next();
2335                 LocationInfoListNode* op3Info = op2Info->Next();
2336                 op2Info->info.isDelayFree     = true;
2337                 op3Info->info.isDelayFree     = true;
2338                 op3Info->info.setSrcCandidates(this, RBM_XMM0);
2339                 info->hasDelayFreeSrc = true;
2340             }
2341             break;
2342
2343 #ifdef _TARGET_X86_
2344         case NI_SSE42_Crc32:
2345         {
2346             // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
2347             //
2348             // TODO - currently we use the BaseType to bring the type of the second argument
2349             // to the code generator. May encode the overload info in other way.
2350             var_types srcType = intrinsicTree->gtSIMDBaseType;
2351             if (varTypeIsByte(srcType))
2352             {
2353                 LocationInfoListNode* op2Info = useList.GetSecond(INDEBUG(intrinsicTree->gtGetOp2()));
2354                 op2Info->info.setSrcCandidates(this, RBM_BYTE_REGS);
2355             }
2356             break;
2357         }
2358 #endif // _TARGET_X86_
2359
2360         default:
2361             assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END));
2362             break;
2363     }
2364 }
2365 #endif
2366
2367 //------------------------------------------------------------------------
2368 // BuildCast: Set the NodeInfo for a GT_CAST.
2369 //
2370 // Arguments:
2371 //    tree      - The node of interest
2372 //
2373 // Return Value:
2374 //    None.
2375 //
2376 void LinearScan::BuildCast(GenTree* tree)
2377 {
2378     TreeNodeInfo* info = currentNodeInfo;
2379     // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
2380     //         see CodeGen::genIntToIntCast()
2381
2382     // Non-overflow casts to/from float/double are done using SSE2 instructions
2383     // and that allow the source operand to be either a reg or memop. Given the
2384     // fact that casts from small int to float/double are done as two-level casts,
2385     // the source operand is always guaranteed to be of size 4 or 8 bytes.
2386     var_types castToType = tree->CastToType();
2387     GenTree*  castOp     = tree->gtCast.CastOp();
2388     var_types castOpType = castOp->TypeGet();
2389
2390     info->srcCount = GetOperandInfo(castOp);
2391     assert(info->dstCount == 1);
2392     if (tree->gtFlags & GTF_UNSIGNED)
2393     {
2394         castOpType = genUnsignedType(castOpType);
2395     }
2396
2397     // some overflow checks need a temp reg:
2398     //  - GT_CAST from INT64/UINT64 to UINT32
2399     if (tree->gtOverflow() && (castToType == TYP_UINT))
2400     {
2401         if (genTypeSize(castOpType) == 8)
2402         {
2403             // Here we don't need internal register to be different from targetReg,
2404             // rather require it to be different from operand's reg.
2405             info->internalIntCount = 1;
2406         }
2407     }
2408 }
2409
2410 //-----------------------------------------------------------------------------------------
2411 // BuildIndir: Specify register requirements for address expression of an indirection operation.
2412 //
2413 // Arguments:
2414 //    indirTree    -   GT_IND or GT_STOREIND gentree node
2415 //
2416 void LinearScan::BuildIndir(GenTreeIndir* indirTree)
2417 {
2418     TreeNodeInfo* info = currentNodeInfo;
2419     // If this is the rhs of a block copy (i.e. non-enregisterable struct),
2420     // it has no register requirements.
2421     if (indirTree->TypeGet() == TYP_STRUCT)
2422     {
2423         return;
2424     }
2425
2426     int indirSrcCount = GetIndirInfo(indirTree);
2427     if (indirTree->gtOper == GT_STOREIND)
2428     {
2429         GenTree* source = indirTree->gtOp.gtOp2;
2430         if (indirTree->AsStoreInd()->IsRMWMemoryOp())
2431         {
2432             // Because 'source' is contained, we haven't yet determined its special register requirements, if any.
2433             // As it happens, the Shift or Rotate cases are the only ones with special requirements.
2434             assert(source->isContained() && source->OperIsRMWMemOp());
2435             GenTree* nonMemSource = nullptr;
2436
2437             if (source->OperIsShiftOrRotate())
2438             {
2439                 info->srcCount += BuildShiftRotate(source);
2440             }
2441             else
2442             {
2443                 info->srcCount += appendBinaryLocationInfoToList(source->AsOp());
2444             }
2445             if (indirTree->AsStoreInd()->IsRMWDstOp1())
2446             {
2447                 if (source->OperIsBinary())
2448                 {
2449                     nonMemSource = source->gtOp.gtOp2;
2450                 }
2451             }
2452             else if (indirTree->AsStoreInd()->IsRMWDstOp2())
2453             {
2454                 nonMemSource = source->gtOp.gtOp1;
2455             }
2456             if (nonMemSource != nullptr)
2457             {
2458                 assert(!nonMemSource->isContained() || (!nonMemSource->isMemoryOp() && !nonMemSource->IsLocal()));
2459 #ifdef _TARGET_X86_
2460                 if (varTypeIsByte(indirTree) && !nonMemSource->isContained())
2461                 {
2462                     // If storeInd is of TYP_BYTE, set source to byteable registers.
2463                     TreeNodeInfo& nonMemSourceInfo = useList.GetTreeNodeInfo(nonMemSource);
2464                     regMaskTP     regMask          = nonMemSourceInfo.getSrcCandidates(this);
2465                     regMask &= ~RBM_NON_BYTE_REGS;
2466                     assert(regMask != RBM_NONE);
2467                     nonMemSourceInfo.setSrcCandidates(this, regMask);
2468                 }
2469 #endif
2470             }
2471         }
2472         else
2473         {
2474 #ifdef _TARGET_X86_
2475             if (varTypeIsByte(indirTree) && !source->isContained())
2476             {
2477                 // If storeInd is of TYP_BYTE, set source to byteable registers.
2478                 LocationInfoListNode* sourceInfo = getLocationInfo(source);
2479                 regMaskTP             regMask    = sourceInfo->info.getSrcCandidates(this);
2480                 regMask &= ~RBM_NON_BYTE_REGS;
2481                 assert(regMask != RBM_NONE);
2482                 sourceInfo->info.setSrcCandidates(this, regMask);
2483                 useList.Append(sourceInfo);
2484                 info->srcCount++;
2485             }
2486             else
2487 #endif
2488             {
2489                 info->srcCount += GetOperandInfo(source);
2490             }
2491         }
2492     }
2493     info->srcCount += indirSrcCount;
2494
2495 #ifdef FEATURE_SIMD
2496     if (indirTree->TypeGet() == TYP_SIMD12)
2497     {
2498         // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir().
2499         assert(!indirTree->Addr()->isContained());
2500
2501         // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
2502         // To assemble the vector properly we would need an additional
2503         // XMM register.
2504         info->internalFloatCount = 1;
2505
2506         // In case of GT_IND we need an internal register different from targetReg and
2507         // both of the registers are used at the same time.
2508         if (indirTree->OperGet() == GT_IND)
2509         {
2510             info->isInternalRegDelayFree = true;
2511         }
2512
2513         info->setInternalCandidates(this, allSIMDRegs());
2514
2515         return;
2516     }
2517 #endif // FEATURE_SIMD
2518
2519     assert(indirTree->Addr()->gtOper != GT_ARR_ELEM);
2520 }
2521
2522 //------------------------------------------------------------------------
2523 // BuildMul: Set the NodeInfo for a multiply.
2524 //
2525 // Arguments:
2526 //    tree      - The node of interest
2527 //
2528 // Return Value:
2529 //    None.
2530 //
2531 void LinearScan::BuildMul(GenTree* tree)
2532 {
2533     TreeNodeInfo* info = currentNodeInfo;
2534 #if defined(_TARGET_X86_)
2535     assert(tree->OperIs(GT_MUL, GT_MULHI, GT_MUL_LONG));
2536 #else
2537     assert(tree->OperIs(GT_MUL, GT_MULHI));
2538 #endif
2539     GenTree* op1   = tree->gtOp.gtOp1;
2540     GenTree* op2   = tree->gtOp.gtOp2;
2541     info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
2542     assert(info->dstCount == 1);
2543
2544     // Case of float/double mul.
2545     if (varTypeIsFloating(tree->TypeGet()))
2546     {
2547         return;
2548     }
2549
2550     bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
2551     bool requiresOverflowCheck = tree->gtOverflowEx();
2552
2553     // There are three forms of x86 multiply:
2554     // one-op form:     RDX:RAX = RAX * r/m
2555     // two-op form:     reg *= r/m
2556     // three-op form:   reg = r/m * imm
2557
2558     // This special widening 32x32->64 MUL is not used on x64
2559     CLANG_FORMAT_COMMENT_ANCHOR;
2560 #if defined(_TARGET_X86_)
2561     if (tree->OperGet() != GT_MUL_LONG)
2562 #endif
2563     {
2564         assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
2565     }
2566
2567     // We do use the widening multiply to implement
2568     // the overflow checking for unsigned multiply
2569     //
2570     if (isUnsignedMultiply && requiresOverflowCheck)
2571     {
2572         // The only encoding provided is RDX:RAX = RAX * rm
2573         //
2574         // Here we set RAX as the only destination candidate
2575         // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
2576         //
2577         info->setDstCandidates(this, RBM_RAX);
2578     }
2579     else if (tree->OperGet() == GT_MULHI)
2580     {
2581         // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
2582         // upper 32 bits of the result set the destination candidate to REG_RDX.
2583         info->setDstCandidates(this, RBM_RDX);
2584     }
2585 #if defined(_TARGET_X86_)
2586     else if (tree->OperGet() == GT_MUL_LONG)
2587     {
2588         // have to use the encoding:RDX:RAX = RAX * rm
2589         info->setDstCandidates(this, RBM_RAX);
2590     }
2591 #endif
2592     GenTree* containedMemOp = nullptr;
2593     if (op1->isContained() && !op1->IsCnsIntOrI())
2594     {
2595         assert(!op2->isContained() || op2->IsCnsIntOrI());
2596         containedMemOp = op1;
2597     }
2598     else if (op2->isContained() && !op2->IsCnsIntOrI())
2599     {
2600         containedMemOp = op2;
2601     }
2602     if ((containedMemOp != nullptr) && CheckAndSetDelayFree(containedMemOp))
2603     {
2604         info->hasDelayFreeSrc = true;
2605     }
2606 }
2607
2608 //------------------------------------------------------------------------------
2609 // SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
2610 // Contains256bitAVX flag when SIMD vector size is 32 bytes
2611 //
2612 // Arguments:
2613 //    isFloatingPointType   - true if it is floating point type
2614 //    sizeOfSIMDVector      - SIMD Vector size
2615 //
2616 void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
2617 {
2618     if (isFloatingPointType && compiler->canUseVexEncoding())
2619     {
2620         compiler->getEmitter()->SetContainsAVX(true);
2621         if (sizeOfSIMDVector == 32)
2622         {
2623             compiler->getEmitter()->SetContains256bitAVX(true);
2624         }
2625     }
2626 }
2627
2628 #ifdef _TARGET_X86_
2629 //------------------------------------------------------------------------
2630 // ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
2631 // various reasons
2632 //
2633 // Arguments:
2634 //    tree      - The node of interest
2635 //
2636 // Return Value:
2637 //    If we need to exclude non-byteable registers
2638 //
2639 bool LinearScan::ExcludeNonByteableRegisters(GenTree* tree)
2640 {
2641     // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
2642     // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
2643     // value. In this case we need to exclude esi/edi from the src candidates of op2.
2644     if (varTypeIsByte(tree))
2645     {
2646         return true;
2647     }
2648     // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
2649     else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
2650     {
2651         return true;
2652     }
2653     else if (tree->OperIsCompare() || tree->OperIs(GT_CMP))
2654     {
2655         GenTree* op1 = tree->gtGetOp1();
2656         GenTree* op2 = tree->gtGetOp2();
2657
2658         // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
2659         // ubyte as the result of comparison and if the result needs to be materialized into a reg
2660         // simply zero extend it to TYP_INT size.  Here is an example of generated code:
2661         //         cmp dl, byte ptr[addr mode]
2662         //         movzx edx, dl
2663         if (varTypeIsByte(op1) && varTypeIsByte(op2))
2664         {
2665             return true;
2666         }
2667         // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
2668         // ubyte as the result of the comparison and if the result needs to be materialized into a reg
2669         // simply zero extend it to TYP_INT size.
2670         else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
2671         {
2672             return true;
2673         }
2674         // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
2675         // ubyte as the result of the comparison and if the result needs to be materialized into a reg
2676         // simply zero extend it to TYP_INT size.
2677         else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
2678         {
2679             return true;
2680         }
2681         else
2682         {
2683             return false;
2684         }
2685     }
2686 #ifdef FEATURE_SIMD
2687     else if (tree->OperGet() == GT_SIMD)
2688     {
2689         GenTreeSIMD* simdNode = tree->AsSIMD();
2690         switch (simdNode->gtSIMDIntrinsicID)
2691         {
2692             case SIMDIntrinsicOpEquality:
2693             case SIMDIntrinsicOpInEquality:
2694                 // We manifest it into a byte register, so the target must be byteable.
2695                 return true;
2696
2697             case SIMDIntrinsicGetItem:
2698             {
2699                 // This logic is duplicated from genSIMDIntrinsicGetItem().
2700                 // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
2701                 // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
2702                 // cases will require this, so the non-byteable registers can be excluded.
2703
2704                 GenTree*  op1      = simdNode->gtGetOp1();
2705                 GenTree*  op2      = simdNode->gtGetOp2();
2706                 var_types baseType = simdNode->gtSIMDBaseType;
2707                 if (!isContainableMemoryOp(op1) && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
2708                 {
2709                     bool     ZeroOrSignExtnReqd = true;
2710                     unsigned baseSize           = genTypeSize(baseType);
2711                     if (baseSize == 1)
2712                     {
2713                         if ((op2->gtIntCon.gtIconVal % 2) == 1)
2714                         {
2715                             ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2716                         }
2717                     }
2718                     else
2719                     {
2720                         assert(baseSize == 2);
2721                         ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2722                     }
2723                     return ZeroOrSignExtnReqd;
2724                 }
2725                 break;
2726             }
2727
2728             default:
2729                 break;
2730         }
2731         return false;
2732     }
2733 #endif // FEATURE_SIMD
2734     else
2735     {
2736         return false;
2737     }
2738 }
2739 #endif // _TARGET_X86_
2740
2741 #endif // _TARGET_XARCH_
2742
2743 #endif // !LEGACY_BACKEND