364a4b7103a2698439725473822c4b4bf00cfd1e
[platform/upstream/coreclr.git] / src / jit / lsraxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX                    Register Requirements for AMD64                        XX
9 XX                                                                           XX
10 XX  This encapsulates all the logic for setting register requirements for    XX
11 XX  the AMD64 architecture.                                                  XX
12 XX                                                                           XX
13 XX                                                                           XX
14 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 */
17
18 #include "jitpch.h"
19 #ifdef _MSC_VER
20 #pragma hdrstop
21 #endif
22
23 #ifdef _TARGET_XARCH_
24
25 #include "jit.h"
26 #include "sideeffects.h"
27 #include "lower.h"
28
29 //------------------------------------------------------------------------
30 // BuildNode: Build the RefPositions for for a node
31 //
32 // Arguments:
33 //    treeNode - the node of interest
34 //
35 // Return Value:
36 //    The number of sources consumed by this node.
37 //
38 // Notes:
39 // Preconditions:
40 //    LSRA Has been initialized.
41 //
42 // Postconditions:
43 //    RefPositions have been built for all the register defs and uses required
44 //    for this node.
45 //
46 int LinearScan::BuildNode(GenTree* tree)
47 {
48     assert(!tree->isContained());
49     Interval* prefSrcInterval = nullptr;
50     int       srcCount;
51     int       dstCount      = 0;
52     regMaskTP dstCandidates = RBM_NONE;
53     regMaskTP killMask      = RBM_NONE;
54     bool      isLocalDefUse = false;
55
56     // Reset the build-related members of LinearScan.
57     clearBuildState();
58
59     // Set the default dstCount. This may be modified below.
60     if (tree->IsValue())
61     {
62         dstCount = 1;
63         if (tree->IsUnusedValue())
64         {
65             isLocalDefUse = true;
66         }
67     }
68     else
69     {
70         dstCount = 0;
71     }
72
73     // floating type generates AVX instruction (vmovss etc.), set the flag
74     if (varTypeIsFloating(tree->TypeGet()))
75     {
76         SetContainsAVXFlags();
77     }
78
79     switch (tree->OperGet())
80     {
81         default:
82             srcCount = BuildSimple(tree);
83             break;
84
85         case GT_LCL_VAR:
86             // Because we do containment analysis before we redo dataflow and identify register
87             // candidates, the containment analysis only uses !lvDoNotEnregister to estimate register
88             // candidates.
89             // If there is a lclVar that is estimated to be register candidate but
90             // is not, if they were marked regOptional they should now be marked contained instead.
91             // TODO-XArch-CQ: When this is being called while RefPositions are being created,
92             // use lvLRACandidate here instead.
93             if (tree->IsRegOptional())
94             {
95                 if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked ||
96                     compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister)
97                 {
98                     tree->ClearRegOptional();
99                     tree->SetContained();
100                     INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 0));
101                     return 0;
102                 }
103             }
104             __fallthrough;
105
106         case GT_LCL_FLD:
107         {
108             // We handle tracked variables differently from non-tracked ones.  If it is tracked,
109             // we will simply add a use of the tracked variable at its parent/consumer.
110             // Otherwise, for a use we need to actually add the appropriate references for loading
111             // or storing the variable.
112             //
113             // A tracked variable won't actually get used until the appropriate ancestor tree node
114             // is processed, unless this is marked "isLocalDefUse" because it is a stack-based argument
115             // to a call or an orphaned dead node.
116             //
117             LclVarDsc* const varDsc = &compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum];
118             if (isCandidateVar(varDsc))
119             {
120                 INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 1));
121                 return 0;
122             }
123             srcCount = 0;
124 #ifdef FEATURE_SIMD
125             // Need an additional register to read upper 4 bytes of Vector3.
126             if (tree->TypeGet() == TYP_SIMD12)
127             {
128                 // We need an internal register different from targetReg in which 'tree' produces its result
129                 // because both targetReg and internal reg will be in use at the same time.
130                 buildInternalFloatRegisterDefForNode(tree, allSIMDRegs());
131                 setInternalRegsDelayFree = true;
132                 buildInternalRegisterUses();
133             }
134 #endif
135             BuildDef(tree);
136         }
137         break;
138
139         case GT_STORE_LCL_FLD:
140         case GT_STORE_LCL_VAR:
141             srcCount = BuildStoreLoc(tree->AsLclVarCommon());
142             break;
143
144         case GT_FIELD_LIST:
145             // These should always be contained. We don't correctly allocate or
146             // generate code for a non-contained GT_FIELD_LIST.
147             noway_assert(!"Non-contained GT_FIELD_LIST");
148             srcCount = 0;
149             break;
150
151         case GT_LIST:
152         case GT_ARGPLACE:
153         case GT_NO_OP:
154         case GT_START_NONGC:
155             srcCount = 0;
156             assert(dstCount == 0);
157             break;
158
159         case GT_START_PREEMPTGC:
160             // This kills GC refs in callee save regs
161             srcCount = 0;
162             assert(dstCount == 0);
163             BuildDefsWithKills(tree, 0, RBM_NONE, RBM_NONE);
164             break;
165
166         case GT_PROF_HOOK:
167             srcCount = 0;
168             assert(dstCount == 0);
169             killMask = getKillSetForProfilerHook();
170             BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
171             break;
172
173         case GT_CNS_INT:
174         case GT_CNS_LNG:
175         case GT_CNS_DBL:
176         {
177             srcCount = 0;
178             assert(dstCount == 1);
179             assert(!tree->IsReuseRegVal());
180             RefPosition* def               = BuildDef(tree);
181             def->getInterval()->isConstant = true;
182         }
183         break;
184
185 #if !defined(_TARGET_64BIT_)
186
187         case GT_LONG:
188             assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here.
189             // An unused GT_LONG node needs to consume its sources, but need not produce a register.
190             tree->gtType = TYP_VOID;
191             tree->ClearUnusedValue();
192             isLocalDefUse = false;
193             srcCount      = 2;
194             dstCount      = 0;
195             BuildUse(tree->gtGetOp1());
196             BuildUse(tree->gtGetOp2());
197             break;
198
199 #endif // !defined(_TARGET_64BIT_)
200
201         case GT_BOX:
202         case GT_COMMA:
203         case GT_QMARK:
204         case GT_COLON:
205             srcCount = 0;
206             unreached();
207             break;
208
209         case GT_RETURN:
210             srcCount = BuildReturn(tree);
211             killMask = getKillSetForReturn();
212             BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
213             break;
214
215         case GT_RETFILT:
216             assert(dstCount == 0);
217             if (tree->TypeGet() == TYP_VOID)
218             {
219                 srcCount = 0;
220             }
221             else
222             {
223                 assert(tree->TypeGet() == TYP_INT);
224                 srcCount = 1;
225                 BuildUse(tree->gtGetOp1(), RBM_INTRET);
226             }
227             break;
228
229         // A GT_NOP is either a passthrough (if it is void, or if it has
230         // a child), but must be considered to produce a dummy value if it
231         // has a type but no child
232         case GT_NOP:
233             srcCount = 0;
234             assert((tree->gtGetOp1() == nullptr) || tree->isContained());
235             if (tree->TypeGet() != TYP_VOID && tree->gtGetOp1() == nullptr)
236             {
237                 assert(dstCount == 1);
238                 BuildUse(tree->gtGetOp1());
239                 BuildDef(tree);
240             }
241             else
242             {
243                 assert(dstCount == 0);
244             }
245             break;
246
247         case GT_JTRUE:
248         {
249             srcCount = 0;
250             assert(dstCount == 0);
251             GenTree* cmp = tree->gtGetOp1();
252             assert(!cmp->IsValue());
253         }
254         break;
255
256         case GT_JCC:
257             srcCount = 0;
258             assert(dstCount == 0);
259             break;
260
261         case GT_SETCC:
262             srcCount = 0;
263             assert(dstCount == 1);
264             // This defines a byte value (note that on x64 allByteRegs() is defined as RBM_ALLINT).
265             BuildDef(tree, allByteRegs());
266             break;
267
268         case GT_JMP:
269             srcCount = 0;
270             assert(dstCount == 0);
271             break;
272
273         case GT_SWITCH:
274             // This should never occur since switch nodes must not be visible at this
275             // point in the JIT.
276             srcCount = 0;
277             noway_assert(!"Switch must be lowered at this point");
278             break;
279
280         case GT_JMPTABLE:
281             srcCount = 0;
282             assert(dstCount == 1);
283             BuildDef(tree);
284             break;
285
286         case GT_SWITCH_TABLE:
287         {
288             assert(dstCount == 0);
289             buildInternalIntRegisterDefForNode(tree);
290             srcCount = BuildBinaryUses(tree->AsOp());
291             buildInternalRegisterUses();
292             assert(srcCount == 2);
293         }
294         break;
295
296         case GT_ASG:
297             noway_assert(!"We should never hit any assignment operator in lowering");
298             srcCount = 0;
299             break;
300
301 #if !defined(_TARGET_64BIT_)
302         case GT_ADD_LO:
303         case GT_ADD_HI:
304         case GT_SUB_LO:
305         case GT_SUB_HI:
306 #endif
307         case GT_ADD:
308         case GT_SUB:
309         case GT_AND:
310         case GT_OR:
311         case GT_XOR:
312             srcCount = BuildBinaryUses(tree->AsOp());
313             assert(dstCount == 1);
314             BuildDef(tree);
315             break;
316
317         case GT_BT:
318             srcCount = BuildBinaryUses(tree->AsOp());
319             assert(dstCount == 0);
320             break;
321
322         case GT_RETURNTRAP:
323         {
324             // This just turns into a compare of its child with an int + a conditional call.
325             RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree);
326             srcCount                 = BuildOperandUses(tree->gtGetOp1());
327             buildInternalRegisterUses();
328             killMask = compiler->compHelperCallKillSet(CORINFO_HELP_STOP_FOR_GC);
329             BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
330         }
331         break;
332
333         case GT_MOD:
334         case GT_DIV:
335         case GT_UMOD:
336         case GT_UDIV:
337             srcCount = BuildModDiv(tree->AsOp());
338             break;
339
340 #if defined(_TARGET_X86_)
341         case GT_MUL_LONG:
342             dstCount = 2;
343             __fallthrough;
344 #endif
345         case GT_MUL:
346         case GT_MULHI:
347             srcCount = BuildMul(tree->AsOp());
348             break;
349
350         case GT_INTRINSIC:
351             srcCount = BuildIntrinsic(tree->AsOp());
352             break;
353
354 #ifdef FEATURE_SIMD
355         case GT_SIMD:
356             srcCount = BuildSIMD(tree->AsSIMD());
357             break;
358 #endif // FEATURE_SIMD
359
360 #ifdef FEATURE_HW_INTRINSICS
361         case GT_HWIntrinsic:
362             srcCount = BuildHWIntrinsic(tree->AsHWIntrinsic());
363             break;
364 #endif // FEATURE_HW_INTRINSICS
365
366         case GT_CAST:
367             assert(dstCount == 1);
368             srcCount = BuildCast(tree->AsCast());
369             break;
370
371         case GT_BITCAST:
372         {
373             assert(dstCount == 1);
374             tgtPrefUse = BuildUse(tree->gtGetOp1());
375             BuildDef(tree);
376             srcCount = 1;
377         }
378         break;
379
380         case GT_NEG:
381             // TODO-XArch-CQ:
382             // SSE instruction set doesn't have an instruction to negate a number.
383             // The recommended way is to xor the float/double number with a bitmask.
384             // The only way to xor is using xorps or xorpd both of which operate on
385             // 128-bit operands.  To hold the bit-mask we would need another xmm
386             // register or a 16-byte aligned 128-bit data constant. Right now emitter
387             // lacks the support for emitting such constants or instruction with mem
388             // addressing mode referring to a 128-bit operand. For now we use an
389             // internal xmm register to load 32/64-bit bitmask from data section.
390             // Note that by trading additional data section memory (128-bit) we can
391             // save on the need for an internal register and also a memory-to-reg
392             // move.
393             //
394             // Note: another option to avoid internal register requirement is by
395             // lowering as GT_SUB(0, src).  This will generate code different from
396             // Jit64 and could possibly result in compat issues (?).
397             if (varTypeIsFloating(tree))
398             {
399
400                 RefPosition* internalDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
401                 srcCount                 = BuildOperandUses(tree->gtGetOp1());
402                 buildInternalRegisterUses();
403             }
404             else
405             {
406                 srcCount = BuildOperandUses(tree->gtGetOp1());
407             }
408             BuildDef(tree);
409             break;
410
411         case GT_NOT:
412             srcCount = BuildOperandUses(tree->gtGetOp1());
413             BuildDef(tree);
414             break;
415
416         case GT_LSH:
417         case GT_RSH:
418         case GT_RSZ:
419         case GT_ROL:
420         case GT_ROR:
421 #ifdef _TARGET_X86_
422         case GT_LSH_HI:
423         case GT_RSH_LO:
424 #endif
425             srcCount = BuildShiftRotate(tree);
426             break;
427
428         case GT_EQ:
429         case GT_NE:
430         case GT_LT:
431         case GT_LE:
432         case GT_GE:
433         case GT_GT:
434         case GT_TEST_EQ:
435         case GT_TEST_NE:
436         case GT_CMP:
437             srcCount = BuildCmp(tree);
438             break;
439
440         case GT_CKFINITE:
441         {
442             assert(dstCount == 1);
443             RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree);
444             srcCount                 = BuildOperandUses(tree->gtGetOp1());
445             buildInternalRegisterUses();
446             BuildDef(tree);
447         }
448         break;
449
450         case GT_CMPXCHG:
451         {
452             srcCount = 3;
453             assert(dstCount == 1);
454
455             // Comparand is preferenced to RAX.
456             // The remaining two operands can be in any reg other than RAX.
457             BuildUse(tree->gtCmpXchg.gtOpLocation, allRegs(TYP_INT) & ~RBM_RAX);
458             BuildUse(tree->gtCmpXchg.gtOpValue, allRegs(TYP_INT) & ~RBM_RAX);
459             BuildUse(tree->gtCmpXchg.gtOpComparand, RBM_RAX);
460             BuildDef(tree, RBM_RAX);
461         }
462         break;
463
464         case GT_XADD:
465         case GT_XCHG:
466         {
467             // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
468             // to special case them.
469             // These tree nodes will have their op1 marked as isDelayFree=true.
470             // That is, op1's reg remains in use until the subsequent instruction.
471             GenTree* addr = tree->gtGetOp1();
472             GenTree* data = tree->gtGetOp2();
473             assert(!addr->isContained());
474             RefPosition* addrUse = BuildUse(addr);
475             setDelayFree(addrUse);
476             tgtPrefUse = addrUse;
477             assert(!data->isContained());
478             BuildUse(data);
479             srcCount = 2;
480             assert(dstCount == 1);
481             BuildDef(tree);
482         }
483         break;
484
485         case GT_PUTARG_REG:
486             srcCount = BuildPutArgReg(tree->AsUnOp());
487             break;
488
489         case GT_CALL:
490             srcCount = BuildCall(tree->AsCall());
491             if (tree->AsCall()->HasMultiRegRetVal())
492             {
493                 dstCount = tree->AsCall()->GetReturnTypeDesc()->GetReturnRegCount();
494             }
495             break;
496
497         case GT_ADDR:
498         {
499             // For a GT_ADDR, the child node should not be evaluated into a register
500             GenTree* child = tree->gtGetOp1();
501             assert(!isCandidateLocalRef(child));
502             assert(child->isContained());
503             assert(dstCount == 1);
504             srcCount = 0;
505         }
506         break;
507
508 #if !defined(FEATURE_PUT_STRUCT_ARG_STK)
509         case GT_OBJ:
510 #endif
511         case GT_BLK:
512         case GT_DYN_BLK:
513             // These should all be eliminated prior to Lowering.
514             assert(!"Non-store block node in Lowering");
515             srcCount = 0;
516             break;
517
518 #ifdef FEATURE_PUT_STRUCT_ARG_STK
519         case GT_PUTARG_STK:
520             srcCount = BuildPutArgStk(tree->AsPutArgStk());
521             break;
522 #endif // FEATURE_PUT_STRUCT_ARG_STK
523
524         case GT_STORE_BLK:
525         case GT_STORE_OBJ:
526         case GT_STORE_DYN_BLK:
527             srcCount = BuildBlockStore(tree->AsBlk());
528             break;
529
530         case GT_INIT_VAL:
531             // Always a passthrough of its child's value.
532             assert(!"INIT_VAL should always be contained");
533             srcCount = 0;
534             break;
535
536         case GT_LCLHEAP:
537             srcCount = BuildLclHeap(tree);
538             break;
539
540         case GT_ARR_BOUNDS_CHECK:
541 #ifdef FEATURE_SIMD
542         case GT_SIMD_CHK:
543 #endif // FEATURE_SIMD
544 #ifdef FEATURE_HW_INTRINSICS
545         case GT_HW_INTRINSIC_CHK:
546 #endif // FEATURE_HW_INTRINSICS
547
548             // Consumes arrLen & index - has no result
549             srcCount = 2;
550             assert(dstCount == 0);
551             srcCount = BuildOperandUses(tree->AsBoundsChk()->gtIndex);
552             srcCount += BuildOperandUses(tree->AsBoundsChk()->gtArrLen);
553             break;
554
555         case GT_ARR_ELEM:
556             // These must have been lowered to GT_ARR_INDEX
557             noway_assert(!"We should never see a GT_ARR_ELEM after Lowering.");
558             srcCount = 0;
559             break;
560
561         case GT_ARR_INDEX:
562         {
563             srcCount = 2;
564             assert(dstCount == 1);
565             assert(!tree->AsArrIndex()->ArrObj()->isContained());
566             assert(!tree->AsArrIndex()->IndexExpr()->isContained());
567             // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
568             // times while the result is being computed.
569             RefPosition* arrObjUse = BuildUse(tree->AsArrIndex()->ArrObj());
570             setDelayFree(arrObjUse);
571             BuildUse(tree->AsArrIndex()->IndexExpr());
572             BuildDef(tree);
573         }
574         break;
575
576         case GT_ARR_OFFSET:
577         {
578             // This consumes the offset, if any, the arrObj and the effective index,
579             // and produces the flattened offset for this dimension.
580             assert(dstCount == 1);
581             srcCount                 = 0;
582             RefPosition* internalDef = nullptr;
583             if (tree->gtArrOffs.gtOffset->isContained())
584             {
585                 srcCount = 2;
586             }
587             else
588             {
589                 // Here we simply need an internal register, which must be different
590                 // from any of the operand's registers, but may be the same as targetReg.
591                 srcCount    = 3;
592                 internalDef = buildInternalIntRegisterDefForNode(tree);
593                 BuildUse(tree->AsArrOffs()->gtOffset);
594             }
595             BuildUse(tree->AsArrOffs()->gtIndex);
596             BuildUse(tree->AsArrOffs()->gtArrObj);
597             if (internalDef != nullptr)
598             {
599                 buildInternalRegisterUses();
600             }
601             BuildDef(tree);
602         }
603         break;
604
605         case GT_LEA:
606             // The LEA usually passes its operands through to the GT_IND, in which case it will
607             // be contained, but we may be instantiating an address, in which case we set them here.
608             srcCount = 0;
609             assert(dstCount == 1);
610             if (tree->AsAddrMode()->HasBase())
611             {
612                 srcCount++;
613                 BuildUse(tree->AsAddrMode()->Base());
614             }
615             if (tree->AsAddrMode()->HasIndex())
616             {
617                 srcCount++;
618                 BuildUse(tree->AsAddrMode()->Index());
619             }
620             BuildDef(tree);
621             break;
622
623         case GT_STOREIND:
624             if (compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(tree))
625             {
626                 srcCount = BuildGCWriteBarrier(tree);
627                 break;
628             }
629             srcCount = BuildIndir(tree->AsIndir());
630             break;
631
632         case GT_NULLCHECK:
633         {
634             assert(dstCount == 0);
635             regMaskTP indirCandidates = RBM_NONE;
636             BuildUse(tree->gtGetOp1(), indirCandidates);
637             srcCount = 1;
638             break;
639         }
640
641         case GT_IND:
642             srcCount = BuildIndir(tree->AsIndir());
643             assert(dstCount == 1);
644             break;
645
646         case GT_CATCH_ARG:
647             srcCount = 0;
648             assert(dstCount == 1);
649             BuildDef(tree, RBM_EXCEPTION_OBJECT);
650             break;
651
652 #if !FEATURE_EH_FUNCLETS
653         case GT_END_LFIN:
654             srcCount = 0;
655             assert(dstCount == 0);
656             break;
657 #endif
658
659         case GT_CLS_VAR:
660             // These nodes are eliminated by rationalizer.
661             JITDUMP("Unexpected node %s in Lower.\n", GenTree::OpName(tree->OperGet()));
662             unreached();
663             break;
664
665         case GT_INDEX_ADDR:
666         {
667             assert(dstCount == 1);
668             RefPosition* internalDef = nullptr;
669 #ifdef _TARGET_64BIT_
670             // On 64-bit we always need a temporary register:
671             //   - if the index is `native int` then we need to load the array
672             //     length into a register to widen it to `native int`
673             //   - if the index is `int` (or smaller) then we need to widen
674             //     it to `long` to peform the address calculation
675             internalDef = buildInternalIntRegisterDefForNode(tree);
676 #else  // !_TARGET_64BIT_
677             assert(!varTypeIsLong(tree->AsIndexAddr()->Index()->TypeGet()));
678             switch (tree->AsIndexAddr()->gtElemSize)
679             {
680                 case 1:
681                 case 2:
682                 case 4:
683                 case 8:
684                     break;
685
686                 default:
687                     internalDef = buildInternalIntRegisterDefForNode(tree);
688                     break;
689             }
690 #endif // !_TARGET_64BIT_
691             srcCount = BuildBinaryUses(tree->AsOp());
692             if (internalDef != nullptr)
693             {
694                 buildInternalRegisterUses();
695             }
696             BuildDef(tree);
697         }
698         break;
699
700     } // end switch (tree->OperGet())
701
702     // We need to be sure that we've set srcCount and dstCount appropriately.
703     // Not that for XARCH, the maximum number of registers defined is 2.
704     assert((dstCount < 2) || ((dstCount == 2) && tree->IsMultiRegNode()));
705     assert(isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue()));
706     assert(!tree->IsUnusedValue() || (dstCount != 0));
707     assert(dstCount == tree->GetRegisterDstCount());
708     INDEBUG(dumpNodeInfo(tree, dstCandidates, srcCount, dstCount));
709     return srcCount;
710 }
711
712 //------------------------------------------------------------------------
713 // getTgtPrefOperands: Identify whether the operands of an Op should be preferenced to the target.
714 //
715 // Arguments:
716 //    tree    - the node of interest.
717 //    prefOp1 - a bool "out" parameter indicating, on return, whether op1 should be preferenced to the target.
718 //    prefOp2 - a bool "out" parameter indicating, on return, whether op2 should be preferenced to the target.
719 //
720 // Return Value:
721 //    This has two "out" parameters for returning the results (see above).
722 //
723 // Notes:
724 //    The caller is responsible for initializing the two "out" parameters to false.
725 //
726 void LinearScan::getTgtPrefOperands(GenTreeOp* tree, bool& prefOp1, bool& prefOp2)
727 {
728     // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
729     // Even then we would like to set isTgtPref on Op1.
730     if (tree->OperIsBinary() && isRMWRegOper(tree))
731     {
732         GenTree* op1 = tree->gtGetOp1();
733         GenTree* op2 = tree->gtGetOp2();
734
735         // If we have a read-modify-write operation, we want to preference op1 to the target,
736         // if it is not contained.
737         if (!op1->isContained() && !op1->OperIs(GT_LIST))
738         {
739             prefOp1 = true;
740         }
741
742         // Commutative opers like add/mul/and/or/xor could reverse the order of operands if it is safe to do so.
743         // In that case we will preference both, to increase the chance of getting a match.
744         if (tree->OperIsCommutative() && op2 != nullptr && !op2->isContained())
745         {
746             prefOp2 = true;
747         }
748     }
749 }
750
751 //------------------------------------------------------------------------------
752 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
753 //
754 // Arguments:
755 //    tree      - a binary tree node
756 //
757 // Return Value:
758 //    Returns true if we can use the read-modify-write instruction form
759 //
760 // Notes:
761 //    This is used to determine whether to preference the source to the destination register.
762 //
763 bool LinearScan::isRMWRegOper(GenTree* tree)
764 {
765     // TODO-XArch-CQ: Make this more accurate.
766     // For now, We assume that most binary operators are of the RMW form.
767     assert(tree->OperIsBinary());
768
769     if (tree->OperIsCompare() || tree->OperIs(GT_CMP) || tree->OperIs(GT_BT))
770     {
771         return false;
772     }
773
774     switch (tree->OperGet())
775     {
776         // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
777         case GT_LEA:
778         case GT_STOREIND:
779         case GT_ARR_INDEX:
780         case GT_STORE_BLK:
781         case GT_STORE_OBJ:
782         case GT_SWITCH_TABLE:
783         case GT_LOCKADD:
784 #ifdef _TARGET_X86_
785         case GT_LONG:
786 #endif
787             return false;
788
789         // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
790         case GT_MUL:
791             return (!tree->gtGetOp2()->isContainedIntOrIImmed() && !tree->gtGetOp1()->isContainedIntOrIImmed());
792
793 #ifdef FEATURE_HW_INTRINSICS
794         case GT_HWIntrinsic:
795             return tree->isRMWHWIntrinsic(compiler);
796 #endif // FEATURE_HW_INTRINSICS
797
798         default:
799             return true;
800     }
801 }
802
803 // Support for building RefPositions for RMW nodes.
804 int LinearScan::BuildRMWUses(GenTreeOp* node, regMaskTP candidates)
805 {
806     int       srcCount      = 0;
807     GenTree*  op1           = node->gtOp1;
808     GenTree*  op2           = node->gtGetOp2IfPresent();
809     bool      isReverseOp   = node->IsReverseOp();
810     regMaskTP op1Candidates = candidates;
811     regMaskTP op2Candidates = candidates;
812
813 #ifdef _TARGET_X86_
814     if (varTypeIsByte(node))
815     {
816         regMaskTP byteCandidates = (candidates == RBM_NONE) ? allByteRegs() : (candidates & allByteRegs());
817         if (!op1->isContained())
818         {
819             assert(byteCandidates != RBM_NONE);
820             op1Candidates = byteCandidates;
821         }
822         if (node->OperIsCommutative() && !op2->isContained())
823         {
824             assert(byteCandidates != RBM_NONE);
825             op2Candidates = byteCandidates;
826         }
827     }
828 #endif // _TARGET_X86_
829
830     bool prefOp1 = false;
831     bool prefOp2 = false;
832     getTgtPrefOperands(node, prefOp1, prefOp2);
833     assert(!prefOp2 || node->OperIsCommutative());
834     assert(!isReverseOp || node->OperIsCommutative());
835
836     // Determine which operand, if any, should be delayRegFree. Normally, this would be op2,
837     // but if we have a commutative operator and op1 is a contained memory op, it would be op1.
838     // We need to make the delayRegFree operand remain live until the op is complete, by marking
839     // the source(s) associated with op2 as "delayFree".
840     // Note that if op2 of a binary RMW operator is a memory op, even if the operator
841     // is commutative, codegen cannot reverse them.
842     // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
843     // more work to be done to correctly reverse the operands if they involve memory
844     // operands.  Also, we may need to handle more cases than GT_IND, especially once
845     // we've modified the register allocator to not require all nodes to be assigned
846     // a register (e.g. a spilled lclVar can often be referenced directly from memory).
847     // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
848     GenTree* delayUseOperand = op2;
849     if (node->OperIsCommutative())
850     {
851         if (op1->isContained() && op2 != nullptr)
852         {
853             delayUseOperand = op1;
854         }
855         else if (!op2->isContained() || op2->IsCnsIntOrI())
856         {
857             // If we have a commutative operator and op2 is not a memory op, we don't need
858             // to set delayRegFree on either operand because codegen can swap them.
859             delayUseOperand = nullptr;
860         }
861     }
862     else if (op1->isContained())
863     {
864         delayUseOperand = nullptr;
865     }
866     if (delayUseOperand != nullptr)
867     {
868         assert(!prefOp1 || delayUseOperand != op1);
869         assert(!prefOp2 || delayUseOperand != op2);
870     }
871
872     if (isReverseOp)
873     {
874         op1 = op2;
875         op2 = node->gtOp1;
876     }
877
878     // Build first use
879     if (prefOp1)
880     {
881         assert(!op1->isContained());
882         tgtPrefUse = BuildUse(op1, op1Candidates);
883         srcCount++;
884     }
885     else if (delayUseOperand == op1)
886     {
887         srcCount += BuildDelayFreeUses(op1, op1Candidates);
888     }
889     else
890     {
891         srcCount += BuildOperandUses(op1, op1Candidates);
892     }
893     // Build second use
894     if (op2 != nullptr)
895     {
896         if (prefOp2)
897         {
898             assert(!op2->isContained());
899             tgtPrefUse2 = BuildUse(op2, op2Candidates);
900             srcCount++;
901         }
902         else if (delayUseOperand == op2)
903         {
904             srcCount += BuildDelayFreeUses(op2, op2Candidates);
905         }
906         else
907         {
908             srcCount += BuildOperandUses(op2, op2Candidates);
909         }
910     }
911     return srcCount;
912 }
913
914 //------------------------------------------------------------------------
915 // BuildShiftRotate: Set the NodeInfo for a shift or rotate.
916 //
917 // Arguments:
918 //    tree      - The node of interest
919 //
920 // Return Value:
921 //    The number of sources consumed by this node.
922 //
923 int LinearScan::BuildShiftRotate(GenTree* tree)
924 {
925     // For shift operations, we need that the number
926     // of bits moved gets stored in CL in case
927     // the number of bits to shift is not a constant.
928     int       srcCount      = 0;
929     GenTree*  shiftBy       = tree->gtGetOp2();
930     GenTree*  source        = tree->gtGetOp1();
931     regMaskTP srcCandidates = RBM_NONE;
932     regMaskTP dstCandidates = RBM_NONE;
933
934     // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
935     // We will allow whatever can be encoded - hope you know what you are doing.
936     if (shiftBy->isContained())
937     {
938         assert(shiftBy->OperIsConst());
939     }
940     else
941     {
942         srcCandidates = allRegs(TYP_INT) & ~RBM_RCX;
943         dstCandidates = allRegs(TYP_INT) & ~RBM_RCX;
944     }
945
946     // Note that Rotate Left/Right instructions don't set ZF and SF flags.
947     //
948     // If the operand being shifted is 32-bits then upper three bits are masked
949     // by hardware to get actual shift count.  Similarly for 64-bit operands
950     // shift count is narrowed to [0..63].  If the resulting shift count is zero,
951     // then shift operation won't modify flags.
952     //
953     // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
954     // if the shift count is known to be non-zero and in the range depending on the
955     // operand size.
956     CLANG_FORMAT_COMMENT_ANCHOR;
957
958 #ifdef _TARGET_X86_
959     // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
960     // we can have a three operand form.
961     if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
962     {
963         assert((source->OperGet() == GT_LONG) && source->isContained());
964
965         GenTree* sourceLo = source->gtGetOp1();
966         GenTree* sourceHi = source->gtGetOp2();
967         assert(!sourceLo->isContained() && !sourceHi->isContained());
968         RefPosition* sourceLoUse = BuildUse(sourceLo, srcCandidates);
969         RefPosition* sourceHiUse = BuildUse(sourceHi, srcCandidates);
970
971         if (!tree->isContained())
972         {
973             if (tree->OperGet() == GT_LSH_HI)
974             {
975                 setDelayFree(sourceLoUse);
976             }
977             else
978             {
979                 setDelayFree(sourceHiUse);
980             }
981         }
982     }
983     else
984 #endif
985         if (!source->isContained())
986     {
987         tgtPrefUse = BuildUse(source, srcCandidates);
988         srcCount++;
989     }
990     else
991     {
992         srcCount += BuildOperandUses(source, srcCandidates);
993     }
994     if (!tree->isContained())
995     {
996         if (!shiftBy->isContained())
997         {
998             srcCount += BuildDelayFreeUses(shiftBy, RBM_RCX);
999             buildKillPositionsForNode(tree, currentLoc + 1, RBM_RCX);
1000         }
1001         BuildDef(tree, dstCandidates);
1002     }
1003     else
1004     {
1005         if (!shiftBy->isContained())
1006         {
1007             srcCount += BuildOperandUses(shiftBy, RBM_RCX);
1008             buildKillPositionsForNode(tree, currentLoc + 1, RBM_RCX);
1009         }
1010     }
1011     return srcCount;
1012 }
1013
1014 //------------------------------------------------------------------------
1015 // BuildCall: Set the NodeInfo for a call.
1016 //
1017 // Arguments:
1018 //    call      - The call node of interest
1019 //
1020 // Return Value:
1021 //    The number of sources consumed by this node.
1022 //
1023 int LinearScan::BuildCall(GenTreeCall* call)
1024 {
1025     bool            hasMultiRegRetVal = false;
1026     ReturnTypeDesc* retTypeDesc       = nullptr;
1027     int             srcCount          = 0;
1028     int             dstCount          = 0;
1029     regMaskTP       dstCandidates     = RBM_NONE;
1030
1031     assert(!call->isContained());
1032     if (call->TypeGet() != TYP_VOID)
1033     {
1034         hasMultiRegRetVal = call->HasMultiRegRetVal();
1035         if (hasMultiRegRetVal)
1036         {
1037             // dst count = number of registers in which the value is returned by call
1038             retTypeDesc = call->GetReturnTypeDesc();
1039             dstCount    = retTypeDesc->GetReturnRegCount();
1040         }
1041         else
1042         {
1043             dstCount = 1;
1044         }
1045     }
1046
1047     GenTree* ctrlExpr = call->gtControlExpr;
1048     if (call->gtCallType == CT_INDIRECT)
1049     {
1050         ctrlExpr = call->gtCallAddr;
1051     }
1052
1053     RegisterType registerType = call->TypeGet();
1054
1055     // Set destination candidates for return value of the call.
1056     CLANG_FORMAT_COMMENT_ANCHOR;
1057
1058 #ifdef _TARGET_X86_
1059     if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
1060     {
1061         // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
1062         // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
1063         // correct argument registers.
1064         dstCandidates = RBM_PINVOKE_TCB;
1065     }
1066     else
1067 #endif // _TARGET_X86_
1068         if (hasMultiRegRetVal)
1069     {
1070         assert(retTypeDesc != nullptr);
1071         dstCandidates = retTypeDesc->GetABIReturnRegs();
1072         assert((int)genCountBits(dstCandidates) == dstCount);
1073     }
1074     else if (varTypeIsFloating(registerType))
1075     {
1076 #ifdef _TARGET_X86_
1077         // The return value will be on the X87 stack, and we will need to move it.
1078         dstCandidates = allRegs(registerType);
1079 #else  // !_TARGET_X86_
1080         dstCandidates              = RBM_FLOATRET;
1081 #endif // !_TARGET_X86_
1082     }
1083     else if (registerType == TYP_LONG)
1084     {
1085         dstCandidates = RBM_LNGRET;
1086     }
1087     else
1088     {
1089         dstCandidates = RBM_INTRET;
1090     }
1091
1092     // number of args to a call =
1093     // callRegArgs + (callargs - placeholders, setup, etc)
1094     // there is an explicit thisPtr but it is redundant
1095
1096     bool callHasFloatRegArgs = false;
1097     bool isVarArgs           = call->IsVarargs();
1098
1099     // First, determine internal registers.
1100     // We will need one for any float arguments to a varArgs call.
1101     for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1102     {
1103         GenTree* argNode = list->Current();
1104         if (argNode->OperIsPutArgReg())
1105         {
1106             HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1107         }
1108         else if (argNode->OperGet() == GT_FIELD_LIST)
1109         {
1110             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1111             {
1112                 assert(entry->Current()->OperIsPutArgReg());
1113                 HandleFloatVarArgs(call, entry->Current(), &callHasFloatRegArgs);
1114             }
1115         }
1116     }
1117
1118     // Now, count reg args
1119     for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1120     {
1121         // By this point, lowering has ensured that all call arguments are one of the following:
1122         // - an arg setup store
1123         // - an arg placeholder
1124         // - a nop
1125         // - a copy blk
1126         // - a field list
1127         // - a put arg
1128         //
1129         // Note that this property is statically checked by LinearScan::CheckBlock.
1130         GenTree* argNode = list->Current();
1131
1132         // Each register argument corresponds to one source.
1133         if (argNode->OperIsPutArgReg())
1134         {
1135             srcCount++;
1136             BuildUse(argNode, genRegMask(argNode->gtRegNum));
1137         }
1138 #ifdef UNIX_AMD64_ABI
1139         else if (argNode->OperGet() == GT_FIELD_LIST)
1140         {
1141             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1142             {
1143                 assert(entry->Current()->OperIsPutArgReg());
1144                 srcCount++;
1145                 BuildUse(entry->Current(), genRegMask(entry->Current()->gtRegNum));
1146             }
1147         }
1148 #endif // UNIX_AMD64_ABI
1149
1150 #ifdef DEBUG
1151         // In DEBUG only, check validity with respect to the arg table entry.
1152
1153         fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1154         assert(curArgTabEntry);
1155
1156         if (curArgTabEntry->regNum == REG_STK)
1157         {
1158             // late arg that is not passed in a register
1159             assert(argNode->gtOper == GT_PUTARG_STK);
1160
1161 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1162             // If the node is TYP_STRUCT and it is put on stack with
1163             // putarg_stk operation, we consume and produce no registers.
1164             // In this case the embedded Obj node should not produce
1165             // registers too since it is contained.
1166             // Note that if it is a SIMD type the argument will be in a register.
1167             if (argNode->TypeGet() == TYP_STRUCT)
1168             {
1169                 assert(argNode->gtGetOp1() != nullptr && argNode->gtGetOp1()->OperGet() == GT_OBJ);
1170                 assert(argNode->gtGetOp1()->isContained());
1171             }
1172 #endif // FEATURE_PUT_STRUCT_ARG_STK
1173             continue;
1174         }
1175 #ifdef UNIX_AMD64_ABI
1176         if (argNode->OperGet() == GT_FIELD_LIST)
1177         {
1178             assert(argNode->isContained());
1179             assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
1180
1181             int i = 0;
1182             for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1183             {
1184                 const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum;
1185                 assert(entry->Current()->gtRegNum == argReg);
1186                 assert(i < 2);
1187                 i++;
1188             }
1189         }
1190         else
1191 #endif // UNIX_AMD64_ABI
1192         {
1193             const regNumber argReg = curArgTabEntry->regNum;
1194             assert(argNode->gtRegNum == argReg);
1195         }
1196 #endif // DEBUG
1197     }
1198
1199     // Now, count stack args
1200     // Note that these need to be computed into a register, but then
1201     // they're just stored to the stack - so the reg doesn't
1202     // need to remain live until the call.  In fact, it must not
1203     // because the code generator doesn't actually consider it live,
1204     // so it can't be spilled.
1205
1206     GenTree* args = call->gtCallArgs;
1207     while (args)
1208     {
1209         GenTree* arg = args->gtGetOp1();
1210         if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
1211         {
1212             if (arg->IsValue() && !arg->isContained())
1213             {
1214                 assert(arg->IsUnusedValue());
1215             }
1216         }
1217         args = args->gtGetOp2();
1218     }
1219
1220     // set reg requirements on call target represented as control sequence.
1221     if (ctrlExpr != nullptr)
1222     {
1223         regMaskTP ctrlExprCandidates = RBM_NONE;
1224
1225         // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1226         // computed into a register.
1227         if (call->IsFastTailCall())
1228         {
1229             assert(!ctrlExpr->isContained());
1230             // Fast tail call - make sure that call target is always computed in RAX
1231             // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1232             ctrlExprCandidates = RBM_RAX;
1233         }
1234 #ifdef _TARGET_X86_
1235         else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1236         {
1237             // On x86, we need to generate a very specific pattern for indirect VSD calls:
1238             //
1239             //    3-byte nop
1240             //    call dword ptr [eax]
1241             //
1242             // Where EAX is also used as an argument to the stub dispatch helper. Make
1243             // sure that the call target address is computed into EAX in this case.
1244             assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
1245             ctrlExprCandidates = RBM_VIRTUAL_STUB_TARGET;
1246         }
1247 #endif // _TARGET_X86_
1248
1249 #if FEATURE_VARARG
1250         // If it is a fast tail call, it is already preferenced to use RAX.
1251         // Therefore, no need set src candidates on call tgt again.
1252         if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall())
1253         {
1254             // Don't assign the call target to any of the argument registers because
1255             // we will use them to also pass floating point arguments as required
1256             // by Amd64 ABI.
1257             ctrlExprCandidates = allRegs(TYP_INT) & ~(RBM_ARG_REGS);
1258         }
1259 #endif // !FEATURE_VARARG
1260         srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
1261     }
1262
1263     buildInternalRegisterUses();
1264
1265     // Now generate defs and kills.
1266     regMaskTP killMask = getKillSetForCall(call);
1267     BuildDefsWithKills(call, dstCount, dstCandidates, killMask);
1268     return srcCount;
1269 }
1270
1271 //------------------------------------------------------------------------
1272 // BuildBlockStore: Set the NodeInfo for a block store.
1273 //
1274 // Arguments:
1275 //    blkNode       - The block store node of interest
1276 //
1277 // Return Value:
1278 //    The number of sources consumed by this node.
1279 //
1280 int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
1281 {
1282     GenTree* dstAddr  = blkNode->Addr();
1283     unsigned size     = blkNode->gtBlkSize;
1284     GenTree* source   = blkNode->Data();
1285     int      srcCount = 0;
1286
1287     GenTree* srcAddrOrFill = nullptr;
1288     bool     isInitBlk     = blkNode->OperIsInitBlkOp();
1289
1290     regMaskTP dstAddrRegMask = RBM_NONE;
1291     regMaskTP sourceRegMask  = RBM_NONE;
1292     regMaskTP blkSizeRegMask = RBM_NONE;
1293
1294     if (isInitBlk)
1295     {
1296         GenTree* initVal = source;
1297         if (initVal->OperIsInitVal())
1298         {
1299             assert(initVal->isContained());
1300             initVal = initVal->gtGetOp1();
1301         }
1302         srcAddrOrFill = initVal;
1303
1304         switch (blkNode->gtBlkOpKind)
1305         {
1306             case GenTreeBlk::BlkOpKindUnroll:
1307                 assert(initVal->IsCnsIntOrI());
1308                 if (size >= XMM_REGSIZE_BYTES)
1309                 {
1310                     // Reserve an XMM register to fill it with a pack of 16 init value constants.
1311                     buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1312                     // use XMM register to fill with constants, it's AVX instruction and set the flag
1313                     SetContainsAVXFlags();
1314                 }
1315 #ifdef _TARGET_X86_
1316                 if ((size & 1) != 0)
1317                 {
1318                     // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1319                     // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1320                     // when unrolling, so only allow byteable registers as the source value. (We could
1321                     // consider just using BlkOpKindRepInstr instead.)
1322                     sourceRegMask = allByteRegs();
1323                 }
1324 #endif // _TARGET_X86_
1325                 break;
1326
1327             case GenTreeBlk::BlkOpKindRepInstr:
1328                 // rep stos has the following register requirements:
1329                 // a) The memory address to be in RDI.
1330                 // b) The fill value has to be in RAX.
1331                 // c) The buffer size will go in RCX.
1332                 dstAddrRegMask = RBM_RDI;
1333                 sourceRegMask  = RBM_RAX;
1334                 blkSizeRegMask = RBM_RCX;
1335                 break;
1336
1337             case GenTreeBlk::BlkOpKindHelper:
1338 #ifdef _TARGET_AMD64_
1339                 // The helper follows the regular AMD64 ABI.
1340                 dstAddrRegMask = RBM_ARG_0;
1341                 sourceRegMask  = RBM_ARG_1;
1342                 blkSizeRegMask = RBM_ARG_2;
1343 #else  // !_TARGET_AMD64_
1344                 dstAddrRegMask     = RBM_RDI;
1345                 sourceRegMask      = RBM_RAX;
1346                 blkSizeRegMask     = RBM_RCX;
1347 #endif // !_TARGET_AMD64_
1348                 break;
1349
1350             default:
1351                 unreached();
1352         }
1353     }
1354     else
1355     {
1356         // CopyObj or CopyBlk
1357         if (source->gtOper == GT_IND)
1358         {
1359             assert(source->isContained());
1360             srcAddrOrFill = source->gtGetOp1();
1361         }
1362         if (blkNode->OperGet() == GT_STORE_OBJ)
1363         {
1364             if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
1365             {
1366                 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1367                 blkSizeRegMask = RBM_RCX;
1368             }
1369             // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
1370             // sources.
1371             sourceRegMask  = RBM_RSI;
1372             dstAddrRegMask = RBM_RDI;
1373         }
1374         else
1375         {
1376             switch (blkNode->gtBlkOpKind)
1377             {
1378                 case GenTreeBlk::BlkOpKindUnroll:
1379                     // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1380                     //
1381                     // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1382                     // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1383                     // RBM_NON_BYTE_REGS from internal candidates.
1384                     if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1385                     {
1386                         regMaskTP regMask = allRegs(TYP_INT);
1387
1388 #ifdef _TARGET_X86_
1389                         if ((size & 1) != 0)
1390                         {
1391                             regMask &= ~RBM_NON_BYTE_REGS;
1392                         }
1393 #endif
1394                         buildInternalIntRegisterDefForNode(blkNode, regMask);
1395                     }
1396
1397                     if (size >= XMM_REGSIZE_BYTES)
1398                     {
1399                         // If we have a buffer larger than XMM_REGSIZE_BYTES,
1400                         // reserve an XMM register to use it for a
1401                         // series of 16-byte loads and stores.
1402                         buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1403                         // Uses XMM reg for load and store and hence check to see whether AVX instructions
1404                         // are used for codegen, set ContainsAVX flag
1405                         SetContainsAVXFlags();
1406                     }
1407                     break;
1408
1409                 case GenTreeBlk::BlkOpKindRepInstr:
1410                     // rep stos has the following register requirements:
1411                     // a) The dest address has to be in RDI.
1412                     // b) The src address has to be in RSI.
1413                     // c) The buffer size will go in RCX.
1414                     dstAddrRegMask = RBM_RDI;
1415                     sourceRegMask  = RBM_RSI;
1416                     blkSizeRegMask = RBM_RCX;
1417                     break;
1418
1419                 case GenTreeBlk::BlkOpKindHelper:
1420 #ifdef _TARGET_AMD64_
1421                     // The helper follows the regular AMD64 ABI.
1422                     dstAddrRegMask = RBM_ARG_0;
1423                     sourceRegMask  = RBM_ARG_1;
1424                     blkSizeRegMask = RBM_ARG_2;
1425 #else  // !_TARGET_AMD64_
1426                     dstAddrRegMask = RBM_RDI;
1427                     sourceRegMask  = RBM_RAX;
1428                     blkSizeRegMask = RBM_RCX;
1429 #endif // !_TARGET_AMD64_
1430                     break;
1431
1432                 default:
1433                     unreached();
1434             }
1435         }
1436         if ((srcAddrOrFill == nullptr) && (sourceRegMask != RBM_NONE))
1437         {
1438             // This is a local source; we'll use a temp register for its address.
1439             assert(source->isContained() && source->OperIsLocal());
1440             buildInternalIntRegisterDefForNode(blkNode, sourceRegMask);
1441         }
1442     }
1443
1444     if ((size != 0) && (blkSizeRegMask != RBM_NONE))
1445     {
1446         // Reserve a temp register for the block size argument.
1447         buildInternalIntRegisterDefForNode(blkNode, blkSizeRegMask);
1448     }
1449
1450     if (!dstAddr->isContained() && !blkNode->IsReverseOp())
1451     {
1452         srcCount++;
1453         BuildUse(dstAddr, dstAddrRegMask);
1454     }
1455     if ((srcAddrOrFill != nullptr) && !srcAddrOrFill->isContained())
1456     {
1457         srcCount++;
1458         BuildUse(srcAddrOrFill, sourceRegMask);
1459     }
1460     if (!dstAddr->isContained() && blkNode->IsReverseOp())
1461     {
1462         srcCount++;
1463         BuildUse(dstAddr, dstAddrRegMask);
1464     }
1465
1466     if (size == 0)
1467     {
1468         assert(blkNode->OperIs(GT_STORE_DYN_BLK));
1469         // The block size argument is a third argument to GT_STORE_DYN_BLK
1470         srcCount++;
1471         GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1472         BuildUse(blockSize, blkSizeRegMask);
1473     }
1474     buildInternalRegisterUses();
1475     regMaskTP killMask = getKillSetForBlockStore(blkNode);
1476     BuildDefsWithKills(blkNode, 0, RBM_NONE, killMask);
1477     return srcCount;
1478 }
1479
1480 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1481 //------------------------------------------------------------------------
1482 // BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1483 //
1484 // Arguments:
1485 //    tree      - The node of interest
1486 //
1487 // Return Value:
1488 //    The number of sources consumed by this node.
1489 //
1490 int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
1491 {
1492     int srcCount = 0;
1493     if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
1494     {
1495         assert(putArgStk->gtOp1->isContained());
1496
1497         RefPosition* simdTemp   = nullptr;
1498         RefPosition* intTemp    = nullptr;
1499         unsigned     prevOffset = putArgStk->getArgSize();
1500         // We need to iterate over the fields twice; once to determine the need for internal temps,
1501         // and once to actually build the uses.
1502         for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1503         {
1504             GenTree* const  fieldNode   = current->Current();
1505             const var_types fieldType   = fieldNode->TypeGet();
1506             const unsigned  fieldOffset = current->gtFieldOffset;
1507
1508 #ifdef _TARGET_X86_
1509             assert(fieldType != TYP_LONG);
1510 #endif // _TARGET_X86_
1511
1512 #if defined(FEATURE_SIMD)
1513             // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the
1514             // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
1515             // we "round up" to 16.
1516             if ((current->gtFieldType == TYP_SIMD12) && (simdTemp == nullptr))
1517             {
1518                 simdTemp = buildInternalFloatRegisterDefForNode(putArgStk);
1519             }
1520 #endif // defined(FEATURE_SIMD)
1521
1522 #ifdef _TARGET_X86_
1523             if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
1524             {
1525                 // We can treat as a slot any field that is stored at a slot boundary, where the previous
1526                 // field is not in the same slot. (Note that we store the fields in reverse order.)
1527                 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
1528                 if (intTemp == nullptr)
1529                 {
1530                     intTemp = buildInternalIntRegisterDefForNode(putArgStk);
1531                 }
1532                 if (!fieldIsSlot && varTypeIsByte(fieldType))
1533                 {
1534                     // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
1535                     // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
1536                     // need a byte-addressable register for the store. We will enforce this requirement on an internal
1537                     // register, which we can use to copy multiple byte values.
1538                     intTemp->registerAssignment &= allByteRegs();
1539                 }
1540             }
1541 #endif // _TARGET_X86_
1542
1543             if (varTypeIsGC(fieldType))
1544             {
1545                 putArgStk->gtNumberReferenceSlots++;
1546             }
1547             prevOffset = fieldOffset;
1548         }
1549
1550         for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1551         {
1552             GenTree* const fieldNode = current->Current();
1553             if (!fieldNode->isContained())
1554             {
1555                 BuildUse(fieldNode);
1556                 srcCount++;
1557             }
1558         }
1559         buildInternalRegisterUses();
1560
1561         return srcCount;
1562     }
1563
1564     GenTree*  src  = putArgStk->gtOp1;
1565     var_types type = src->TypeGet();
1566
1567 #if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1568     // For PutArgStk of a TYP_SIMD12, we need an extra register.
1569     if (putArgStk->isSIMD12())
1570     {
1571         buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
1572         BuildUse(putArgStk->gtOp1);
1573         srcCount = 1;
1574         buildInternalRegisterUses();
1575         return srcCount;
1576     }
1577 #endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1578
1579     if (type != TYP_STRUCT)
1580     {
1581         return BuildSimple(putArgStk);
1582     }
1583
1584     GenTree* dst     = putArgStk;
1585     GenTree* srcAddr = nullptr;
1586
1587     // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1588     // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1589     // our framework assemblies, so this is the main code generation scheme we'll use.
1590     ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
1591     switch (putArgStk->gtPutArgStkKind)
1592     {
1593         case GenTreePutArgStk::Kind::Push:
1594         case GenTreePutArgStk::Kind::PushAllSlots:
1595         case GenTreePutArgStk::Kind::Unroll:
1596             // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1597             //
1598             // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1599             // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
1600             // RBM_NON_BYTE_REGS from internal candidates.
1601             if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
1602             {
1603                 regMaskTP regMask = allRegs(TYP_INT);
1604
1605 #ifdef _TARGET_X86_
1606                 if ((size % 2) != 0)
1607                 {
1608                     regMask &= ~RBM_NON_BYTE_REGS;
1609                 }
1610 #endif
1611                 buildInternalIntRegisterDefForNode(putArgStk, regMask);
1612             }
1613
1614 #ifdef _TARGET_X86_
1615             if (size >= 8)
1616 #else  // !_TARGET_X86_
1617             if (size >= XMM_REGSIZE_BYTES)
1618 #endif // !_TARGET_X86_
1619             {
1620                 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
1621                 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
1622                 // series of 16-byte loads and stores.
1623                 buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
1624                 SetContainsAVXFlags();
1625             }
1626             break;
1627
1628         case GenTreePutArgStk::Kind::RepInstr:
1629             buildInternalIntRegisterDefForNode(putArgStk, RBM_RDI);
1630             buildInternalIntRegisterDefForNode(putArgStk, RBM_RCX);
1631             buildInternalIntRegisterDefForNode(putArgStk, RBM_RSI);
1632             break;
1633
1634         default:
1635             unreached();
1636     }
1637
1638     srcCount = BuildOperandUses(src);
1639     buildInternalRegisterUses();
1640     return srcCount;
1641 }
1642 #endif // FEATURE_PUT_STRUCT_ARG_STK
1643
1644 //------------------------------------------------------------------------
1645 // BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
1646 //
1647 // Arguments:
1648 //    tree      - The node of interest
1649 //
1650 // Return Value:
1651 //    The number of sources consumed by this node.
1652 //
1653 int LinearScan::BuildLclHeap(GenTree* tree)
1654 {
1655     int srcCount = 1;
1656
1657     // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1658     // Here '-' means don't care.
1659     //
1660     //     Size?                    Init Memory?         # temp regs
1661     //      0                            -                  0 (returns 0)
1662     //      const and <=6 reg words      -                  0 (pushes '0')
1663     //      const and >6 reg words       Yes                0 (pushes '0')
1664     //      const and <PageSize          No                 0 (amd64) 1 (x86)
1665     //                                                        (x86:tmpReg for sutracting from esp)
1666     //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
1667     //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
1668     //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
1669     //
1670     // Note: Here we don't need internal register to be different from targetReg.
1671     // Rather, require it to be different from operand's reg.
1672
1673     GenTree* size = tree->gtGetOp1();
1674     if (size->IsCnsIntOrI())
1675     {
1676         assert(size->isContained());
1677         srcCount       = 0;
1678         size_t sizeVal = size->gtIntCon.gtIconVal;
1679
1680         if (sizeVal == 0)
1681         {
1682             buildInternalIntRegisterDefForNode(tree);
1683         }
1684         else
1685         {
1686             // Compute the amount of memory to properly STACK_ALIGN.
1687             // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1688             // This should also help in debugging as we can examine the original size specified with localloc.
1689             sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1690
1691             // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1692             // we will generate 'push 0'.
1693             assert((sizeVal % REGSIZE_BYTES) == 0);
1694             size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1695             if (cntRegSizedWords > 6)
1696             {
1697                 if (!compiler->info.compInitMem)
1698                 {
1699                     // No need to initialize allocated stack space.
1700                     if (sizeVal < compiler->eeGetPageSize())
1701                     {
1702 #ifdef _TARGET_X86_
1703                         // x86 needs a register here to avoid generating "sub" on ESP.
1704                         buildInternalIntRegisterDefForNode(tree);
1705 #endif
1706                     }
1707                     else
1708                     {
1709                         // We need two registers: regCnt and RegTmp
1710                         buildInternalIntRegisterDefForNode(tree);
1711                         buildInternalIntRegisterDefForNode(tree);
1712                     }
1713                 }
1714             }
1715         }
1716     }
1717     else
1718     {
1719         if (!compiler->info.compInitMem)
1720         {
1721             buildInternalIntRegisterDefForNode(tree);
1722             buildInternalIntRegisterDefForNode(tree);
1723         }
1724         BuildUse(size);
1725     }
1726     buildInternalRegisterUses();
1727     BuildDef(tree);
1728     return srcCount;
1729 }
1730
1731 //------------------------------------------------------------------------
1732 // BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
1733 //
1734 // Arguments:
1735 //    tree      - The node of interest
1736 //
1737 // Return Value:
1738 //    The number of sources consumed by this node.
1739 //
1740 int LinearScan::BuildModDiv(GenTree* tree)
1741 {
1742     GenTree*     op1           = tree->gtGetOp1();
1743     GenTree*     op2           = tree->gtGetOp2();
1744     regMaskTP    dstCandidates = RBM_NONE;
1745     RefPosition* internalDef   = nullptr;
1746     int          srcCount      = 0;
1747
1748     if (varTypeIsFloating(tree->TypeGet()))
1749     {
1750         return BuildSimple(tree);
1751     }
1752
1753     // Amd64 Div/Idiv instruction:
1754     //    Dividend in RAX:RDX  and computes
1755     //    Quotient in RAX, Remainder in RDX
1756
1757     if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
1758     {
1759         // We are interested in just the remainder.
1760         // RAX is used as a trashable register during computation of remainder.
1761         dstCandidates = RBM_RDX;
1762     }
1763     else
1764     {
1765         // We are interested in just the quotient.
1766         // RDX gets used as trashable register during computation of quotient
1767         dstCandidates = RBM_RAX;
1768     }
1769
1770 #ifdef _TARGET_X86_
1771     if (op1->OperGet() == GT_LONG)
1772     {
1773         assert(op1->isContained());
1774
1775         // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
1776         GenTree* loVal = op1->gtGetOp1();
1777         GenTree* hiVal = op1->gtGetOp2();
1778         assert(!loVal->isContained() && !hiVal->isContained());
1779
1780         assert(op2->IsCnsIntOrI());
1781         assert(tree->OperGet() == GT_UMOD);
1782
1783         // This situation also requires an internal register.
1784         buildInternalIntRegisterDefForNode(tree);
1785
1786         BuildUse(loVal, RBM_EAX);
1787         BuildUse(hiVal, RBM_EDX);
1788         srcCount = 2;
1789     }
1790     else
1791 #endif
1792     {
1793         // If possible would like to have op1 in RAX to avoid a register move.
1794         RefPosition* op1Use = BuildUse(op1, RBM_EAX);
1795         tgtPrefUse          = op1Use;
1796         srcCount            = 1;
1797     }
1798
1799     srcCount += BuildDelayFreeUses(op2, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
1800
1801     buildInternalRegisterUses();
1802
1803     regMaskTP killMask = getKillSetForModDiv(tree->AsOp());
1804     BuildDefsWithKills(tree, 1, dstCandidates, killMask);
1805     return srcCount;
1806 }
1807
1808 //------------------------------------------------------------------------
1809 // BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
1810 //
1811 // Arguments:
1812 //    tree      - The node of interest
1813 //
1814 // Return Value:
1815 //    The number of sources consumed by this node.
1816 //
1817 int LinearScan::BuildIntrinsic(GenTree* tree)
1818 {
1819     // Both operand and its result must be of floating point type.
1820     GenTree* op1 = tree->gtGetOp1();
1821     assert(varTypeIsFloating(op1));
1822     assert(op1->TypeGet() == tree->TypeGet());
1823     RefPosition* internalFloatDef = nullptr;
1824
1825     switch (tree->gtIntrinsic.gtIntrinsicId)
1826     {
1827         case CORINFO_INTRINSIC_Abs:
1828             // Abs(float x) = x & 0x7fffffff
1829             // Abs(double x) = x & 0x7ffffff ffffffff
1830
1831             // In case of Abs we need an internal register to hold mask.
1832
1833             // TODO-XArch-CQ: avoid using an internal register for the mask.
1834             // Andps or andpd both will operate on 128-bit operands.
1835             // The data section constant to hold the mask is a 64-bit size.
1836             // Therefore, we need both the operand and mask to be in
1837             // xmm register. When we add support in emitter to emit 128-bit
1838             // data constants and instructions that operate on 128-bit
1839             // memory operands we can avoid the need for an internal register.
1840             if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
1841             {
1842                 internalFloatDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
1843             }
1844             break;
1845
1846 #ifdef _TARGET_X86_
1847         case CORINFO_INTRINSIC_Cos:
1848         case CORINFO_INTRINSIC_Sin:
1849             NYI_X86("Math intrinsics Cos and Sin");
1850             break;
1851 #endif // _TARGET_X86_
1852
1853         case CORINFO_INTRINSIC_Sqrt:
1854         case CORINFO_INTRINSIC_Round:
1855         case CORINFO_INTRINSIC_Ceiling:
1856         case CORINFO_INTRINSIC_Floor:
1857             break;
1858
1859         default:
1860             // Right now only Sqrt/Abs are treated as math intrinsics
1861             noway_assert(!"Unsupported math intrinsic");
1862             unreached();
1863             break;
1864     }
1865     assert(tree->gtGetOp2IfPresent() == nullptr);
1866     int srcCount;
1867     if (op1->isContained())
1868     {
1869         srcCount = BuildOperandUses(op1);
1870     }
1871     else
1872     {
1873         tgtPrefUse = BuildUse(op1);
1874         srcCount   = 1;
1875     }
1876     if (internalFloatDef != nullptr)
1877     {
1878         buildInternalRegisterUses();
1879     }
1880     BuildDef(tree);
1881     return srcCount;
1882 }
1883
1884 #ifdef FEATURE_SIMD
1885 //------------------------------------------------------------------------
1886 // BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
1887 //
1888 // Arguments:
1889 //    tree       - The GT_SIMD node of interest
1890 //
1891 // Return Value:
1892 //    The number of sources consumed by this node.
1893 //
1894 int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
1895 {
1896     // Only SIMDIntrinsicInit can be contained. Other than that,
1897     // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount.
1898     int       dstCount      = simdTree->IsValue() ? 1 : 0;
1899     bool      buildUses     = true;
1900     regMaskTP dstCandidates = RBM_NONE;
1901
1902     if (simdTree->isContained())
1903     {
1904         assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
1905     }
1906     else if (dstCount != 1)
1907     {
1908         assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ||
1909                (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality));
1910     }
1911     SetContainsAVXFlags(simdTree->gtSIMDSize);
1912     GenTree* op1      = simdTree->gtGetOp1();
1913     GenTree* op2      = simdTree->gtGetOp2();
1914     int      srcCount = 0;
1915
1916     switch (simdTree->gtSIMDIntrinsicID)
1917     {
1918         case SIMDIntrinsicInit:
1919         {
1920             // This sets all fields of a SIMD struct to the given value.
1921             // Mark op1 as contained if it is either zero or int constant of all 1's,
1922             // or a float constant with 16 or 32 byte simdType (AVX case)
1923             //
1924             // Note that for small int base types, the initVal has been constructed so that
1925             // we can use the full int value.
1926             CLANG_FORMAT_COMMENT_ANCHOR;
1927
1928 #if !defined(_TARGET_64BIT_)
1929             if (op1->OperGet() == GT_LONG)
1930             {
1931                 assert(op1->isContained());
1932                 GenTree* op1lo = op1->gtGetOp1();
1933                 GenTree* op1hi = op1->gtGetOp2();
1934
1935                 if (op1lo->isContained())
1936                 {
1937                     srcCount = 0;
1938                     assert(op1hi->isContained());
1939                     assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
1940                            (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
1941                 }
1942                 else
1943                 {
1944                     srcCount = 2;
1945                     buildInternalFloatRegisterDefForNode(simdTree);
1946                     setInternalRegsDelayFree = true;
1947                 }
1948
1949                 if (srcCount == 2)
1950                 {
1951                     BuildUse(op1lo, RBM_EAX);
1952                     BuildUse(op1hi, RBM_EDX);
1953                 }
1954                 buildUses = false;
1955             }
1956 #endif // !defined(_TARGET_64BIT_)
1957         }
1958         break;
1959
1960         case SIMDIntrinsicInitN:
1961         {
1962             var_types baseType = simdTree->gtSIMDBaseType;
1963             srcCount           = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
1964             // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
1965             buildInternalFloatRegisterDefForNode(simdTree);
1966             int initCount = 0;
1967             for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
1968             {
1969                 assert(list->OperGet() == GT_LIST);
1970                 GenTree* listItem = list->gtGetOp1();
1971                 assert(listItem->TypeGet() == baseType);
1972                 assert(!listItem->isContained());
1973                 BuildUse(listItem);
1974                 initCount++;
1975             }
1976             assert(initCount == srcCount);
1977             buildUses = false;
1978         }
1979         break;
1980
1981         case SIMDIntrinsicInitArray:
1982             // We have an array and an index, which may be contained.
1983             break;
1984
1985         case SIMDIntrinsicDiv:
1986             // SSE2 has no instruction support for division on integer vectors
1987             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1988             break;
1989
1990         case SIMDIntrinsicAbs:
1991             // float/double vectors: This gets implemented as bitwise-And operation
1992             // with a mask and hence should never see  here.
1993             //
1994             // Must be a Vector<int> or Vector<short> Vector<sbyte>
1995             assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
1996                    simdTree->gtSIMDBaseType == TYP_BYTE);
1997             assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
1998             break;
1999
2000         case SIMDIntrinsicSqrt:
2001             // SSE2 has no instruction support for sqrt on integer vectors.
2002             noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
2003             break;
2004
2005         case SIMDIntrinsicAdd:
2006         case SIMDIntrinsicSub:
2007         case SIMDIntrinsicMul:
2008         case SIMDIntrinsicBitwiseAnd:
2009         case SIMDIntrinsicBitwiseAndNot:
2010         case SIMDIntrinsicBitwiseOr:
2011         case SIMDIntrinsicBitwiseXor:
2012         case SIMDIntrinsicMin:
2013         case SIMDIntrinsicMax:
2014             // SSE2 32-bit integer multiplication requires two temp regs
2015             if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
2016                 compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2017             {
2018                 buildInternalFloatRegisterDefForNode(simdTree);
2019                 buildInternalFloatRegisterDefForNode(simdTree);
2020             }
2021             break;
2022
2023         case SIMDIntrinsicEqual:
2024             break;
2025
2026         // SSE2 doesn't support < and <= directly on int vectors.
2027         // Instead we need to use > and >= with swapped operands.
2028         case SIMDIntrinsicLessThan:
2029         case SIMDIntrinsicLessThanOrEqual:
2030             noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
2031             break;
2032
2033         // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
2034         // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
2035         // Instead we need to use <  and <= with swapped operands.
2036         case SIMDIntrinsicGreaterThan:
2037             noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2038             break;
2039
2040         case SIMDIntrinsicOpEquality:
2041         case SIMDIntrinsicOpInEquality:
2042             if (simdTree->gtGetOp2()->isContained())
2043             {
2044                 // If the second operand is contained then ContainCheckSIMD has determined
2045                 // that PTEST can be used. We only need a single source register and no
2046                 // internal registers.
2047             }
2048             else
2049             {
2050                 // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
2051                 // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
2052                 // and one internal INT register (to hold the result of PMOVMSKB).
2053                 buildInternalIntRegisterDefForNode(simdTree);
2054                 buildInternalFloatRegisterDefForNode(simdTree);
2055             }
2056             // These SIMD nodes only set the condition flags.
2057             dstCount = 0;
2058             break;
2059
2060         case SIMDIntrinsicDotProduct:
2061             // Float/Double vectors:
2062             // For SSE, or AVX with 32-byte vectors, we also need an internal register
2063             // as scratch. Further we need the targetReg and internal reg to be distinct
2064             // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2065             // don't need a tmpReg.
2066             //
2067             // 32-byte integer vector on SSE4/AVX:
2068             // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2069             // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
2070             // registers since targetReg is an int type register.
2071             //
2072             // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2073             // and the need for scratch registers.
2074             if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2075             {
2076                 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
2077                     (simdTree->gtGetOp1()->TypeGet() == TYP_SIMD32))
2078                 {
2079                     buildInternalFloatRegisterDefForNode(simdTree);
2080                     setInternalRegsDelayFree = true;
2081                 }
2082                 // else don't need scratch reg(s).
2083             }
2084             else
2085             {
2086                 assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2087
2088                 // No need to setInternalRegsDelayFree since targetReg is a
2089                 // an int type reg and guaranteed to be different from xmm/ymm
2090                 // regs.
2091                 buildInternalFloatRegisterDefForNode(simdTree);
2092                 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2093                 {
2094                     buildInternalFloatRegisterDefForNode(simdTree);
2095                 }
2096             }
2097             break;
2098
2099         case SIMDIntrinsicGetItem:
2100         {
2101             // This implements get_Item method. The sources are:
2102             //  - the source SIMD struct
2103             //  - index (which element to get)
2104             // The result is baseType of SIMD struct.
2105             // op1 may be a contained memory op, but if so we will consume its address.
2106             // op2 may be a contained constant.
2107             op1 = simdTree->gtGetOp1();
2108             op2 = simdTree->gtGetOp2();
2109
2110             if (!op1->isContained())
2111             {
2112                 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2113                 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2114                 // can use that in the process of extracting the element.
2115                 //
2116                 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2117                 // we will need a temp if are indexing into the upper half of the AVX register.
2118                 // In all other cases with constant index, we need a temp xmm register to extract the
2119                 // element if index is other than zero.
2120
2121                 if (!op2->IsCnsIntOrI())
2122                 {
2123                     (void)compiler->getSIMDInitTempVarNum();
2124                 }
2125                 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2126                 {
2127                     bool needFloatTemp;
2128                     if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2129                         (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
2130                     {
2131                         int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2132                         needFloatTemp    = (byteShiftCnt >= 16);
2133                     }
2134                     else
2135                     {
2136                         needFloatTemp = !op2->IsIntegralConst(0);
2137                     }
2138
2139                     if (needFloatTemp)
2140                     {
2141                         buildInternalFloatRegisterDefForNode(simdTree);
2142                     }
2143                 }
2144 #ifdef _TARGET_X86_
2145                 // This logic is duplicated from genSIMDIntrinsicGetItem().
2146                 // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
2147                 // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
2148                 // cases will require this, so the non-byteable registers can be excluded.
2149
2150                 var_types baseType = simdTree->gtSIMDBaseType;
2151                 if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
2152                 {
2153                     bool     ZeroOrSignExtnReqd = true;
2154                     unsigned baseSize           = genTypeSize(baseType);
2155                     if (baseSize == 1)
2156                     {
2157                         if ((op2->gtIntCon.gtIconVal % 2) == 1)
2158                         {
2159                             ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2160                         }
2161                     }
2162                     else
2163                     {
2164                         assert(baseSize == 2);
2165                         ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2166                     }
2167                     if (ZeroOrSignExtnReqd)
2168                     {
2169                         dstCandidates = allByteRegs();
2170                     }
2171                 }
2172 #endif // _TARGET_X86_
2173             }
2174         }
2175         break;
2176
2177         case SIMDIntrinsicSetX:
2178         case SIMDIntrinsicSetY:
2179         case SIMDIntrinsicSetZ:
2180         case SIMDIntrinsicSetW:
2181             // We need an internal integer register for SSE2 codegen
2182             if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2183             {
2184                 buildInternalIntRegisterDefForNode(simdTree);
2185             }
2186
2187             break;
2188
2189         case SIMDIntrinsicCast:
2190             break;
2191
2192         case SIMDIntrinsicConvertToSingle:
2193             if (simdTree->gtSIMDBaseType == TYP_UINT)
2194             {
2195                 // We need an internal register different from targetReg.
2196                 setInternalRegsDelayFree = true;
2197                 buildInternalFloatRegisterDefForNode(simdTree);
2198                 buildInternalFloatRegisterDefForNode(simdTree);
2199                 // We also need an integer register.
2200                 buildInternalIntRegisterDefForNode(simdTree);
2201             }
2202             break;
2203
2204         case SIMDIntrinsicConvertToInt32:
2205             break;
2206
2207         case SIMDIntrinsicWidenLo:
2208         case SIMDIntrinsicWidenHi:
2209             if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
2210             {
2211                 // We need an internal register different from targetReg.
2212                 setInternalRegsDelayFree = true;
2213                 buildInternalFloatRegisterDefForNode(simdTree);
2214             }
2215             break;
2216
2217         case SIMDIntrinsicConvertToInt64:
2218             // We need an internal register different from targetReg.
2219             setInternalRegsDelayFree = true;
2220             buildInternalFloatRegisterDefForNode(simdTree);
2221             if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2222             {
2223                 buildInternalFloatRegisterDefForNode(simdTree);
2224             }
2225             // We also need an integer register.
2226             buildInternalIntRegisterDefForNode(simdTree);
2227             break;
2228
2229         case SIMDIntrinsicConvertToDouble:
2230             // We need an internal register different from targetReg.
2231             setInternalRegsDelayFree = true;
2232             buildInternalFloatRegisterDefForNode(simdTree);
2233 #ifdef _TARGET_X86_
2234             if (simdTree->gtSIMDBaseType == TYP_LONG)
2235             {
2236                 buildInternalFloatRegisterDefForNode(simdTree);
2237                 buildInternalFloatRegisterDefForNode(simdTree);
2238             }
2239             else
2240 #endif
2241                 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
2242             {
2243                 buildInternalFloatRegisterDefForNode(simdTree);
2244             }
2245             // We also need an integer register.
2246             buildInternalIntRegisterDefForNode(simdTree);
2247             break;
2248
2249         case SIMDIntrinsicNarrow:
2250             // We need an internal register different from targetReg.
2251             setInternalRegsDelayFree = true;
2252             buildInternalFloatRegisterDefForNode(simdTree);
2253             if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
2254             {
2255                 buildInternalFloatRegisterDefForNode(simdTree);
2256             }
2257             break;
2258
2259         case SIMDIntrinsicShuffleSSE2:
2260             // Second operand is an integer constant and marked as contained.
2261             assert(simdTree->gtGetOp2()->isContainedIntOrIImmed());
2262             break;
2263
2264         case SIMDIntrinsicGetX:
2265         case SIMDIntrinsicGetY:
2266         case SIMDIntrinsicGetZ:
2267         case SIMDIntrinsicGetW:
2268         case SIMDIntrinsicGetOne:
2269         case SIMDIntrinsicGetZero:
2270         case SIMDIntrinsicGetCount:
2271         case SIMDIntrinsicGetAllOnes:
2272             assert(!"Get intrinsics should not be seen during Lowering.");
2273             unreached();
2274
2275         default:
2276             noway_assert(!"Unimplemented SIMD node type.");
2277             unreached();
2278     }
2279     if (buildUses)
2280     {
2281         assert(!op1->OperIs(GT_LIST));
2282         assert(srcCount == 0);
2283         // This is overly conservative, but is here for zero diffs.
2284         srcCount = BuildRMWUses(simdTree);
2285     }
2286     buildInternalRegisterUses();
2287     if (dstCount == 1)
2288     {
2289         BuildDef(simdTree, dstCandidates);
2290     }
2291     else
2292     {
2293         assert(dstCount == 0);
2294     }
2295     return srcCount;
2296 }
2297 #endif // FEATURE_SIMD
2298
2299 #ifdef FEATURE_HW_INTRINSICS
2300 //------------------------------------------------------------------------
2301 // BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree.
2302 //
2303 // Arguments:
2304 //    tree       - The GT_HWIntrinsic node of interest
2305 //
2306 // Return Value:
2307 //    The number of sources consumed by this node.
2308 //
2309 int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
2310 {
2311     NamedIntrinsic      intrinsicId = intrinsicTree->gtHWIntrinsicId;
2312     var_types           baseType    = intrinsicTree->gtSIMDBaseType;
2313     InstructionSet      isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
2314     HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
2315     int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(intrinsicTree);
2316
2317     // Set the AVX Flags if this instruction may use VEX encoding for SIMD operations.
2318     // Note that this may be true even if the ISA is not AVX (e.g. for platform-agnostic intrinsics
2319     // or non-AVX intrinsics that will use VEX encoding if it is available on the target).
2320     if (intrinsicTree->isSIMD())
2321     {
2322         SetContainsAVXFlags(intrinsicTree->gtSIMDSize);
2323     }
2324
2325     GenTree* op1    = intrinsicTree->gtGetOp1();
2326     GenTree* op2    = intrinsicTree->gtGetOp2();
2327     GenTree* op3    = nullptr;
2328     GenTree* lastOp = nullptr;
2329
2330     int srcCount = 0;
2331     int dstCount = intrinsicTree->IsValue() ? 1 : 0;
2332
2333     regMaskTP dstCandidates = RBM_NONE;
2334
2335     if (op1 == nullptr)
2336     {
2337         assert(op2 == nullptr);
2338         assert(numArgs == 0);
2339     }
2340     else
2341     {
2342         if (op1->OperIsList())
2343         {
2344             assert(op2 == nullptr);
2345             assert(numArgs >= 3);
2346
2347             GenTreeArgList* argList = op1->AsArgList();
2348
2349             op1     = argList->Current();
2350             argList = argList->Rest();
2351
2352             op2     = argList->Current();
2353             argList = argList->Rest();
2354
2355             op3 = argList->Current();
2356
2357             while (argList->Rest() != nullptr)
2358             {
2359                 argList = argList->Rest();
2360             }
2361
2362             lastOp  = argList->Current();
2363             argList = argList->Rest();
2364
2365             assert(argList == nullptr);
2366         }
2367         else if (op2 != nullptr)
2368         {
2369             assert(numArgs == 2);
2370             lastOp = op2;
2371         }
2372         else
2373         {
2374             assert(numArgs == 1);
2375             lastOp = op1;
2376         }
2377
2378         assert(lastOp != nullptr);
2379
2380         bool buildUses = true;
2381
2382         if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId))
2383         {
2384             if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed())
2385             {
2386                 assert(!lastOp->IsCnsIntOrI());
2387
2388                 // We need two extra reg when lastOp isn't a constant so
2389                 // the offset into the jump table for the fallback path
2390                 // can be computed.
2391                 buildInternalIntRegisterDefForNode(intrinsicTree);
2392                 buildInternalIntRegisterDefForNode(intrinsicTree);
2393             }
2394         }
2395
2396         // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
2397         // is not allocated the same register as the target.
2398         bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
2399
2400         // Create internal temps, and handle any other special requirements.
2401         // Note that the default case for building uses will handle the RMW flag, but if the uses
2402         // are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree)
2403         // must be handled within the case.
2404         switch (intrinsicId)
2405         {
2406             case NI_Base_Vector128_CreateScalarUnsafe:
2407             case NI_Base_Vector128_ToScalar:
2408             case NI_Base_Vector256_CreateScalarUnsafe:
2409             case NI_Base_Vector256_ToScalar:
2410             {
2411                 assert(numArgs == 1);
2412
2413                 if (varTypeIsFloating(baseType))
2414                 {
2415                     if (op1->isContained())
2416                     {
2417                         srcCount += BuildOperandUses(op1);
2418                     }
2419                     else
2420                     {
2421                         // We will either be in memory and need to be moved
2422                         // into a register of the appropriate size or we
2423                         // are already in an XMM/YMM register and can stay
2424                         // where we are.
2425
2426                         tgtPrefUse = BuildUse(op1);
2427                         srcCount += 1;
2428                     }
2429
2430                     buildUses = false;
2431                 }
2432                 break;
2433             }
2434
2435             case NI_Base_Vector128_ToVector256:
2436             case NI_Base_Vector128_ToVector256Unsafe:
2437             case NI_Base_Vector256_GetLower:
2438             {
2439                 assert(numArgs == 1);
2440
2441                 if (op1->isContained())
2442                 {
2443                     srcCount += BuildOperandUses(op1);
2444                 }
2445                 else
2446                 {
2447                     // We will either be in memory and need to be moved
2448                     // into a register of the appropriate size or we
2449                     // are already in an XMM/YMM register and can stay
2450                     // where we are.
2451
2452                     tgtPrefUse = BuildUse(op1);
2453                     srcCount += 1;
2454                 }
2455
2456                 buildUses = false;
2457                 break;
2458             }
2459
2460             case NI_SSE_CompareEqualOrderedScalar:
2461             case NI_SSE_CompareEqualUnorderedScalar:
2462             case NI_SSE_CompareNotEqualOrderedScalar:
2463             case NI_SSE_CompareNotEqualUnorderedScalar:
2464             case NI_SSE2_CompareEqualOrderedScalar:
2465             case NI_SSE2_CompareEqualUnorderedScalar:
2466             case NI_SSE2_CompareNotEqualOrderedScalar:
2467             case NI_SSE2_CompareNotEqualUnorderedScalar:
2468             {
2469                 buildInternalIntRegisterDefForNode(intrinsicTree, allByteRegs());
2470                 setInternalRegsDelayFree = true;
2471                 break;
2472             }
2473
2474             case NI_SSE2_MaskMove:
2475             {
2476                 assert(numArgs == 3);
2477                 assert(!isRMW);
2478
2479                 // MaskMove hardcodes the destination (op3) in DI/EDI/RDI
2480                 srcCount += BuildOperandUses(op1);
2481                 srcCount += BuildOperandUses(op2);
2482                 srcCount += BuildOperandUses(op3, RBM_EDI);
2483
2484                 buildUses = false;
2485                 break;
2486             }
2487
2488             case NI_SSE41_BlendVariable:
2489             {
2490                 assert(numArgs == 3);
2491
2492                 if (!compiler->canUseVexEncoding())
2493                 {
2494                     assert(isRMW);
2495
2496                     // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
2497                     srcCount += BuildOperandUses(op1);
2498                     srcCount += BuildDelayFreeUses(op2);
2499                     srcCount += BuildDelayFreeUses(op3, RBM_XMM0);
2500
2501                     buildUses = false;
2502                 }
2503                 break;
2504             }
2505
2506             case NI_SSE41_TestAllOnes:
2507             {
2508                 buildInternalFloatRegisterDefForNode(intrinsicTree);
2509                 break;
2510             }
2511
2512             case NI_SSE41_Extract:
2513             {
2514                 if (baseType == TYP_FLOAT)
2515                 {
2516                     buildInternalIntRegisterDefForNode(intrinsicTree);
2517                 }
2518 #ifdef _TARGET_X86_
2519                 else if (varTypeIsByte(baseType))
2520                 {
2521                     dstCandidates = allByteRegs();
2522                 }
2523 #endif
2524                 break;
2525             }
2526
2527 #ifdef _TARGET_X86_
2528             case NI_SSE42_Crc32:
2529             case NI_SSE42_X64_Crc32:
2530             {
2531                 // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
2532                 // to the code generator. We may want to encode the overload info in another way.
2533
2534                 assert(numArgs == 2);
2535                 assert(isRMW);
2536
2537                 // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
2538                 srcCount += BuildOperandUses(op1);
2539                 srcCount += BuildDelayFreeUses(op2, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE);
2540
2541                 buildUses = false;
2542                 break;
2543             }
2544 #endif // _TARGET_X86_
2545
2546             case NI_BMI2_MultiplyNoFlags:
2547             case NI_BMI2_X64_MultiplyNoFlags:
2548             {
2549                 assert(numArgs == 2 || numArgs == 3);
2550                 srcCount += BuildOperandUses(op1, RBM_EDX);
2551                 srcCount += BuildOperandUses(op2);
2552                 if (numArgs == 3)
2553                 {
2554                     // op3 reg should be different from target reg to
2555                     // store the lower half result after executing the instruction
2556                     srcCount += BuildDelayFreeUses(op3);
2557                     // Need a internal register different from the dst to take the lower half result
2558                     buildInternalIntRegisterDefForNode(intrinsicTree);
2559                     setInternalRegsDelayFree = true;
2560                 }
2561                 buildUses = false;
2562                 break;
2563             }
2564
2565             case NI_FMA_MultiplyAdd:
2566             case NI_FMA_MultiplyAddNegated:
2567             case NI_FMA_MultiplyAddNegatedScalar:
2568             case NI_FMA_MultiplyAddScalar:
2569             case NI_FMA_MultiplyAddSubtract:
2570             case NI_FMA_MultiplySubtract:
2571             case NI_FMA_MultiplySubtractAdd:
2572             case NI_FMA_MultiplySubtractNegated:
2573             case NI_FMA_MultiplySubtractNegatedScalar:
2574             case NI_FMA_MultiplySubtractScalar:
2575             {
2576                 assert(numArgs == 3);
2577                 assert(isRMW);
2578
2579                 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2580
2581                 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2582                 assert(!copiesUpperBits || !op1->isContained());
2583
2584                 if (op3->isContained())
2585                 {
2586                     // 213 form: op1 = (op2 * op1) + [op3]
2587
2588                     if (copiesUpperBits)
2589                     {
2590                         tgtPrefUse = BuildUse(op1);
2591
2592                         srcCount += 1;
2593                         srcCount += BuildDelayFreeUses(op2);
2594                     }
2595                     else
2596                     {
2597                         // op1 and op2 are commutative, so don't
2598                         // set either to be tgtPref or delayFree
2599
2600                         srcCount += BuildOperandUses(op1);
2601                         srcCount += BuildOperandUses(op2);
2602                     }
2603
2604                     srcCount += BuildOperandUses(op3);
2605                 }
2606                 else if (op2->isContained())
2607                 {
2608                     // 132 form: op1 = (op1 * op3) + [op2]
2609
2610                     tgtPrefUse = BuildUse(op1);
2611
2612                     srcCount += 1;
2613                     srcCount += BuildOperandUses(op2);
2614                     srcCount += BuildDelayFreeUses(op3);
2615                 }
2616                 else if (op1->isContained())
2617                 {
2618                     // 231 form: op3 = (op2 * op3) + [op1]
2619
2620                     tgtPrefUse = BuildUse(op3);
2621
2622                     srcCount += BuildOperandUses(op1);
2623                     srcCount += BuildDelayFreeUses(op2);
2624                     srcCount += 1;
2625                 }
2626                 else
2627                 {
2628                     // 213 form: op1 = (op2 * op1) + op3
2629
2630                     if (copiesUpperBits)
2631                     {
2632                         tgtPrefUse = BuildUse(op1);
2633
2634                         srcCount += 1;
2635                         srcCount += BuildDelayFreeUses(op2);
2636                     }
2637                     else
2638                     {
2639                         // op1 and op2 are commutative, so don't
2640                         // set either to be tgtPref or delayFree
2641
2642                         srcCount += BuildOperandUses(op1);
2643                         srcCount += BuildOperandUses(op2);
2644                     }
2645
2646                     srcCount += BuildDelayFreeUses(op3);
2647                 }
2648
2649                 buildUses = false;
2650                 break;
2651             }
2652
2653             case NI_AVX2_GatherVector128:
2654             case NI_AVX2_GatherVector256:
2655             {
2656                 assert(numArgs == 3);
2657                 // Any pair of the index, mask, or destination registers should be different
2658                 srcCount += BuildOperandUses(op1);
2659                 srcCount += BuildDelayFreeUses(op2);
2660
2661                 // get a tmp register for mask that will be cleared by gather instructions
2662                 buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
2663                 setInternalRegsDelayFree = true;
2664
2665                 buildUses = false;
2666                 break;
2667             }
2668
2669             case NI_AVX2_GatherMaskVector128:
2670             case NI_AVX2_GatherMaskVector256:
2671             {
2672                 assert(numArgs == 5);
2673                 // Any pair of the index, mask, or destination registers should be different
2674                 srcCount += BuildOperandUses(op1);
2675                 srcCount += BuildOperandUses(op2);
2676                 srcCount += BuildDelayFreeUses(op3);
2677
2678                 assert(intrinsicTree->gtGetOp1()->OperIsList());
2679                 GenTreeArgList* argList = intrinsicTree->gtGetOp1()->AsArgList();
2680                 GenTree*        op4     = argList->Rest()->Rest()->Rest()->Current();
2681                 srcCount += BuildDelayFreeUses(op4);
2682
2683                 // get a tmp register for mask that will be cleared by gather instructions
2684                 buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
2685                 setInternalRegsDelayFree = true;
2686
2687                 buildUses = false;
2688                 break;
2689             }
2690
2691             default:
2692             {
2693                 assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
2694                 break;
2695             }
2696         }
2697
2698         if (buildUses)
2699         {
2700             assert((numArgs > 0) && (numArgs < 4));
2701
2702             srcCount += BuildOperandUses(op1);
2703
2704             if (op2 != nullptr)
2705             {
2706                 srcCount += (isRMW) ? BuildDelayFreeUses(op2) : BuildOperandUses(op2);
2707
2708                 if (op3 != nullptr)
2709                 {
2710                     srcCount += (isRMW) ? BuildDelayFreeUses(op3) : BuildOperandUses(op3);
2711                 }
2712             }
2713         }
2714
2715         buildInternalRegisterUses();
2716     }
2717
2718     if (dstCount == 1)
2719     {
2720         BuildDef(intrinsicTree, dstCandidates);
2721     }
2722     else
2723     {
2724         assert(dstCount == 0);
2725     }
2726
2727     return srcCount;
2728 }
2729 #endif
2730
2731 //------------------------------------------------------------------------
2732 // BuildCast: Set the NodeInfo for a GT_CAST.
2733 //
2734 // Arguments:
2735 //    cast - The GT_CAST node
2736 //
2737 // Return Value:
2738 //    The number of sources consumed by this node.
2739 //
2740 int LinearScan::BuildCast(GenTreeCast* cast)
2741 {
2742     GenTree* src = cast->gtGetOp1();
2743
2744     const var_types srcType  = genActualType(src->TypeGet());
2745     const var_types castType = cast->gtCastType;
2746
2747     regMaskTP candidates = RBM_NONE;
2748 #ifdef _TARGET_X86_
2749     if (varTypeIsByte(castType))
2750     {
2751         candidates = allByteRegs();
2752     }
2753
2754     assert(!varTypeIsLong(srcType) || (src->OperIs(GT_LONG) && src->isContained()));
2755 #else
2756     // Overflow checking cast from TYP_(U)LONG to TYP_UINT requires a temporary
2757     // register to extract the upper 32 bits of the 64 bit source register.
2758     if (cast->gtOverflow() && varTypeIsLong(srcType) && (castType == TYP_UINT))
2759     {
2760         // Here we don't need internal register to be different from targetReg,
2761         // rather require it to be different from operand's reg.
2762         buildInternalIntRegisterDefForNode(cast);
2763     }
2764 #endif
2765
2766     int srcCount = BuildOperandUses(src, candidates);
2767     buildInternalRegisterUses();
2768     BuildDef(cast, candidates);
2769     return srcCount;
2770 }
2771
2772 //-----------------------------------------------------------------------------------------
2773 // BuildIndir: Specify register requirements for address expression of an indirection operation.
2774 //
2775 // Arguments:
2776 //    indirTree    -   GT_IND or GT_STOREIND gentree node
2777 //
2778 // Return Value:
2779 //    The number of sources consumed by this node.
2780 //
2781 int LinearScan::BuildIndir(GenTreeIndir* indirTree)
2782 {
2783     // If this is the rhs of a block copy (i.e. non-enregisterable struct),
2784     // it has no register requirements.
2785     if (indirTree->TypeGet() == TYP_STRUCT)
2786     {
2787         return 0;
2788     }
2789
2790 #ifdef FEATURE_SIMD
2791     RefPosition* internalFloatDef = nullptr;
2792     if (indirTree->TypeGet() == TYP_SIMD12)
2793     {
2794         // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir().
2795         assert(!indirTree->Addr()->isContained());
2796
2797         // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
2798         // To assemble the vector properly we would need an additional
2799         // XMM register.
2800         internalFloatDef = buildInternalFloatRegisterDefForNode(indirTree);
2801
2802         // In case of GT_IND we need an internal register different from targetReg and
2803         // both of the registers are used at the same time.
2804         if (indirTree->OperGet() == GT_IND)
2805         {
2806             setInternalRegsDelayFree = true;
2807         }
2808     }
2809 #endif // FEATURE_SIMD
2810
2811     regMaskTP indirCandidates = RBM_NONE;
2812     int       srcCount        = BuildIndirUses(indirTree, indirCandidates);
2813     if (indirTree->gtOper == GT_STOREIND)
2814     {
2815         GenTree* source = indirTree->gtGetOp2();
2816         if (indirTree->AsStoreInd()->IsRMWMemoryOp())
2817         {
2818             // Because 'source' is contained, we haven't yet determined its special register requirements, if any.
2819             // As it happens, the Shift or Rotate cases are the only ones with special requirements.
2820             assert(source->isContained() && source->OperIsRMWMemOp());
2821             GenTree*      nonMemSource = nullptr;
2822             GenTreeIndir* otherIndir   = nullptr;
2823
2824             if (source->OperIsShiftOrRotate())
2825             {
2826                 srcCount += BuildShiftRotate(source);
2827             }
2828             else
2829             {
2830                 regMaskTP srcCandidates = RBM_NONE;
2831
2832 #ifdef _TARGET_X86_
2833                 // Determine if we need byte regs for the non-mem source, if any.
2834                 // Note that BuildShiftRotate (above) will handle the byte requirement as needed,
2835                 // but STOREIND isn't itself an RMW op, so we have to explicitly set it for that case.
2836
2837                 GenTree* nonMemSource = nullptr;
2838
2839                 if (indirTree->AsStoreInd()->IsRMWDstOp1())
2840                 {
2841                     otherIndir = source->gtGetOp1()->AsIndir();
2842                     if (source->OperIsBinary())
2843                     {
2844                         nonMemSource = source->gtGetOp2();
2845                     }
2846                 }
2847                 else if (indirTree->AsStoreInd()->IsRMWDstOp2())
2848                 {
2849                     otherIndir   = source->gtGetOp2()->AsIndir();
2850                     nonMemSource = source->gtGetOp1();
2851                 }
2852                 if ((nonMemSource != nullptr) && !nonMemSource->isContained() && varTypeIsByte(indirTree))
2853                 {
2854                     srcCandidates = RBM_BYTE_REGS;
2855                 }
2856 #endif
2857                 if (otherIndir != nullptr)
2858                 {
2859                     // Any lclVars in the addressing mode of this indirection are contained.
2860                     // If they are marked as lastUse, transfer the last use flag to the store indir.
2861                     GenTree* base    = otherIndir->Base();
2862                     GenTree* dstBase = indirTree->Base();
2863                     CheckAndMoveRMWLastUse(base, dstBase);
2864                     GenTree* index    = otherIndir->Index();
2865                     GenTree* dstIndex = indirTree->Index();
2866                     CheckAndMoveRMWLastUse(index, dstIndex);
2867                 }
2868                 srcCount += BuildBinaryUses(source->AsOp(), srcCandidates);
2869             }
2870         }
2871         else
2872         {
2873 #ifdef _TARGET_X86_
2874             if (varTypeIsByte(indirTree) && !source->isContained())
2875             {
2876                 BuildUse(source, allByteRegs());
2877                 srcCount++;
2878             }
2879             else
2880 #endif
2881             {
2882                 srcCount += BuildOperandUses(source);
2883             }
2884         }
2885     }
2886 #ifdef FEATURE_SIMD
2887     if (varTypeIsSIMD(indirTree))
2888     {
2889         SetContainsAVXFlags(genTypeSize(indirTree->TypeGet()));
2890     }
2891     buildInternalRegisterUses();
2892 #endif // FEATURE_SIMD
2893
2894     if (indirTree->gtOper != GT_STOREIND)
2895     {
2896         BuildDef(indirTree);
2897     }
2898     return srcCount;
2899 }
2900
2901 //------------------------------------------------------------------------
2902 // BuildMul: Set the NodeInfo for a multiply.
2903 //
2904 // Arguments:
2905 //    tree      - The node of interest
2906 //
2907 // Return Value:
2908 //    The number of sources consumed by this node.
2909 //
2910 int LinearScan::BuildMul(GenTree* tree)
2911 {
2912     assert(tree->OperIsMul());
2913     GenTree* op1 = tree->gtGetOp1();
2914     GenTree* op2 = tree->gtGetOp2();
2915
2916     // Only non-floating point mul has special requirements
2917     if (varTypeIsFloating(tree->TypeGet()))
2918     {
2919         return BuildSimple(tree);
2920     }
2921
2922     int       srcCount      = BuildBinaryUses(tree->AsOp());
2923     int       dstCount      = 1;
2924     regMaskTP dstCandidates = RBM_NONE;
2925
2926     bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
2927     bool requiresOverflowCheck = tree->gtOverflowEx();
2928
2929     // There are three forms of x86 multiply:
2930     // one-op form:     RDX:RAX = RAX * r/m
2931     // two-op form:     reg *= r/m
2932     // three-op form:   reg = r/m * imm
2933
2934     // This special widening 32x32->64 MUL is not used on x64
2935     CLANG_FORMAT_COMMENT_ANCHOR;
2936 #if defined(_TARGET_X86_)
2937     if (tree->OperGet() != GT_MUL_LONG)
2938 #endif
2939     {
2940         assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
2941     }
2942
2943     // We do use the widening multiply to implement
2944     // the overflow checking for unsigned multiply
2945     //
2946     if (isUnsignedMultiply && requiresOverflowCheck)
2947     {
2948         // The only encoding provided is RDX:RAX = RAX * rm
2949         //
2950         // Here we set RAX as the only destination candidate
2951         // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
2952         //
2953         dstCandidates = RBM_RAX;
2954     }
2955     else if (tree->OperGet() == GT_MULHI)
2956     {
2957         // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
2958         // upper 32 bits of the result set the destination candidate to REG_RDX.
2959         dstCandidates = RBM_RDX;
2960     }
2961 #if defined(_TARGET_X86_)
2962     else if (tree->OperGet() == GT_MUL_LONG)
2963     {
2964         // have to use the encoding:RDX:RAX = RAX * rm
2965         dstCandidates = RBM_RAX | RBM_RDX;
2966         dstCount      = 2;
2967     }
2968 #endif
2969     GenTree* containedMemOp = nullptr;
2970     if (op1->isContained() && !op1->IsCnsIntOrI())
2971     {
2972         assert(!op2->isContained() || op2->IsCnsIntOrI());
2973         containedMemOp = op1;
2974     }
2975     else if (op2->isContained() && !op2->IsCnsIntOrI())
2976     {
2977         containedMemOp = op2;
2978     }
2979     regMaskTP killMask = getKillSetForMul(tree->AsOp());
2980     BuildDefsWithKills(tree, dstCount, dstCandidates, killMask);
2981     return srcCount;
2982 }
2983
2984 //------------------------------------------------------------------------------
2985 // SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
2986 // Contains256bitAVX flag when SIMD vector size is 32 bytes
2987 //
2988 // Arguments:
2989 //    isFloatingPointType   - true if it is floating point type
2990 //    sizeOfSIMDVector      - SIMD Vector size
2991 //
2992 void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/)
2993 {
2994     if (compiler->canUseVexEncoding())
2995     {
2996         compiler->getEmitter()->SetContainsAVX(true);
2997         if (sizeOfSIMDVector == 32)
2998         {
2999             compiler->getEmitter()->SetContains256bitAVX(true);
3000         }
3001     }
3002 }
3003
3004 #endif // _TARGET_XARCH_