1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Register Requirements for AMD64 XX
10 XX This encapsulates all the logic for setting register requirements for XX
11 XX the AMD64 architecture. XX
14 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
26 #include "sideeffects.h"
29 //------------------------------------------------------------------------
30 // BuildNode: Build the RefPositions for for a node
33 // treeNode - the node of interest
36 // The number of sources consumed by this node.
40 // LSRA Has been initialized.
43 // RefPositions have been built for all the register defs and uses required
46 int LinearScan::BuildNode(GenTree* tree)
48 assert(!tree->isContained());
49 Interval* prefSrcInterval = nullptr;
52 regMaskTP dstCandidates = RBM_NONE;
53 regMaskTP killMask = RBM_NONE;
54 bool isLocalDefUse = false;
56 // Reset the build-related members of LinearScan.
59 // Set the default dstCount. This may be modified below.
63 if (tree->IsUnusedValue())
73 // floating type generates AVX instruction (vmovss etc.), set the flag
74 if (varTypeIsFloating(tree->TypeGet()))
76 SetContainsAVXFlags();
79 switch (tree->OperGet())
82 srcCount = BuildSimple(tree);
86 // Because we do containment analysis before we redo dataflow and identify register
87 // candidates, the containment analysis only uses !lvDoNotEnregister to estimate register
89 // If there is a lclVar that is estimated to be register candidate but
90 // is not, if they were marked regOptional they should now be marked contained instead.
91 // TODO-XArch-CQ: When this is being called while RefPositions are being created,
92 // use lvLRACandidate here instead.
93 if (tree->IsRegOptional())
95 if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked ||
96 compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister)
98 tree->ClearRegOptional();
100 INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 0));
108 // We handle tracked variables differently from non-tracked ones. If it is tracked,
109 // we will simply add a use of the tracked variable at its parent/consumer.
110 // Otherwise, for a use we need to actually add the appropriate references for loading
111 // or storing the variable.
113 // A tracked variable won't actually get used until the appropriate ancestor tree node
114 // is processed, unless this is marked "isLocalDefUse" because it is a stack-based argument
115 // to a call or an orphaned dead node.
117 LclVarDsc* const varDsc = &compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum];
118 if (isCandidateVar(varDsc))
120 INDEBUG(dumpNodeInfo(tree, dstCandidates, 0, 1));
125 // Need an additional register to read upper 4 bytes of Vector3.
126 if (tree->TypeGet() == TYP_SIMD12)
128 // We need an internal register different from targetReg in which 'tree' produces its result
129 // because both targetReg and internal reg will be in use at the same time.
130 buildInternalFloatRegisterDefForNode(tree, allSIMDRegs());
131 setInternalRegsDelayFree = true;
132 buildInternalRegisterUses();
139 case GT_STORE_LCL_FLD:
140 case GT_STORE_LCL_VAR:
141 srcCount = BuildStoreLoc(tree->AsLclVarCommon());
145 // These should always be contained. We don't correctly allocate or
146 // generate code for a non-contained GT_FIELD_LIST.
147 noway_assert(!"Non-contained GT_FIELD_LIST");
156 assert(dstCount == 0);
159 case GT_START_PREEMPTGC:
160 // This kills GC refs in callee save regs
162 assert(dstCount == 0);
163 BuildDefsWithKills(tree, 0, RBM_NONE, RBM_NONE);
168 assert(dstCount == 0);
169 killMask = getKillSetForProfilerHook();
170 BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
178 assert(dstCount == 1);
179 assert(!tree->IsReuseRegVal());
180 RefPosition* def = BuildDef(tree);
181 def->getInterval()->isConstant = true;
185 #if !defined(_TARGET_64BIT_)
188 assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here.
189 // An unused GT_LONG node needs to consume its sources, but need not produce a register.
190 tree->gtType = TYP_VOID;
191 tree->ClearUnusedValue();
192 isLocalDefUse = false;
195 BuildUse(tree->gtGetOp1());
196 BuildUse(tree->gtGetOp2());
199 #endif // !defined(_TARGET_64BIT_)
210 srcCount = BuildReturn(tree);
211 killMask = getKillSetForReturn();
212 BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
216 assert(dstCount == 0);
217 if (tree->TypeGet() == TYP_VOID)
223 assert(tree->TypeGet() == TYP_INT);
225 BuildUse(tree->gtGetOp1(), RBM_INTRET);
229 // A GT_NOP is either a passthrough (if it is void, or if it has
230 // a child), but must be considered to produce a dummy value if it
231 // has a type but no child
234 assert((tree->gtGetOp1() == nullptr) || tree->isContained());
235 if (tree->TypeGet() != TYP_VOID && tree->gtGetOp1() == nullptr)
237 assert(dstCount == 1);
238 BuildUse(tree->gtGetOp1());
243 assert(dstCount == 0);
250 assert(dstCount == 0);
251 GenTree* cmp = tree->gtGetOp1();
252 assert(!cmp->IsValue());
258 assert(dstCount == 0);
263 assert(dstCount == 1);
264 // This defines a byte value (note that on x64 allByteRegs() is defined as RBM_ALLINT).
265 BuildDef(tree, allByteRegs());
270 assert(dstCount == 0);
274 // This should never occur since switch nodes must not be visible at this
277 noway_assert(!"Switch must be lowered at this point");
282 assert(dstCount == 1);
286 case GT_SWITCH_TABLE:
288 assert(dstCount == 0);
289 buildInternalIntRegisterDefForNode(tree);
290 srcCount = BuildBinaryUses(tree->AsOp());
291 buildInternalRegisterUses();
292 assert(srcCount == 2);
297 noway_assert(!"We should never hit any assignment operator in lowering");
301 #if !defined(_TARGET_64BIT_)
312 srcCount = BuildBinaryUses(tree->AsOp());
313 assert(dstCount == 1);
318 srcCount = BuildBinaryUses(tree->AsOp());
319 assert(dstCount == 0);
324 // This just turns into a compare of its child with an int + a conditional call.
325 RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree);
326 srcCount = BuildOperandUses(tree->gtGetOp1());
327 buildInternalRegisterUses();
328 killMask = compiler->compHelperCallKillSet(CORINFO_HELP_STOP_FOR_GC);
329 BuildDefsWithKills(tree, 0, RBM_NONE, killMask);
337 srcCount = BuildModDiv(tree->AsOp());
340 #if defined(_TARGET_X86_)
347 srcCount = BuildMul(tree->AsOp());
351 srcCount = BuildIntrinsic(tree->AsOp());
356 srcCount = BuildSIMD(tree->AsSIMD());
358 #endif // FEATURE_SIMD
360 #ifdef FEATURE_HW_INTRINSICS
362 srcCount = BuildHWIntrinsic(tree->AsHWIntrinsic());
364 #endif // FEATURE_HW_INTRINSICS
367 assert(dstCount == 1);
368 srcCount = BuildCast(tree->AsCast());
373 assert(dstCount == 1);
374 tgtPrefUse = BuildUse(tree->gtGetOp1());
382 // SSE instruction set doesn't have an instruction to negate a number.
383 // The recommended way is to xor the float/double number with a bitmask.
384 // The only way to xor is using xorps or xorpd both of which operate on
385 // 128-bit operands. To hold the bit-mask we would need another xmm
386 // register or a 16-byte aligned 128-bit data constant. Right now emitter
387 // lacks the support for emitting such constants or instruction with mem
388 // addressing mode referring to a 128-bit operand. For now we use an
389 // internal xmm register to load 32/64-bit bitmask from data section.
390 // Note that by trading additional data section memory (128-bit) we can
391 // save on the need for an internal register and also a memory-to-reg
394 // Note: another option to avoid internal register requirement is by
395 // lowering as GT_SUB(0, src). This will generate code different from
396 // Jit64 and could possibly result in compat issues (?).
397 if (varTypeIsFloating(tree))
400 RefPosition* internalDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
401 srcCount = BuildOperandUses(tree->gtGetOp1());
402 buildInternalRegisterUses();
406 srcCount = BuildOperandUses(tree->gtGetOp1());
412 srcCount = BuildOperandUses(tree->gtGetOp1());
425 srcCount = BuildShiftRotate(tree);
437 srcCount = BuildCmp(tree);
442 assert(dstCount == 1);
443 RefPosition* internalDef = buildInternalIntRegisterDefForNode(tree);
444 srcCount = BuildOperandUses(tree->gtGetOp1());
445 buildInternalRegisterUses();
453 assert(dstCount == 1);
455 // Comparand is preferenced to RAX.
456 // The remaining two operands can be in any reg other than RAX.
457 BuildUse(tree->gtCmpXchg.gtOpLocation, allRegs(TYP_INT) & ~RBM_RAX);
458 BuildUse(tree->gtCmpXchg.gtOpValue, allRegs(TYP_INT) & ~RBM_RAX);
459 BuildUse(tree->gtCmpXchg.gtOpComparand, RBM_RAX);
460 BuildDef(tree, RBM_RAX);
467 // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
468 // to special case them.
469 // These tree nodes will have their op1 marked as isDelayFree=true.
470 // That is, op1's reg remains in use until the subsequent instruction.
471 GenTree* addr = tree->gtGetOp1();
472 GenTree* data = tree->gtGetOp2();
473 assert(!addr->isContained());
474 RefPosition* addrUse = BuildUse(addr);
475 setDelayFree(addrUse);
476 tgtPrefUse = addrUse;
477 assert(!data->isContained());
480 assert(dstCount == 1);
486 srcCount = BuildPutArgReg(tree->AsUnOp());
490 srcCount = BuildCall(tree->AsCall());
491 if (tree->AsCall()->HasMultiRegRetVal())
493 dstCount = tree->AsCall()->GetReturnTypeDesc()->GetReturnRegCount();
499 // For a GT_ADDR, the child node should not be evaluated into a register
500 GenTree* child = tree->gtGetOp1();
501 assert(!isCandidateLocalRef(child));
502 assert(child->isContained());
503 assert(dstCount == 1);
508 #if !defined(FEATURE_PUT_STRUCT_ARG_STK)
513 // These should all be eliminated prior to Lowering.
514 assert(!"Non-store block node in Lowering");
518 #ifdef FEATURE_PUT_STRUCT_ARG_STK
520 srcCount = BuildPutArgStk(tree->AsPutArgStk());
522 #endif // FEATURE_PUT_STRUCT_ARG_STK
526 case GT_STORE_DYN_BLK:
527 srcCount = BuildBlockStore(tree->AsBlk());
531 // Always a passthrough of its child's value.
532 assert(!"INIT_VAL should always be contained");
537 srcCount = BuildLclHeap(tree);
540 case GT_ARR_BOUNDS_CHECK:
543 #endif // FEATURE_SIMD
544 #ifdef FEATURE_HW_INTRINSICS
545 case GT_HW_INTRINSIC_CHK:
546 #endif // FEATURE_HW_INTRINSICS
548 // Consumes arrLen & index - has no result
550 assert(dstCount == 0);
551 srcCount = BuildOperandUses(tree->AsBoundsChk()->gtIndex);
552 srcCount += BuildOperandUses(tree->AsBoundsChk()->gtArrLen);
556 // These must have been lowered to GT_ARR_INDEX
557 noway_assert(!"We should never see a GT_ARR_ELEM after Lowering.");
564 assert(dstCount == 1);
565 assert(!tree->AsArrIndex()->ArrObj()->isContained());
566 assert(!tree->AsArrIndex()->IndexExpr()->isContained());
567 // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
568 // times while the result is being computed.
569 RefPosition* arrObjUse = BuildUse(tree->AsArrIndex()->ArrObj());
570 setDelayFree(arrObjUse);
571 BuildUse(tree->AsArrIndex()->IndexExpr());
578 // This consumes the offset, if any, the arrObj and the effective index,
579 // and produces the flattened offset for this dimension.
580 assert(dstCount == 1);
582 RefPosition* internalDef = nullptr;
583 if (tree->gtArrOffs.gtOffset->isContained())
589 // Here we simply need an internal register, which must be different
590 // from any of the operand's registers, but may be the same as targetReg.
592 internalDef = buildInternalIntRegisterDefForNode(tree);
593 BuildUse(tree->AsArrOffs()->gtOffset);
595 BuildUse(tree->AsArrOffs()->gtIndex);
596 BuildUse(tree->AsArrOffs()->gtArrObj);
597 if (internalDef != nullptr)
599 buildInternalRegisterUses();
606 // The LEA usually passes its operands through to the GT_IND, in which case it will
607 // be contained, but we may be instantiating an address, in which case we set them here.
609 assert(dstCount == 1);
610 if (tree->AsAddrMode()->HasBase())
613 BuildUse(tree->AsAddrMode()->Base());
615 if (tree->AsAddrMode()->HasIndex())
618 BuildUse(tree->AsAddrMode()->Index());
624 if (compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(tree))
626 srcCount = BuildGCWriteBarrier(tree);
629 srcCount = BuildIndir(tree->AsIndir());
634 assert(dstCount == 0);
635 regMaskTP indirCandidates = RBM_NONE;
636 BuildUse(tree->gtGetOp1(), indirCandidates);
642 srcCount = BuildIndir(tree->AsIndir());
643 assert(dstCount == 1);
648 assert(dstCount == 1);
649 BuildDef(tree, RBM_EXCEPTION_OBJECT);
652 #if !FEATURE_EH_FUNCLETS
655 assert(dstCount == 0);
660 // These nodes are eliminated by rationalizer.
661 JITDUMP("Unexpected node %s in Lower.\n", GenTree::OpName(tree->OperGet()));
667 assert(dstCount == 1);
668 RefPosition* internalDef = nullptr;
669 #ifdef _TARGET_64BIT_
670 // On 64-bit we always need a temporary register:
671 // - if the index is `native int` then we need to load the array
672 // length into a register to widen it to `native int`
673 // - if the index is `int` (or smaller) then we need to widen
674 // it to `long` to peform the address calculation
675 internalDef = buildInternalIntRegisterDefForNode(tree);
676 #else // !_TARGET_64BIT_
677 assert(!varTypeIsLong(tree->AsIndexAddr()->Index()->TypeGet()));
678 switch (tree->AsIndexAddr()->gtElemSize)
687 internalDef = buildInternalIntRegisterDefForNode(tree);
690 #endif // !_TARGET_64BIT_
691 srcCount = BuildBinaryUses(tree->AsOp());
692 if (internalDef != nullptr)
694 buildInternalRegisterUses();
700 } // end switch (tree->OperGet())
702 // We need to be sure that we've set srcCount and dstCount appropriately.
703 // Not that for XARCH, the maximum number of registers defined is 2.
704 assert((dstCount < 2) || ((dstCount == 2) && tree->IsMultiRegNode()));
705 assert(isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue()));
706 assert(!tree->IsUnusedValue() || (dstCount != 0));
707 assert(dstCount == tree->GetRegisterDstCount());
708 INDEBUG(dumpNodeInfo(tree, dstCandidates, srcCount, dstCount));
712 //------------------------------------------------------------------------
713 // getTgtPrefOperands: Identify whether the operands of an Op should be preferenced to the target.
716 // tree - the node of interest.
717 // prefOp1 - a bool "out" parameter indicating, on return, whether op1 should be preferenced to the target.
718 // prefOp2 - a bool "out" parameter indicating, on return, whether op2 should be preferenced to the target.
721 // This has two "out" parameters for returning the results (see above).
724 // The caller is responsible for initializing the two "out" parameters to false.
726 void LinearScan::getTgtPrefOperands(GenTreeOp* tree, bool& prefOp1, bool& prefOp2)
728 // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
729 // Even then we would like to set isTgtPref on Op1.
730 if (tree->OperIsBinary() && isRMWRegOper(tree))
732 GenTree* op1 = tree->gtGetOp1();
733 GenTree* op2 = tree->gtGetOp2();
735 // If we have a read-modify-write operation, we want to preference op1 to the target,
736 // if it is not contained.
737 if (!op1->isContained() && !op1->OperIs(GT_LIST))
742 // Commutative opers like add/mul/and/or/xor could reverse the order of operands if it is safe to do so.
743 // In that case we will preference both, to increase the chance of getting a match.
744 if (tree->OperIsCommutative() && op2 != nullptr && !op2->isContained())
751 //------------------------------------------------------------------------------
752 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
755 // tree - a binary tree node
758 // Returns true if we can use the read-modify-write instruction form
761 // This is used to determine whether to preference the source to the destination register.
763 bool LinearScan::isRMWRegOper(GenTree* tree)
765 // TODO-XArch-CQ: Make this more accurate.
766 // For now, We assume that most binary operators are of the RMW form.
767 assert(tree->OperIsBinary());
769 if (tree->OperIsCompare() || tree->OperIs(GT_CMP) || tree->OperIs(GT_BT))
774 switch (tree->OperGet())
776 // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
782 case GT_SWITCH_TABLE:
789 // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
791 return (!tree->gtGetOp2()->isContainedIntOrIImmed() && !tree->gtGetOp1()->isContainedIntOrIImmed());
793 #ifdef FEATURE_HW_INTRINSICS
795 return tree->isRMWHWIntrinsic(compiler);
796 #endif // FEATURE_HW_INTRINSICS
803 // Support for building RefPositions for RMW nodes.
804 int LinearScan::BuildRMWUses(GenTreeOp* node, regMaskTP candidates)
807 GenTree* op1 = node->gtOp1;
808 GenTree* op2 = node->gtGetOp2IfPresent();
809 bool isReverseOp = node->IsReverseOp();
810 regMaskTP op1Candidates = candidates;
811 regMaskTP op2Candidates = candidates;
814 if (varTypeIsByte(node))
816 regMaskTP byteCandidates = (candidates == RBM_NONE) ? allByteRegs() : (candidates & allByteRegs());
817 if (!op1->isContained())
819 assert(byteCandidates != RBM_NONE);
820 op1Candidates = byteCandidates;
822 if (node->OperIsCommutative() && !op2->isContained())
824 assert(byteCandidates != RBM_NONE);
825 op2Candidates = byteCandidates;
828 #endif // _TARGET_X86_
830 bool prefOp1 = false;
831 bool prefOp2 = false;
832 getTgtPrefOperands(node, prefOp1, prefOp2);
833 assert(!prefOp2 || node->OperIsCommutative());
834 assert(!isReverseOp || node->OperIsCommutative());
836 // Determine which operand, if any, should be delayRegFree. Normally, this would be op2,
837 // but if we have a commutative operator and op1 is a contained memory op, it would be op1.
838 // We need to make the delayRegFree operand remain live until the op is complete, by marking
839 // the source(s) associated with op2 as "delayFree".
840 // Note that if op2 of a binary RMW operator is a memory op, even if the operator
841 // is commutative, codegen cannot reverse them.
842 // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
843 // more work to be done to correctly reverse the operands if they involve memory
844 // operands. Also, we may need to handle more cases than GT_IND, especially once
845 // we've modified the register allocator to not require all nodes to be assigned
846 // a register (e.g. a spilled lclVar can often be referenced directly from memory).
847 // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
848 GenTree* delayUseOperand = op2;
849 if (node->OperIsCommutative())
851 if (op1->isContained() && op2 != nullptr)
853 delayUseOperand = op1;
855 else if (!op2->isContained() || op2->IsCnsIntOrI())
857 // If we have a commutative operator and op2 is not a memory op, we don't need
858 // to set delayRegFree on either operand because codegen can swap them.
859 delayUseOperand = nullptr;
862 else if (op1->isContained())
864 delayUseOperand = nullptr;
866 if (delayUseOperand != nullptr)
868 assert(!prefOp1 || delayUseOperand != op1);
869 assert(!prefOp2 || delayUseOperand != op2);
881 assert(!op1->isContained());
882 tgtPrefUse = BuildUse(op1, op1Candidates);
885 else if (delayUseOperand == op1)
887 srcCount += BuildDelayFreeUses(op1, op1Candidates);
891 srcCount += BuildOperandUses(op1, op1Candidates);
898 assert(!op2->isContained());
899 tgtPrefUse2 = BuildUse(op2, op2Candidates);
902 else if (delayUseOperand == op2)
904 srcCount += BuildDelayFreeUses(op2, op2Candidates);
908 srcCount += BuildOperandUses(op2, op2Candidates);
914 //------------------------------------------------------------------------
915 // BuildShiftRotate: Set the NodeInfo for a shift or rotate.
918 // tree - The node of interest
921 // The number of sources consumed by this node.
923 int LinearScan::BuildShiftRotate(GenTree* tree)
925 // For shift operations, we need that the number
926 // of bits moved gets stored in CL in case
927 // the number of bits to shift is not a constant.
929 GenTree* shiftBy = tree->gtGetOp2();
930 GenTree* source = tree->gtGetOp1();
931 regMaskTP srcCandidates = RBM_NONE;
932 regMaskTP dstCandidates = RBM_NONE;
934 // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
935 // We will allow whatever can be encoded - hope you know what you are doing.
936 if (shiftBy->isContained())
938 assert(shiftBy->OperIsConst());
942 srcCandidates = allRegs(TYP_INT) & ~RBM_RCX;
943 dstCandidates = allRegs(TYP_INT) & ~RBM_RCX;
946 // Note that Rotate Left/Right instructions don't set ZF and SF flags.
948 // If the operand being shifted is 32-bits then upper three bits are masked
949 // by hardware to get actual shift count. Similarly for 64-bit operands
950 // shift count is narrowed to [0..63]. If the resulting shift count is zero,
951 // then shift operation won't modify flags.
953 // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
954 // if the shift count is known to be non-zero and in the range depending on the
956 CLANG_FORMAT_COMMENT_ANCHOR;
959 // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
960 // we can have a three operand form.
961 if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
963 assert((source->OperGet() == GT_LONG) && source->isContained());
965 GenTree* sourceLo = source->gtGetOp1();
966 GenTree* sourceHi = source->gtGetOp2();
967 assert(!sourceLo->isContained() && !sourceHi->isContained());
968 RefPosition* sourceLoUse = BuildUse(sourceLo, srcCandidates);
969 RefPosition* sourceHiUse = BuildUse(sourceHi, srcCandidates);
971 if (!tree->isContained())
973 if (tree->OperGet() == GT_LSH_HI)
975 setDelayFree(sourceLoUse);
979 setDelayFree(sourceHiUse);
985 if (!source->isContained())
987 tgtPrefUse = BuildUse(source, srcCandidates);
992 srcCount += BuildOperandUses(source, srcCandidates);
994 if (!tree->isContained())
996 if (!shiftBy->isContained())
998 srcCount += BuildDelayFreeUses(shiftBy, RBM_RCX);
999 buildKillPositionsForNode(tree, currentLoc + 1, RBM_RCX);
1001 BuildDef(tree, dstCandidates);
1005 if (!shiftBy->isContained())
1007 srcCount += BuildOperandUses(shiftBy, RBM_RCX);
1008 buildKillPositionsForNode(tree, currentLoc + 1, RBM_RCX);
1014 //------------------------------------------------------------------------
1015 // BuildCall: Set the NodeInfo for a call.
1018 // call - The call node of interest
1021 // The number of sources consumed by this node.
1023 int LinearScan::BuildCall(GenTreeCall* call)
1025 bool hasMultiRegRetVal = false;
1026 ReturnTypeDesc* retTypeDesc = nullptr;
1029 regMaskTP dstCandidates = RBM_NONE;
1031 assert(!call->isContained());
1032 if (call->TypeGet() != TYP_VOID)
1034 hasMultiRegRetVal = call->HasMultiRegRetVal();
1035 if (hasMultiRegRetVal)
1037 // dst count = number of registers in which the value is returned by call
1038 retTypeDesc = call->GetReturnTypeDesc();
1039 dstCount = retTypeDesc->GetReturnRegCount();
1047 GenTree* ctrlExpr = call->gtControlExpr;
1048 if (call->gtCallType == CT_INDIRECT)
1050 ctrlExpr = call->gtCallAddr;
1053 RegisterType registerType = call->TypeGet();
1055 // Set destination candidates for return value of the call.
1056 CLANG_FORMAT_COMMENT_ANCHOR;
1059 if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
1061 // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
1062 // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
1063 // correct argument registers.
1064 dstCandidates = RBM_PINVOKE_TCB;
1067 #endif // _TARGET_X86_
1068 if (hasMultiRegRetVal)
1070 assert(retTypeDesc != nullptr);
1071 dstCandidates = retTypeDesc->GetABIReturnRegs();
1072 assert((int)genCountBits(dstCandidates) == dstCount);
1074 else if (varTypeIsFloating(registerType))
1077 // The return value will be on the X87 stack, and we will need to move it.
1078 dstCandidates = allRegs(registerType);
1079 #else // !_TARGET_X86_
1080 dstCandidates = RBM_FLOATRET;
1081 #endif // !_TARGET_X86_
1083 else if (registerType == TYP_LONG)
1085 dstCandidates = RBM_LNGRET;
1089 dstCandidates = RBM_INTRET;
1092 // number of args to a call =
1093 // callRegArgs + (callargs - placeholders, setup, etc)
1094 // there is an explicit thisPtr but it is redundant
1096 bool callHasFloatRegArgs = false;
1097 bool isVarArgs = call->IsVarargs();
1099 // First, determine internal registers.
1100 // We will need one for any float arguments to a varArgs call.
1101 for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1103 GenTree* argNode = list->Current();
1104 if (argNode->OperIsPutArgReg())
1106 HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1108 else if (argNode->OperGet() == GT_FIELD_LIST)
1110 for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1112 assert(entry->Current()->OperIsPutArgReg());
1113 HandleFloatVarArgs(call, entry->Current(), &callHasFloatRegArgs);
1118 // Now, count reg args
1119 for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1121 // By this point, lowering has ensured that all call arguments are one of the following:
1122 // - an arg setup store
1123 // - an arg placeholder
1129 // Note that this property is statically checked by LinearScan::CheckBlock.
1130 GenTree* argNode = list->Current();
1132 // Each register argument corresponds to one source.
1133 if (argNode->OperIsPutArgReg())
1136 BuildUse(argNode, genRegMask(argNode->gtRegNum));
1138 #ifdef UNIX_AMD64_ABI
1139 else if (argNode->OperGet() == GT_FIELD_LIST)
1141 for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1143 assert(entry->Current()->OperIsPutArgReg());
1145 BuildUse(entry->Current(), genRegMask(entry->Current()->gtRegNum));
1148 #endif // UNIX_AMD64_ABI
1151 // In DEBUG only, check validity with respect to the arg table entry.
1153 fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1154 assert(curArgTabEntry);
1156 if (curArgTabEntry->regNum == REG_STK)
1158 // late arg that is not passed in a register
1159 assert(argNode->gtOper == GT_PUTARG_STK);
1161 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1162 // If the node is TYP_STRUCT and it is put on stack with
1163 // putarg_stk operation, we consume and produce no registers.
1164 // In this case the embedded Obj node should not produce
1165 // registers too since it is contained.
1166 // Note that if it is a SIMD type the argument will be in a register.
1167 if (argNode->TypeGet() == TYP_STRUCT)
1169 assert(argNode->gtGetOp1() != nullptr && argNode->gtGetOp1()->OperGet() == GT_OBJ);
1170 assert(argNode->gtGetOp1()->isContained());
1172 #endif // FEATURE_PUT_STRUCT_ARG_STK
1175 #ifdef UNIX_AMD64_ABI
1176 if (argNode->OperGet() == GT_FIELD_LIST)
1178 assert(argNode->isContained());
1179 assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
1182 for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1184 const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum;
1185 assert(entry->Current()->gtRegNum == argReg);
1191 #endif // UNIX_AMD64_ABI
1193 const regNumber argReg = curArgTabEntry->regNum;
1194 assert(argNode->gtRegNum == argReg);
1199 // Now, count stack args
1200 // Note that these need to be computed into a register, but then
1201 // they're just stored to the stack - so the reg doesn't
1202 // need to remain live until the call. In fact, it must not
1203 // because the code generator doesn't actually consider it live,
1204 // so it can't be spilled.
1206 GenTree* args = call->gtCallArgs;
1209 GenTree* arg = args->gtGetOp1();
1210 if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
1212 if (arg->IsValue() && !arg->isContained())
1214 assert(arg->IsUnusedValue());
1217 args = args->gtGetOp2();
1220 // set reg requirements on call target represented as control sequence.
1221 if (ctrlExpr != nullptr)
1223 regMaskTP ctrlExprCandidates = RBM_NONE;
1225 // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1226 // computed into a register.
1227 if (call->IsFastTailCall())
1229 assert(!ctrlExpr->isContained());
1230 // Fast tail call - make sure that call target is always computed in RAX
1231 // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1232 ctrlExprCandidates = RBM_RAX;
1235 else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1237 // On x86, we need to generate a very specific pattern for indirect VSD calls:
1240 // call dword ptr [eax]
1242 // Where EAX is also used as an argument to the stub dispatch helper. Make
1243 // sure that the call target address is computed into EAX in this case.
1244 assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
1245 ctrlExprCandidates = RBM_VIRTUAL_STUB_TARGET;
1247 #endif // _TARGET_X86_
1250 // If it is a fast tail call, it is already preferenced to use RAX.
1251 // Therefore, no need set src candidates on call tgt again.
1252 if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall())
1254 // Don't assign the call target to any of the argument registers because
1255 // we will use them to also pass floating point arguments as required
1257 ctrlExprCandidates = allRegs(TYP_INT) & ~(RBM_ARG_REGS);
1259 #endif // !FEATURE_VARARG
1260 srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
1263 buildInternalRegisterUses();
1265 // Now generate defs and kills.
1266 regMaskTP killMask = getKillSetForCall(call);
1267 BuildDefsWithKills(call, dstCount, dstCandidates, killMask);
1271 //------------------------------------------------------------------------
1272 // BuildBlockStore: Set the NodeInfo for a block store.
1275 // blkNode - The block store node of interest
1278 // The number of sources consumed by this node.
1280 int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
1282 GenTree* dstAddr = blkNode->Addr();
1283 unsigned size = blkNode->gtBlkSize;
1284 GenTree* source = blkNode->Data();
1287 GenTree* srcAddrOrFill = nullptr;
1288 bool isInitBlk = blkNode->OperIsInitBlkOp();
1290 regMaskTP dstAddrRegMask = RBM_NONE;
1291 regMaskTP sourceRegMask = RBM_NONE;
1292 regMaskTP blkSizeRegMask = RBM_NONE;
1296 GenTree* initVal = source;
1297 if (initVal->OperIsInitVal())
1299 assert(initVal->isContained());
1300 initVal = initVal->gtGetOp1();
1302 srcAddrOrFill = initVal;
1304 switch (blkNode->gtBlkOpKind)
1306 case GenTreeBlk::BlkOpKindUnroll:
1307 assert(initVal->IsCnsIntOrI());
1308 if (size >= XMM_REGSIZE_BYTES)
1310 // Reserve an XMM register to fill it with a pack of 16 init value constants.
1311 buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1312 // use XMM register to fill with constants, it's AVX instruction and set the flag
1313 SetContainsAVXFlags();
1316 if ((size & 1) != 0)
1318 // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1319 // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1320 // when unrolling, so only allow byteable registers as the source value. (We could
1321 // consider just using BlkOpKindRepInstr instead.)
1322 sourceRegMask = allByteRegs();
1324 #endif // _TARGET_X86_
1327 case GenTreeBlk::BlkOpKindRepInstr:
1328 // rep stos has the following register requirements:
1329 // a) The memory address to be in RDI.
1330 // b) The fill value has to be in RAX.
1331 // c) The buffer size will go in RCX.
1332 dstAddrRegMask = RBM_RDI;
1333 sourceRegMask = RBM_RAX;
1334 blkSizeRegMask = RBM_RCX;
1337 case GenTreeBlk::BlkOpKindHelper:
1338 #ifdef _TARGET_AMD64_
1339 // The helper follows the regular AMD64 ABI.
1340 dstAddrRegMask = RBM_ARG_0;
1341 sourceRegMask = RBM_ARG_1;
1342 blkSizeRegMask = RBM_ARG_2;
1343 #else // !_TARGET_AMD64_
1344 dstAddrRegMask = RBM_RDI;
1345 sourceRegMask = RBM_RAX;
1346 blkSizeRegMask = RBM_RCX;
1347 #endif // !_TARGET_AMD64_
1356 // CopyObj or CopyBlk
1357 if (source->gtOper == GT_IND)
1359 assert(source->isContained());
1360 srcAddrOrFill = source->gtGetOp1();
1362 if (blkNode->OperGet() == GT_STORE_OBJ)
1364 if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
1366 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1367 blkSizeRegMask = RBM_RCX;
1369 // The srcAddr must be in a register. If it was under a GT_IND, we need to subsume all of its
1371 sourceRegMask = RBM_RSI;
1372 dstAddrRegMask = RBM_RDI;
1376 switch (blkNode->gtBlkOpKind)
1378 case GenTreeBlk::BlkOpKindUnroll:
1379 // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1381 // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1382 // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
1383 // RBM_NON_BYTE_REGS from internal candidates.
1384 if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1386 regMaskTP regMask = allRegs(TYP_INT);
1389 if ((size & 1) != 0)
1391 regMask &= ~RBM_NON_BYTE_REGS;
1394 buildInternalIntRegisterDefForNode(blkNode, regMask);
1397 if (size >= XMM_REGSIZE_BYTES)
1399 // If we have a buffer larger than XMM_REGSIZE_BYTES,
1400 // reserve an XMM register to use it for a
1401 // series of 16-byte loads and stores.
1402 buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1403 // Uses XMM reg for load and store and hence check to see whether AVX instructions
1404 // are used for codegen, set ContainsAVX flag
1405 SetContainsAVXFlags();
1409 case GenTreeBlk::BlkOpKindRepInstr:
1410 // rep stos has the following register requirements:
1411 // a) The dest address has to be in RDI.
1412 // b) The src address has to be in RSI.
1413 // c) The buffer size will go in RCX.
1414 dstAddrRegMask = RBM_RDI;
1415 sourceRegMask = RBM_RSI;
1416 blkSizeRegMask = RBM_RCX;
1419 case GenTreeBlk::BlkOpKindHelper:
1420 #ifdef _TARGET_AMD64_
1421 // The helper follows the regular AMD64 ABI.
1422 dstAddrRegMask = RBM_ARG_0;
1423 sourceRegMask = RBM_ARG_1;
1424 blkSizeRegMask = RBM_ARG_2;
1425 #else // !_TARGET_AMD64_
1426 dstAddrRegMask = RBM_RDI;
1427 sourceRegMask = RBM_RAX;
1428 blkSizeRegMask = RBM_RCX;
1429 #endif // !_TARGET_AMD64_
1436 if ((srcAddrOrFill == nullptr) && (sourceRegMask != RBM_NONE))
1438 // This is a local source; we'll use a temp register for its address.
1439 assert(source->isContained() && source->OperIsLocal());
1440 buildInternalIntRegisterDefForNode(blkNode, sourceRegMask);
1444 if ((size != 0) && (blkSizeRegMask != RBM_NONE))
1446 // Reserve a temp register for the block size argument.
1447 buildInternalIntRegisterDefForNode(blkNode, blkSizeRegMask);
1450 if (!dstAddr->isContained() && !blkNode->IsReverseOp())
1453 BuildUse(dstAddr, dstAddrRegMask);
1455 if ((srcAddrOrFill != nullptr) && !srcAddrOrFill->isContained())
1458 BuildUse(srcAddrOrFill, sourceRegMask);
1460 if (!dstAddr->isContained() && blkNode->IsReverseOp())
1463 BuildUse(dstAddr, dstAddrRegMask);
1468 assert(blkNode->OperIs(GT_STORE_DYN_BLK));
1469 // The block size argument is a third argument to GT_STORE_DYN_BLK
1471 GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1472 BuildUse(blockSize, blkSizeRegMask);
1474 buildInternalRegisterUses();
1475 regMaskTP killMask = getKillSetForBlockStore(blkNode);
1476 BuildDefsWithKills(blkNode, 0, RBM_NONE, killMask);
1480 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1481 //------------------------------------------------------------------------
1482 // BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1485 // tree - The node of interest
1488 // The number of sources consumed by this node.
1490 int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
1493 if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
1495 assert(putArgStk->gtOp1->isContained());
1497 RefPosition* simdTemp = nullptr;
1498 RefPosition* intTemp = nullptr;
1499 unsigned prevOffset = putArgStk->getArgSize();
1500 // We need to iterate over the fields twice; once to determine the need for internal temps,
1501 // and once to actually build the uses.
1502 for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1504 GenTree* const fieldNode = current->Current();
1505 const var_types fieldType = fieldNode->TypeGet();
1506 const unsigned fieldOffset = current->gtFieldOffset;
1509 assert(fieldType != TYP_LONG);
1510 #endif // _TARGET_X86_
1512 #if defined(FEATURE_SIMD)
1513 // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the
1514 // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
1515 // we "round up" to 16.
1516 if ((current->gtFieldType == TYP_SIMD12) && (simdTemp == nullptr))
1518 simdTemp = buildInternalFloatRegisterDefForNode(putArgStk);
1520 #endif // defined(FEATURE_SIMD)
1523 if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
1525 // We can treat as a slot any field that is stored at a slot boundary, where the previous
1526 // field is not in the same slot. (Note that we store the fields in reverse order.)
1527 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
1528 if (intTemp == nullptr)
1530 intTemp = buildInternalIntRegisterDefForNode(putArgStk);
1532 if (!fieldIsSlot && varTypeIsByte(fieldType))
1534 // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
1535 // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
1536 // need a byte-addressable register for the store. We will enforce this requirement on an internal
1537 // register, which we can use to copy multiple byte values.
1538 intTemp->registerAssignment &= allByteRegs();
1541 #endif // _TARGET_X86_
1543 if (varTypeIsGC(fieldType))
1545 putArgStk->gtNumberReferenceSlots++;
1547 prevOffset = fieldOffset;
1550 for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1552 GenTree* const fieldNode = current->Current();
1553 if (!fieldNode->isContained())
1555 BuildUse(fieldNode);
1559 buildInternalRegisterUses();
1564 GenTree* src = putArgStk->gtOp1;
1565 var_types type = src->TypeGet();
1567 #if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1568 // For PutArgStk of a TYP_SIMD12, we need an extra register.
1569 if (putArgStk->isSIMD12())
1571 buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
1572 BuildUse(putArgStk->gtOp1);
1574 buildInternalRegisterUses();
1577 #endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1579 if (type != TYP_STRUCT)
1581 return BuildSimple(putArgStk);
1584 GenTree* dst = putArgStk;
1585 GenTree* srcAddr = nullptr;
1587 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1588 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1589 // our framework assemblies, so this is the main code generation scheme we'll use.
1590 ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
1591 switch (putArgStk->gtPutArgStkKind)
1593 case GenTreePutArgStk::Kind::Push:
1594 case GenTreePutArgStk::Kind::PushAllSlots:
1595 case GenTreePutArgStk::Kind::Unroll:
1596 // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1598 // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1599 // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
1600 // RBM_NON_BYTE_REGS from internal candidates.
1601 if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
1603 regMaskTP regMask = allRegs(TYP_INT);
1606 if ((size % 2) != 0)
1608 regMask &= ~RBM_NON_BYTE_REGS;
1611 buildInternalIntRegisterDefForNode(putArgStk, regMask);
1616 #else // !_TARGET_X86_
1617 if (size >= XMM_REGSIZE_BYTES)
1618 #endif // !_TARGET_X86_
1620 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
1621 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
1622 // series of 16-byte loads and stores.
1623 buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
1624 SetContainsAVXFlags();
1628 case GenTreePutArgStk::Kind::RepInstr:
1629 buildInternalIntRegisterDefForNode(putArgStk, RBM_RDI);
1630 buildInternalIntRegisterDefForNode(putArgStk, RBM_RCX);
1631 buildInternalIntRegisterDefForNode(putArgStk, RBM_RSI);
1638 srcCount = BuildOperandUses(src);
1639 buildInternalRegisterUses();
1642 #endif // FEATURE_PUT_STRUCT_ARG_STK
1644 //------------------------------------------------------------------------
1645 // BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
1648 // tree - The node of interest
1651 // The number of sources consumed by this node.
1653 int LinearScan::BuildLclHeap(GenTree* tree)
1657 // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1658 // Here '-' means don't care.
1660 // Size? Init Memory? # temp regs
1661 // 0 - 0 (returns 0)
1662 // const and <=6 reg words - 0 (pushes '0')
1663 // const and >6 reg words Yes 0 (pushes '0')
1664 // const and <PageSize No 0 (amd64) 1 (x86)
1665 // (x86:tmpReg for sutracting from esp)
1666 // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp)
1667 // Non-const Yes 0 (regCnt=targetReg and pushes '0')
1668 // Non-const No 2 (regCnt and tmpReg for subtracting from sp)
1670 // Note: Here we don't need internal register to be different from targetReg.
1671 // Rather, require it to be different from operand's reg.
1673 GenTree* size = tree->gtGetOp1();
1674 if (size->IsCnsIntOrI())
1676 assert(size->isContained());
1678 size_t sizeVal = size->gtIntCon.gtIconVal;
1682 buildInternalIntRegisterDefForNode(tree);
1686 // Compute the amount of memory to properly STACK_ALIGN.
1687 // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1688 // This should also help in debugging as we can examine the original size specified with localloc.
1689 sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1691 // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1692 // we will generate 'push 0'.
1693 assert((sizeVal % REGSIZE_BYTES) == 0);
1694 size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1695 if (cntRegSizedWords > 6)
1697 if (!compiler->info.compInitMem)
1699 // No need to initialize allocated stack space.
1700 if (sizeVal < compiler->eeGetPageSize())
1703 // x86 needs a register here to avoid generating "sub" on ESP.
1704 buildInternalIntRegisterDefForNode(tree);
1709 // We need two registers: regCnt and RegTmp
1710 buildInternalIntRegisterDefForNode(tree);
1711 buildInternalIntRegisterDefForNode(tree);
1719 if (!compiler->info.compInitMem)
1721 buildInternalIntRegisterDefForNode(tree);
1722 buildInternalIntRegisterDefForNode(tree);
1726 buildInternalRegisterUses();
1731 //------------------------------------------------------------------------
1732 // BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
1735 // tree - The node of interest
1738 // The number of sources consumed by this node.
1740 int LinearScan::BuildModDiv(GenTree* tree)
1742 GenTree* op1 = tree->gtGetOp1();
1743 GenTree* op2 = tree->gtGetOp2();
1744 regMaskTP dstCandidates = RBM_NONE;
1745 RefPosition* internalDef = nullptr;
1748 if (varTypeIsFloating(tree->TypeGet()))
1750 return BuildSimple(tree);
1753 // Amd64 Div/Idiv instruction:
1754 // Dividend in RAX:RDX and computes
1755 // Quotient in RAX, Remainder in RDX
1757 if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
1759 // We are interested in just the remainder.
1760 // RAX is used as a trashable register during computation of remainder.
1761 dstCandidates = RBM_RDX;
1765 // We are interested in just the quotient.
1766 // RDX gets used as trashable register during computation of quotient
1767 dstCandidates = RBM_RAX;
1771 if (op1->OperGet() == GT_LONG)
1773 assert(op1->isContained());
1775 // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
1776 GenTree* loVal = op1->gtGetOp1();
1777 GenTree* hiVal = op1->gtGetOp2();
1778 assert(!loVal->isContained() && !hiVal->isContained());
1780 assert(op2->IsCnsIntOrI());
1781 assert(tree->OperGet() == GT_UMOD);
1783 // This situation also requires an internal register.
1784 buildInternalIntRegisterDefForNode(tree);
1786 BuildUse(loVal, RBM_EAX);
1787 BuildUse(hiVal, RBM_EDX);
1793 // If possible would like to have op1 in RAX to avoid a register move.
1794 RefPosition* op1Use = BuildUse(op1, RBM_EAX);
1795 tgtPrefUse = op1Use;
1799 srcCount += BuildDelayFreeUses(op2, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
1801 buildInternalRegisterUses();
1803 regMaskTP killMask = getKillSetForModDiv(tree->AsOp());
1804 BuildDefsWithKills(tree, 1, dstCandidates, killMask);
1808 //------------------------------------------------------------------------
1809 // BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
1812 // tree - The node of interest
1815 // The number of sources consumed by this node.
1817 int LinearScan::BuildIntrinsic(GenTree* tree)
1819 // Both operand and its result must be of floating point type.
1820 GenTree* op1 = tree->gtGetOp1();
1821 assert(varTypeIsFloating(op1));
1822 assert(op1->TypeGet() == tree->TypeGet());
1823 RefPosition* internalFloatDef = nullptr;
1825 switch (tree->gtIntrinsic.gtIntrinsicId)
1827 case CORINFO_INTRINSIC_Abs:
1828 // Abs(float x) = x & 0x7fffffff
1829 // Abs(double x) = x & 0x7ffffff ffffffff
1831 // In case of Abs we need an internal register to hold mask.
1833 // TODO-XArch-CQ: avoid using an internal register for the mask.
1834 // Andps or andpd both will operate on 128-bit operands.
1835 // The data section constant to hold the mask is a 64-bit size.
1836 // Therefore, we need both the operand and mask to be in
1837 // xmm register. When we add support in emitter to emit 128-bit
1838 // data constants and instructions that operate on 128-bit
1839 // memory operands we can avoid the need for an internal register.
1840 if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
1842 internalFloatDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
1847 case CORINFO_INTRINSIC_Cos:
1848 case CORINFO_INTRINSIC_Sin:
1849 NYI_X86("Math intrinsics Cos and Sin");
1851 #endif // _TARGET_X86_
1853 case CORINFO_INTRINSIC_Sqrt:
1854 case CORINFO_INTRINSIC_Round:
1855 case CORINFO_INTRINSIC_Ceiling:
1856 case CORINFO_INTRINSIC_Floor:
1860 // Right now only Sqrt/Abs are treated as math intrinsics
1861 noway_assert(!"Unsupported math intrinsic");
1865 assert(tree->gtGetOp2IfPresent() == nullptr);
1867 if (op1->isContained())
1869 srcCount = BuildOperandUses(op1);
1873 tgtPrefUse = BuildUse(op1);
1876 if (internalFloatDef != nullptr)
1878 buildInternalRegisterUses();
1885 //------------------------------------------------------------------------
1886 // BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
1889 // tree - The GT_SIMD node of interest
1892 // The number of sources consumed by this node.
1894 int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
1896 // Only SIMDIntrinsicInit can be contained. Other than that,
1897 // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount.
1898 int dstCount = simdTree->IsValue() ? 1 : 0;
1899 bool buildUses = true;
1900 regMaskTP dstCandidates = RBM_NONE;
1902 if (simdTree->isContained())
1904 assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
1906 else if (dstCount != 1)
1908 assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ||
1909 (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality));
1911 SetContainsAVXFlags(simdTree->gtSIMDSize);
1912 GenTree* op1 = simdTree->gtGetOp1();
1913 GenTree* op2 = simdTree->gtGetOp2();
1916 switch (simdTree->gtSIMDIntrinsicID)
1918 case SIMDIntrinsicInit:
1920 // This sets all fields of a SIMD struct to the given value.
1921 // Mark op1 as contained if it is either zero or int constant of all 1's,
1922 // or a float constant with 16 or 32 byte simdType (AVX case)
1924 // Note that for small int base types, the initVal has been constructed so that
1925 // we can use the full int value.
1926 CLANG_FORMAT_COMMENT_ANCHOR;
1928 #if !defined(_TARGET_64BIT_)
1929 if (op1->OperGet() == GT_LONG)
1931 assert(op1->isContained());
1932 GenTree* op1lo = op1->gtGetOp1();
1933 GenTree* op1hi = op1->gtGetOp2();
1935 if (op1lo->isContained())
1938 assert(op1hi->isContained());
1939 assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
1940 (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
1945 buildInternalFloatRegisterDefForNode(simdTree);
1946 setInternalRegsDelayFree = true;
1951 BuildUse(op1lo, RBM_EAX);
1952 BuildUse(op1hi, RBM_EDX);
1956 #endif // !defined(_TARGET_64BIT_)
1960 case SIMDIntrinsicInitN:
1962 var_types baseType = simdTree->gtSIMDBaseType;
1963 srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
1964 // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
1965 buildInternalFloatRegisterDefForNode(simdTree);
1967 for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
1969 assert(list->OperGet() == GT_LIST);
1970 GenTree* listItem = list->gtGetOp1();
1971 assert(listItem->TypeGet() == baseType);
1972 assert(!listItem->isContained());
1976 assert(initCount == srcCount);
1981 case SIMDIntrinsicInitArray:
1982 // We have an array and an index, which may be contained.
1985 case SIMDIntrinsicDiv:
1986 // SSE2 has no instruction support for division on integer vectors
1987 noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1990 case SIMDIntrinsicAbs:
1991 // float/double vectors: This gets implemented as bitwise-And operation
1992 // with a mask and hence should never see here.
1994 // Must be a Vector<int> or Vector<short> Vector<sbyte>
1995 assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
1996 simdTree->gtSIMDBaseType == TYP_BYTE);
1997 assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2000 case SIMDIntrinsicSqrt:
2001 // SSE2 has no instruction support for sqrt on integer vectors.
2002 noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
2005 case SIMDIntrinsicAdd:
2006 case SIMDIntrinsicSub:
2007 case SIMDIntrinsicMul:
2008 case SIMDIntrinsicBitwiseAnd:
2009 case SIMDIntrinsicBitwiseAndNot:
2010 case SIMDIntrinsicBitwiseOr:
2011 case SIMDIntrinsicBitwiseXor:
2012 case SIMDIntrinsicMin:
2013 case SIMDIntrinsicMax:
2014 // SSE2 32-bit integer multiplication requires two temp regs
2015 if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
2016 compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2018 buildInternalFloatRegisterDefForNode(simdTree);
2019 buildInternalFloatRegisterDefForNode(simdTree);
2023 case SIMDIntrinsicEqual:
2026 // SSE2 doesn't support < and <= directly on int vectors.
2027 // Instead we need to use > and >= with swapped operands.
2028 case SIMDIntrinsicLessThan:
2029 case SIMDIntrinsicLessThanOrEqual:
2030 noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
2033 // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
2034 // SSE2 cmpps/pd doesn't support > and >= directly on float/double vectors.
2035 // Instead we need to use < and <= with swapped operands.
2036 case SIMDIntrinsicGreaterThan:
2037 noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2040 case SIMDIntrinsicOpEquality:
2041 case SIMDIntrinsicOpInEquality:
2042 if (simdTree->gtGetOp2()->isContained())
2044 // If the second operand is contained then ContainCheckSIMD has determined
2045 // that PTEST can be used. We only need a single source register and no
2046 // internal registers.
2050 // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
2051 // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
2052 // and one internal INT register (to hold the result of PMOVMSKB).
2053 buildInternalIntRegisterDefForNode(simdTree);
2054 buildInternalFloatRegisterDefForNode(simdTree);
2056 // These SIMD nodes only set the condition flags.
2060 case SIMDIntrinsicDotProduct:
2061 // Float/Double vectors:
2062 // For SSE, or AVX with 32-byte vectors, we also need an internal register
2063 // as scratch. Further we need the targetReg and internal reg to be distinct
2064 // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2065 // don't need a tmpReg.
2067 // 32-byte integer vector on SSE4/AVX:
2068 // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2069 // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
2070 // registers since targetReg is an int type register.
2072 // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2073 // and the need for scratch registers.
2074 if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2076 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
2077 (simdTree->gtGetOp1()->TypeGet() == TYP_SIMD32))
2079 buildInternalFloatRegisterDefForNode(simdTree);
2080 setInternalRegsDelayFree = true;
2082 // else don't need scratch reg(s).
2086 assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2088 // No need to setInternalRegsDelayFree since targetReg is a
2089 // an int type reg and guaranteed to be different from xmm/ymm
2091 buildInternalFloatRegisterDefForNode(simdTree);
2092 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2094 buildInternalFloatRegisterDefForNode(simdTree);
2099 case SIMDIntrinsicGetItem:
2101 // This implements get_Item method. The sources are:
2102 // - the source SIMD struct
2103 // - index (which element to get)
2104 // The result is baseType of SIMD struct.
2105 // op1 may be a contained memory op, but if so we will consume its address.
2106 // op2 may be a contained constant.
2107 op1 = simdTree->gtGetOp1();
2108 op2 = simdTree->gtGetOp2();
2110 if (!op1->isContained())
2112 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2113 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2114 // can use that in the process of extracting the element.
2116 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2117 // we will need a temp if are indexing into the upper half of the AVX register.
2118 // In all other cases with constant index, we need a temp xmm register to extract the
2119 // element if index is other than zero.
2121 if (!op2->IsCnsIntOrI())
2123 (void)compiler->getSIMDInitTempVarNum();
2125 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2128 if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2129 (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
2131 int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2132 needFloatTemp = (byteShiftCnt >= 16);
2136 needFloatTemp = !op2->IsIntegralConst(0);
2141 buildInternalFloatRegisterDefForNode(simdTree);
2145 // This logic is duplicated from genSIMDIntrinsicGetItem().
2146 // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
2147 // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
2148 // cases will require this, so the non-byteable registers can be excluded.
2150 var_types baseType = simdTree->gtSIMDBaseType;
2151 if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
2153 bool ZeroOrSignExtnReqd = true;
2154 unsigned baseSize = genTypeSize(baseType);
2157 if ((op2->gtIntCon.gtIconVal % 2) == 1)
2159 ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2164 assert(baseSize == 2);
2165 ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2167 if (ZeroOrSignExtnReqd)
2169 dstCandidates = allByteRegs();
2172 #endif // _TARGET_X86_
2177 case SIMDIntrinsicSetX:
2178 case SIMDIntrinsicSetY:
2179 case SIMDIntrinsicSetZ:
2180 case SIMDIntrinsicSetW:
2181 // We need an internal integer register for SSE2 codegen
2182 if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2184 buildInternalIntRegisterDefForNode(simdTree);
2189 case SIMDIntrinsicCast:
2192 case SIMDIntrinsicConvertToSingle:
2193 if (simdTree->gtSIMDBaseType == TYP_UINT)
2195 // We need an internal register different from targetReg.
2196 setInternalRegsDelayFree = true;
2197 buildInternalFloatRegisterDefForNode(simdTree);
2198 buildInternalFloatRegisterDefForNode(simdTree);
2199 // We also need an integer register.
2200 buildInternalIntRegisterDefForNode(simdTree);
2204 case SIMDIntrinsicConvertToInt32:
2207 case SIMDIntrinsicWidenLo:
2208 case SIMDIntrinsicWidenHi:
2209 if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
2211 // We need an internal register different from targetReg.
2212 setInternalRegsDelayFree = true;
2213 buildInternalFloatRegisterDefForNode(simdTree);
2217 case SIMDIntrinsicConvertToInt64:
2218 // We need an internal register different from targetReg.
2219 setInternalRegsDelayFree = true;
2220 buildInternalFloatRegisterDefForNode(simdTree);
2221 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2223 buildInternalFloatRegisterDefForNode(simdTree);
2225 // We also need an integer register.
2226 buildInternalIntRegisterDefForNode(simdTree);
2229 case SIMDIntrinsicConvertToDouble:
2230 // We need an internal register different from targetReg.
2231 setInternalRegsDelayFree = true;
2232 buildInternalFloatRegisterDefForNode(simdTree);
2234 if (simdTree->gtSIMDBaseType == TYP_LONG)
2236 buildInternalFloatRegisterDefForNode(simdTree);
2237 buildInternalFloatRegisterDefForNode(simdTree);
2241 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
2243 buildInternalFloatRegisterDefForNode(simdTree);
2245 // We also need an integer register.
2246 buildInternalIntRegisterDefForNode(simdTree);
2249 case SIMDIntrinsicNarrow:
2250 // We need an internal register different from targetReg.
2251 setInternalRegsDelayFree = true;
2252 buildInternalFloatRegisterDefForNode(simdTree);
2253 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
2255 buildInternalFloatRegisterDefForNode(simdTree);
2259 case SIMDIntrinsicShuffleSSE2:
2260 // Second operand is an integer constant and marked as contained.
2261 assert(simdTree->gtGetOp2()->isContainedIntOrIImmed());
2264 case SIMDIntrinsicGetX:
2265 case SIMDIntrinsicGetY:
2266 case SIMDIntrinsicGetZ:
2267 case SIMDIntrinsicGetW:
2268 case SIMDIntrinsicGetOne:
2269 case SIMDIntrinsicGetZero:
2270 case SIMDIntrinsicGetCount:
2271 case SIMDIntrinsicGetAllOnes:
2272 assert(!"Get intrinsics should not be seen during Lowering.");
2276 noway_assert(!"Unimplemented SIMD node type.");
2281 assert(!op1->OperIs(GT_LIST));
2282 assert(srcCount == 0);
2283 // This is overly conservative, but is here for zero diffs.
2284 srcCount = BuildRMWUses(simdTree);
2286 buildInternalRegisterUses();
2289 BuildDef(simdTree, dstCandidates);
2293 assert(dstCount == 0);
2297 #endif // FEATURE_SIMD
2299 #ifdef FEATURE_HW_INTRINSICS
2300 //------------------------------------------------------------------------
2301 // BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree.
2304 // tree - The GT_HWIntrinsic node of interest
2307 // The number of sources consumed by this node.
2309 int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
2311 NamedIntrinsic intrinsicId = intrinsicTree->gtHWIntrinsicId;
2312 var_types baseType = intrinsicTree->gtSIMDBaseType;
2313 InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsicId);
2314 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
2315 int numArgs = HWIntrinsicInfo::lookupNumArgs(intrinsicTree);
2317 // Set the AVX Flags if this instruction may use VEX encoding for SIMD operations.
2318 // Note that this may be true even if the ISA is not AVX (e.g. for platform-agnostic intrinsics
2319 // or non-AVX intrinsics that will use VEX encoding if it is available on the target).
2320 if (intrinsicTree->isSIMD())
2322 SetContainsAVXFlags(intrinsicTree->gtSIMDSize);
2325 GenTree* op1 = intrinsicTree->gtGetOp1();
2326 GenTree* op2 = intrinsicTree->gtGetOp2();
2327 GenTree* op3 = nullptr;
2328 GenTree* lastOp = nullptr;
2331 int dstCount = intrinsicTree->IsValue() ? 1 : 0;
2333 regMaskTP dstCandidates = RBM_NONE;
2337 assert(op2 == nullptr);
2338 assert(numArgs == 0);
2342 if (op1->OperIsList())
2344 assert(op2 == nullptr);
2345 assert(numArgs >= 3);
2347 GenTreeArgList* argList = op1->AsArgList();
2349 op1 = argList->Current();
2350 argList = argList->Rest();
2352 op2 = argList->Current();
2353 argList = argList->Rest();
2355 op3 = argList->Current();
2357 while (argList->Rest() != nullptr)
2359 argList = argList->Rest();
2362 lastOp = argList->Current();
2363 argList = argList->Rest();
2365 assert(argList == nullptr);
2367 else if (op2 != nullptr)
2369 assert(numArgs == 2);
2374 assert(numArgs == 1);
2378 assert(lastOp != nullptr);
2380 bool buildUses = true;
2382 if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId))
2384 if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed())
2386 assert(!lastOp->IsCnsIntOrI());
2388 // We need two extra reg when lastOp isn't a constant so
2389 // the offset into the jump table for the fallback path
2391 buildInternalIntRegisterDefForNode(intrinsicTree);
2392 buildInternalIntRegisterDefForNode(intrinsicTree);
2396 // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
2397 // is not allocated the same register as the target.
2398 bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
2400 // Create internal temps, and handle any other special requirements.
2401 // Note that the default case for building uses will handle the RMW flag, but if the uses
2402 // are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree)
2403 // must be handled within the case.
2404 switch (intrinsicId)
2406 case NI_Base_Vector128_CreateScalarUnsafe:
2407 case NI_Base_Vector128_ToScalar:
2408 case NI_Base_Vector256_CreateScalarUnsafe:
2409 case NI_Base_Vector256_ToScalar:
2411 assert(numArgs == 1);
2413 if (varTypeIsFloating(baseType))
2415 if (op1->isContained())
2417 srcCount += BuildOperandUses(op1);
2421 // We will either be in memory and need to be moved
2422 // into a register of the appropriate size or we
2423 // are already in an XMM/YMM register and can stay
2426 tgtPrefUse = BuildUse(op1);
2435 case NI_Base_Vector128_ToVector256:
2436 case NI_Base_Vector128_ToVector256Unsafe:
2437 case NI_Base_Vector256_GetLower:
2439 assert(numArgs == 1);
2441 if (op1->isContained())
2443 srcCount += BuildOperandUses(op1);
2447 // We will either be in memory and need to be moved
2448 // into a register of the appropriate size or we
2449 // are already in an XMM/YMM register and can stay
2452 tgtPrefUse = BuildUse(op1);
2460 case NI_SSE_CompareEqualOrderedScalar:
2461 case NI_SSE_CompareEqualUnorderedScalar:
2462 case NI_SSE_CompareNotEqualOrderedScalar:
2463 case NI_SSE_CompareNotEqualUnorderedScalar:
2464 case NI_SSE2_CompareEqualOrderedScalar:
2465 case NI_SSE2_CompareEqualUnorderedScalar:
2466 case NI_SSE2_CompareNotEqualOrderedScalar:
2467 case NI_SSE2_CompareNotEqualUnorderedScalar:
2469 buildInternalIntRegisterDefForNode(intrinsicTree, allByteRegs());
2470 setInternalRegsDelayFree = true;
2474 case NI_SSE2_MaskMove:
2476 assert(numArgs == 3);
2479 // MaskMove hardcodes the destination (op3) in DI/EDI/RDI
2480 srcCount += BuildOperandUses(op1);
2481 srcCount += BuildOperandUses(op2);
2482 srcCount += BuildOperandUses(op3, RBM_EDI);
2488 case NI_SSE41_BlendVariable:
2490 assert(numArgs == 3);
2492 if (!compiler->canUseVexEncoding())
2496 // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
2497 srcCount += BuildOperandUses(op1);
2498 srcCount += BuildDelayFreeUses(op2);
2499 srcCount += BuildDelayFreeUses(op3, RBM_XMM0);
2506 case NI_SSE41_TestAllOnes:
2508 buildInternalFloatRegisterDefForNode(intrinsicTree);
2512 case NI_SSE41_Extract:
2514 if (baseType == TYP_FLOAT)
2516 buildInternalIntRegisterDefForNode(intrinsicTree);
2519 else if (varTypeIsByte(baseType))
2521 dstCandidates = allByteRegs();
2528 case NI_SSE42_Crc32:
2529 case NI_SSE42_X64_Crc32:
2531 // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
2532 // to the code generator. We may want to encode the overload info in another way.
2534 assert(numArgs == 2);
2537 // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
2538 srcCount += BuildOperandUses(op1);
2539 srcCount += BuildDelayFreeUses(op2, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE);
2544 #endif // _TARGET_X86_
2546 case NI_BMI2_MultiplyNoFlags:
2547 case NI_BMI2_X64_MultiplyNoFlags:
2549 assert(numArgs == 2 || numArgs == 3);
2550 srcCount += BuildOperandUses(op1, RBM_EDX);
2551 srcCount += BuildOperandUses(op2);
2554 // op3 reg should be different from target reg to
2555 // store the lower half result after executing the instruction
2556 srcCount += BuildDelayFreeUses(op3);
2557 // Need a internal register different from the dst to take the lower half result
2558 buildInternalIntRegisterDefForNode(intrinsicTree);
2559 setInternalRegsDelayFree = true;
2565 case NI_FMA_MultiplyAdd:
2566 case NI_FMA_MultiplyAddNegated:
2567 case NI_FMA_MultiplyAddNegatedScalar:
2568 case NI_FMA_MultiplyAddScalar:
2569 case NI_FMA_MultiplyAddSubtract:
2570 case NI_FMA_MultiplySubtract:
2571 case NI_FMA_MultiplySubtractAdd:
2572 case NI_FMA_MultiplySubtractNegated:
2573 case NI_FMA_MultiplySubtractNegatedScalar:
2574 case NI_FMA_MultiplySubtractScalar:
2576 assert(numArgs == 3);
2579 const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
2581 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
2582 assert(!copiesUpperBits || !op1->isContained());
2584 if (op3->isContained())
2586 // 213 form: op1 = (op2 * op1) + [op3]
2588 if (copiesUpperBits)
2590 tgtPrefUse = BuildUse(op1);
2593 srcCount += BuildDelayFreeUses(op2);
2597 // op1 and op2 are commutative, so don't
2598 // set either to be tgtPref or delayFree
2600 srcCount += BuildOperandUses(op1);
2601 srcCount += BuildOperandUses(op2);
2604 srcCount += BuildOperandUses(op3);
2606 else if (op2->isContained())
2608 // 132 form: op1 = (op1 * op3) + [op2]
2610 tgtPrefUse = BuildUse(op1);
2613 srcCount += BuildOperandUses(op2);
2614 srcCount += BuildDelayFreeUses(op3);
2616 else if (op1->isContained())
2618 // 231 form: op3 = (op2 * op3) + [op1]
2620 tgtPrefUse = BuildUse(op3);
2622 srcCount += BuildOperandUses(op1);
2623 srcCount += BuildDelayFreeUses(op2);
2628 // 213 form: op1 = (op2 * op1) + op3
2630 if (copiesUpperBits)
2632 tgtPrefUse = BuildUse(op1);
2635 srcCount += BuildDelayFreeUses(op2);
2639 // op1 and op2 are commutative, so don't
2640 // set either to be tgtPref or delayFree
2642 srcCount += BuildOperandUses(op1);
2643 srcCount += BuildOperandUses(op2);
2646 srcCount += BuildDelayFreeUses(op3);
2653 case NI_AVX2_GatherVector128:
2654 case NI_AVX2_GatherVector256:
2656 assert(numArgs == 3);
2657 // Any pair of the index, mask, or destination registers should be different
2658 srcCount += BuildOperandUses(op1);
2659 srcCount += BuildDelayFreeUses(op2);
2661 // get a tmp register for mask that will be cleared by gather instructions
2662 buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
2663 setInternalRegsDelayFree = true;
2669 case NI_AVX2_GatherMaskVector128:
2670 case NI_AVX2_GatherMaskVector256:
2672 assert(numArgs == 5);
2673 // Any pair of the index, mask, or destination registers should be different
2674 srcCount += BuildOperandUses(op1);
2675 srcCount += BuildOperandUses(op2);
2676 srcCount += BuildDelayFreeUses(op3);
2678 assert(intrinsicTree->gtGetOp1()->OperIsList());
2679 GenTreeArgList* argList = intrinsicTree->gtGetOp1()->AsArgList();
2680 GenTree* op4 = argList->Rest()->Rest()->Rest()->Current();
2681 srcCount += BuildDelayFreeUses(op4);
2683 // get a tmp register for mask that will be cleared by gather instructions
2684 buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
2685 setInternalRegsDelayFree = true;
2693 assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
2700 assert((numArgs > 0) && (numArgs < 4));
2702 srcCount += BuildOperandUses(op1);
2706 srcCount += (isRMW) ? BuildDelayFreeUses(op2) : BuildOperandUses(op2);
2710 srcCount += (isRMW) ? BuildDelayFreeUses(op3) : BuildOperandUses(op3);
2715 buildInternalRegisterUses();
2720 BuildDef(intrinsicTree, dstCandidates);
2724 assert(dstCount == 0);
2731 //------------------------------------------------------------------------
2732 // BuildCast: Set the NodeInfo for a GT_CAST.
2735 // cast - The GT_CAST node
2738 // The number of sources consumed by this node.
2740 int LinearScan::BuildCast(GenTreeCast* cast)
2742 GenTree* src = cast->gtGetOp1();
2744 const var_types srcType = genActualType(src->TypeGet());
2745 const var_types castType = cast->gtCastType;
2747 regMaskTP candidates = RBM_NONE;
2749 if (varTypeIsByte(castType))
2751 candidates = allByteRegs();
2754 assert(!varTypeIsLong(srcType) || (src->OperIs(GT_LONG) && src->isContained()));
2756 // Overflow checking cast from TYP_(U)LONG to TYP_UINT requires a temporary
2757 // register to extract the upper 32 bits of the 64 bit source register.
2758 if (cast->gtOverflow() && varTypeIsLong(srcType) && (castType == TYP_UINT))
2760 // Here we don't need internal register to be different from targetReg,
2761 // rather require it to be different from operand's reg.
2762 buildInternalIntRegisterDefForNode(cast);
2766 int srcCount = BuildOperandUses(src, candidates);
2767 buildInternalRegisterUses();
2768 BuildDef(cast, candidates);
2772 //-----------------------------------------------------------------------------------------
2773 // BuildIndir: Specify register requirements for address expression of an indirection operation.
2776 // indirTree - GT_IND or GT_STOREIND gentree node
2779 // The number of sources consumed by this node.
2781 int LinearScan::BuildIndir(GenTreeIndir* indirTree)
2783 // If this is the rhs of a block copy (i.e. non-enregisterable struct),
2784 // it has no register requirements.
2785 if (indirTree->TypeGet() == TYP_STRUCT)
2791 RefPosition* internalFloatDef = nullptr;
2792 if (indirTree->TypeGet() == TYP_SIMD12)
2794 // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir().
2795 assert(!indirTree->Addr()->isContained());
2797 // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
2798 // To assemble the vector properly we would need an additional
2800 internalFloatDef = buildInternalFloatRegisterDefForNode(indirTree);
2802 // In case of GT_IND we need an internal register different from targetReg and
2803 // both of the registers are used at the same time.
2804 if (indirTree->OperGet() == GT_IND)
2806 setInternalRegsDelayFree = true;
2809 #endif // FEATURE_SIMD
2811 regMaskTP indirCandidates = RBM_NONE;
2812 int srcCount = BuildIndirUses(indirTree, indirCandidates);
2813 if (indirTree->gtOper == GT_STOREIND)
2815 GenTree* source = indirTree->gtGetOp2();
2816 if (indirTree->AsStoreInd()->IsRMWMemoryOp())
2818 // Because 'source' is contained, we haven't yet determined its special register requirements, if any.
2819 // As it happens, the Shift or Rotate cases are the only ones with special requirements.
2820 assert(source->isContained() && source->OperIsRMWMemOp());
2821 GenTree* nonMemSource = nullptr;
2822 GenTreeIndir* otherIndir = nullptr;
2824 if (source->OperIsShiftOrRotate())
2826 srcCount += BuildShiftRotate(source);
2830 regMaskTP srcCandidates = RBM_NONE;
2833 // Determine if we need byte regs for the non-mem source, if any.
2834 // Note that BuildShiftRotate (above) will handle the byte requirement as needed,
2835 // but STOREIND isn't itself an RMW op, so we have to explicitly set it for that case.
2837 GenTree* nonMemSource = nullptr;
2839 if (indirTree->AsStoreInd()->IsRMWDstOp1())
2841 otherIndir = source->gtGetOp1()->AsIndir();
2842 if (source->OperIsBinary())
2844 nonMemSource = source->gtGetOp2();
2847 else if (indirTree->AsStoreInd()->IsRMWDstOp2())
2849 otherIndir = source->gtGetOp2()->AsIndir();
2850 nonMemSource = source->gtGetOp1();
2852 if ((nonMemSource != nullptr) && !nonMemSource->isContained() && varTypeIsByte(indirTree))
2854 srcCandidates = RBM_BYTE_REGS;
2857 if (otherIndir != nullptr)
2859 // Any lclVars in the addressing mode of this indirection are contained.
2860 // If they are marked as lastUse, transfer the last use flag to the store indir.
2861 GenTree* base = otherIndir->Base();
2862 GenTree* dstBase = indirTree->Base();
2863 CheckAndMoveRMWLastUse(base, dstBase);
2864 GenTree* index = otherIndir->Index();
2865 GenTree* dstIndex = indirTree->Index();
2866 CheckAndMoveRMWLastUse(index, dstIndex);
2868 srcCount += BuildBinaryUses(source->AsOp(), srcCandidates);
2874 if (varTypeIsByte(indirTree) && !source->isContained())
2876 BuildUse(source, allByteRegs());
2882 srcCount += BuildOperandUses(source);
2887 if (varTypeIsSIMD(indirTree))
2889 SetContainsAVXFlags(genTypeSize(indirTree->TypeGet()));
2891 buildInternalRegisterUses();
2892 #endif // FEATURE_SIMD
2894 if (indirTree->gtOper != GT_STOREIND)
2896 BuildDef(indirTree);
2901 //------------------------------------------------------------------------
2902 // BuildMul: Set the NodeInfo for a multiply.
2905 // tree - The node of interest
2908 // The number of sources consumed by this node.
2910 int LinearScan::BuildMul(GenTree* tree)
2912 assert(tree->OperIsMul());
2913 GenTree* op1 = tree->gtGetOp1();
2914 GenTree* op2 = tree->gtGetOp2();
2916 // Only non-floating point mul has special requirements
2917 if (varTypeIsFloating(tree->TypeGet()))
2919 return BuildSimple(tree);
2922 int srcCount = BuildBinaryUses(tree->AsOp());
2924 regMaskTP dstCandidates = RBM_NONE;
2926 bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0);
2927 bool requiresOverflowCheck = tree->gtOverflowEx();
2929 // There are three forms of x86 multiply:
2930 // one-op form: RDX:RAX = RAX * r/m
2931 // two-op form: reg *= r/m
2932 // three-op form: reg = r/m * imm
2934 // This special widening 32x32->64 MUL is not used on x64
2935 CLANG_FORMAT_COMMENT_ANCHOR;
2936 #if defined(_TARGET_X86_)
2937 if (tree->OperGet() != GT_MUL_LONG)
2940 assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
2943 // We do use the widening multiply to implement
2944 // the overflow checking for unsigned multiply
2946 if (isUnsignedMultiply && requiresOverflowCheck)
2948 // The only encoding provided is RDX:RAX = RAX * rm
2950 // Here we set RAX as the only destination candidate
2951 // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
2953 dstCandidates = RBM_RAX;
2955 else if (tree->OperGet() == GT_MULHI)
2957 // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
2958 // upper 32 bits of the result set the destination candidate to REG_RDX.
2959 dstCandidates = RBM_RDX;
2961 #if defined(_TARGET_X86_)
2962 else if (tree->OperGet() == GT_MUL_LONG)
2964 // have to use the encoding:RDX:RAX = RAX * rm
2965 dstCandidates = RBM_RAX | RBM_RDX;
2969 GenTree* containedMemOp = nullptr;
2970 if (op1->isContained() && !op1->IsCnsIntOrI())
2972 assert(!op2->isContained() || op2->IsCnsIntOrI());
2973 containedMemOp = op1;
2975 else if (op2->isContained() && !op2->IsCnsIntOrI())
2977 containedMemOp = op2;
2979 regMaskTP killMask = getKillSetForMul(tree->AsOp());
2980 BuildDefsWithKills(tree, dstCount, dstCandidates, killMask);
2984 //------------------------------------------------------------------------------
2985 // SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
2986 // Contains256bitAVX flag when SIMD vector size is 32 bytes
2989 // isFloatingPointType - true if it is floating point type
2990 // sizeOfSIMDVector - SIMD Vector size
2992 void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/)
2994 if (compiler->canUseVexEncoding())
2996 compiler->getEmitter()->SetContainsAVX(true);
2997 if (sizeOfSIMDVector == 32)
2999 compiler->getEmitter()->SetContains256bitAVX(true);
3004 #endif // _TARGET_XARCH_