1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Register Requirements for AMD64 XX
10 XX This encapsulates all the logic for setting register requirements for XX
11 XX the AMD64 architecture. XX
14 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
23 #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
28 #include "sideeffects.h"
31 //------------------------------------------------------------------------
32 // BuildNode: Set register requirements for a node
35 // treeNode - the node of interest
39 // LSRA Has been initialized and there is a TreeNodeInfo node
40 // already allocated and initialized for every tree in the IR.
42 // Every TreeNodeInfo instance has the right annotations on register
43 // requirements needed by LSRA to build the Interval Table (source,
44 // destination and internal [temp] register counts).
46 void LinearScan::BuildNode(GenTree* tree)
48 TreeNodeInfo* info = currentNodeInfo;
49 assert(!tree->isContained());
54 if (tree->IsUnusedValue())
56 info->isLocalDefUse = true;
64 // floating type generates AVX instruction (vmovss etc.), set the flag
65 SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
66 switch (tree->OperGet())
73 // Because we do containment analysis before we redo dataflow and identify register
74 // candidates, the containment analysis only !lvDoNotEnregister to estimate register
76 // If there is a lclVar that is estimated to be register candidate but
77 // is not, if they were marked regOptional they should now be marked contained instead.
78 // TODO-XArch-CQ: When this is being called while RefPositions are being created,
79 // use lvLRACandidate here instead.
80 if (tree->IsRegOptional())
82 if (!compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvTracked ||
83 compiler->lvaTable[tree->AsLclVarCommon()->gtLclNum].lvDoNotEnregister)
85 tree->ClearRegOptional();
97 // Need an additional register to read upper 4 bytes of Vector3.
98 if (tree->TypeGet() == TYP_SIMD12)
100 // We need an internal register different from targetReg in which 'tree' produces its result
101 // because both targetReg and internal reg will be in use at the same time.
102 info->internalFloatCount = 1;
103 info->isInternalRegDelayFree = true;
104 info->setInternalCandidates(this, allSIMDRegs());
109 case GT_STORE_LCL_FLD:
110 case GT_STORE_LCL_VAR:
111 BuildStoreLoc(tree->AsLclVarCommon());
115 // These should always be contained. We don't correctly allocate or
116 // generate code for a non-contained GT_FIELD_LIST.
117 noway_assert(!"Non-contained GT_FIELD_LIST");
126 assert(info->dstCount == 0);
131 assert(info->dstCount == 1);
134 #if !defined(_TARGET_64BIT_)
137 assert(tree->IsUnusedValue()); // Contained nodes are already processed, only unused GT_LONG can reach here.
138 // An unused GT_LONG node needs to consume its sources, but need not produce a register.
139 tree->gtType = TYP_VOID;
140 tree->ClearUnusedValue();
141 info->isLocalDefUse = false;
144 appendLocationInfoToList(tree->gtGetOp1());
145 appendLocationInfoToList(tree->gtGetOp2());
148 #endif // !defined(_TARGET_64BIT_)
155 assert(info->dstCount == 0);
164 assert(info->dstCount == 0);
165 if (tree->TypeGet() == TYP_VOID)
171 assert(tree->TypeGet() == TYP_INT);
175 info->setSrcCandidates(this, RBM_INTRET);
176 LocationInfoListNode* locationInfo = getLocationInfo(tree->gtOp.gtOp1);
177 locationInfo->info.setSrcCandidates(this, RBM_INTRET);
178 useList.Append(locationInfo);
182 // A GT_NOP is either a passthrough (if it is void, or if it has
183 // a child), but must be considered to produce a dummy value if it
184 // has a type but no child
187 assert((tree->gtOp.gtOp1 == nullptr) || tree->isContained());
188 if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
190 assert(info->dstCount == 1);
194 assert(info->dstCount == 0);
201 assert(info->dstCount == 0);
202 GenTree* cmp = tree->gtGetOp1();
203 assert(!cmp->IsValue());
209 assert(info->dstCount == 0);
214 assert(info->dstCount == 1);
216 info->setDstCandidates(this, RBM_BYTE_REGS);
217 #endif // _TARGET_X86_
222 assert(info->dstCount == 0);
226 // This should never occur since switch nodes must not be visible at this
229 noway_assert(!"Switch must be lowered at this point");
234 assert(info->dstCount == 1);
237 case GT_SWITCH_TABLE:
238 info->internalIntCount = 1;
239 assert(info->dstCount == 0);
240 info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
241 assert(info->srcCount == 2);
245 noway_assert(!"We should never hit any assignment operator in lowering");
249 #if !defined(_TARGET_64BIT_)
261 info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
265 // This just turns into a compare of its child with an int + a conditional call.
266 info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
267 assert(info->dstCount == 0);
268 info->internalIntCount = 1;
269 info->setInternalCandidates(this, allRegs(TYP_INT));
276 BuildModDiv(tree->AsOp());
281 #if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
284 BuildMul(tree->AsOp());
288 BuildIntrinsic(tree->AsOp());
293 BuildSIMD(tree->AsSIMD());
295 #endif // FEATURE_SIMD
297 #ifdef FEATURE_HW_INTRINSICS
299 BuildHWIntrinsic(tree->AsHWIntrinsic());
301 #endif // FEATURE_HW_INTRINSICS
309 LocationInfoListNode* locationInfo = getLocationInfo(tree->gtOp.gtOp1);
310 locationInfo->info.isTgtPref = true;
311 useList.Append(locationInfo);
318 info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
321 // SSE instruction set doesn't have an instruction to negate a number.
322 // The recommended way is to xor the float/double number with a bitmask.
323 // The only way to xor is using xorps or xorpd both of which operate on
324 // 128-bit operands. To hold the bit-mask we would need another xmm
325 // register or a 16-byte aligned 128-bit data constant. Right now emitter
326 // lacks the support for emitting such constants or instruction with mem
327 // addressing mode referring to a 128-bit operand. For now we use an
328 // internal xmm register to load 32/64-bit bitmask from data section.
329 // Note that by trading additional data section memory (128-bit) we can
330 // save on the need for an internal register and also a memory-to-reg
333 // Note: another option to avoid internal register requirement is by
334 // lowering as GT_SUB(0, src). This will generate code different from
335 // Jit64 and could possibly result in compat issues (?).
336 if (varTypeIsFloating(tree))
338 info->internalFloatCount = 1;
339 info->setInternalCandidates(this, internalFloatRegCandidates());
344 info->srcCount = GetOperandInfo(tree->gtOp.gtOp1);
356 (void)BuildShiftRotate(tree);
372 appendLocationInfoToList(tree->gtOp.gtOp1);
374 assert(info->dstCount == 1);
375 info->internalIntCount = 1;
381 assert(info->dstCount == 1);
383 // comparand is preferenced to RAX.
384 // Remaining two operands can be in any reg other than RAX.
385 LocationInfoListNode* locationInfo = getLocationInfo(tree->gtCmpXchg.gtOpLocation);
386 locationInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RAX);
387 useList.Append(locationInfo);
388 LocationInfoListNode* valueInfo = getLocationInfo(tree->gtCmpXchg.gtOpValue);
389 valueInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RAX);
390 useList.Append(valueInfo);
391 info->setDstCandidates(this, RBM_RAX);
392 LocationInfoListNode* comparandInfo = getLocationInfo(tree->gtCmpXchg.gtOpComparand);
393 comparandInfo->info.setSrcCandidates(this, RBM_RAX);
394 useList.Append(comparandInfo);
399 info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
400 assert(info->dstCount == (tree->TypeGet() == TYP_VOID) ? 0 : 1);
404 BuildPutArgReg(tree->AsUnOp());
408 BuildCall(tree->AsCall());
413 // For a GT_ADDR, the child node should not be evaluated into a register
414 GenTree* child = tree->gtOp.gtOp1;
415 assert(!isCandidateLocalRef(child));
416 assert(child->isContained());
417 assert(info->dstCount == 1);
422 #if !defined(FEATURE_PUT_STRUCT_ARG_STK)
427 // These should all be eliminated prior to Lowering.
428 assert(!"Non-store block node in Lowering");
432 #ifdef FEATURE_PUT_STRUCT_ARG_STK
434 BuildPutArgStk(tree->AsPutArgStk());
436 #endif // FEATURE_PUT_STRUCT_ARG_STK
440 case GT_STORE_DYN_BLK:
441 BuildBlockStore(tree->AsBlk());
445 // Always a passthrough of its child's value.
446 assert(!"INIT_VAL should always be contained");
453 case GT_ARR_BOUNDS_CHECK:
456 #endif // FEATURE_SIMD
457 #ifdef FEATURE_HW_INTRINSICS
458 case GT_HW_INTRINSIC_CHK:
459 #endif // FEATURE_HW_INTRINSICS
460 // Consumes arrLen & index - has no result
462 assert(info->dstCount == 0);
463 info->srcCount = GetOperandInfo(tree->AsBoundsChk()->gtIndex);
464 info->srcCount += GetOperandInfo(tree->AsBoundsChk()->gtArrLen);
468 // These must have been lowered to GT_ARR_INDEX
469 noway_assert(!"We should never see a GT_ARR_ELEM after Lowering.");
476 assert(info->dstCount == 1);
477 assert(!tree->AsArrIndex()->ArrObj()->isContained());
478 assert(!tree->AsArrIndex()->IndexExpr()->isContained());
479 // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
480 // times while the result is being computed.
481 LocationInfoListNode* arrObjInfo = getLocationInfo(tree->AsArrIndex()->ArrObj());
482 arrObjInfo->info.isDelayFree = true;
483 useList.Append(arrObjInfo);
484 useList.Append(getLocationInfo(tree->AsArrIndex()->IndexExpr()));
485 info->hasDelayFreeSrc = true;
490 // This consumes the offset, if any, the arrObj and the effective index,
491 // and produces the flattened offset for this dimension.
492 assert(info->dstCount == 1);
493 if (tree->gtArrOffs.gtOffset->isContained())
499 // Here we simply need an internal register, which must be different
500 // from any of the operand's registers, but may be the same as targetReg.
502 info->internalIntCount = 1;
503 appendLocationInfoToList(tree->AsArrOffs()->gtOffset);
505 appendLocationInfoToList(tree->AsArrOffs()->gtIndex);
506 appendLocationInfoToList(tree->AsArrOffs()->gtArrObj);
510 // The LEA usually passes its operands through to the GT_IND, in which case it will
511 // be contained, but we may be instantiating an address, in which case we set them here.
513 assert(info->dstCount == 1);
514 if (tree->AsAddrMode()->HasBase())
517 appendLocationInfoToList(tree->AsAddrMode()->Base());
519 if (tree->AsAddrMode()->HasIndex())
522 appendLocationInfoToList(tree->AsAddrMode()->Index());
527 if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
529 BuildGCWriteBarrier(tree);
532 BuildIndir(tree->AsIndir());
536 assert(info->dstCount == 0);
537 appendLocationInfoToList(tree->gtOp.gtOp1);
542 BuildIndir(tree->AsIndir());
543 assert(info->dstCount == 1);
548 assert(info->dstCount == 1);
549 info->setDstCandidates(this, RBM_EXCEPTION_OBJECT);
552 #if !FEATURE_EH_FUNCLETS
555 assert(info->dstCount == 0);
560 // These nodes are eliminated by rationalizer.
561 JITDUMP("Unexpected node %s in Lower.\n", GenTree::OpName(tree->OperGet()));
566 assert(info->dstCount == 1);
567 info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
569 if (tree->AsIndexAddr()->Index()->TypeGet() == TYP_I_IMPL)
571 info->internalIntCount = 1;
575 switch (tree->AsIndexAddr()->gtElemSize)
584 info->internalIntCount = 1;
589 } // end switch (tree->OperGet())
591 // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
592 // Even then we would like to set isTgtPref on Op1.
593 if (tree->OperIsBinary() && info->srcCount >= 1)
595 if (isRMWRegOper(tree))
597 GenTree* op1 = tree->gtOp.gtOp1;
598 GenTree* op2 = tree->gtOp.gtOp2;
600 // Commutative opers like add/mul/and/or/xor could reverse the order of
601 // operands if it is safe to do so. In such a case we would like op2 to be
602 // target preferenced instead of op1.
603 if (tree->OperIsCommutative() && op1->isContained() && op2 != nullptr)
606 op2 = tree->gtOp.gtOp1;
609 // If we have a read-modify-write operation, we want to preference op1 to the target,
610 // if it is not contained.
611 if (!op1->isContained() && !op1->OperIs(GT_LIST))
613 useList.GetTreeNodeInfo(op1).isTgtPref = true;
616 // Is this a non-commutative operator, or is op2 a contained memory op?
617 // In either case, we need to make op2 remain live until the op is complete, by marking
618 // the source(s) associated with op2 as "delayFree" if this node defines a register.
619 // Note that if op2 of a binary RMW operator is a memory op, even if the operator
620 // is commutative, codegen cannot reverse them.
621 // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
622 // more work to be done to correctly reverse the operands if they involve memory
623 // operands. Also, we may need to handle more cases than GT_IND, especially once
624 // we've modified the register allocator to not require all nodes to be assigned
625 // a register (e.g. a spilled lclVar can often be referenced directly from memory).
626 // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
628 GenTree* delayUseSrc = nullptr;
629 // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
630 // to special case them.
631 if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
633 // These tree nodes will have their op1 marked as isDelayFree=true.
634 // Hence these tree nodes should have a Def position so that op1's reg
635 // gets freed at DefLoc+1.
636 if (tree->TypeGet() == TYP_VOID)
638 // Right now a GT_XADD node could be morphed into a
639 // GT_LOCKADD of TYP_VOID. See gtExtractSideEffList().
640 // Note that it is advantageous to use GT_LOCKADD
641 // instead of of GT_XADD as the former uses lock.add,
642 // which allows its second operand to be a contained
643 // immediate wheres xadd instruction requires its
644 // second operand to be in a register.
645 assert(info->dstCount == 0);
647 // Give it an artificial type and mark it as an unused value.
648 // This results in a Def position created but not considered consumed by its parent node.
649 tree->gtType = TYP_INT;
651 info->isLocalDefUse = true;
652 tree->SetUnusedValue();
656 assert(info->dstCount != 0);
661 else if ((info->dstCount != 0) && (op2 != nullptr) &&
662 (!tree->OperIsCommutative() || (op2->isContained() && !op2->IsCnsIntOrI())))
666 if ((delayUseSrc != nullptr) && CheckAndSetDelayFree(delayUseSrc))
668 info->hasDelayFreeSrc = true;
673 BuildCheckByteable(tree);
675 // We need to be sure that we've set info->srcCount and info->dstCount appropriately
676 assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
677 assert(info->isLocalDefUse == (tree->IsValue() && tree->IsUnusedValue()));
678 assert(!tree->IsUnusedValue() || (info->dstCount != 0));
679 assert(info->dstCount == tree->GetRegisterDstCount());
682 //---------------------------------------------------------------------
683 // CheckAndSetDelayFree - Set isDelayFree on the given operand or its child(ren), if appropriate
686 // delayUseSrc - a node that may have a delayed use
689 // True iff the node or one of its children has been marked isDelayFree
692 // Only register operands should be marked isDelayFree, not contained immediates or memory.
694 bool LinearScan::CheckAndSetDelayFree(GenTree* delayUseSrc)
696 // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
697 // on the base & index, if any.
698 // Otherwise, we set it on delayUseSrc itself.
699 bool returnValue = false;
700 if (delayUseSrc->isContained())
702 // If delayUseSrc is a non-Indir contained node (e.g. a local) there's no register use to delay.
703 if (delayUseSrc->isIndir())
705 GenTree* base = delayUseSrc->AsIndir()->Base();
706 GenTree* index = delayUseSrc->AsIndir()->Index();
707 if ((base != nullptr) && !base->isContained())
709 useList.GetTreeNodeInfo(base).isDelayFree = true;
712 if (index != nullptr)
714 assert(!index->isContained());
715 useList.GetTreeNodeInfo(index).isDelayFree = true;
722 useList.GetTreeNodeInfo(delayUseSrc).isDelayFree = true;
728 //------------------------------------------------------------------------
729 // BuildCheckByteable: Check the tree to see if "byte-able" registers are
730 // required, and set the tree node info accordingly.
733 // tree - The node of interest
738 void LinearScan::BuildCheckByteable(GenTree* tree)
741 TreeNodeInfo* info = currentNodeInfo;
742 // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
743 // if the tree node is a byte type.
745 // Though this looks conservative in theory, in practice we could not think of a case where
746 // the below logic leads to conservative register specification. In future when or if we find
747 // one such case, this logic needs to be fine tuned for that case(s).
749 if (ExcludeNonByteableRegisters(tree))
752 if (info->dstCount > 0)
754 regMask = info->getDstCandidates(this);
755 assert(regMask != RBM_NONE);
756 info->setDstCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
759 if (tree->OperIsSimple())
761 GenTree* op = tree->gtOp.gtOp1;
762 // We need byte registers on the operands of most simple operators that produce a byte result.
763 // However, indirections are simple operators but do not require their address in a byte register.
764 if ((op != nullptr) && !tree->OperIsIndir())
766 // No need to set src candidates on a contained child operand.
767 if (!op->isContained())
769 TreeNodeInfo& op1Info = useList.GetTreeNodeInfo(op);
770 regMask = op1Info.getSrcCandidates(this);
771 assert(regMask != RBM_NONE);
772 op1Info.setSrcCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
776 if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
778 op = tree->gtOp.gtOp2;
779 if (!op->isContained())
781 TreeNodeInfo& op2Info = useList.GetTreeNodeInfo(op);
782 regMask = op2Info.getSrcCandidates(this);
783 assert(regMask != RBM_NONE);
784 op2Info.setSrcCandidates(this, regMask & ~RBM_NON_BYTE_REGS);
789 #endif //_TARGET_X86_
792 //------------------------------------------------------------------------------
793 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
796 // tree - a binary tree node
799 // Returns true if we can use the read-modify-write instruction form
802 // This is used to determine whether to preference the source to the destination register.
804 bool LinearScan::isRMWRegOper(GenTree* tree)
806 // TODO-XArch-CQ: Make this more accurate.
807 // For now, We assume that most binary operators are of the RMW form.
808 assert(tree->OperIsBinary());
810 if (tree->OperIsCompare() || tree->OperIs(GT_CMP))
815 switch (tree->OperGet())
817 // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
825 // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
827 return (!tree->gtOp.gtOp2->isContainedIntOrIImmed() && !tree->gtOp.gtOp1->isContainedIntOrIImmed());
829 #ifdef FEATURE_HW_INTRINSICS
831 return tree->isRMWHWIntrinsic(compiler);
832 #endif // FEATURE_HW_INTRINSICS
839 //------------------------------------------------------------------------
840 // BuildShiftRotate: Set the NodeInfo for a shift or rotate.
843 // tree - The node of interest
848 int LinearScan::BuildShiftRotate(GenTree* tree)
850 TreeNodeInfo* info = currentNodeInfo;
851 // For shift operations, we need that the number
852 // of bits moved gets stored in CL in case
853 // the number of bits to shift is not a constant.
855 GenTree* shiftBy = tree->gtOp.gtOp2;
856 GenTree* source = tree->gtOp.gtOp1;
857 LocationInfoListNode* shiftByInfo = nullptr;
858 // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
859 // We will allow whatever can be encoded - hope you know what you are doing.
860 if (shiftBy->isContained())
862 srcCount += GetOperandInfo(source);
867 shiftByInfo = getLocationInfo(shiftBy);
868 shiftByInfo->info.setSrcCandidates(this, RBM_RCX);
869 info->setDstCandidates(this, allRegs(TYP_INT) & ~RBM_RCX);
870 LocationInfoListNode* sourceInfo;
871 srcCount += GetOperandInfo(source, &sourceInfo);
872 for (; sourceInfo != nullptr; sourceInfo = sourceInfo->Next())
874 sourceInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~RBM_RCX);
878 // Note that Rotate Left/Right instructions don't set ZF and SF flags.
880 // If the operand being shifted is 32-bits then upper three bits are masked
881 // by hardware to get actual shift count. Similarly for 64-bit operands
882 // shift count is narrowed to [0..63]. If the resulting shift count is zero,
883 // then shift operation won't modify flags.
885 // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
886 // if the shift count is known to be non-zero and in the range depending on the
888 CLANG_FORMAT_COMMENT_ANCHOR;
891 // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
892 // we can have a three operand form. Increment the srcCount.
893 if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
895 assert((source->OperGet() == GT_LONG) && source->isContained());
897 GenTree* sourceLo = source->gtOp.gtOp1;
898 LocationInfoListNode* sourceLoInfo = useList.Begin();
899 LocationInfoListNode* sourceHiInfo = useList.GetSecond(INDEBUG(source->gtGetOp2()));
901 info->hasDelayFreeSrc = true;
902 if (tree->OperGet() == GT_LSH_HI)
904 sourceLoInfo->info.isDelayFree = true;
908 sourceHiInfo->info.isDelayFree = true;
912 if (shiftByInfo != nullptr)
914 if (tree->IsReverseOp())
916 useList.Prepend(shiftByInfo);
920 useList.Append(shiftByInfo);
923 if (!tree->isContained())
925 info->srcCount = srcCount;
930 //------------------------------------------------------------------------
931 // BuildCall: Set the NodeInfo for a call.
934 // call - The call node of interest
939 void LinearScan::BuildCall(GenTreeCall* call)
941 TreeNodeInfo* info = currentNodeInfo;
942 bool hasMultiRegRetVal = false;
943 ReturnTypeDesc* retTypeDesc = nullptr;
945 assert(!call->isContained());
947 if (call->TypeGet() != TYP_VOID)
949 hasMultiRegRetVal = call->HasMultiRegRetVal();
950 if (hasMultiRegRetVal)
952 // dst count = number of registers in which the value is returned by call
953 retTypeDesc = call->GetReturnTypeDesc();
954 info->dstCount = retTypeDesc->GetReturnRegCount();
958 assert(info->dstCount == 1);
963 assert(info->dstCount == 0);
966 GenTree* ctrlExpr = call->gtControlExpr;
967 LocationInfoListNode* ctrlExprInfo = nullptr;
968 if (call->gtCallType == CT_INDIRECT)
970 ctrlExpr = call->gtCallAddr;
973 // If this is a varargs call, we will clear the internal candidates in case we need
974 // to reserve some integer registers for copying float args.
975 // We have to do this because otherwise the default candidates are allRegs, and adding
976 // the individual specific registers will have no effect.
977 if (call->IsVarargs())
979 info->setInternalCandidates(this, RBM_NONE);
982 RegisterType registerType = call->TypeGet();
984 // Set destination candidates for return value of the call.
985 CLANG_FORMAT_COMMENT_ANCHOR;
988 if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
990 // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
991 // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
992 // correct argument registers.
993 info->setDstCandidates(this, RBM_PINVOKE_TCB);
996 #endif // _TARGET_X86_
997 if (hasMultiRegRetVal)
999 assert(retTypeDesc != nullptr);
1000 info->setDstCandidates(this, retTypeDesc->GetABIReturnRegs());
1002 else if (varTypeIsFloating(registerType))
1005 // The return value will be on the X87 stack, and we will need to move it.
1006 info->setDstCandidates(this, allRegs(registerType));
1007 #else // !_TARGET_X86_
1008 info->setDstCandidates(this, RBM_FLOATRET);
1009 #endif // !_TARGET_X86_
1011 else if (registerType == TYP_LONG)
1013 info->setDstCandidates(this, RBM_LNGRET);
1017 info->setDstCandidates(this, RBM_INTRET);
1020 // number of args to a call =
1021 // callRegArgs + (callargs - placeholders, setup, etc)
1022 // there is an explicit thisPtr but it is redundant
1024 bool callHasFloatRegArgs = false;
1025 bool isVarArgs = call->IsVarargs();
1027 // First, count reg args
1028 for (GenTree* list = call->gtCallLateArgs; list; list = list->MoveNext())
1030 assert(list->OperIsList());
1032 // By this point, lowering has ensured that all call arguments are one of the following:
1033 // - an arg setup store
1034 // - an arg placeholder
1040 // Note that this property is statically checked by LinearScan::CheckBlock.
1041 GenTree* argNode = list->Current();
1043 // Each register argument corresponds to one source.
1044 if (argNode->OperIsPutArgReg())
1047 HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1048 appendLocationInfoToList(argNode);
1050 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1051 else if (argNode->OperGet() == GT_FIELD_LIST)
1053 for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1055 assert(entry->Current()->OperIsPutArgReg());
1057 HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
1058 appendLocationInfoToList(entry->Current());
1061 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1064 // In DEBUG only, check validity with respect to the arg table entry.
1066 fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1067 assert(curArgTabEntry);
1069 if (curArgTabEntry->regNum == REG_STK)
1071 // late arg that is not passed in a register
1072 assert(argNode->gtOper == GT_PUTARG_STK);
1074 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1075 // If the node is TYP_STRUCT and it is put on stack with
1076 // putarg_stk operation, we consume and produce no registers.
1077 // In this case the embedded Obj node should not produce
1078 // registers too since it is contained.
1079 // Note that if it is a SIMD type the argument will be in a register.
1080 if (argNode->TypeGet() == TYP_STRUCT)
1082 assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
1083 assert(argNode->gtOp.gtOp1->isContained());
1085 #endif // FEATURE_PUT_STRUCT_ARG_STK
1088 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1089 if (argNode->OperGet() == GT_FIELD_LIST)
1091 assert(argNode->isContained());
1092 assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
1095 for (GenTreeFieldList* entry = argNode->AsFieldList(); entry != nullptr; entry = entry->Rest())
1097 const regNumber argReg = (i == 0) ? curArgTabEntry->regNum : curArgTabEntry->otherRegNum;
1098 assert(entry->Current()->gtRegNum == argReg);
1104 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1106 const regNumber argReg = curArgTabEntry->regNum;
1107 assert(argNode->gtRegNum == argReg);
1112 // Now, count stack args
1113 // Note that these need to be computed into a register, but then
1114 // they're just stored to the stack - so the reg doesn't
1115 // need to remain live until the call. In fact, it must not
1116 // because the code generator doesn't actually consider it live,
1117 // so it can't be spilled.
1119 GenTree* args = call->gtCallArgs;
1122 GenTree* arg = args->gtOp.gtOp1;
1123 if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
1125 if (arg->IsValue() && !arg->isContained())
1127 // argInfo->isLocalDefUse = true;
1128 assert(arg->IsUnusedValue());
1130 // assert(argInfo->dstCount == 0);
1132 args = args->gtOp.gtOp2;
1135 // set reg requirements on call target represented as control sequence.
1136 if (ctrlExpr != nullptr)
1138 LocationInfoListNode* ctrlExprInfo = nullptr;
1139 int ctrlExprCount = GetOperandInfo(ctrlExpr);
1140 if (ctrlExprCount != 0)
1142 assert(ctrlExprCount == 1);
1143 ctrlExprInfo = useList.Last();
1147 // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1148 // computed into a register.
1149 if (call->IsFastTailCall())
1151 assert(!ctrlExpr->isContained() && ctrlExprInfo != nullptr);
1152 // Fast tail call - make sure that call target is always computed in RAX
1153 // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1154 ctrlExprInfo->info.setSrcCandidates(this, RBM_RAX);
1157 else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1159 // On x86, we need to generate a very specific pattern for indirect VSD calls:
1162 // call dword ptr [eax]
1164 // Where EAX is also used as an argument to the stub dispatch helper. Make
1165 // sure that the call target address is computed into EAX in this case.
1166 assert(ctrlExprInfo != nullptr);
1167 assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
1168 ctrlExprInfo->info.setSrcCandidates(this, RBM_VIRTUAL_STUB_TARGET);
1170 #endif // _TARGET_X86_
1173 // If it is a fast tail call, it is already preferenced to use RAX.
1174 // Therefore, no need set src candidates on call tgt again.
1175 if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExprInfo != nullptr))
1177 // Don't assign the call target to any of the argument registers because
1178 // we will use them to also pass floating point arguments as required
1180 ctrlExprInfo->info.setSrcCandidates(this, allRegs(TYP_INT) & ~(RBM_ARG_REGS));
1182 #endif // !FEATURE_VARARG
1186 //------------------------------------------------------------------------
1187 // BuildBlockStore: Set the NodeInfo for a block store.
1190 // blkNode - The block store node of interest
1195 void LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
1197 TreeNodeInfo* info = currentNodeInfo;
1198 GenTree* dstAddr = blkNode->Addr();
1199 unsigned size = blkNode->gtBlkSize;
1200 GenTree* source = blkNode->Data();
1202 LocationInfoListNode* dstAddrInfo = nullptr;
1203 LocationInfoListNode* sourceInfo = nullptr;
1204 LocationInfoListNode* sizeInfo = nullptr;
1206 // Sources are dest address, initVal or source.
1207 // We may require an additional source or temp register for the size.
1208 if (!dstAddr->isContained())
1211 dstAddrInfo = getLocationInfo(dstAddr);
1213 assert(info->dstCount == 0);
1214 info->setInternalCandidates(this, RBM_NONE);
1215 GenTree* srcAddrOrFill = nullptr;
1216 bool isInitBlk = blkNode->OperIsInitBlkOp();
1218 regMaskTP dstAddrRegMask = RBM_NONE;
1219 regMaskTP sourceRegMask = RBM_NONE;
1220 regMaskTP blkSizeRegMask = RBM_NONE;
1224 GenTree* initVal = source;
1225 if (initVal->OperIsInitVal())
1227 assert(initVal->isContained());
1228 initVal = initVal->gtGetOp1();
1230 srcAddrOrFill = initVal;
1231 if (!initVal->isContained())
1234 sourceInfo = getLocationInfo(initVal);
1237 switch (blkNode->gtBlkOpKind)
1239 case GenTreeBlk::BlkOpKindUnroll:
1240 assert(initVal->IsCnsIntOrI());
1241 if (size >= XMM_REGSIZE_BYTES)
1243 // Reserve an XMM register to fill it with a pack of 16 init value constants.
1244 info->internalFloatCount = 1;
1245 info->setInternalCandidates(this, internalFloatRegCandidates());
1246 // use XMM register to fill with constants, it's AVX instruction and set the flag
1247 SetContainsAVXFlags();
1250 if ((size & 1) != 0)
1252 // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1253 // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1254 // when unrolling, so only allow byteable registers as the source value. (We could
1255 // consider just using BlkOpKindRepInstr instead.)
1256 sourceRegMask = RBM_BYTE_REGS;
1258 #endif // _TARGET_X86_
1261 case GenTreeBlk::BlkOpKindRepInstr:
1262 // rep stos has the following register requirements:
1263 // a) The memory address to be in RDI.
1264 // b) The fill value has to be in RAX.
1265 // c) The buffer size will go in RCX.
1266 dstAddrRegMask = RBM_RDI;
1267 sourceRegMask = RBM_RAX;
1268 blkSizeRegMask = RBM_RCX;
1271 case GenTreeBlk::BlkOpKindHelper:
1272 #ifdef _TARGET_AMD64_
1273 // The helper follows the regular AMD64 ABI.
1274 dstAddrRegMask = RBM_ARG_0;
1275 sourceRegMask = RBM_ARG_1;
1276 blkSizeRegMask = RBM_ARG_2;
1277 #else // !_TARGET_AMD64_
1278 dstAddrRegMask = RBM_RDI;
1279 sourceRegMask = RBM_RAX;
1280 blkSizeRegMask = RBM_RCX;
1281 #endif // !_TARGET_AMD64_
1290 // CopyObj or CopyBlk
1291 if (source->gtOper == GT_IND)
1293 assert(source->isContained());
1294 srcAddrOrFill = source->gtGetOp1();
1295 if (!srcAddrOrFill->isContained())
1297 sourceInfo = getLocationInfo(srcAddrOrFill);
1301 if (blkNode->OperGet() == GT_STORE_OBJ)
1303 if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
1305 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1306 blkSizeRegMask = RBM_RCX;
1308 // The srcAddr must be in a register. If it was under a GT_IND, we need to subsume all of its
1310 sourceRegMask = RBM_RSI;
1311 dstAddrRegMask = RBM_RDI;
1315 switch (blkNode->gtBlkOpKind)
1317 case GenTreeBlk::BlkOpKindUnroll:
1318 // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1320 // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1321 // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
1322 // RBM_NON_BYTE_REGS from internal candidates.
1323 if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1325 info->internalIntCount++;
1326 regMaskTP regMask = allRegs(TYP_INT);
1329 if ((size & 1) != 0)
1331 regMask &= ~RBM_NON_BYTE_REGS;
1334 info->setInternalCandidates(this, regMask);
1337 if (size >= XMM_REGSIZE_BYTES)
1339 // If we have a buffer larger than XMM_REGSIZE_BYTES,
1340 // reserve an XMM register to use it for a
1341 // series of 16-byte loads and stores.
1342 info->internalFloatCount = 1;
1343 info->addInternalCandidates(this, internalFloatRegCandidates());
1344 // Uses XMM reg for load and store and hence check to see whether AVX instructions
1345 // are used for codegen, set ContainsAVX flag
1346 SetContainsAVXFlags();
1350 case GenTreeBlk::BlkOpKindRepInstr:
1351 // rep stos has the following register requirements:
1352 // a) The dest address has to be in RDI.
1353 // b) The src address has to be in RSI.
1354 // c) The buffer size will go in RCX.
1355 dstAddrRegMask = RBM_RDI;
1356 sourceRegMask = RBM_RSI;
1357 blkSizeRegMask = RBM_RCX;
1360 case GenTreeBlk::BlkOpKindHelper:
1361 #ifdef _TARGET_AMD64_
1362 // The helper follows the regular AMD64 ABI.
1363 dstAddrRegMask = RBM_ARG_0;
1364 sourceRegMask = RBM_ARG_1;
1365 blkSizeRegMask = RBM_ARG_2;
1366 #else // !_TARGET_AMD64_
1367 dstAddrRegMask = RBM_RDI;
1368 sourceRegMask = RBM_RAX;
1369 blkSizeRegMask = RBM_RCX;
1370 #endif // !_TARGET_AMD64_
1379 if (dstAddrInfo != nullptr)
1381 if (dstAddrRegMask != RBM_NONE)
1383 dstAddrInfo->info.setSrcCandidates(this, dstAddrRegMask);
1385 useList.Append(dstAddrInfo);
1387 if (sourceRegMask != RBM_NONE)
1389 if (sourceInfo != nullptr)
1391 sourceInfo->info.setSrcCandidates(this, sourceRegMask);
1395 // This is a local source; we'll use a temp register for its address.
1396 info->addInternalCandidates(this, sourceRegMask);
1397 info->internalIntCount++;
1400 if (sourceInfo != nullptr)
1402 useList.Add(sourceInfo, blkNode->IsReverseOp());
1405 if (blkNode->OperIs(GT_STORE_DYN_BLK))
1407 // The block size argument is a third argument to GT_STORE_DYN_BLK
1410 GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1411 sizeInfo = getLocationInfo(blockSize);
1412 useList.Add(sizeInfo, blkNode->AsDynBlk()->gtEvalSizeFirst);
1415 if (blkSizeRegMask != RBM_NONE)
1419 // Reserve a temp register for the block size argument.
1420 info->addInternalCandidates(this, blkSizeRegMask);
1421 info->internalIntCount++;
1425 // The block size argument is a third argument to GT_STORE_DYN_BLK
1426 assert((blkNode->gtOper == GT_STORE_DYN_BLK) && (sizeInfo != nullptr));
1427 info->setSrcCount(3);
1428 sizeInfo->info.setSrcCandidates(this, blkSizeRegMask);
1433 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1434 //------------------------------------------------------------------------
1435 // BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1438 // tree - The node of interest
1443 void LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
1445 TreeNodeInfo* info = currentNodeInfo;
1447 assert(info->dstCount == 0);
1449 if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
1451 putArgStk->gtOp1->SetContained();
1454 unsigned fieldCount = 0;
1455 bool needsByteTemp = false;
1456 bool needsSimdTemp = false;
1457 unsigned prevOffset = putArgStk->getArgSize();
1458 for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
1460 GenTree* const fieldNode = current->Current();
1461 const var_types fieldType = fieldNode->TypeGet();
1462 const unsigned fieldOffset = current->gtFieldOffset;
1463 assert(fieldType != TYP_LONG);
1465 #if defined(FEATURE_SIMD)
1466 // Note that we need to check the GT_FIELD_LIST type, not 'fieldType'. This is because the
1467 // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
1468 // we "round up" to 16.
1469 if (current->gtFieldType == TYP_SIMD12)
1471 needsSimdTemp = true;
1473 #endif // defined(FEATURE_SIMD)
1475 // We can treat as a slot any field that is stored at a slot boundary, where the previous
1476 // field is not in the same slot. (Note that we store the fields in reverse order.)
1477 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
1480 if (varTypeIsByte(fieldType))
1482 // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
1483 // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
1484 // need a byte-addressable register for the store. We will enforce this requirement on an internal
1485 // register, which we can use to copy multiple byte values.
1486 needsByteTemp = true;
1490 if (varTypeIsGC(fieldType))
1492 putArgStk->gtNumberReferenceSlots++;
1494 prevOffset = fieldOffset;
1496 if (!fieldNode->isContained())
1498 appendLocationInfoToList(fieldNode);
1503 if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
1505 // If any of the fields cannot be stored with an actual push, we may need a temporary
1506 // register to load the value before storing it to the stack location.
1507 info->internalIntCount = 1;
1508 regMaskTP regMask = allRegs(TYP_INT);
1511 regMask &= ~RBM_NON_BYTE_REGS;
1513 info->setInternalCandidates(this, regMask);
1516 #if defined(FEATURE_SIMD)
1517 // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
1520 assert(info->dstCount == 0);
1521 info->internalFloatCount += 1;
1522 info->addInternalCandidates(this, allSIMDRegs());
1524 #endif // defined(FEATURE_SIMD)
1527 #endif // _TARGET_X86_
1530 GenTree* src = putArgStk->gtOp1;
1531 var_types type = src->TypeGet();
1533 #if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1534 // For PutArgStk of a TYP_SIMD12, we need an extra register.
1535 if (putArgStk->isSIMD12())
1537 appendLocationInfoToList(putArgStk->gtOp1);
1539 info->internalFloatCount = 1;
1540 info->setInternalCandidates(this, allSIMDRegs());
1543 #endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
1545 if (type != TYP_STRUCT)
1547 BuildSimple(putArgStk);
1551 GenTree* dst = putArgStk;
1552 GenTree* srcAddr = nullptr;
1554 info->srcCount = GetOperandInfo(src);
1556 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1557 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1558 // our framework assemblies, so this is the main code generation scheme we'll use.
1559 ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
1560 switch (putArgStk->gtPutArgStkKind)
1562 case GenTreePutArgStk::Kind::Push:
1563 case GenTreePutArgStk::Kind::PushAllSlots:
1564 case GenTreePutArgStk::Kind::Unroll:
1565 // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1567 // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1568 // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
1569 // RBM_NON_BYTE_REGS from internal candidates.
1570 if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
1572 info->internalIntCount++;
1573 regMaskTP regMask = allRegs(TYP_INT);
1576 if ((size % 2) != 0)
1578 regMask &= ~RBM_NON_BYTE_REGS;
1581 info->setInternalCandidates(this, regMask);
1586 #else // !_TARGET_X86_
1587 if (size >= XMM_REGSIZE_BYTES)
1588 #endif // !_TARGET_X86_
1590 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
1591 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
1592 // series of 16-byte loads and stores.
1593 info->internalFloatCount = 1;
1594 info->addInternalCandidates(this, internalFloatRegCandidates());
1595 SetContainsAVXFlags();
1599 case GenTreePutArgStk::Kind::RepInstr:
1600 info->internalIntCount += 3;
1601 info->setInternalCandidates(this, (RBM_RDI | RBM_RCX | RBM_RSI));
1608 #endif // FEATURE_PUT_STRUCT_ARG_STK
1610 //------------------------------------------------------------------------
1611 // BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
1614 // tree - The node of interest
1619 void LinearScan::BuildLclHeap(GenTree* tree)
1621 TreeNodeInfo* info = currentNodeInfo;
1623 assert(info->dstCount == 1);
1625 // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1626 // Here '-' means don't care.
1628 // Size? Init Memory? # temp regs
1629 // 0 - 0 (returns 0)
1630 // const and <=6 reg words - 0 (pushes '0')
1631 // const and >6 reg words Yes 0 (pushes '0')
1632 // const and <PageSize No 0 (amd64) 1 (x86)
1633 // (x86:tmpReg for sutracting from esp)
1634 // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp)
1635 // Non-const Yes 0 (regCnt=targetReg and pushes '0')
1636 // Non-const No 2 (regCnt and tmpReg for subtracting from sp)
1638 // Note: Here we don't need internal register to be different from targetReg.
1639 // Rather, require it to be different from operand's reg.
1641 GenTree* size = tree->gtOp.gtOp1;
1642 if (size->IsCnsIntOrI())
1644 assert(size->isContained());
1646 size_t sizeVal = size->gtIntCon.gtIconVal;
1650 info->internalIntCount = 0;
1654 // Compute the amount of memory to properly STACK_ALIGN.
1655 // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1656 // This should also help in debugging as we can examine the original size specified with localloc.
1657 sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1659 // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1660 // we will generate 'push 0'.
1661 assert((sizeVal % REGSIZE_BYTES) == 0);
1662 size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
1663 if (cntRegSizedWords <= 6)
1665 info->internalIntCount = 0;
1667 else if (!compiler->info.compInitMem)
1669 // No need to initialize allocated stack space.
1670 if (sizeVal < compiler->eeGetPageSize())
1673 info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
1674 #else // !_TARGET_X86_
1675 info->internalIntCount = 0;
1676 #endif // !_TARGET_X86_
1680 // We need two registers: regCnt and RegTmp
1681 info->internalIntCount = 2;
1686 // >6 and need to zero initialize allocated stack space.
1687 info->internalIntCount = 0;
1693 appendLocationInfoToList(size);
1694 if (!compiler->info.compInitMem)
1696 info->internalIntCount = 2;
1700 info->internalIntCount = 0;
1705 //------------------------------------------------------------------------
1706 // BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
1709 // tree - The node of interest
1714 void LinearScan::BuildModDiv(GenTree* tree)
1716 TreeNodeInfo* info = currentNodeInfo;
1717 GenTree* op1 = tree->gtGetOp1();
1718 GenTree* op2 = tree->gtGetOp2();
1720 assert(info->dstCount == 1);
1722 if (varTypeIsFloating(tree->TypeGet()))
1724 info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
1728 // Amd64 Div/Idiv instruction:
1729 // Dividend in RAX:RDX and computes
1730 // Quotient in RAX, Remainder in RDX
1732 if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
1734 // We are interested in just the remainder.
1735 // RAX is used as a trashable register during computation of remainder.
1736 info->setDstCandidates(this, RBM_RDX);
1740 // We are interested in just the quotient.
1741 // RDX gets used as trashable register during computation of quotient
1742 info->setDstCandidates(this, RBM_RAX);
1746 if (op1->OperGet() == GT_LONG)
1748 assert(op1->isContained());
1750 // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
1751 GenTree* loVal = op1->gtGetOp1();
1752 GenTree* hiVal = op1->gtGetOp2();
1754 assert(op2->IsCnsIntOrI());
1755 assert(tree->OperGet() == GT_UMOD);
1757 // This situation also requires an internal register.
1758 info->internalIntCount = 1;
1759 info->setInternalCandidates(this, allRegs(TYP_INT));
1761 LocationInfoListNode* loValInfo = getLocationInfo(loVal);
1762 LocationInfoListNode* hiValInfo = getLocationInfo(hiVal);
1763 loValInfo->info.setSrcCandidates(this, RBM_EAX);
1764 hiValInfo->info.setSrcCandidates(this, RBM_EDX);
1765 useList.Append(loValInfo);
1766 useList.Append(hiValInfo);
1772 // If possible would like to have op1 in RAX to avoid a register move
1773 LocationInfoListNode* op1Info = getLocationInfo(op1);
1774 op1Info->info.setSrcCandidates(this, RBM_RAX);
1775 useList.Append(op1Info);
1779 LocationInfoListNode* op2Info;
1780 info->srcCount += GetOperandInfo(op2, &op2Info);
1781 for (; op2Info != nullptr; op2Info = op2Info->Next())
1783 op2Info->info.setSrcCandidates(this, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
1787 //------------------------------------------------------------------------
1788 // BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
1791 // tree - The node of interest
1796 void LinearScan::BuildIntrinsic(GenTree* tree)
1798 TreeNodeInfo* info = currentNodeInfo;
1799 // Both operand and its result must be of floating point type.
1800 GenTree* op1 = tree->gtGetOp1();
1801 assert(varTypeIsFloating(op1));
1802 assert(op1->TypeGet() == tree->TypeGet());
1804 info->srcCount = GetOperandInfo(op1);
1805 assert(info->dstCount == 1);
1807 switch (tree->gtIntrinsic.gtIntrinsicId)
1809 case CORINFO_INTRINSIC_Sqrt:
1812 case CORINFO_INTRINSIC_Abs:
1813 // Abs(float x) = x & 0x7fffffff
1814 // Abs(double x) = x & 0x7ffffff ffffffff
1816 // In case of Abs we need an internal register to hold mask.
1818 // TODO-XArch-CQ: avoid using an internal register for the mask.
1819 // Andps or andpd both will operate on 128-bit operands.
1820 // The data section constant to hold the mask is a 64-bit size.
1821 // Therefore, we need both the operand and mask to be in
1822 // xmm register. When we add support in emitter to emit 128-bit
1823 // data constants and instructions that operate on 128-bit
1824 // memory operands we can avoid the need for an internal register.
1825 if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
1827 info->internalFloatCount = 1;
1828 info->setInternalCandidates(this, internalFloatRegCandidates());
1833 case CORINFO_INTRINSIC_Cos:
1834 case CORINFO_INTRINSIC_Sin:
1835 NYI_X86("Math intrinsics Cos and Sin");
1837 #endif // _TARGET_X86_
1839 case CORINFO_INTRINSIC_Round:
1840 case CORINFO_INTRINSIC_Ceiling:
1841 case CORINFO_INTRINSIC_Floor:
1842 #if defined(LEGACY_BACKEND)
1843 NYI_X86("Math intrinsics Round, Ceiling, and Floor");
1844 #endif // LEGACY_BACKEND
1848 // Right now only Sqrt/Abs are treated as math intrinsics
1849 noway_assert(!"Unsupported math intrinsic");
1856 //------------------------------------------------------------------------
1857 // BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
1860 // tree - The GT_SIMD node of interest
1865 void LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
1867 TreeNodeInfo* info = currentNodeInfo;
1868 // Only SIMDIntrinsicInit can be contained. Other than that,
1869 // only SIMDIntrinsicOpEquality and SIMDIntrinsicOpInEquality can have 0 dstCount.
1870 if (simdTree->isContained())
1872 assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
1874 else if (info->dstCount != 1)
1876 assert((simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ||
1877 (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality));
1879 SetContainsAVXFlags(true, simdTree->gtSIMDSize);
1880 GenTree* op1 = simdTree->gtOp.gtOp1;
1881 GenTree* op2 = simdTree->gtOp.gtOp2;
1883 if (!op1->OperIs(GT_LIST))
1885 info->srcCount += GetOperandInfo(op1);
1887 if ((op2 != nullptr) && !op2->isContained())
1889 info->srcCount += GetOperandInfo(op2);
1892 switch (simdTree->gtSIMDIntrinsicID)
1894 case SIMDIntrinsicInit:
1896 // This sets all fields of a SIMD struct to the given value.
1897 // Mark op1 as contained if it is either zero or int constant of all 1's,
1898 // or a float constant with 16 or 32 byte simdType (AVX case)
1900 // Should never see small int base type vectors except for zero initialization.
1901 assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
1903 #if !defined(_TARGET_64BIT_)
1904 if (op1->OperGet() == GT_LONG)
1906 assert(op1->isContained());
1907 GenTree* op1lo = op1->gtGetOp1();
1908 GenTree* op1hi = op1->gtGetOp2();
1910 if (op1lo->isContained())
1912 assert(op1hi->isContained());
1913 assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
1914 (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
1915 assert(info->srcCount == 0);
1919 assert(info->srcCount == 2);
1920 info->internalFloatCount = 1;
1921 info->setInternalCandidates(this, allSIMDRegs());
1922 info->isInternalRegDelayFree = true;
1925 #endif // !defined(_TARGET_64BIT_)
1929 case SIMDIntrinsicInitN:
1931 var_types baseType = simdTree->gtSIMDBaseType;
1932 info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
1934 for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
1936 assert(list->OperGet() == GT_LIST);
1937 GenTree* listItem = list->gtGetOp1();
1938 assert(listItem->TypeGet() == baseType);
1939 assert(!listItem->isContained());
1940 appendLocationInfoToList(listItem);
1943 assert(initCount == info->srcCount);
1945 // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
1946 info->internalFloatCount = 1;
1947 info->setInternalCandidates(this, allSIMDRegs());
1951 case SIMDIntrinsicInitArray:
1952 // We have an array and an index, which may be contained.
1953 assert(info->srcCount == (simdTree->gtGetOp2()->isContained() ? 1 : 2));
1956 case SIMDIntrinsicDiv:
1957 // SSE2 has no instruction support for division on integer vectors
1958 noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1959 assert(info->srcCount == 2);
1962 case SIMDIntrinsicAbs:
1963 // float/double vectors: This gets implemented as bitwise-And operation
1964 // with a mask and hence should never see here.
1966 // Must be a Vector<int> or Vector<short> Vector<sbyte>
1967 assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
1968 simdTree->gtSIMDBaseType == TYP_BYTE);
1969 assert(compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
1970 assert(info->srcCount == 1);
1973 case SIMDIntrinsicSqrt:
1974 // SSE2 has no instruction support for sqrt on integer vectors.
1975 noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
1976 assert(info->srcCount == 1);
1979 case SIMDIntrinsicAdd:
1980 case SIMDIntrinsicSub:
1981 case SIMDIntrinsicMul:
1982 case SIMDIntrinsicBitwiseAnd:
1983 case SIMDIntrinsicBitwiseAndNot:
1984 case SIMDIntrinsicBitwiseOr:
1985 case SIMDIntrinsicBitwiseXor:
1986 case SIMDIntrinsicMin:
1987 case SIMDIntrinsicMax:
1988 assert(info->srcCount == 2);
1990 // SSE2 32-bit integer multiplication requires two temp regs
1991 if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
1992 compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
1994 info->internalFloatCount = 2;
1995 info->setInternalCandidates(this, allSIMDRegs());
1999 case SIMDIntrinsicEqual:
2000 assert(info->srcCount == 2);
2003 // SSE2 doesn't support < and <= directly on int vectors.
2004 // Instead we need to use > and >= with swapped operands.
2005 case SIMDIntrinsicLessThan:
2006 case SIMDIntrinsicLessThanOrEqual:
2007 assert(info->srcCount == 2);
2008 noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
2011 // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
2012 // SSE2 cmpps/pd doesn't support > and >= directly on float/double vectors.
2013 // Instead we need to use < and <= with swapped operands.
2014 case SIMDIntrinsicGreaterThan:
2015 noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2016 assert(info->srcCount == 2);
2019 case SIMDIntrinsicOpEquality:
2020 case SIMDIntrinsicOpInEquality:
2021 if (simdTree->gtGetOp2()->isContained())
2023 // If the second operand is contained then ContainCheckSIMD has determined
2024 // that PTEST can be used. We only need a single source register and no
2025 // internal registers.
2026 assert(info->srcCount == 1);
2030 // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
2031 // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
2032 // and one internal INT register (to hold the result of PMOVMSKB).
2033 assert(info->srcCount == 2);
2034 info->internalFloatCount = 1;
2035 info->setInternalCandidates(this, allSIMDRegs());
2036 info->internalIntCount = 1;
2037 info->addInternalCandidates(this, allRegs(TYP_INT));
2039 // These SIMD nodes only set the condition flags.
2043 case SIMDIntrinsicDotProduct:
2044 // Float/Double vectors:
2045 // For SSE, or AVX with 32-byte vectors, we also need an internal register
2046 // as scratch. Further we need the targetReg and internal reg to be distinct
2047 // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2048 // don't need a tmpReg.
2050 // 32-byte integer vector on SSE4/AVX:
2051 // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2052 // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
2053 // registers since targetReg is an int type register.
2055 // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2056 // and the need for scratch registers.
2057 if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2059 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) ||
2060 (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
2062 info->internalFloatCount = 1;
2063 info->isInternalRegDelayFree = true;
2064 info->setInternalCandidates(this, allSIMDRegs());
2066 // else don't need scratch reg(s).
2070 assert(simdTree->gtSIMDBaseType == TYP_INT && compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported);
2072 // No need to set isInternalRegDelayFree since targetReg is a
2073 // an int type reg and guaranteed to be different from xmm/ymm
2075 info->internalFloatCount = (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) ? 2 : 1;
2076 info->setInternalCandidates(this, allSIMDRegs());
2078 assert(info->srcCount == 2);
2081 case SIMDIntrinsicGetItem:
2083 // This implements get_Item method. The sources are:
2084 // - the source SIMD struct
2085 // - index (which element to get)
2086 // The result is baseType of SIMD struct.
2087 // op1 may be a contained memory op, but if so we will consume its address.
2088 // op2 may be a contained constant.
2089 op1 = simdTree->gtOp.gtOp1;
2090 op2 = simdTree->gtOp.gtOp2;
2092 if (!op1->isContained())
2094 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2095 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2096 // can use that in the process of extracting the element.
2098 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2099 // we will need a temp if are indexing into the upper half of the AVX register.
2100 // In all other cases with constant index, we need a temp xmm register to extract the
2101 // element if index is other than zero.
2103 if (!op2->IsCnsIntOrI())
2105 (void)compiler->getSIMDInitTempVarNum();
2107 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2110 if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2111 (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
2113 int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2114 needFloatTemp = (byteShiftCnt >= 16);
2118 needFloatTemp = !op2->IsIntegralConst(0);
2123 info->internalFloatCount = 1;
2124 info->setInternalCandidates(this, allSIMDRegs());
2131 case SIMDIntrinsicSetX:
2132 case SIMDIntrinsicSetY:
2133 case SIMDIntrinsicSetZ:
2134 case SIMDIntrinsicSetW:
2135 assert(info->srcCount == 2);
2137 // We need an internal integer register for SSE2 codegen
2138 if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2140 info->internalIntCount = 1;
2141 info->setInternalCandidates(this, allRegs(TYP_INT));
2146 case SIMDIntrinsicCast:
2147 assert(info->srcCount == 1);
2150 case SIMDIntrinsicConvertToSingle:
2151 assert(info->srcCount == 1);
2152 if (simdTree->gtSIMDBaseType == TYP_UINT)
2154 // We need an internal register different from targetReg.
2155 info->isInternalRegDelayFree = true;
2156 info->internalIntCount = 1;
2157 info->internalFloatCount = 2;
2158 info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2162 case SIMDIntrinsicConvertToInt32:
2163 assert(info->srcCount == 1);
2166 case SIMDIntrinsicWidenLo:
2167 case SIMDIntrinsicWidenHi:
2168 assert(info->srcCount == 1);
2169 if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
2171 // We need an internal register different from targetReg.
2172 info->isInternalRegDelayFree = true;
2173 info->internalFloatCount = 1;
2174 info->setInternalCandidates(this, allSIMDRegs());
2178 case SIMDIntrinsicConvertToInt64:
2179 assert(info->srcCount == 1);
2180 // We need an internal register different from targetReg.
2181 info->isInternalRegDelayFree = true;
2182 info->internalIntCount = 1;
2183 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
2185 info->internalFloatCount = 2;
2189 info->internalFloatCount = 1;
2191 info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2194 case SIMDIntrinsicConvertToDouble:
2195 assert(info->srcCount == 1);
2196 // We need an internal register different from targetReg.
2197 info->isInternalRegDelayFree = true;
2198 info->internalIntCount = 1;
2200 if (simdTree->gtSIMDBaseType == TYP_LONG)
2202 info->internalFloatCount = 3;
2206 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
2208 info->internalFloatCount = 2;
2212 info->internalFloatCount = 1;
2214 info->setInternalCandidates(this, allSIMDRegs() | allRegs(TYP_INT));
2217 case SIMDIntrinsicNarrow:
2218 assert(info->srcCount == 2);
2219 // We need an internal register different from targetReg.
2220 info->isInternalRegDelayFree = true;
2221 if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
2223 info->internalFloatCount = 2;
2227 info->internalFloatCount = 1;
2229 info->setInternalCandidates(this, allSIMDRegs());
2232 case SIMDIntrinsicShuffleSSE2:
2233 assert(info->srcCount == 1);
2234 // Second operand is an integer constant and marked as contained.
2235 assert(simdTree->gtOp.gtOp2->isContainedIntOrIImmed());
2238 case SIMDIntrinsicGetX:
2239 case SIMDIntrinsicGetY:
2240 case SIMDIntrinsicGetZ:
2241 case SIMDIntrinsicGetW:
2242 case SIMDIntrinsicGetOne:
2243 case SIMDIntrinsicGetZero:
2244 case SIMDIntrinsicGetCount:
2245 case SIMDIntrinsicGetAllOnes:
2246 assert(!"Get intrinsics should not be seen during Lowering.");
2250 noway_assert(!"Unimplemented SIMD node type.");
2254 #endif // FEATURE_SIMD
2256 #ifdef FEATURE_HW_INTRINSICS
2257 //------------------------------------------------------------------------
2258 // BuildHWIntrinsic: Set the NodeInfo for a GT_HWIntrinsic tree.
2261 // tree - The GT_HWIntrinsic node of interest
2266 void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
2268 TreeNodeInfo* info = currentNodeInfo;
2269 NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId;
2270 var_types baseType = intrinsicTree->gtSIMDBaseType;
2271 InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID);
2272 HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID);
2273 HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID);
2274 int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicTree);
2276 if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
2278 SetContainsAVXFlags(true, 32);
2281 GenTree* op1 = intrinsicTree->gtOp.gtOp1;
2282 GenTree* op2 = intrinsicTree->gtOp.gtOp2;
2287 if (op1->OperIsList())
2289 for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest())
2291 info->srcCount += GetOperandInfo(list->Current());
2296 info->srcCount += GetOperandInfo(op1);
2302 info->srcCount += GetOperandInfo(op2);
2305 if ((category == HW_Category_IMM) && ((flags & HW_Flag_NoJmpTableIMM) == 0))
2307 GenTree* lastOp = Compiler::lastOpOfHWIntrinsic(intrinsicTree, numArgs);
2308 assert(lastOp != nullptr);
2309 if (Compiler::isImmHWIntrinsic(intrinsicID, lastOp) && !lastOp->isContainedIntOrIImmed())
2311 assert(!lastOp->IsCnsIntOrI());
2313 // We need two extra reg when lastOp isn't a constant so
2314 // the offset into the jump table for the fallback path
2317 info->internalIntCount = 2;
2318 info->setInternalCandidates(this, allRegs(TYP_INT));
2322 // Check for "srcCount >= 2" to match against 3+ operand nodes where one is constant
2323 if ((op2 == nullptr) && (info->srcCount >= 2) && intrinsicTree->isRMWHWIntrinsic(compiler))
2325 // TODO-XArch-CQ: This is currently done in order to handle intrinsics which have more than
2326 // two arguments but which still have RMW semantics (such as NI_SSE41_Insert). We should make
2327 // this handling more general and move it back out to LinearScan::BuildNode.
2329 assert(numArgs > 2);
2330 LocationInfoListNode* op2Info = useList.Begin()->Next();
2331 op2Info->info.isDelayFree = true;
2332 info->hasDelayFreeSrc = true;
2335 switch (intrinsicID)
2337 case NI_SSE_CompareEqualOrderedScalar:
2338 case NI_SSE_CompareEqualUnorderedScalar:
2339 case NI_SSE_CompareNotEqualOrderedScalar:
2340 case NI_SSE_CompareNotEqualUnorderedScalar:
2341 case NI_SSE2_CompareEqualOrderedScalar:
2342 case NI_SSE2_CompareEqualUnorderedScalar:
2343 case NI_SSE2_CompareNotEqualOrderedScalar:
2344 case NI_SSE2_CompareNotEqualUnorderedScalar:
2345 info->internalIntCount = 1;
2346 info->setInternalCandidates(this, RBM_BYTE_REGS);
2347 info->isInternalRegDelayFree = true;
2350 case NI_SSE_SetScalarVector128:
2351 case NI_SSE2_SetScalarVector128:
2352 // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
2353 info->internalFloatCount = 1;
2354 info->setInternalCandidates(this, allSIMDRegs());
2355 info->isInternalRegDelayFree = true;
2358 case NI_SSE_ConvertToSingle:
2359 case NI_SSE_StaticCast:
2360 case NI_SSE2_ConvertToDouble:
2361 case NI_AVX_ExtendToVector256:
2362 case NI_AVX_GetLowerHalf:
2363 case NI_AVX_StaticCast:
2365 assert(info->srcCount == 1);
2366 assert(info->dstCount == 1);
2367 useList.Last()->info.isTgtPref = true;
2371 case NI_AVX_SetAllVector256:
2373 if (varTypeIsIntegral(baseType))
2375 info->internalFloatCount = 1;
2376 if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsByte(baseType))
2378 info->internalFloatCount += 1;
2380 info->setInternalCandidates(this, allSIMDRegs());
2385 case NI_SSE2_MaskMove:
2387 // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
2388 LocationInfoListNode* op3Info = useList.Begin()->Next()->Next();
2389 op3Info->info.setSrcCandidates(this, RBM_EDI);
2393 case NI_SSE41_BlendVariable:
2394 if (!compiler->canUseVexEncoding())
2396 // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
2397 LocationInfoListNode* op2Info = useList.Begin()->Next();
2398 LocationInfoListNode* op3Info = op2Info->Next();
2399 op2Info->info.isDelayFree = true;
2400 op3Info->info.isDelayFree = true;
2401 op3Info->info.setSrcCandidates(this, RBM_XMM0);
2402 info->hasDelayFreeSrc = true;
2406 case NI_SSE41_TestAllOnes:
2408 info->internalFloatCount = 1;
2409 info->setInternalCandidates(this, allSIMDRegs());
2413 case NI_SSE41_Extract:
2414 if (baseType == TYP_FLOAT)
2416 info->internalIntCount += 1;
2419 else if (varTypeIsByte(baseType))
2421 info->setDstCandidates(this, RBM_BYTE_REGS);
2427 case NI_SSE42_Crc32:
2429 // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
2431 // TODO - currently we use the BaseType to bring the type of the second argument
2432 // to the code generator. May encode the overload info in other way.
2433 var_types srcType = intrinsicTree->gtSIMDBaseType;
2434 if (varTypeIsByte(srcType))
2436 LocationInfoListNode* op2Info = useList.GetSecond(INDEBUG(intrinsicTree->gtGetOp2()));
2437 op2Info->info.setSrcCandidates(this, RBM_BYTE_REGS);
2441 #endif // _TARGET_X86_
2444 assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END));
2450 //------------------------------------------------------------------------
2451 // BuildCast: Set the NodeInfo for a GT_CAST.
2454 // tree - The node of interest
2459 void LinearScan::BuildCast(GenTree* tree)
2461 TreeNodeInfo* info = currentNodeInfo;
2462 // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
2463 // see CodeGen::genIntToIntCast()
2465 // Non-overflow casts to/from float/double are done using SSE2 instructions
2466 // and that allow the source operand to be either a reg or memop. Given the
2467 // fact that casts from small int to float/double are done as two-level casts,
2468 // the source operand is always guaranteed to be of size 4 or 8 bytes.
2469 var_types castToType = tree->CastToType();
2470 GenTree* castOp = tree->gtCast.CastOp();
2471 var_types castOpType = castOp->TypeGet();
2473 info->srcCount = GetOperandInfo(castOp);
2474 assert(info->dstCount == 1);
2475 if (tree->gtFlags & GTF_UNSIGNED)
2477 castOpType = genUnsignedType(castOpType);
2480 // some overflow checks need a temp reg:
2481 // - GT_CAST from INT64/UINT64 to UINT32
2482 if (tree->gtOverflow() && (castToType == TYP_UINT))
2484 if (genTypeSize(castOpType) == 8)
2486 // Here we don't need internal register to be different from targetReg,
2487 // rather require it to be different from operand's reg.
2488 info->internalIntCount = 1;
2493 //-----------------------------------------------------------------------------------------
2494 // BuildIndir: Specify register requirements for address expression of an indirection operation.
2497 // indirTree - GT_IND or GT_STOREIND gentree node
2499 void LinearScan::BuildIndir(GenTreeIndir* indirTree)
2501 TreeNodeInfo* info = currentNodeInfo;
2502 // If this is the rhs of a block copy (i.e. non-enregisterable struct),
2503 // it has no register requirements.
2504 if (indirTree->TypeGet() == TYP_STRUCT)
2509 int indirSrcCount = GetIndirInfo(indirTree);
2510 if (indirTree->gtOper == GT_STOREIND)
2512 GenTree* source = indirTree->gtOp.gtOp2;
2513 if (indirTree->AsStoreInd()->IsRMWMemoryOp())
2515 // Because 'source' is contained, we haven't yet determined its special register requirements, if any.
2516 // As it happens, the Shift or Rotate cases are the only ones with special requirements.
2517 assert(source->isContained() && source->OperIsRMWMemOp());
2518 GenTree* nonMemSource = nullptr;
2519 GenTreeIndir* otherIndir = nullptr;
2521 if (source->OperIsShiftOrRotate())
2523 info->srcCount += BuildShiftRotate(source);
2527 info->srcCount += appendBinaryLocationInfoToList(source->AsOp());
2529 if (indirTree->AsStoreInd()->IsRMWDstOp1())
2531 otherIndir = source->gtGetOp1()->AsIndir();
2532 if (source->OperIsBinary())
2534 nonMemSource = source->gtOp.gtOp2;
2537 else if (indirTree->AsStoreInd()->IsRMWDstOp2())
2539 otherIndir = source->gtGetOp2()->AsIndir();
2540 nonMemSource = source->gtOp.gtOp1;
2542 if (otherIndir != nullptr)
2544 // Any lclVars in the addressing mode of this indirection are contained.
2545 // If they are marked as lastUse, transfer the last use flag to the store indir.
2546 GenTree* base = otherIndir->Base();
2547 GenTree* dstBase = indirTree->Base();
2548 CheckAndMoveRMWLastUse(base, dstBase);
2549 GenTree* index = otherIndir->Index();
2550 GenTree* dstIndex = indirTree->Index();
2551 CheckAndMoveRMWLastUse(index, dstIndex);
2553 if (nonMemSource != nullptr)
2555 assert(!nonMemSource->isContained() || (!nonMemSource->isMemoryOp() && !nonMemSource->IsLocal()));
2557 if (varTypeIsByte(indirTree) && !nonMemSource->isContained())
2559 // If storeInd is of TYP_BYTE, set source to byteable registers.
2560 TreeNodeInfo& nonMemSourceInfo = useList.GetTreeNodeInfo(nonMemSource);
2561 regMaskTP regMask = nonMemSourceInfo.getSrcCandidates(this);
2562 regMask &= ~RBM_NON_BYTE_REGS;
2563 assert(regMask != RBM_NONE);
2564 nonMemSourceInfo.setSrcCandidates(this, regMask);
2572 if (varTypeIsByte(indirTree) && !source->isContained())
2574 // If storeInd is of TYP_BYTE, set source to byteable registers.
2575 LocationInfoListNode* sourceInfo = getLocationInfo(source);
2576 regMaskTP regMask = sourceInfo->info.getSrcCandidates(this);
2577 regMask &= ~RBM_NON_BYTE_REGS;
2578 assert(regMask != RBM_NONE);
2579 sourceInfo->info.setSrcCandidates(this, regMask);
2580 useList.Append(sourceInfo);
2586 info->srcCount += GetOperandInfo(source);
2590 info->srcCount += indirSrcCount;
2593 if (indirTree->TypeGet() == TYP_SIMD12)
2595 // If indirTree is of TYP_SIMD12, addr is not contained. See comment in LowerIndir().
2596 assert(!indirTree->Addr()->isContained());
2598 // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
2599 // To assemble the vector properly we would need an additional
2601 info->internalFloatCount = 1;
2603 // In case of GT_IND we need an internal register different from targetReg and
2604 // both of the registers are used at the same time.
2605 if (indirTree->OperGet() == GT_IND)
2607 info->isInternalRegDelayFree = true;
2610 info->setInternalCandidates(this, allSIMDRegs());
2614 #endif // FEATURE_SIMD
2616 assert(indirTree->Addr()->gtOper != GT_ARR_ELEM);
2619 //------------------------------------------------------------------------
2620 // BuildMul: Set the NodeInfo for a multiply.
2623 // tree - The node of interest
2628 void LinearScan::BuildMul(GenTree* tree)
2630 TreeNodeInfo* info = currentNodeInfo;
2631 #if defined(_TARGET_X86_)
2632 assert(tree->OperIs(GT_MUL, GT_MULHI, GT_MUL_LONG));
2634 assert(tree->OperIs(GT_MUL, GT_MULHI));
2636 GenTree* op1 = tree->gtOp.gtOp1;
2637 GenTree* op2 = tree->gtOp.gtOp2;
2638 info->srcCount = appendBinaryLocationInfoToList(tree->AsOp());
2639 assert(info->dstCount == 1);
2641 // Case of float/double mul.
2642 if (varTypeIsFloating(tree->TypeGet()))
2647 bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0);
2648 bool requiresOverflowCheck = tree->gtOverflowEx();
2650 // There are three forms of x86 multiply:
2651 // one-op form: RDX:RAX = RAX * r/m
2652 // two-op form: reg *= r/m
2653 // three-op form: reg = r/m * imm
2655 // This special widening 32x32->64 MUL is not used on x64
2656 CLANG_FORMAT_COMMENT_ANCHOR;
2657 #if defined(_TARGET_X86_)
2658 if (tree->OperGet() != GT_MUL_LONG)
2661 assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
2664 // We do use the widening multiply to implement
2665 // the overflow checking for unsigned multiply
2667 if (isUnsignedMultiply && requiresOverflowCheck)
2669 // The only encoding provided is RDX:RAX = RAX * rm
2671 // Here we set RAX as the only destination candidate
2672 // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
2674 info->setDstCandidates(this, RBM_RAX);
2676 else if (tree->OperGet() == GT_MULHI)
2678 // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
2679 // upper 32 bits of the result set the destination candidate to REG_RDX.
2680 info->setDstCandidates(this, RBM_RDX);
2682 #if defined(_TARGET_X86_)
2683 else if (tree->OperGet() == GT_MUL_LONG)
2685 // have to use the encoding:RDX:RAX = RAX * rm
2686 info->setDstCandidates(this, RBM_RAX);
2689 GenTree* containedMemOp = nullptr;
2690 if (op1->isContained() && !op1->IsCnsIntOrI())
2692 assert(!op2->isContained() || op2->IsCnsIntOrI());
2693 containedMemOp = op1;
2695 else if (op2->isContained() && !op2->IsCnsIntOrI())
2697 containedMemOp = op2;
2699 if ((containedMemOp != nullptr) && CheckAndSetDelayFree(containedMemOp))
2701 info->hasDelayFreeSrc = true;
2705 //------------------------------------------------------------------------------
2706 // SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
2707 // Contains256bitAVX flag when SIMD vector size is 32 bytes
2710 // isFloatingPointType - true if it is floating point type
2711 // sizeOfSIMDVector - SIMD Vector size
2713 void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
2715 if (isFloatingPointType && compiler->canUseVexEncoding())
2717 compiler->getEmitter()->SetContainsAVX(true);
2718 if (sizeOfSIMDVector == 32)
2720 compiler->getEmitter()->SetContains256bitAVX(true);
2726 //------------------------------------------------------------------------
2727 // ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
2731 // tree - The node of interest
2734 // If we need to exclude non-byteable registers
2736 bool LinearScan::ExcludeNonByteableRegisters(GenTree* tree)
2738 // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
2739 // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
2740 // value. In this case we need to exclude esi/edi from the src candidates of op2.
2741 if (varTypeIsByte(tree))
2745 // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
2746 else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
2750 else if (tree->OperIsCompare() || tree->OperIs(GT_CMP))
2752 GenTree* op1 = tree->gtGetOp1();
2753 GenTree* op2 = tree->gtGetOp2();
2755 // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
2756 // ubyte as the result of comparison and if the result needs to be materialized into a reg
2757 // simply zero extend it to TYP_INT size. Here is an example of generated code:
2758 // cmp dl, byte ptr[addr mode]
2760 if (varTypeIsByte(op1) && varTypeIsByte(op2))
2764 // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
2765 // ubyte as the result of the comparison and if the result needs to be materialized into a reg
2766 // simply zero extend it to TYP_INT size.
2767 else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
2771 // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
2772 // ubyte as the result of the comparison and if the result needs to be materialized into a reg
2773 // simply zero extend it to TYP_INT size.
2774 else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
2784 else if (tree->OperGet() == GT_SIMD)
2786 GenTreeSIMD* simdNode = tree->AsSIMD();
2787 switch (simdNode->gtSIMDIntrinsicID)
2789 case SIMDIntrinsicOpEquality:
2790 case SIMDIntrinsicOpInEquality:
2791 // We manifest it into a byte register, so the target must be byteable.
2794 case SIMDIntrinsicGetItem:
2796 // This logic is duplicated from genSIMDIntrinsicGetItem().
2797 // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
2798 // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
2799 // cases will require this, so the non-byteable registers can be excluded.
2801 GenTree* op1 = simdNode->gtGetOp1();
2802 GenTree* op2 = simdNode->gtGetOp2();
2803 var_types baseType = simdNode->gtSIMDBaseType;
2804 if (!isContainableMemoryOp(op1) && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
2806 bool ZeroOrSignExtnReqd = true;
2807 unsigned baseSize = genTypeSize(baseType);
2810 if ((op2->gtIntCon.gtIconVal % 2) == 1)
2812 ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2817 assert(baseSize == 2);
2818 ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2820 return ZeroOrSignExtnReqd;
2830 #endif // FEATURE_SIMD
2836 #endif // _TARGET_X86_
2838 #endif // _TARGET_XARCH_
2840 #endif // !LEGACY_BACKEND