1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Lowering for AMD64 XX
10 XX This encapsulates all the logic for lowering trees for the AMD64 XX
11 XX architecture. For a more detailed view of what is lowering, please XX
12 XX take a look at Lower.cpp XX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
24 #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
29 #include "sideeffects.h"
32 // xarch supports both ROL and ROR instructions so no lowering is required.
33 void Lowering::LowerRotate(GenTreePtr tree)
37 //------------------------------------------------------------------------
38 // LowerStoreLoc: Lower a store of a lclVar
41 // storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
45 // - Setting the appropriate candidates for a store of a multi-reg call return value.
46 // - Requesting an internal register for SIMD12 stores.
47 // - Handling of contained immediates and widening operations of unsigneds.
49 void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
51 TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
53 // Is this the case of var = call where call is returning
54 // a value in multiple return registers?
55 GenTree* op1 = storeLoc->gtGetOp1();
56 if (op1->IsMultiRegCall())
58 // backend expects to see this case only for store lclvar.
59 assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
61 // srcCount = number of registers in which the value is returned by call
62 GenTreeCall* call = op1->AsCall();
63 ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
64 info->srcCount = retTypeDesc->GetReturnRegCount();
66 // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
67 regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
68 op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
73 if (varTypeIsSIMD(storeLoc))
75 if (op1->IsCnsIntOrI())
78 MakeSrcContained(storeLoc, op1);
80 else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD))
82 // Need an additional register to extract upper 4 bytes of Vector3.
83 info->internalFloatCount = 1;
84 info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
86 // In this case don't mark the operand as contained as we want it to
87 // be evaluated into an xmm register
91 #endif // FEATURE_SIMD
93 // If the source is a containable immediate, make it contained, unless it is
94 // an int-size or larger store of zero to memory, because we can generate smaller code
95 // by zeroing a register and then storing it.
96 if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
98 MakeSrcContained(storeLoc, op1);
101 // Try to widen the ops if they are going into a local var.
102 if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT))
104 GenTreeIntCon* con = storeLoc->gtOp1->AsIntCon();
105 ssize_t ival = con->gtIconVal;
107 unsigned varNum = storeLoc->gtLclNum;
108 LclVarDsc* varDsc = comp->lvaTable + varNum;
110 if (varDsc->lvIsSIMDType())
112 noway_assert(storeLoc->gtType != TYP_STRUCT);
114 unsigned size = genTypeSize(storeLoc);
115 // If we are storing a constant into a local variable
116 // we extend the size of the store here
117 if ((size < 4) && !varTypeIsStruct(varDsc))
119 if (!varTypeIsUnsigned(varDsc))
121 if (genTypeSize(storeLoc) == 1)
123 if ((ival & 0x7f) != ival)
125 ival = ival | 0xffffff00;
130 assert(genTypeSize(storeLoc) == 2);
131 if ((ival & 0x7fff) != ival)
133 ival = ival | 0xffff0000;
138 // A local stack slot is at least 4 bytes in size, regardless of
139 // what the local var is typed as, so auto-promote it here
140 // unless it is a field of a promoted struct
141 // TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
142 if (!varDsc->lvIsStructField)
144 storeLoc->gtType = TYP_INT;
145 con->SetIconValue(ival);
152 * Takes care of annotating the register requirements
153 * for every TreeNodeInfo struct that maps to each tree node.
155 * LSRA Has been initialized and there is a TreeNodeInfo node
156 * already allocated and initialized for every tree in the IR.
158 * Every TreeNodeInfo instance has the right annotations on register
159 * requirements needed by LSRA to build the Interval Table (source,
160 * destination and internal [temp] register counts).
161 * This code is refactored originally from LSRA.
163 void Lowering::TreeNodeInfoInit(GenTree* tree)
165 LinearScan* l = m_lsra;
166 Compiler* compiler = comp;
168 TreeNodeInfo* info = &(tree->gtLsraInfo);
170 switch (tree->OperGet())
176 TreeNodeInfoInitSimple(tree);
184 // Need an additional register to read upper 4 bytes of Vector3.
185 if (tree->TypeGet() == TYP_SIMD12)
187 // We need an internal register different from targetReg in which 'tree' produces its result
188 // because both targetReg and internal reg will be in use at the same time.
189 info->internalFloatCount = 1;
190 info->isInternalRegDelayFree = true;
191 info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
196 case GT_STORE_LCL_FLD:
197 case GT_STORE_LCL_VAR:
199 if (tree->gtGetOp1()->OperGet() == GT_LONG)
204 #endif // _TARGET_X86_
209 LowerStoreLoc(tree->AsLclVarCommon());
213 noway_assert(!"box should not exist here");
214 // The result of 'op1' is also the final result
226 GenTreePtr firstOperand;
227 GenTreePtr secondOperand;
228 if (tree->gtFlags & GTF_REVERSE_OPS)
230 firstOperand = tree->gtOp.gtOp2;
231 secondOperand = tree->gtOp.gtOp1;
235 firstOperand = tree->gtOp.gtOp1;
236 secondOperand = tree->gtOp.gtOp2;
238 if (firstOperand->TypeGet() != TYP_VOID)
240 firstOperand->gtLsraInfo.isLocalDefUse = true;
241 firstOperand->gtLsraInfo.dstCount = 0;
243 if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
245 secondOperand->gtLsraInfo.isLocalDefUse = true;
246 secondOperand->gtLsraInfo.dstCount = 0;
268 #if !defined(_TARGET_64BIT_)
271 if ((tree->gtLIRFlags & LIR::Flags::IsUnusedValue) != 0)
273 // An unused GT_LONG node needs to consume its sources.
285 #endif // !defined(_TARGET_64BIT_)
295 TreeNodeInfoInitReturn(tree);
299 if (tree->TypeGet() == TYP_VOID)
306 assert(tree->TypeGet() == TYP_INT);
311 info->setSrcCandidates(l, RBM_INTRET);
312 tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
316 // A GT_NOP is either a passthrough (if it is void, or if it has
317 // a child), but must be considered to produce a dummy value if it
318 // has a type but no child
321 if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
336 GenTree* cmp = tree->gtGetOp1();
337 l->clearDstCount(cmp);
340 // Say we have the following IR
341 // simdCompareResult = GT_SIMD((In)Equality, v1, v2)
342 // integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
343 // GT_JTRUE(integerCompareResult)
345 // In this case we don't need to generate code fo GT_EQ_/NE, since SIMD (In)Equality
346 // intrinsic would set or clear Zero flag.
348 genTreeOps cmpOper = cmp->OperGet();
349 if (cmpOper == GT_EQ || cmpOper == GT_NE)
351 GenTree* cmpOp1 = cmp->gtGetOp1();
352 GenTree* cmpOp2 = cmp->gtGetOp2();
354 if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
356 // clear dstCount on SIMD node to indicate that
357 // result doesn't need to be materialized into a register.
358 l->clearOperandCounts(cmp);
359 l->clearDstCount(cmpOp1);
360 l->clearOperandCounts(cmpOp2);
362 // Codegen of SIMD (in)Equality uses target integer reg
363 // only for setting flags. Target reg is not needed on AVX
364 // when comparing against Vector Zero. In all other cases
365 // we need to reserve an int type internal register, since we
366 // have cleared dstCount.
367 if (compiler->canUseAVX() && cmpOp1->gtGetOp2()->IsIntegralConstVector(0))
369 // We don't need an internal register,since we use vptest
370 // for setting flags.
374 ++(cmpOp1->gtLsraInfo.internalIntCount);
375 regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l);
376 internalCandidates |= l->allRegs(TYP_INT);
377 cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates);
380 // We would have to reverse compare oper in the following cases:
381 // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
382 // Therefore, if compare oper is == or != against false(0), we will
383 // be checking opposite of what is required.
385 // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
386 // Therefore, if compare oper is == or != against true(1), we will
387 // be checking opposite of what is required.
388 GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
389 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
391 if (cmpOp2->IsIntegralConst(0))
393 cmp->SetOper(GenTree::ReverseRelop(cmpOper));
398 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
399 if (cmpOp2->IsIntegralConst(1))
401 cmp->SetOper(GenTree::ReverseRelop(cmpOper));
406 #endif // FEATURE_SIMD
421 // This should never occur since switch nodes must not be visible at this
424 info->dstCount = 0; // To avoid getting uninit errors.
425 noway_assert(!"Switch must be lowered at this point");
433 case GT_SWITCH_TABLE:
435 info->internalIntCount = 1;
442 noway_assert(!"We should never hit any assignment operator in lowering");
447 #if !defined(_TARGET_64BIT_)
455 // SSE2 arithmetic instructions doesn't support the form "op mem, xmm".
456 // Rather they only support "op xmm, mem/xmm" form.
457 if (varTypeIsFloating(tree->TypeGet()))
459 // overflow operations aren't supported on float/double types.
460 assert(!tree->gtOverflow());
462 op1 = tree->gtGetOp1();
463 op2 = tree->gtGetOp2();
465 // No implicit conversions at this stage as the expectation is that
466 // everything is made explicit by adding casts.
467 assert(op1->TypeGet() == op2->TypeGet());
472 if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
474 MakeSrcContained(tree, op2);
476 else if (tree->OperIsCommutative() &&
477 (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))))
479 // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
480 // as long as it is safe so that the following efficient code sequence is generated:
481 // addss/sd targetReg, memOp (if op1Reg == targetReg) OR
482 // movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
485 // movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg (if op1Reg == targetReg) OR
486 // movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
487 MakeSrcContained(tree, op1);
491 // If there are no containable operands, we can make an operand reg optional.
492 SetRegOptionalForBinOp(tree);
502 TreeNodeInfoInitLogicalOp(tree);
506 // this just turns into a compare of its child with an int
507 // + a conditional call
510 if (tree->gtOp.gtOp1->isIndir())
512 MakeSrcContained(tree, tree->gtOp.gtOp1);
514 info->internalIntCount = 1;
515 info->setInternalCandidates(l, l->allRegs(TYP_INT));
522 TreeNodeInfoInitModDiv(tree);
527 #if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
530 SetMulOpCounts(tree);
534 TreeNodeInfoInitIntrinsic(tree);
539 TreeNodeInfoInitSIMD(tree);
541 #endif // FEATURE_SIMD
544 TreeNodeInfoInitCast(tree);
552 // SSE instruction set doesn't have an instruction to negate a number.
553 // The recommended way is to xor the float/double number with a bitmask.
554 // The only way to xor is using xorps or xorpd both of which operate on
555 // 128-bit operands. To hold the bit-mask we would need another xmm
556 // register or a 16-byte aligned 128-bit data constant. Right now emitter
557 // lacks the support for emitting such constants or instruction with mem
558 // addressing mode referring to a 128-bit operand. For now we use an
559 // internal xmm register to load 32/64-bit bitmask from data section.
560 // Note that by trading additional data section memory (128-bit) we can
561 // save on the need for an internal register and also a memory-to-reg
564 // Note: another option to avoid internal register requirement is by
565 // lowering as GT_SUB(0, src). This will generate code different from
566 // Jit64 and could possibly result in compat issues (?).
567 if (varTypeIsFloating(tree))
569 info->internalFloatCount = 1;
570 info->setInternalCandidates(l, l->internalFloatRegCandidates());
588 TreeNodeInfoInitShiftRotate(tree);
597 TreeNodeInfoInitCmp(tree);
603 info->internalIntCount = 1;
610 // comparand is preferenced to RAX.
611 // Remaining two operands can be in any reg other than RAX.
612 tree->gtCmpXchg.gtOpComparand->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
613 tree->gtCmpXchg.gtOpLocation->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
614 tree->gtCmpXchg.gtOpValue->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
615 tree->gtLsraInfo.setDstCandidates(l, RBM_RAX);
622 CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
626 TreeNodeInfoInitCall(tree->AsCall());
631 // For a GT_ADDR, the child node should not be evaluated into a register
632 GenTreePtr child = tree->gtOp.gtOp1;
633 assert(!l->isCandidateLocalRef(child));
634 l->clearDstCount(child);
640 #if !defined(FEATURE_PUT_STRUCT_ARG_STK)
645 // These should all be eliminated prior to Lowering.
646 assert(!"Non-store block node in Lowering");
651 #ifdef FEATURE_PUT_STRUCT_ARG_STK
653 TreeNodeInfoInitPutArgStk(tree->AsPutArgStk());
655 #endif // FEATURE_PUT_STRUCT_ARG_STK
659 case GT_STORE_DYN_BLK:
660 TreeNodeInfoInitBlockStore(tree->AsBlk());
664 TreeNodeInfoInitLclHeap(tree);
667 case GT_ARR_BOUNDS_CHECK:
670 #endif // FEATURE_SIMD
672 GenTreeBoundsChk* node = tree->AsBoundsChk();
673 // Consumes arrLen & index - has no result
678 if (CheckImmedAndMakeContained(tree, node->gtIndex))
680 other = node->gtArrLen;
682 else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
684 other = node->gtIndex;
686 else if (node->gtIndex->isMemoryOp())
688 other = node->gtIndex;
692 other = node->gtArrLen;
695 if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
697 if (other->isMemoryOp())
699 MakeSrcContained(tree, other);
703 // We can mark 'other' as reg optional, since it is not contained.
704 SetRegOptional(other);
711 // These must have been lowered to GT_ARR_INDEX
712 noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
720 // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
721 // times while the result is being computed.
722 tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
723 info->hasDelayFreeSrc = true;
727 // This consumes the offset, if any, the arrObj and the effective index,
728 // and produces the flattened offset for this dimension.
732 // we don't want to generate code for this
733 if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
735 MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
739 // Here we simply need an internal register, which must be different
740 // from any of the operand's registers, but may be the same as targetReg.
741 info->internalIntCount = 1;
746 // The LEA usually passes its operands through to the GT_IND, in which case we'll
747 // clear the info->srcCount and info->dstCount later, but we may be instantiating an address,
748 // so we set them here.
750 if (tree->AsAddrMode()->HasBase())
754 if (tree->AsAddrMode()->HasIndex())
765 GenTree* src = tree->gtOp.gtOp2;
767 if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
769 LowerGCWriteBarrier(tree);
773 // If the source is a containable immediate, make it contained, unless it is
774 // an int-size or larger store of zero to memory, because we can generate smaller code
775 // by zeroing a register and then storing it.
776 if (IsContainableImmed(tree, src) &&
777 (!src->IsIntegralConst(0) || varTypeIsSmall(tree) || tree->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
779 MakeSrcContained(tree, src);
781 else if (!varTypeIsFloating(tree))
783 // Perform recognition of trees with the following structure:
784 // StoreInd(addr, BinOp(expr, GT_IND(addr)))
785 // to be able to fold this into an instruction of the form
786 // BINOP [addr], register
787 // where register is the actual place where 'expr' is computed.
789 // SSE2 doesn't support RMW form of instructions.
790 if (SetStoreIndOpCountsIfRMWMemOp(tree))
796 SetIndirAddrOpCounts(tree);
803 info->isLocalDefUse = true;
809 SetIndirAddrOpCounts(tree);
815 info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
818 #if !FEATURE_EH_FUNCLETS
826 // These nodes are eliminated by rationalizer.
827 JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet()));
830 } // end switch (tree->OperGet())
832 // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
833 // Even then we would like to set isTgtPref on Op1.
834 if (tree->OperIsBinary() && info->srcCount >= 1)
836 if (isRMWRegOper(tree))
838 GenTree* op1 = tree->gtOp.gtOp1;
839 GenTree* op2 = tree->gtOp.gtOp2;
841 // Commutative opers like add/mul/and/or/xor could reverse the order of
842 // operands if it is safe to do so. In such a case we would like op2 to be
843 // target preferenced instead of op1.
844 if (tree->OperIsCommutative() && op1->gtLsraInfo.dstCount == 0 && op2 != nullptr)
847 op2 = tree->gtOp.gtOp1;
850 // If we have a read-modify-write operation, we want to preference op1 to the target.
851 // If op1 is contained, we don't want to preference it, but it won't
852 // show up as a source in that case, so it will be ignored.
853 op1->gtLsraInfo.isTgtPref = true;
855 // Is this a non-commutative operator, or is op2 a contained memory op?
856 // (Note that we can't call IsContained() at this point because it uses exactly the
857 // same information we're currently computing.)
858 // In either case, we need to make op2 remain live until the op is complete, by marking
859 // the source(s) associated with op2 as "delayFree".
860 // Note that if op2 of a binary RMW operator is a memory op, even if the operator
861 // is commutative, codegen cannot reverse them.
862 // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
863 // more work to be done to correctly reverse the operands if they involve memory
864 // operands. Also, we may need to handle more cases than GT_IND, especially once
865 // we've modified the register allocator to not require all nodes to be assigned
866 // a register (e.g. a spilled lclVar can often be referenced directly from memory).
867 // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
869 GenTree* delayUseSrc = nullptr;
870 // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
871 // to special case them.
872 if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
876 else if ((op2 != nullptr) &&
877 (!tree->OperIsCommutative() || (op2->isMemoryOp() && (op2->gtLsraInfo.srcCount == 0))))
881 if (delayUseSrc != nullptr)
883 // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
884 // on the base & index, if any.
885 // Otherwise, we set it on delayUseSrc itself.
886 if (delayUseSrc->isIndir() && (delayUseSrc->gtLsraInfo.dstCount == 0))
888 GenTree* base = delayUseSrc->AsIndir()->Base();
889 GenTree* index = delayUseSrc->AsIndir()->Index();
892 base->gtLsraInfo.isDelayFree = true;
894 if (index != nullptr)
896 index->gtLsraInfo.isDelayFree = true;
901 delayUseSrc->gtLsraInfo.isDelayFree = true;
903 info->hasDelayFreeSrc = true;
908 TreeNodeInfoInitCheckByteable(tree);
910 // We need to be sure that we've set info->srcCount and info->dstCount appropriately
911 assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
914 //------------------------------------------------------------------------
915 // TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are
916 // required, and set the tree node info accordingly.
919 // tree - The node of interest
924 void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree)
927 LinearScan* l = m_lsra;
928 TreeNodeInfo* info = &(tree->gtLsraInfo);
930 // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
931 // if the tree node is a byte type.
933 // Though this looks conservative in theory, in practice we could not think of a case where
934 // the below logic leads to conservative register specification. In future when or if we find
935 // one such case, this logic needs to be fine tuned for that case(s).
937 if (ExcludeNonByteableRegisters(tree))
940 if (info->dstCount > 0)
942 regMask = info->getDstCandidates(l);
943 assert(regMask != RBM_NONE);
944 info->setDstCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
947 if (tree->OperIsSimple() && (info->srcCount > 0))
949 // No need to set src candidates on a contained child operand.
950 GenTree* op = tree->gtOp.gtOp1;
951 assert(op != nullptr);
952 bool containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
955 regMask = op->gtLsraInfo.getSrcCandidates(l);
956 assert(regMask != RBM_NONE);
957 op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
960 if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
962 op = tree->gtOp.gtOp2;
963 containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
966 regMask = op->gtLsraInfo.getSrcCandidates(l);
967 assert(regMask != RBM_NONE);
968 op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
973 #endif //_TARGET_X86_
976 //------------------------------------------------------------------------
977 // TreeNodeInfoInitSimple: Sets the srcCount and dstCount for all the trees
978 // without special handling based on the tree node type.
981 // tree - The node of interest
986 void Lowering::TreeNodeInfoInitSimple(GenTree* tree)
988 TreeNodeInfo* info = &(tree->gtLsraInfo);
989 unsigned kind = tree->OperKind();
990 info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
991 if (kind & (GTK_CONST | GTK_LEAF))
995 else if (kind & (GTK_SMPOP))
997 if (tree->gtGetOp2() != nullptr)
1012 //------------------------------------------------------------------------
1013 // TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
1016 // tree - The node of interest
1021 void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
1023 TreeNodeInfo* info = &(tree->gtLsraInfo);
1024 LinearScan* l = m_lsra;
1025 Compiler* compiler = comp;
1027 #if !defined(_TARGET_64BIT_)
1028 if (tree->TypeGet() == TYP_LONG)
1030 GenTree* op1 = tree->gtGetOp1();
1031 noway_assert(op1->OperGet() == GT_LONG);
1032 GenTree* loVal = op1->gtGetOp1();
1033 GenTree* hiVal = op1->gtGetOp2();
1035 loVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_LO);
1036 hiVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_HI);
1040 #endif // !defined(_TARGET_64BIT_)
1042 GenTree* op1 = tree->gtGetOp1();
1043 regMaskTP useCandidates = RBM_NONE;
1045 info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
1048 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1049 if (varTypeIsStruct(tree))
1051 // op1 has to be either an lclvar or a multi-reg returning call
1052 if (op1->OperGet() == GT_LCL_VAR)
1054 GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
1055 LclVarDsc* varDsc = &(compiler->lvaTable[lclVarCommon->gtLclNum]);
1056 assert(varDsc->lvIsMultiRegRet);
1058 // Mark var as contained if not enregistrable.
1059 if (!varTypeIsEnregisterableStruct(op1))
1061 MakeSrcContained(tree, op1);
1066 noway_assert(op1->IsMultiRegCall());
1068 ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc();
1069 info->srcCount = retTypeDesc->GetReturnRegCount();
1070 useCandidates = retTypeDesc->GetABIReturnRegs();
1074 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1076 // Non-struct type return - determine useCandidates
1077 switch (tree->TypeGet())
1080 useCandidates = RBM_NONE;
1083 useCandidates = RBM_FLOATRET;
1086 useCandidates = RBM_DOUBLERET;
1088 #if defined(_TARGET_64BIT_)
1090 useCandidates = RBM_LNGRET;
1092 #endif // defined(_TARGET_64BIT_)
1094 useCandidates = RBM_INTRET;
1099 if (useCandidates != RBM_NONE)
1101 op1->gtLsraInfo.setSrcCandidates(l, useCandidates);
1106 //------------------------------------------------------------------------
1107 // TreeNodeInfoInitShiftRotate: Set the NodeInfo for a shift or rotate.
1110 // tree - The node of interest
1115 void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
1117 TreeNodeInfo* info = &(tree->gtLsraInfo);
1118 LinearScan* l = m_lsra;
1123 // For shift operations, we need that the number
1124 // of bits moved gets stored in CL in case
1125 // the number of bits to shift is not a constant.
1126 GenTreePtr shiftBy = tree->gtOp.gtOp2;
1127 GenTreePtr source = tree->gtOp.gtOp1;
1130 // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
1131 // we can have a three operand form. Increment the srcCount.
1132 if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
1134 assert(source->OperGet() == GT_LONG);
1138 if (tree->OperGet() == GT_LSH_HI)
1140 GenTreePtr sourceLo = source->gtOp.gtOp1;
1141 sourceLo->gtLsraInfo.isDelayFree = true;
1145 GenTreePtr sourceHi = source->gtOp.gtOp2;
1146 sourceHi->gtLsraInfo.isDelayFree = true;
1149 source->gtLsraInfo.hasDelayFreeSrc = true;
1150 info->hasDelayFreeSrc = true;
1154 // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
1155 // We will allow whatever can be encoded - hope you know what you are doing.
1156 if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) ||
1157 (shiftBy->gtIntConCommon.IconValue() < 0))
1159 source->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
1160 shiftBy->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
1161 info->setDstCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
1165 MakeSrcContained(tree, shiftBy);
1169 //------------------------------------------------------------------------
1170 // TreeNodeInfoInitCall: Set the NodeInfo for a call.
1173 // call - The call node of interest
1178 void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
1180 TreeNodeInfo* info = &(call->gtLsraInfo);
1181 LinearScan* l = m_lsra;
1182 Compiler* compiler = comp;
1183 bool hasMultiRegRetVal = false;
1184 ReturnTypeDesc* retTypeDesc = nullptr;
1187 if (call->TypeGet() != TYP_VOID)
1189 hasMultiRegRetVal = call->HasMultiRegRetVal();
1190 if (hasMultiRegRetVal)
1192 // dst count = number of registers in which the value is returned by call
1193 retTypeDesc = call->GetReturnTypeDesc();
1194 info->dstCount = retTypeDesc->GetReturnRegCount();
1206 GenTree* ctrlExpr = call->gtControlExpr;
1207 if (call->gtCallType == CT_INDIRECT)
1209 // either gtControlExpr != null or gtCallAddr != null.
1210 // Both cannot be non-null at the same time.
1211 assert(ctrlExpr == nullptr);
1212 assert(call->gtCallAddr != nullptr);
1213 ctrlExpr = call->gtCallAddr;
1216 // Fast tail calls aren't currently supported on x86, but if they ever are, the code
1217 // below that handles indirect VSD calls will need to be fixed.
1218 assert(!call->IsFastTailCall() || !call->IsVirtualStub());
1219 #endif // _TARGET_X86_
1222 // set reg requirements on call target represented as control sequence.
1223 if (ctrlExpr != nullptr)
1225 // we should never see a gtControlExpr whose type is void.
1226 assert(ctrlExpr->TypeGet() != TYP_VOID);
1228 // call can take a Rm op on x64
1231 // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1232 // computed into a register.
1233 if (!call->IsFastTailCall())
1236 // On x86, we need to generate a very specific pattern for indirect VSD calls:
1239 // call dword ptr [eax]
1241 // Where EAX is also used as an argument to the stub dispatch helper. Make
1242 // sure that the call target address is computed into EAX in this case.
1243 if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1245 assert(ctrlExpr->isIndir());
1247 ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET);
1248 MakeSrcContained(call, ctrlExpr);
1251 #endif // _TARGET_X86_
1252 if (ctrlExpr->isIndir())
1254 MakeSrcContained(call, ctrlExpr);
1259 // Fast tail call - make sure that call target is always computed in RAX
1260 // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
1261 ctrlExpr->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
1265 // If this is a varargs call, we will clear the internal candidates in case we need
1266 // to reserve some integer registers for copying float args.
1267 // We have to do this because otherwise the default candidates are allRegs, and adding
1268 // the individual specific registers will have no effect.
1269 if (call->IsVarargs())
1271 info->setInternalCandidates(l, RBM_NONE);
1274 RegisterType registerType = call->TypeGet();
1276 // Set destination candidates for return value of the call.
1277 CLANG_FORMAT_COMMENT_ANCHOR;
1280 if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
1282 // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
1283 // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
1284 // correct argument registers.
1285 info->setDstCandidates(l, RBM_PINVOKE_TCB);
1288 #endif // _TARGET_X86_
1289 if (hasMultiRegRetVal)
1291 assert(retTypeDesc != nullptr);
1292 info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
1294 else if (varTypeIsFloating(registerType))
1297 // The return value will be on the X87 stack, and we will need to move it.
1298 info->setDstCandidates(l, l->allRegs(registerType));
1299 #else // !_TARGET_X86_
1300 info->setDstCandidates(l, RBM_FLOATRET);
1301 #endif // !_TARGET_X86_
1303 else if (registerType == TYP_LONG)
1305 info->setDstCandidates(l, RBM_LNGRET);
1309 info->setDstCandidates(l, RBM_INTRET);
1312 // number of args to a call =
1313 // callRegArgs + (callargs - placeholders, setup, etc)
1314 // there is an explicit thisPtr but it is redundant
1316 // If there is an explicit this pointer, we don't want that node to produce anything
1317 // as it is redundant
1318 if (call->gtCallObjp != nullptr)
1320 GenTreePtr thisPtrNode = call->gtCallObjp;
1322 if (thisPtrNode->gtOper == GT_PUTARG_REG)
1324 l->clearOperandCounts(thisPtrNode);
1325 l->clearDstCount(thisPtrNode->gtOp.gtOp1);
1329 l->clearDstCount(thisPtrNode);
1334 bool callHasFloatRegArgs = false;
1335 #endif // !FEATURE_VARARG
1337 // First, count reg args
1338 for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
1340 assert(list->OperIsList());
1342 GenTreePtr argNode = list->Current();
1344 fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
1345 assert(curArgTabEntry);
1347 if (curArgTabEntry->regNum == REG_STK)
1349 // late arg that is not passed in a register
1351 assert(argNode->gtOper == GT_PUTARG_STK);
1352 argNode->gtLsraInfo.srcCount = 1;
1353 argNode->gtLsraInfo.dstCount = 0;
1355 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1356 // If the node is TYP_STRUCT and it is put on stack with
1357 // putarg_stk operation, we consume and produce no registers.
1358 // In this case the embedded Obj node should not produce
1359 // registers too since it is contained.
1360 // Note that if it is a SIMD type the argument will be in a register.
1361 if (argNode->TypeGet() == TYP_STRUCT)
1363 assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
1364 argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
1365 argNode->gtLsraInfo.srcCount = 0;
1367 #endif // FEATURE_PUT_STRUCT_ARG_STK
1371 regNumber argReg = REG_NA;
1372 regMaskTP argMask = RBM_NONE;
1374 bool isOnStack = true;
1375 if (curArgTabEntry->regNum != REG_STK)
1378 var_types argType = argNode->TypeGet();
1381 callHasFloatRegArgs |= varTypeIsFloating(argType);
1382 #endif // !FEATURE_VARARG
1384 argReg = curArgTabEntry->regNum;
1387 // Default case is that we consume one source; modify this later (e.g. for
1388 // promoted structs)
1391 argMask = genRegMask(argReg);
1392 argNode = argNode->gtEffectiveVal();
1395 // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID.
1396 // Use the curArgTabEntry's isStruct to get whether the param is a struct.
1397 if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct))
1399 unsigned originalSize = 0;
1400 LclVarDsc* varDsc = nullptr;
1401 if (argNode->gtOper == GT_LCL_VAR)
1403 varDsc = compiler->lvaTable + argNode->gtLclVarCommon.gtLclNum;
1404 originalSize = varDsc->lvSize();
1406 else if (argNode->gtOper == GT_MKREFANY)
1408 originalSize = 2 * TARGET_POINTER_SIZE;
1410 else if (argNode->gtOper == GT_OBJ)
1412 noway_assert(!"GT_OBJ not supported for amd64");
1414 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1415 else if (argNode->gtOper == GT_PUTARG_REG)
1417 originalSize = genTypeSize(argNode->gtType);
1419 else if (argNode->gtOper == GT_FIELD_LIST)
1423 // There could be up to 2 PUTARG_REGs in the list
1424 GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
1425 unsigned iterationNum = 0;
1426 for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
1428 GenTreePtr putArgRegNode = fieldListPtr->Current();
1429 assert(putArgRegNode->gtOper == GT_PUTARG_REG);
1431 if (iterationNum == 0)
1433 varDsc = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
1434 originalSize = varDsc->lvSize();
1435 assert(originalSize != 0);
1439 // Need an extra source for every node, but the first in the list.
1442 // Get the mask for the second putarg_reg
1443 argMask = genRegMask(curArgTabEntry->otherRegNum);
1446 putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
1447 putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);
1449 // To avoid redundant moves, have the argument child tree computed in the
1450 // register in which the argument is passed to the call.
1451 putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
1455 assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
1457 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1460 noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
1463 unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES;
1464 unsigned remainingSlots = slots;
1468 remainingSlots = slots - 1;
1470 regNumber reg = (regNumber)(argReg + 1);
1471 while (remainingSlots > 0 && reg <= REG_ARG_LAST)
1473 argMask |= genRegMask(reg);
1474 reg = (regNumber)(reg + 1);
1480 short internalIntCount = 0;
1481 if (remainingSlots > 0)
1483 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1484 // This TYP_STRUCT argument is also passed in the outgoing argument area
1485 // We need a register to address the TYP_STRUCT
1486 internalIntCount = 1;
1487 #else // FEATURE_UNIX_AMD64_STRUCT_PASSING
1488 // And we may need 2
1489 internalIntCount = 2;
1490 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1492 argNode->gtLsraInfo.internalIntCount = internalIntCount;
1494 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
1495 if (argNode->gtOper == GT_PUTARG_REG)
1497 argNode->gtLsraInfo.setDstCandidates(l, argMask);
1498 argNode->gtLsraInfo.setSrcCandidates(l, argMask);
1500 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
1504 argNode->gtLsraInfo.setDstCandidates(l, argMask);
1505 argNode->gtLsraInfo.setSrcCandidates(l, argMask);
1508 // To avoid redundant moves, have the argument child tree computed in the
1509 // register in which the argument is passed to the call.
1510 if (argNode->gtOper == GT_PUTARG_REG)
1512 argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
1516 // In the case of a varargs call, the ABI dictates that if we have floating point args,
1517 // we must pass the enregistered arguments in both the integer and floating point registers.
1518 // Since the integer register is not associated with this arg node, we will reserve it as
1519 // an internal register so that it is not used during the evaluation of the call node
1520 // (e.g. for the target).
1521 if (call->IsVarargs() && varTypeIsFloating(argNode))
1523 regNumber targetReg = compiler->getCallArgIntRegister(argReg);
1524 info->setInternalIntCount(info->internalIntCount + 1);
1525 info->addInternalCandidates(l, genRegMask(targetReg));
1527 #endif // FEATURE_VARARG
1530 // Now, count stack args
1531 // Note that these need to be computed into a register, but then
1532 // they're just stored to the stack - so the reg doesn't
1533 // need to remain live until the call. In fact, it must not
1534 // because the code generator doesn't actually consider it live,
1535 // so it can't be spilled.
1537 GenTreePtr args = call->gtCallArgs;
1540 GenTreePtr arg = args->gtOp.gtOp1;
1541 if (!(args->gtFlags & GTF_LATE_ARG))
1543 TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
1544 #if !defined(_TARGET_64BIT_)
1545 if (arg->TypeGet() == TYP_LONG)
1547 assert(arg->OperGet() == GT_LONG);
1548 GenTreePtr loArg = arg->gtGetOp1();
1549 GenTreePtr hiArg = arg->gtGetOp2();
1550 assert((loArg->OperGet() == GT_PUTARG_STK) && (hiArg->OperGet() == GT_PUTARG_STK));
1551 assert((loArg->gtLsraInfo.dstCount == 1) && (hiArg->gtLsraInfo.dstCount == 1));
1552 loArg->gtLsraInfo.isLocalDefUse = true;
1553 hiArg->gtLsraInfo.isLocalDefUse = true;
1556 #endif // !defined(_TARGET_64BIT_)
1558 if (argInfo->dstCount != 0)
1560 argInfo->isLocalDefUse = true;
1563 // If the child of GT_PUTARG_STK is a constant, we don't need a register to
1564 // move it to memory (stack location).
1566 // On AMD64, we don't want to make 0 contained, because we can generate smaller code
1567 // by zeroing a register and then storing it. E.g.:
1569 // mov gword ptr [rsp+28H], rdx
1570 // is 2 bytes smaller than:
1571 // mov gword ptr [rsp+28H], 0
1573 // On x86, we push stack arguments; we don't use 'mov'. So:
1575 // is 1 byte smaller than:
1579 argInfo->dstCount = 0;
1580 if (arg->gtOper == GT_PUTARG_STK)
1582 GenTree* op1 = arg->gtOp.gtOp1;
1583 if (IsContainableImmed(arg, op1)
1584 #if defined(_TARGET_AMD64_)
1585 && !op1->IsIntegralConst(0)
1586 #endif // _TARGET_AMD64_
1589 MakeSrcContained(arg, op1);
1594 args = args->gtOp.gtOp2;
1598 // If it is a fast tail call, it is already preferenced to use RAX.
1599 // Therefore, no need set src candidates on call tgt again.
1600 if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
1602 // Don't assign the call target to any of the argument registers because
1603 // we will use them to also pass floating point arguments as required
1605 ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
1607 #endif // !FEATURE_VARARG
1610 //------------------------------------------------------------------------
1611 // TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
1614 // blkNode - The block store node of interest
1619 void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
1621 GenTree* dstAddr = blkNode->Addr();
1622 unsigned size = blkNode->gtBlkSize;
1623 GenTree* source = blkNode->Data();
1624 LinearScan* l = m_lsra;
1625 Compiler* compiler = comp;
1627 // Sources are dest address, initVal or source.
1628 // We may require an additional source or temp register for the size.
1629 blkNode->gtLsraInfo.srcCount = 2;
1630 blkNode->gtLsraInfo.dstCount = 0;
1631 blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
1632 GenTreePtr srcAddrOrFill = nullptr;
1633 bool isInitBlk = blkNode->OperIsInitBlkOp();
1635 regMaskTP dstAddrRegMask = RBM_NONE;
1636 regMaskTP sourceRegMask = RBM_NONE;
1637 regMaskTP blkSizeRegMask = RBM_NONE;
1640 // CopyObj or CopyBlk
1641 if ((blkNode->OperGet() == GT_STORE_OBJ) && ((blkNode->AsObj()->gtGcPtrCount == 0) || blkNode->gtBlkOpGcUnsafe))
1643 blkNode->SetOper(GT_STORE_BLK);
1645 if (source->gtOper == GT_IND)
1647 srcAddrOrFill = blkNode->Data()->gtGetOp1();
1648 // We're effectively setting source as contained, but can't call MakeSrcContained, because the
1649 // "inheritance" of the srcCount is to a child not a parent - it would "just work" but could be misleading.
1650 // If srcAddr is already non-contained, we don't need to change it.
1651 if (srcAddrOrFill->gtLsraInfo.getDstCount() == 0)
1653 srcAddrOrFill->gtLsraInfo.setDstCount(1);
1654 srcAddrOrFill->gtLsraInfo.setSrcCount(source->gtLsraInfo.srcCount);
1656 m_lsra->clearOperandCounts(source);
1658 else if (!source->OperIsSIMD())
1660 assert(source->IsLocal());
1661 MakeSrcContained(blkNode, source);
1667 GenTree* initVal = source;
1668 srcAddrOrFill = source;
1669 // If we have an InitBlk with constant block size we can optimize several ways:
1670 // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
1671 // we use rep stosb since this reduces the register pressure in LSRA and we have
1672 // roughly the same performance as calling the helper.
1673 // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant,
1674 // we can speed this up by unrolling the loop using SSE2 stores. The reason for
1675 // this threshold is because our last investigation (Fall 2013), more than 95% of initblks
1676 // in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
1677 // preferred code sequence for the vast majority of cases.
1679 // This threshold will decide from using the helper or let the JIT decide to inline
1680 // a code sequence of its choice.
1681 unsigned helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);
1683 // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
1684 if (size != 0 && size <= helperThreshold)
1686 // Always favor unrolling vs rep stos.
1687 if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
1689 // The fill value of an initblk is interpreted to hold a
1690 // value of (unsigned int8) however a constant of any size
1691 // may practically reside on the evaluation stack. So extract
1692 // the lower byte out of the initVal constant and replicate
1693 // it to a larger constant whose size is sufficient to support
1694 // the largest width store of the desired inline expansion.
1696 ssize_t fill = initVal->gtIntCon.gtIconVal & 0xFF;
1697 #ifdef _TARGET_AMD64_
1698 if (size < REGSIZE_BYTES)
1700 initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
1704 initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * fill;
1705 initVal->gtType = TYP_LONG;
1707 #else // !_TARGET_AMD64_
1708 initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
1709 #endif // !_TARGET_AMD64_
1711 // In case we have a buffer >= 16 bytes
1712 // we can use SSE2 to do a 128-bit store in a single
1714 if (size >= XMM_REGSIZE_BYTES)
1716 // Reserve an XMM register to fill it with
1717 // a pack of 16 init value constants.
1718 blkNode->gtLsraInfo.internalFloatCount = 1;
1719 blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
1721 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
1724 if ((size & 1) != 0)
1726 // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
1727 // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
1728 // when unrolling, so only allow byteable registers as the source value. (We could
1729 // consider just using BlkOpKindRepInstr instead.)
1730 sourceRegMask = RBM_BYTE_REGS;
1732 #endif // _TARGET_X86_
1736 // rep stos has the following register requirements:
1737 // a) The memory address to be in RDI.
1738 // b) The fill value has to be in RAX.
1739 // c) The buffer size will go in RCX.
1740 dstAddrRegMask = RBM_RDI;
1741 srcAddrOrFill = initVal;
1742 sourceRegMask = RBM_RAX;
1743 blkSizeRegMask = RBM_RCX;
1744 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
1749 #ifdef _TARGET_AMD64_
1750 // The helper follows the regular AMD64 ABI.
1751 dstAddrRegMask = RBM_ARG_0;
1752 sourceRegMask = RBM_ARG_1;
1753 blkSizeRegMask = RBM_ARG_2;
1754 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
1755 #else // !_TARGET_AMD64_
1756 dstAddrRegMask = RBM_RDI;
1757 sourceRegMask = RBM_RAX;
1758 blkSizeRegMask = RBM_RCX;
1759 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
1760 #endif // !_TARGET_AMD64_
1763 else if (blkNode->gtOper == GT_STORE_OBJ)
1767 GenTreeObj* cpObjNode = blkNode->AsObj();
1769 unsigned slots = cpObjNode->gtSlots;
1772 // CpObj must always have at least one GC-Pointer as a member.
1773 assert(cpObjNode->gtGcPtrCount > 0);
1775 assert(dstAddr->gtType == TYP_BYREF || dstAddr->gtType == TYP_I_IMPL);
1777 CORINFO_CLASS_HANDLE clsHnd = cpObjNode->gtClass;
1778 size_t classSize = comp->info.compCompHnd->getClassSize(clsHnd);
1779 size_t blkSize = roundUp(classSize, TARGET_POINTER_SIZE);
1781 // Currently, the EE always round up a class data structure so
1782 // we are not handling the case where we have a non multiple of pointer sized
1783 // struct. This behavior may change in the future so in order to keeps things correct
1784 // let's assert it just to be safe. Going forward we should simply
1785 // handle this case.
1786 assert(classSize == blkSize);
1787 assert((blkSize / TARGET_POINTER_SIZE) == slots);
1788 assert(cpObjNode->HasGCPtr());
1791 bool IsRepMovsProfitable = false;
1793 // If the destination is not on the stack, let's find out if we
1794 // can improve code size by using rep movsq instead of generating
1795 // sequences of movsq instructions.
1796 if (!dstAddr->OperIsLocalAddr())
1798 // Let's inspect the struct/class layout and determine if it's profitable
1799 // to use rep movsq for copying non-gc memory instead of using single movsq
1800 // instructions for each memory slot.
1802 BYTE* gcPtrs = cpObjNode->gtGcPtrs;
1806 unsigned nonGCSlots = 0;
1807 // Measure a contiguous non-gc area inside the struct and note the maximum.
1808 while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
1814 while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
1819 if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
1821 IsRepMovsProfitable = true;
1824 } while (i < slots);
1826 else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
1828 IsRepMovsProfitable = true;
1831 // There are two cases in which we need to materialize the
1833 // a) When the destination is on the stack we don't need to use the
1834 // write barrier, we can just simply call rep movsq and get a win in codesize.
1835 // b) If we determine we have contiguous non-gc regions in the struct where it's profitable
1836 // to use rep movsq instead of a sequence of single movsq instructions. According to the
1837 // Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
1838 // the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
1839 if (IsRepMovsProfitable)
1841 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
1842 blkSizeRegMask = RBM_RCX;
1843 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
1847 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
1850 dstAddrRegMask = RBM_RDI;
1852 // The srcAddr must be in a register. If it was under a GT_IND, we need to subsume all of its
1854 sourceRegMask = RBM_RSI;
1858 assert((blkNode->OperGet() == GT_STORE_BLK) || (blkNode->OperGet() == GT_STORE_DYN_BLK));
1860 // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
1861 // we can use rep movs to generate code instead of the helper call.
1863 // This threshold will decide between using the helper or let the JIT decide to inline
1864 // a code sequence of its choice.
1865 unsigned helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
1867 // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
1868 if ((size != 0) && (size <= helperThreshold))
1870 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
1871 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
1872 // our framework assemblies, so this is the main code generation scheme we'll use.
1873 if (size <= CPBLK_UNROLL_LIMIT)
1875 // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
1877 // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
1878 // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
1879 // RBM_NON_BYTE_REGS from internal candidates.
1880 if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
1882 blkNode->gtLsraInfo.internalIntCount++;
1883 regMaskTP regMask = l->allRegs(TYP_INT);
1886 if ((size % 2) != 0)
1888 regMask &= ~RBM_NON_BYTE_REGS;
1891 blkNode->gtLsraInfo.setInternalCandidates(l, regMask);
1894 if (size >= XMM_REGSIZE_BYTES)
1896 // If we have a buffer larger than XMM_REGSIZE_BYTES,
1897 // reserve an XMM register to use it for a
1898 // series of 16-byte loads and stores.
1899 blkNode->gtLsraInfo.internalFloatCount = 1;
1900 blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
1903 // If src or dst are on stack, we don't have to generate the address into a register
1904 // because it's just some constant+SP
1905 if (srcAddrOrFill != nullptr && srcAddrOrFill->OperIsLocalAddr())
1907 MakeSrcContained(blkNode, srcAddrOrFill);
1910 if (dstAddr->OperIsLocalAddr())
1912 MakeSrcContained(blkNode, dstAddr);
1915 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
1919 blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
1920 dstAddrRegMask = RBM_RDI;
1921 sourceRegMask = RBM_RSI;
1922 blkSizeRegMask = RBM_RCX;
1923 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
1926 #ifdef _TARGET_AMD64_
1929 // In case we have a constant integer this means we went beyond
1930 // CPBLK_MOVS_LIMIT bytes of size, still we should never have the case of
1931 // any GC-Pointers in the src struct.
1932 blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
1933 dstAddrRegMask = RBM_ARG_0;
1934 sourceRegMask = RBM_ARG_1;
1935 blkSizeRegMask = RBM_ARG_2;
1936 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
1938 #elif defined(_TARGET_X86_)
1941 dstAddrRegMask = RBM_RDI;
1942 sourceRegMask = RBM_RSI;
1943 blkSizeRegMask = RBM_RCX;
1944 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
1946 #endif // _TARGET_X86_
1947 assert(blkNode->gtBlkOpKind != GenTreeBlk::BlkOpKindInvalid);
1949 if (dstAddrRegMask != RBM_NONE)
1951 dstAddr->gtLsraInfo.setSrcCandidates(l, dstAddrRegMask);
1953 if (sourceRegMask != RBM_NONE)
1955 if (srcAddrOrFill != nullptr)
1957 srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, sourceRegMask);
1961 // This is a local source; we'll use a temp register for its address.
1962 blkNode->gtLsraInfo.addInternalCandidates(l, sourceRegMask);
1963 blkNode->gtLsraInfo.internalIntCount++;
1966 if (blkSizeRegMask != RBM_NONE)
1970 // Reserve a temp register for the block size argument.
1971 blkNode->gtLsraInfo.addInternalCandidates(l, blkSizeRegMask);
1972 blkNode->gtLsraInfo.internalIntCount++;
1976 // The block size argument is a third argument to GT_STORE_DYN_BLK
1977 noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
1978 blkNode->gtLsraInfo.setSrcCount(3);
1979 GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
1980 blockSize->gtLsraInfo.setSrcCandidates(l, blkSizeRegMask);
1985 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1986 //------------------------------------------------------------------------
1987 // TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
1990 // tree - The node of interest
1995 void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
1997 TreeNodeInfo* info = &(putArgStk->gtLsraInfo);
1998 LinearScan* l = m_lsra;
2001 if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
2003 putArgStk->gtNumberReferenceSlots = 0;
2004 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;
2006 GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
2008 // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
2009 // of uses is visible to LSRA.
2010 unsigned fieldCount = 0;
2011 GenTreeFieldList* head = nullptr;
2012 for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next)
2014 next = current->Rest();
2016 // First, insert the field node into the sorted list.
2017 GenTreeFieldList* prev = nullptr;
2018 for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
2020 // If the offset of the current list node is greater than the offset of the cursor or if we have
2021 // reached the end of the list, insert the current node before the cursor and terminate.
2022 if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset))
2024 if (prev == nullptr)
2026 assert(cursor == head);
2031 prev->Rest() = current;
2034 current->Rest() = cursor;
2042 info->srcCount = fieldCount;
2045 // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
2046 // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
2047 // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
2048 // corresponding field list nodes in two, giving an upper bound of 8.
2050 // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
2051 // the maximum size of a field list grows significantly, we will need to reevaluate it.
2052 assert(fieldCount <= 8);
2054 // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
2056 if (head != fieldList)
2058 head->gtFlags |= GTF_FIELD_LIST_HEAD;
2059 fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
2062 head->gtSeqNum = fieldList->gtSeqNum;
2065 head->gtLsraInfo = fieldList->gtLsraInfo;
2066 head->gtClearReg(comp);
2068 BlockRange().InsertAfter(fieldList, head);
2069 BlockRange().Remove(fieldList);
2072 putArgStk->gtOp1 = fieldList;
2075 // Now that the fields have been sorted, initialize the LSRA info.
2076 bool allFieldsAreSlots = true;
2077 unsigned prevOffset = putArgStk->getArgSize();
2078 for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
2080 GenTree* const fieldNode = current->Current();
2081 const var_types fieldType = fieldNode->TypeGet();
2082 const unsigned fieldOffset = current->gtFieldOffset;
2083 assert(fieldType != TYP_LONG);
2085 // TODO-X86-CQ: we could probably improve codegen here by marking all of the operands to field nodes that
2086 // we are going to `push` on the stack as reg-optional.
2088 const bool fieldIsSlot =
2089 varTypeIsIntegralOrI(fieldType) && ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
2092 allFieldsAreSlots = false;
2093 if (varTypeIsByte(fieldType))
2095 // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
2096 // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
2097 // need a byte-addressable register for the store.
2098 fieldNode->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_NON_BYTE_REGS);
2102 if (varTypeIsGC(fieldType))
2104 putArgStk->gtNumberReferenceSlots++;
2107 prevOffset = fieldOffset;
2110 // If all fields of this list are slots, set the copy kind.
2111 if (allFieldsAreSlots)
2113 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::AllSlots;
2117 #endif // _TARGET_X86_
2119 if (putArgStk->TypeGet() != TYP_STRUCT)
2121 TreeNodeInfoInitSimple(putArgStk);
2125 GenTreePtr dst = putArgStk;
2126 GenTreePtr src = putArgStk->gtOp1;
2127 GenTreePtr srcAddr = nullptr;
2129 bool haveLocalAddr = false;
2130 if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
2132 srcAddr = src->gtOp.gtOp1;
2133 assert(srcAddr != nullptr);
2134 haveLocalAddr = srcAddr->OperIsLocalAddr();
2138 assert(varTypeIsSIMD(putArgStk));
2141 info->srcCount = src->gtLsraInfo.dstCount;
2144 // In case of a CpBlk we could use a helper call. In case of putarg_stk we
2145 // can't do that since the helper call could kill some already set up outgoing args.
2146 // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
2147 // The cpyXXXX code is rather complex and this could cause it to be more complex, but
2148 // it might be the right thing to do.
2150 // This threshold will decide from using the helper or let the JIT decide to inline
2151 // a code sequence of its choice.
2152 ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
2153 ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
2155 // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
2156 // (I don't know which).
2158 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
2159 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
2160 // our framework assemblies, so this is the main code generation scheme we'll use.
2161 if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0)
2163 // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
2165 // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
2166 // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
2167 // RBM_NON_BYTE_REGS from internal candidates.
2168 if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
2170 info->internalIntCount++;
2171 regMaskTP regMask = l->allRegs(TYP_INT);
2174 if ((size % 2) != 0)
2176 regMask &= ~RBM_NON_BYTE_REGS;
2179 info->setInternalCandidates(l, regMask);
2184 #else // !_TARGET_X86_
2185 if (size >= XMM_REGSIZE_BYTES)
2186 #endif // !_TARGET_X86_
2188 // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
2189 // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
2190 // series of 16-byte loads and stores.
2191 info->internalFloatCount = 1;
2192 info->addInternalCandidates(l, l->internalFloatRegCandidates());
2195 // If src or dst are on stack, we don't have to generate the address into a register
2196 // because it's just some constant+SP
2197 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
2200 else if (putArgStk->gtNumberReferenceSlots != 0)
2202 // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
2203 // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
2205 #endif // _TARGET_X86_
2208 info->internalIntCount += 3;
2209 info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
2211 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
2214 // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
2215 MakeSrcContained(putArgStk, src);
2219 // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
2222 // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it
2225 MakeSrcContained(putArgStk, srcAddr);
2229 #endif // FEATURE_PUT_STRUCT_ARG_STK
2231 //------------------------------------------------------------------------
2232 // TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP.
2235 // tree - The node of interest
2240 void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
2242 TreeNodeInfo* info = &(tree->gtLsraInfo);
2243 LinearScan* l = m_lsra;
2244 Compiler* compiler = comp;
2249 // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
2250 // Here '-' means don't care.
2252 // Size? Init Memory? # temp regs
2253 // 0 - 0 (returns 0)
2254 // const and <=6 reg words - 0 (pushes '0')
2255 // const and >6 reg words Yes 0 (pushes '0')
2256 // const and <PageSize No 0 (amd64) 1 (x86)
2257 // (x86:tmpReg for sutracting from esp)
2258 // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp)
2259 // Non-const Yes 0 (regCnt=targetReg and pushes '0')
2260 // Non-const No 2 (regCnt and tmpReg for subtracting from sp)
2262 // Note: Here we don't need internal register to be different from targetReg.
2263 // Rather, require it to be different from operand's reg.
2265 GenTreePtr size = tree->gtOp.gtOp1;
2266 if (size->IsCnsIntOrI())
2268 MakeSrcContained(tree, size);
2270 size_t sizeVal = size->gtIntCon.gtIconVal;
2274 info->internalIntCount = 0;
2278 // Compute the amount of memory to properly STACK_ALIGN.
2279 // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
2280 // This should also help in debugging as we can examine the original size specified with localloc.
2281 sizeVal = AlignUp(sizeVal, STACK_ALIGN);
2283 // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
2284 // we will generate 'push 0'.
2285 assert((sizeVal % REGSIZE_BYTES) == 0);
2286 size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
2287 if (cntRegSizedWords <= 6)
2289 info->internalIntCount = 0;
2291 else if (!compiler->info.compInitMem)
2293 // No need to initialize allocated stack space.
2294 if (sizeVal < compiler->eeGetPageSize())
2297 info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
2298 #else // !_TARGET_X86_
2299 info->internalIntCount = 0;
2300 #endif // !_TARGET_X86_
2304 // We need two registers: regCnt and RegTmp
2305 info->internalIntCount = 2;
2310 // >6 and need to zero initialize allocated stack space.
2311 info->internalIntCount = 0;
2317 if (!compiler->info.compInitMem)
2319 info->internalIntCount = 2;
2323 info->internalIntCount = 0;
2328 //------------------------------------------------------------------------
2329 // TreeNodeInfoInitLogicalOp: Set the NodeInfo for GT_AND/GT_OR/GT_XOR,
2330 // as well as GT_ADD/GT_SUB.
2333 // tree - The node of interest
2338 void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree)
2340 TreeNodeInfo* info = &(tree->gtLsraInfo);
2341 LinearScan* l = m_lsra;
2343 // We're not marking a constant hanging on the left of the add
2344 // as containable so we assign it to a register having CQ impact.
2345 // TODO-XArch-CQ: Detect this case and support both generating a single instruction
2346 // for GT_ADD(Constant, SomeTree)
2350 GenTree* op1 = tree->gtGetOp1();
2351 GenTree* op2 = tree->gtGetOp2();
2353 // We can directly encode the second operand if it is either a containable constant or a memory-op.
2354 // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
2355 // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
2356 // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
2357 bool directlyEncodable = false;
2358 bool binOpInRMW = false;
2359 GenTreePtr operand = nullptr;
2361 if (IsContainableImmed(tree, op2))
2363 directlyEncodable = true;
2368 binOpInRMW = IsBinOpInRMWStoreInd(tree);
2371 if (op2->isMemoryOp() && tree->TypeGet() == op2->TypeGet())
2373 directlyEncodable = true;
2376 else if (tree->OperIsCommutative())
2378 if (IsContainableImmed(tree, op1) ||
2379 (op1->isMemoryOp() && tree->TypeGet() == op1->TypeGet() && IsSafeToContainMem(tree, op1)))
2381 // If it is safe, we can reverse the order of operands of commutative operations for efficient
2383 directlyEncodable = true;
2390 if (directlyEncodable)
2392 assert(operand != nullptr);
2393 MakeSrcContained(tree, operand);
2395 else if (!binOpInRMW)
2397 // If this binary op neither has contained operands, nor is a
2398 // Read-Modify-Write (RMW) operation, we can mark its operands
2400 SetRegOptionalForBinOp(tree);
2404 //------------------------------------------------------------------------
2405 // TreeNodeInfoInitModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
2408 // tree - The node of interest
2413 void Lowering::TreeNodeInfoInitModDiv(GenTree* tree)
2415 TreeNodeInfo* info = &(tree->gtLsraInfo);
2416 LinearScan* l = m_lsra;
2418 GenTree* op1 = tree->gtGetOp1();
2419 GenTree* op2 = tree->gtGetOp2();
2424 switch (tree->OperGet())
2428 if (varTypeIsFloating(tree->TypeGet()))
2430 // No implicit conversions at this stage as the expectation is that
2431 // everything is made explicit by adding casts.
2432 assert(op1->TypeGet() == op2->TypeGet());
2434 if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
2436 MakeSrcContained(tree, op2);
2440 // If there are no containable operands, we can make an operand reg optional.
2441 // SSE2 allows only op2 to be a memory-op.
2442 SetRegOptional(op2);
2453 // Amd64 Div/Idiv instruction:
2454 // Dividend in RAX:RDX and computes
2455 // Quotient in RAX, Remainder in RDX
2457 if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
2459 // We are interested in just the remainder.
2460 // RAX is used as a trashable register during computation of remainder.
2461 info->setDstCandidates(l, RBM_RDX);
2465 // We are interested in just the quotient.
2466 // RDX gets used as trashable register during computation of quotient
2467 info->setDstCandidates(l, RBM_RAX);
2471 if (op1->OperGet() == GT_LONG)
2473 // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
2474 GenTree* loVal = op1->gtGetOp1();
2475 GenTree* hiVal = op1->gtGetOp2();
2477 // Src count is actually 3, so increment.
2478 assert(op2->IsCnsIntOrI());
2481 loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX);
2482 hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX);
2487 // If possible would like to have op1 in RAX to avoid a register move
2488 op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
2491 // divisor can be an r/m, but the memory indirection must be of the same size as the divide
2492 if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
2494 MakeSrcContained(tree, op2);
2498 op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
2500 // If there are no containable operands, we can make an operand reg optional.
2501 // Div instruction allows only op2 to be a memory op.
2502 SetRegOptional(op2);
2506 //------------------------------------------------------------------------
2507 // TreeNodeInfoInitIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
2510 // tree - The node of interest
2515 void Lowering::TreeNodeInfoInitIntrinsic(GenTree* tree)
2517 TreeNodeInfo* info = &(tree->gtLsraInfo);
2518 LinearScan* l = m_lsra;
2520 // Both operand and its result must be of floating point type.
2521 GenTree* op1 = tree->gtGetOp1();
2522 assert(varTypeIsFloating(op1));
2523 assert(op1->TypeGet() == tree->TypeGet());
2528 switch (tree->gtIntrinsic.gtIntrinsicId)
2530 case CORINFO_INTRINSIC_Sqrt:
2531 if (op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl())
2533 MakeSrcContained(tree, op1);
2537 // Mark the operand as reg optional since codegen can still
2538 // generate code if op1 is on stack.
2539 SetRegOptional(op1);
2543 case CORINFO_INTRINSIC_Abs:
2544 // Abs(float x) = x & 0x7fffffff
2545 // Abs(double x) = x & 0x7ffffff ffffffff
2547 // In case of Abs we need an internal register to hold mask.
2549 // TODO-XArch-CQ: avoid using an internal register for the mask.
2550 // Andps or andpd both will operate on 128-bit operands.
2551 // The data section constant to hold the mask is a 64-bit size.
2552 // Therefore, we need both the operand and mask to be in
2553 // xmm register. When we add support in emitter to emit 128-bit
2554 // data constants and instructions that operate on 128-bit
2555 // memory operands we can avoid the need for an internal register.
2556 if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
2558 info->internalFloatCount = 1;
2559 info->setInternalCandidates(l, l->internalFloatRegCandidates());
2564 case CORINFO_INTRINSIC_Cos:
2565 case CORINFO_INTRINSIC_Sin:
2566 case CORINFO_INTRINSIC_Round:
2567 NYI_X86("Math intrinsics Cos, Sin and Round");
2569 #endif // _TARGET_X86_
2572 // Right now only Sqrt/Abs are treated as math intrinsics
2573 noway_assert(!"Unsupported math intrinsic");
2580 //------------------------------------------------------------------------
2581 // TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
2584 // tree - The GT_SIMD node of interest
2589 void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
2591 GenTreeSIMD* simdTree = tree->AsSIMD();
2592 TreeNodeInfo* info = &(tree->gtLsraInfo);
2593 LinearScan* lsra = m_lsra;
2595 switch (simdTree->gtSIMDIntrinsicID)
2600 case SIMDIntrinsicInit:
2603 op1 = tree->gtOp.gtOp1;
2605 // This sets all fields of a SIMD struct to the given value.
2606 // Mark op1 as contained if it is either zero or int constant of all 1's,
2607 // or a float constant with 16 or 32 byte simdType (AVX case)
2609 // Should never see small int base type vectors except for zero initialization.
2610 assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
2612 if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
2613 (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
2615 MakeSrcContained(tree, tree->gtOp.gtOp1);
2618 else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
2619 ((simdTree->gtSIMDSize == 16) || (simdTree->gtSIMDSize == 32)))
2621 // Either op1 is a float or dbl constant or an addr
2622 if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
2624 MakeSrcContained(tree, tree->gtOp.gtOp1);
2631 case SIMDIntrinsicInitN:
2633 info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));
2635 // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
2636 info->internalFloatCount = 1;
2637 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
2641 case SIMDIntrinsicInitArray:
2642 // We have an array and an index, which may be contained.
2644 CheckImmedAndMakeContained(tree, tree->gtGetOp2());
2647 case SIMDIntrinsicDiv:
2648 // SSE2 has no instruction support for division on integer vectors
2649 noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
2653 case SIMDIntrinsicAbs:
2654 // This gets implemented as bitwise-And operation with a mask
2655 // and hence should never see it here.
2659 case SIMDIntrinsicSqrt:
2660 // SSE2 has no instruction support for sqrt on integer vectors.
2661 noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
2665 case SIMDIntrinsicAdd:
2666 case SIMDIntrinsicSub:
2667 case SIMDIntrinsicMul:
2668 case SIMDIntrinsicBitwiseAnd:
2669 case SIMDIntrinsicBitwiseAndNot:
2670 case SIMDIntrinsicBitwiseOr:
2671 case SIMDIntrinsicBitwiseXor:
2672 case SIMDIntrinsicMin:
2673 case SIMDIntrinsicMax:
2676 // SSE2 32-bit integer multiplication requires two temp regs
2677 if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT)
2679 info->internalFloatCount = 2;
2680 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
2684 case SIMDIntrinsicEqual:
2688 // SSE2 doesn't support < and <= directly on int vectors.
2689 // Instead we need to use > and >= with swapped operands.
2690 case SIMDIntrinsicLessThan:
2691 case SIMDIntrinsicLessThanOrEqual:
2693 noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
2696 // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
2697 // SSE2 cmpps/pd doesn't support > and >= directly on float/double vectors.
2698 // Instead we need to use < and <= with swapped operands.
2699 case SIMDIntrinsicGreaterThan:
2700 noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
2704 case SIMDIntrinsicOpEquality:
2705 case SIMDIntrinsicOpInEquality:
2708 // On AVX, we can generate optimal code for (in)equality
2710 op2 = tree->gtGetOp2();
2711 if (comp->canUseAVX() && op2->IsIntegralConstVector(0))
2713 // On AVX we can use ptest instruction for (in)equality
2714 // against zero to generate optimal code.
2716 // We can safely do the below optimization for integral
2717 // vectors but not for floating-point for the reason
2718 // that we have +0.0 and -0.0 and +0.0 == -0.0
2719 MakeSrcContained(tree, op2);
2724 // Need two SIMD registers as scratch.
2725 // See genSIMDIntrinsicRelOp() for details on code sequence generate and
2726 // the need for two scratch registers.
2728 // Note these intrinsics produce a BOOL result, hence internal float
2729 // registers reserved are guaranteed to be different from target
2730 // integer register without explicitly specifying.
2731 info->internalFloatCount = 2;
2732 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
2736 case SIMDIntrinsicDotProduct:
2737 // Float/Double vectors:
2738 // For SSE, or AVX with 32-byte vectors, we also need an internal register
2739 // as scratch. Further we need the targetReg and internal reg to be distinct
2740 // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
2741 // don't need a tmpReg.
2743 // 32-byte integer vector on AVX:
2744 // will take advantage of phaddd, which operates only on 128-bit xmm reg.
2745 // This would need 2 internal registers since targetReg is an int type
2748 // See genSIMDIntrinsicDotProduct() for details on code sequence generated
2749 // and the need for scratch registers.
2750 if (varTypeIsFloating(simdTree->gtSIMDBaseType))
2752 if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
2753 (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
2755 info->internalFloatCount = 1;
2756 info->isInternalRegDelayFree = true;
2757 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
2759 // else don't need scratch reg(s).
2763 assert(simdTree->gtSIMDBaseType == TYP_INT && comp->canUseAVX());
2765 // No need to set isInternalRegDelayFree since targetReg is a
2766 // an int type reg and guaranteed to be different from xmm/ymm
2768 info->internalFloatCount = 2;
2769 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
2774 case SIMDIntrinsicGetItem:
2776 // This implements get_Item method. The sources are:
2777 // - the source SIMD struct
2778 // - index (which element to get)
2779 // The result is baseType of SIMD struct.
2781 op1 = tree->gtOp.gtOp1;
2782 op2 = tree->gtOp.gtOp2;
2784 // If the index is a constant, mark it as contained.
2785 if (CheckImmedAndMakeContained(tree, op2))
2790 if (op1->isMemoryOp())
2792 MakeSrcContained(tree, op1);
2794 // Although GT_IND of TYP_SIMD12 reserves an internal float
2795 // register for reading 4 and 8 bytes from memory and
2796 // assembling them into target XMM reg, it is not required
2798 op1->gtLsraInfo.internalIntCount = 0;
2799 op1->gtLsraInfo.internalFloatCount = 0;
2803 // If the index is not a constant, we will use the SIMD temp location to store the vector.
2804 // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
2805 // can use that in the process of extracting the element.
2807 // If the index is a constant and base type is a small int we can use pextrw, but on AVX
2808 // we will need a temp if are indexing into the upper half of the AVX register.
2809 // In all other cases with constant index, we need a temp xmm register to extract the
2810 // element if index is other than zero.
2812 if (!op2->IsCnsIntOrI())
2814 (void)comp->getSIMDInitTempVarNum();
2816 else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
2819 if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
2820 (comp->getSIMDInstructionSet() == InstructionSet_AVX))
2822 int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
2823 needFloatTemp = (byteShiftCnt >= 16);
2827 needFloatTemp = !op2->IsIntegralConst(0);
2832 info->internalFloatCount = 1;
2833 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
2840 case SIMDIntrinsicSetX:
2841 case SIMDIntrinsicSetY:
2842 case SIMDIntrinsicSetZ:
2843 case SIMDIntrinsicSetW:
2846 // We need an internal integer register for SSE2 codegen
2847 if (comp->getSIMDInstructionSet() == InstructionSet_SSE2)
2849 info->internalIntCount = 1;
2850 info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
2855 case SIMDIntrinsicCast:
2859 case SIMDIntrinsicShuffleSSE2:
2861 // Second operand is an integer constant and marked as contained.
2862 op2 = tree->gtOp.gtOp2;
2863 noway_assert(op2->IsCnsIntOrI());
2864 MakeSrcContained(tree, op2);
2867 case SIMDIntrinsicGetX:
2868 case SIMDIntrinsicGetY:
2869 case SIMDIntrinsicGetZ:
2870 case SIMDIntrinsicGetW:
2871 case SIMDIntrinsicGetOne:
2872 case SIMDIntrinsicGetZero:
2873 case SIMDIntrinsicGetCount:
2874 case SIMDIntrinsicGetAllOnes:
2875 assert(!"Get intrinsics should not be seen during Lowering.");
2879 noway_assert(!"Unimplemented SIMD node type.");
2883 #endif // FEATURE_SIMD
2885 //------------------------------------------------------------------------
2886 // TreeNodeInfoInitCast: Set the NodeInfo for a GT_CAST.
2889 // tree - The node of interest
2894 void Lowering::TreeNodeInfoInitCast(GenTree* tree)
2896 TreeNodeInfo* info = &(tree->gtLsraInfo);
2898 // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
2899 // see CodeGen::genIntToIntCast()
2904 // Non-overflow casts to/from float/double are done using SSE2 instructions
2905 // and that allow the source operand to be either a reg or memop. Given the
2906 // fact that casts from small int to float/double are done as two-level casts,
2907 // the source operand is always guaranteed to be of size 4 or 8 bytes.
2908 var_types castToType = tree->CastToType();
2909 GenTreePtr castOp = tree->gtCast.CastOp();
2910 var_types castOpType = castOp->TypeGet();
2911 if (tree->gtFlags & GTF_UNSIGNED)
2913 castOpType = genUnsignedType(castOpType);
2916 if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
2919 // If converting to float/double, the operand must be 4 or 8 byte in size.
2920 if (varTypeIsFloating(castToType))
2922 unsigned opSize = genTypeSize(castOpType);
2923 assert(opSize == 4 || opSize == 8);
2927 // U8 -> R8 conversion requires that the operand be in a register.
2928 if (castOpType != TYP_ULONG)
2930 if (castOp->isMemoryOp() || castOp->IsCnsNonZeroFltOrDbl())
2932 MakeSrcContained(tree, castOp);
2936 // Mark castOp as reg optional to indicate codegen
2937 // can still generate code if it is on stack.
2938 SetRegOptional(castOp);
2943 #if !defined(_TARGET_64BIT_)
2944 if (varTypeIsLong(castOpType))
2946 noway_assert(castOp->OperGet() == GT_LONG);
2949 #endif // !defined(_TARGET_64BIT_)
2951 // some overflow checks need a temp reg:
2952 // - GT_CAST from INT64/UINT64 to UINT32
2953 if (tree->gtOverflow() && (castToType == TYP_UINT))
2955 if (genTypeSize(castOpType) == 8)
2957 // Here we don't need internal register to be different from targetReg,
2958 // rather require it to be different from operand's reg.
2959 info->internalIntCount = 1;
2964 void Lowering::LowerGCWriteBarrier(GenTree* tree)
2966 assert(tree->OperGet() == GT_STOREIND);
2968 GenTreeStoreInd* dst = tree->AsStoreInd();
2969 GenTreePtr addr = dst->Addr();
2970 GenTreePtr src = dst->Data();
2972 if (addr->OperGet() == GT_LEA)
2974 // In the case where we are doing a helper assignment, if the dst
2975 // is an indir through an lea, we need to actually instantiate the
2976 // lea in a register
2977 GenTreeAddrMode* lea = addr->AsAddrMode();
2979 int leaSrcCount = 0;
2984 if (lea->HasIndex())
2988 lea->gtLsraInfo.srcCount = leaSrcCount;
2989 lea->gtLsraInfo.dstCount = 1;
2992 bool useOptimizedWriteBarrierHelper = false; // By default, assume no optimized write barriers.
2994 #if NOGC_WRITE_BARRIERS
2996 #if defined(_TARGET_X86_)
2998 useOptimizedWriteBarrierHelper = true; // On x86, use the optimized write barriers by default.
3000 GCInfo::WriteBarrierForm wbf = comp->codeGen->gcInfo.gcIsWriteBarrierCandidate(tree, src);
3001 if (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug) // This one is always a call to a C++ method.
3003 useOptimizedWriteBarrierHelper = false;
3007 if (useOptimizedWriteBarrierHelper)
3009 // Special write barrier:
3010 // op1 (addr) goes into REG_WRITE_BARRIER (rdx) and
3011 // op2 (src) goes into any int register.
3012 addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER);
3013 src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_SRC);
3016 #else // !defined(_TARGET_X86_)
3017 #error "NOGC_WRITE_BARRIERS is not supported"
3018 #endif // !defined(_TARGET_X86_)
3020 #endif // NOGC_WRITE_BARRIERS
3022 if (!useOptimizedWriteBarrierHelper)
3024 // For the standard JIT Helper calls:
3025 // op1 (addr) goes into REG_ARG_0 and
3026 // op2 (src) goes into REG_ARG_1
3027 addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
3028 src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
3031 // Both src and dst must reside in a register, which they should since we haven't set
3032 // either of them as contained.
3033 assert(addr->gtLsraInfo.dstCount == 1);
3034 assert(src->gtLsraInfo.dstCount == 1);
3037 //-----------------------------------------------------------------------------------------
3038 // Specify register requirements for address expression of an indirection operation.
3041 // indirTree - GT_IND or GT_STOREIND gentree node
3043 void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
3045 assert(indirTree->isIndir());
3046 // If this is the rhs of a block copy (i.e. non-enregisterable struct),
3047 // it has no register requirements.
3048 if (indirTree->TypeGet() == TYP_STRUCT)
3053 GenTreePtr addr = indirTree->gtGetOp1();
3054 TreeNodeInfo* info = &(indirTree->gtLsraInfo);
3056 GenTreePtr base = nullptr;
3057 GenTreePtr index = nullptr;
3062 // If indirTree is of TYP_SIMD12, don't mark addr as contained
3063 // so that it always get computed to a register. This would
3064 // mean codegen side logic doesn't need to handle all possible
3065 // addr expressions that could be contained.
3067 // TODO-XArch-CQ: handle other addr mode expressions that could be marked
3069 if (indirTree->TypeGet() == TYP_SIMD12)
3071 // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
3072 // To assemble the vector properly we would need an additional
3074 info->internalFloatCount = 1;
3076 // In case of GT_IND we need an internal register different from targetReg and
3077 // both of the registers are used at the same time.
3078 if (indirTree->OperGet() == GT_IND)
3080 info->isInternalRegDelayFree = true;
3083 info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
3087 #endif // FEATURE_SIMD
3089 if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
3091 // The address of an indirection that requires its address in a reg.
3092 // Skip any further processing that might otherwise make it contained.
3094 else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
3096 // These nodes go into an addr mode:
3097 // - GT_CLS_VAR_ADDR turns into a constant.
3098 // - GT_LCL_VAR_ADDR is a stack addr mode.
3100 // make this contained, it turns into a constant that goes into an addr mode
3101 MakeSrcContained(indirTree, addr);
3103 else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
3106 // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
3107 // (i.e. those VSD calls for which stub addr is known during JIT compilation time). In this case,
3108 // VM requires us to pass stub addr in REG_VIRTUAL_STUB_PARAM - see LowerVirtualStubCall(). For
3109 // that reason we cannot mark such an addr as contained. Note that this is not an issue for
3110 // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
3114 // Note that LowerVirtualStubCall() sets addr->gtRegNum to REG_VIRTUAL_STUB_PARAM and Lowering::doPhase()
3115 // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA before calling
3116 // TreeNodeInfoInit(). Ideally we should set a flag on addr nodes that shouldn't be marked as contained
3117 // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose. As a workaround
3118 // an explicit check is made here.
3120 // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
3121 MakeSrcContained(indirTree, addr);
3123 else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
3125 MakeSrcContained(indirTree, addr);
3127 else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) &&
3128 !AreSourcesPossiblyModifiedLocals(indirTree, base, index))
3130 // An addressing mode will be constructed that may cause some
3131 // nodes to not need a register, and cause others' lifetimes to be extended
3132 // to the GT_IND or even its parent if it's an assignment
3134 assert(base != addr);
3135 m_lsra->clearOperandCounts(addr);
3137 const bool hasBase = base != nullptr;
3138 const bool hasIndex = index != nullptr;
3139 assert(hasBase || hasIndex); // At least one of a base or an index must be present.
3141 // If the addressing mode has both a base and an index, bump its source count by one. If it only has one or the
3142 // other, its source count is already correct (due to the source for the address itself).
3143 if (hasBase && hasIndex)
3148 // Traverse the computation below GT_IND to find the operands
3149 // for the addressing mode, marking the various constants and
3150 // intermediate results as not consuming/producing.
3151 // If the traversal were more complex, we might consider using
3152 // a traversal function, but the addressing mode is only made
3153 // up of simple arithmetic operators, and the code generator
3154 // only traverses one leg of each node.
3156 bool foundBase = !hasBase;
3157 bool foundIndex = !hasIndex;
3158 for (GenTree *child = addr, *nextChild = nullptr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
3160 nextChild = nullptr;
3161 GenTree* op1 = child->gtOp.gtOp1;
3162 GenTree* op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
3168 else if (op1 == index)
3174 m_lsra->clearOperandCounts(op1);
3175 if (!op1->OperIsLeaf())
3187 else if (op2 == index)
3193 m_lsra->clearOperandCounts(op2);
3194 if (!op2->OperIsLeaf())
3196 assert(nextChild == nullptr);
3202 assert(foundBase && foundIndex);
3204 else if (addr->gtOper == GT_ARR_ELEM)
3206 // The GT_ARR_ELEM consumes all the indices and produces the offset.
3207 // The array object lives until the mem access.
3208 // We also consume the target register to which the address is
3212 assert(addr->gtLsraInfo.srcCount >= 2);
3213 addr->gtLsraInfo.srcCount -= 1;
3217 void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
3219 assert(tree->OperIsCompare());
3221 TreeNodeInfo* info = &(tree->gtLsraInfo);
3227 // If the compare is used by a jump, we just need to set the condition codes. If not, then we need
3228 // to store the result into the low byte of a register, which requires the dst be a byteable register.
3229 // We always set the dst candidates, though, because if this is compare is consumed by a jump, they
3230 // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear
3231 // that flag is maintained until this location (especially for decomposed long compares).
3232 info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
3233 #endif // _TARGET_X86_
3235 GenTreePtr op1 = tree->gtOp.gtOp1;
3236 GenTreePtr op2 = tree->gtOp.gtOp2;
3237 var_types op1Type = op1->TypeGet();
3238 var_types op2Type = op2->TypeGet();
3240 #if !defined(_TARGET_64BIT_)
3241 // Long compares will consume GT_LONG nodes, each of which produces two results.
3242 // Thus for each long operand there will be an additional source.
3243 // TODO-X86-CQ: Mark hiOp2 and loOp2 as contained if it is a constant or a memory op.
3244 if (varTypeIsLong(op1Type))
3248 if (varTypeIsLong(op2Type))
3252 #endif // !defined(_TARGET_64BIT_)
3254 // If either of op1 or op2 is floating point values, then we need to use
3255 // ucomiss or ucomisd to compare, both of which support the following form:
3256 // ucomis[s|d] xmm, xmm/mem
3257 // That is only the second operand can be a memory op.
3259 // Second operand is a memory Op: Note that depending on comparison operator,
3260 // the operands of ucomis[s|d] need to be reversed. Therefore, either op1 or
3261 // op2 can be a memory op depending on the comparison operator.
3262 if (varTypeIsFloating(op1Type))
3264 // The type of the operands has to be the same and no implicit conversions at this stage.
3265 assert(op1Type == op2Type);
3268 if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
3270 // Unordered comparison case
3271 reverseOps = (tree->gtOper == GT_GT || tree->gtOper == GT_GE);
3275 reverseOps = (tree->gtOper == GT_LT || tree->gtOper == GT_LE);
3288 assert(otherOp != nullptr);
3289 if (otherOp->IsCnsNonZeroFltOrDbl())
3291 MakeSrcContained(tree, otherOp);
3293 else if (otherOp->isMemoryOp() && ((otherOp == op2) || IsSafeToContainMem(tree, otherOp)))
3295 MakeSrcContained(tree, otherOp);
3299 // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
3300 // contained, we can mark it reg-optional.
3301 SetRegOptional(otherOp);
3307 // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
3308 // or in other backend.
3310 bool hasShortCast = false;
3311 if (CheckImmedAndMakeContained(tree, op2))
3313 // If the types are the same, or if the constant is of the correct size,
3314 // we can treat the isMemoryOp as contained.
3315 bool op1CanBeContained = (genTypeSize(op1Type) == genTypeSize(op2Type));
3317 // Do we have a short compare against a constant in op2
3319 if (varTypeIsSmall(op1Type))
3321 GenTreeIntCon* con = op2->AsIntCon();
3322 ssize_t ival = con->gtIconVal;
3324 bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE);
3325 bool useTest = isEqualityCompare && (ival == 0);
3329 ssize_t lo = 0; // minimum imm value allowed for cmp reg,imm
3330 ssize_t hi = 0; // maximum imm value allowed for cmp reg,imm
3331 bool isUnsigned = false;
3336 op1Type = TYP_UBYTE;
3360 if ((ival >= lo) && (ival <= hi))
3362 // We can perform a small compare with the immediate 'ival'
3363 tree->gtFlags |= GTF_RELOP_SMALL;
3364 if (isUnsigned && !isEqualityCompare)
3366 tree->gtFlags |= GTF_UNSIGNED;
3368 // We can treat the isMemoryOp as "contained"
3369 op1CanBeContained = true;
3374 if (op1CanBeContained)
3376 if (op1->isMemoryOp())
3378 MakeSrcContained(tree, op1);
3382 bool op1IsMadeContained = false;
3384 // When op1 is a GT_AND we can often generate a single "test" instruction
3385 // instead of two instructions (an "and" instruction followed by a "cmp"/"test").
3387 // This instruction can only be used for equality or inequality comparisons.
3388 // and we must have a compare against zero.
3390 // If we have a postive test for a single bit we can reverse the condition and
3391 // make the compare be against zero.
3396 // GT_AND GT_CNS (0x100) ==>> GT_AND GT_CNS (0)
3398 // andOp1 GT_CNS (0x100) andOp1 GT_CNS (0x100)
3400 // We will mark the GT_AND node as contained if the tree is an equality compare with zero.
3401 // Additionally, when we do this we also allow for a contained memory operand for "andOp1".
3403 bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE);
3405 if (isEqualityCompare && (op1->OperGet() == GT_AND))
3407 GenTreePtr andOp2 = op1->gtOp.gtOp2;
3408 if (IsContainableImmed(op1, andOp2))
3410 ssize_t andOp2CnsVal = andOp2->AsIntConCommon()->IconValue();
3411 ssize_t relOp2CnsVal = op2->AsIntConCommon()->IconValue();
3413 if ((relOp2CnsVal == andOp2CnsVal) && isPow2(andOp2CnsVal))
3415 // We have a single bit test, so now we can change the
3416 // tree into the alternative form,
3417 // so that we can generate a test instruction.
3419 // Reverse the equality comparison
3420 tree->SetOperRaw((tree->gtOper == GT_EQ) ? GT_NE : GT_EQ);
3422 // Change the relOp2CnsVal to zero
3424 op2->AsIntConCommon()->SetIconValue(0);
3427 // Now do we have a equality compare with zero?
3429 if (relOp2CnsVal == 0)
3431 // Note that child nodes must be made contained before parent nodes
3433 // Check for a memory operand for op1 with the test instruction
3435 GenTreePtr andOp1 = op1->gtOp.gtOp1;
3436 if (andOp1->isMemoryOp())
3438 // If the type of value memoryOp (andOp1) is not the same as the type of constant
3439 // (andOp2) check to see whether it is safe to mark AndOp1 as contained. For e.g. in
3440 // the following case it is not safe to mark andOp1 as contained
3441 // AndOp1 = signed byte and andOp2 is an int constant of value 512.
3443 // If it is safe, we update the type and value of andOp2 to match with andOp1.
3444 bool containable = (andOp1->TypeGet() == op1->TypeGet());
3447 ssize_t newIconVal = 0;
3449 switch (andOp1->TypeGet())
3454 newIconVal = (signed char)andOp2CnsVal;
3455 containable = FitsIn<signed char>(andOp2CnsVal);
3459 newIconVal = andOp2CnsVal & 0xFF;
3463 newIconVal = (signed short)andOp2CnsVal;
3464 containable = FitsIn<signed short>(andOp2CnsVal);
3467 newIconVal = andOp2CnsVal & 0xFFFF;
3471 newIconVal = (INT32)andOp2CnsVal;
3472 containable = FitsIn<INT32>(andOp2CnsVal);
3475 newIconVal = andOp2CnsVal & 0xFFFFFFFF;
3479 #ifdef _TARGET_64BIT_
3481 newIconVal = (INT64)andOp2CnsVal;
3485 newIconVal = (UINT64)andOp2CnsVal;
3488 #endif //_TARGET_64BIT_
3493 andOp2->gtType = andOp1->TypeGet();
3494 andOp2->AsIntConCommon()->SetIconValue(newIconVal);
3498 // Mark the 'andOp1' memory operand as contained
3499 // Note that for equality comparisons we don't need
3500 // to deal with any signed or unsigned issues.
3503 MakeSrcContained(op1, andOp1);
3506 // Mark the 'op1' (the GT_AND) operand as contained
3507 MakeSrcContained(tree, op1);
3508 op1IsMadeContained = true;
3510 // During Codegen we will now generate "test andOp1, andOp2CnsVal"
3514 else if (op1->OperGet() == GT_CAST)
3516 // If the op1 is a cast operation, and cast type is one byte sized unsigned type,
3517 // we can directly use the number in register, instead of doing an extra cast step.
3518 var_types dstType = op1->CastToType();
3519 bool isUnsignedDst = varTypeIsUnsigned(dstType);
3520 emitAttr castSize = EA_ATTR(genTypeSize(dstType));
3521 GenTreePtr castOp1 = op1->gtOp.gtOp1;
3522 genTreeOps castOp1Oper = castOp1->OperGet();
3523 bool safeOper = false;
3525 // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE.
3526 // For example when 'castOp1Oper' is a GT_RSZ or GT_RSH then we are shifting
3527 // bits from the left into the lower bits. If we change the type to a TYP_UBYTE
3528 // we will instead generate a byte sized shift operation: shr al, 24
3529 // For the following ALU operations is it safe to change the gtType to the
3532 if ((castOp1Oper == GT_CNS_INT) || (castOp1Oper == GT_CALL) || // the return value from a Call
3533 (castOp1Oper == GT_LCL_VAR) || castOp1->OperIsLogical() || // GT_AND, GT_OR, GT_XOR
3534 castOp1->isMemoryOp()) // isIndir() || isLclField();
3539 if ((castSize == EA_1BYTE) && isUnsignedDst && // Unsigned cast to TYP_UBYTE
3540 safeOper && // Must be a safe operation
3541 !op1->gtOverflow()) // Must not be an overflow checking cast
3543 // Currently all of the Oper accepted as 'safeOper' are
3544 // non-overflow checking operations. If we were to add
3545 // an overflow checking operation then this assert needs
3546 // to be moved above to guard entry to this block.
3548 assert(!castOp1->gtOverflowEx()); // Must not be an overflow checking operation
3550 GenTreePtr removeTreeNode = op1;
3551 tree->gtOp.gtOp1 = castOp1;
3553 castOp1->gtType = TYP_UBYTE;
3555 // trim down the value if castOp1 is an int constant since its type changed to UBYTE.
3556 if (castOp1Oper == GT_CNS_INT)
3558 castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal;
3561 // TODO-Cleanup: we're within "if (CheckImmedAndMakeContained(tree, op2))", so isn't
3562 // the following condition always true?
3563 if (op2->isContainedIntOrIImmed())
3565 ssize_t val = (ssize_t)op2->AsIntConCommon()->IconValue();
3566 if (val >= 0 && val <= 255)
3568 op2->gtType = TYP_UBYTE;
3569 tree->gtFlags |= GTF_UNSIGNED;
3571 // right now the op1's type is the same as op2's type.
3572 // if op1 is MemoryOp, we should make the op1 as contained node.
3573 if (castOp1->isMemoryOp())
3575 MakeSrcContained(tree, op1);
3576 op1IsMadeContained = true;
3581 BlockRange().Remove(removeTreeNode);
3583 // We've changed the type on op1 to TYP_UBYTE, but we already processed that node. We need to
3584 // go back and mark it byteable.
3585 // TODO-Cleanup: it might be better to move this out of the TreeNodeInfoInit pass to the earlier
3586 // "lower" pass, in which case the byteable check would just fall out. But that is quite
3588 TreeNodeInfoInitCheckByteable(op1);
3594 "TreeNodeInfoInitCmp: Removing a GT_CAST to TYP_UBYTE and changing castOp1->gtType to "
3596 comp->gtDispTreeRange(BlockRange(), tree);
3602 // If not made contained, op1 can be marked as reg-optional.
3603 if (!op1IsMadeContained)
3605 SetRegOptional(op1);
3610 else if (op1Type == op2Type)
3612 if (op2->isMemoryOp())
3614 MakeSrcContained(tree, op2);
3616 else if (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))
3618 MakeSrcContained(tree, op1);
3620 else if (op1->IsCnsIntOrI())
3622 // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm,
3623 // but there is currently an assert in CodeGen::genCompareInt().
3624 // https://github.com/dotnet/coreclr/issues/7270
3625 SetRegOptional(op2);
3629 // One of op1 or op2 could be marked as reg optional
3630 // to indicate that codegen can still generate code
3631 // if one of them is on stack.
3632 SetRegOptional(PreferredRegOptionalOperand(tree));
3635 if (varTypeIsSmall(op1Type) && varTypeIsUnsigned(op1Type))
3637 // Mark the tree as doing unsigned comparison if
3638 // both the operands are small and unsigned types.
3639 // Otherwise we will end up performing a signed comparison
3640 // of two small unsigned values without zero extending them to
3641 // TYP_INT size and which is incorrect.
3642 tree->gtFlags |= GTF_UNSIGNED;
3647 /* Lower GT_CAST(srcType, DstType) nodes.
3649 * Casts from small int type to float/double are transformed as follows:
3650 * GT_CAST(byte, float/double) = GT_CAST(GT_CAST(byte, int32), float/double)
3651 * GT_CAST(sbyte, float/double) = GT_CAST(GT_CAST(sbyte, int32), float/double)
3652 * GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double)
3653 * GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double)
3655 * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
3656 * are morphed as follows by front-end and hence should not be seen here.
3657 * GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double)
3658 * GT_CAST(uint64, float) = GT_CAST(GT_CAST(uint64, double), float)
3661 * Similarly casts from float/double to a smaller int type are transformed as follows:
3662 * GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte)
3663 * GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte)
3664 * GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16)
3665 * GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16)
3667 * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
3668 * integer. The above transformations help us to leverage those instructions.
3670 * Note that for the following conversions we still depend on helper calls and
3671 * don't expect to see them here.
3672 * i) GT_CAST(float/double, uint64)
3673 * ii) GT_CAST(float/double, int type with overflow detection)
3675 * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
3676 * There are hardly any occurrences of this conversion operation in platform
3677 * assemblies or in CQ perf benchmarks (1 occurrence in mscorlib, microsoft.jscript,
3678 * 1 occurence in Roslyn and no occurrences in system, system.core, system.numerics
3679 * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
3680 * doing this optimization is a win, should consider generating in-lined code.
3682 void Lowering::LowerCast(GenTree* tree)
3684 assert(tree->OperGet() == GT_CAST);
3686 GenTreePtr op1 = tree->gtOp.gtOp1;
3687 var_types dstType = tree->CastToType();
3688 var_types srcType = op1->TypeGet();
3689 var_types tmpType = TYP_UNDEF;
3690 bool srcUns = false;
3692 // force the srcType to unsigned if GT_UNSIGNED flag is set
3693 if (tree->gtFlags & GTF_UNSIGNED)
3695 srcType = genUnsignedType(srcType);
3698 // We should never see the following casts as they are expected to be lowered
3699 // apropriately or converted into helper calls by front-end.
3700 // srcType = float/double dstType = * and overflow detecting cast
3701 // Reason: must be converted to a helper call
3702 // srcType = float/double, dstType = ulong
3703 // Reason: must be converted to a helper call
3704 // srcType = uint dstType = float/double
3705 // Reason: uint -> float/double = uint -> long -> float/double
3706 // srcType = ulong dstType = float
3707 // Reason: ulong -> float = ulong -> double -> float
3708 if (varTypeIsFloating(srcType))
3710 noway_assert(!tree->gtOverflow());
3711 noway_assert(dstType != TYP_ULONG);
3713 else if (srcType == TYP_UINT)
3715 noway_assert(!varTypeIsFloating(dstType));
3717 else if (srcType == TYP_ULONG)
3719 noway_assert(dstType != TYP_FLOAT);
3722 // Case of src is a small type and dst is a floating point type.
3723 if (varTypeIsSmall(srcType) && varTypeIsFloating(dstType))
3725 // These conversions can never be overflow detecting ones.
3726 noway_assert(!tree->gtOverflow());
3729 // case of src is a floating point type and dst is a small type.
3730 else if (varTypeIsFloating(srcType) && varTypeIsSmall(dstType))
3735 if (tmpType != TYP_UNDEF)
3737 GenTreePtr tmp = comp->gtNewCastNode(tmpType, op1, tmpType);
3738 tmp->gtFlags |= (tree->gtFlags & (GTF_UNSIGNED | GTF_OVERFLOW | GTF_EXCEPT));
3740 tree->gtFlags &= ~GTF_UNSIGNED;
3741 tree->gtOp.gtOp1 = tmp;
3742 BlockRange().InsertAfter(op1, tmp);
3746 //----------------------------------------------------------------------------------------------
3747 // Lowering::IsRMWIndirCandidate:
3748 // Returns true if the given operand is a candidate indirection for a read-modify-write
3752 // operand - The operand to consider.
3753 // storeInd - The indirect store that roots the possible RMW operator.
3755 bool Lowering::IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd)
3757 // If the operand isn't an indirection, it's trivially not a candidate.
3758 if (operand->OperGet() != GT_IND)
3763 // If the indirection's source address isn't equivalent to the destination address of the storeIndir, then the
3764 // indirection is not a candidate.
3765 GenTree* srcAddr = operand->gtGetOp1();
3766 GenTree* dstAddr = storeInd->gtGetOp1();
3767 if ((srcAddr->OperGet() != dstAddr->OperGet()) || !IndirsAreEquivalent(operand, storeInd))
3772 // If it is not safe to contain the entire tree rooted at the indirection, then the indirection is not a
3773 // candidate. Crawl the IR from the node immediately preceding the storeIndir until the last node in the
3774 // indirection's tree is visited and check the side effects at each point.
3776 m_scratchSideEffects.Clear();
3778 assert((operand->gtLIRFlags & LIR::Flags::Mark) == 0);
3779 operand->gtLIRFlags |= LIR::Flags::Mark;
3781 unsigned markCount = 1;
3783 for (node = storeInd->gtPrev; markCount > 0; node = node->gtPrev)
3785 assert(node != nullptr);
3787 if ((node->gtLIRFlags & LIR::Flags::Mark) == 0)
3789 m_scratchSideEffects.AddNode(comp, node);
3793 node->gtLIRFlags &= ~LIR::Flags::Mark;
3796 if (m_scratchSideEffects.InterferesWith(comp, node, false))
3798 // The indirection's tree contains some node that can't be moved to the storeInder. The indirection is
3799 // not a candidate. Clear any leftover mark bits and return.
3800 for (; markCount > 0; node = node->gtPrev)
3802 if ((node->gtLIRFlags & LIR::Flags::Mark) != 0)
3804 node->gtLIRFlags &= ~LIR::Flags::Mark;
3811 for (GenTree* nodeOperand : node->Operands())
3813 assert((nodeOperand->gtLIRFlags & LIR::Flags::Mark) == 0);
3814 nodeOperand->gtLIRFlags |= LIR::Flags::Mark;
3820 // At this point we've verified that the operand is an indirection, its address is equivalent to the storeIndir's
3821 // destination address, and that it and the transitive closure of its operand can be safely contained by the
3822 // storeIndir. This indirection is therefore a candidate for an RMW op.
3826 //----------------------------------------------------------------------------------------------
3827 // Returns true if this tree is bin-op of a GT_STOREIND of the following form
3828 // storeInd(subTreeA, binOp(gtInd(subTreeA), subtreeB)) or
3829 // storeInd(subTreeA, binOp(subtreeB, gtInd(subTreeA)) in case of commutative bin-ops
3831 // The above form for storeInd represents a read-modify-write memory binary operation.
3834 // tree - GentreePtr of binOp
3837 // True if 'tree' is part of a RMW memory operation pattern
3839 bool Lowering::IsBinOpInRMWStoreInd(GenTreePtr tree)
3841 // Must be a non floating-point type binary operator since SSE2 doesn't support RMW memory ops
3842 assert(!varTypeIsFloating(tree));
3843 assert(GenTree::OperIsBinary(tree->OperGet()));
3845 // Cheap bail out check before more expensive checks are performed.
3846 // RMW memory op pattern requires that one of the operands of binOp to be GT_IND.
3847 if (tree->gtGetOp1()->OperGet() != GT_IND && tree->gtGetOp2()->OperGet() != GT_IND)
3853 if (!BlockRange().TryGetUse(tree, &use) || use.User()->OperGet() != GT_STOREIND || use.User()->gtGetOp2() != tree)
3858 // Since it is not relatively cheap to recognize RMW memory op pattern, we
3859 // cache the result in GT_STOREIND node so that while lowering GT_STOREIND
3860 // we can use the result.
3861 GenTreePtr indirCandidate = nullptr;
3862 GenTreePtr indirOpSource = nullptr;
3863 return IsRMWMemOpRootedAtStoreInd(use.User(), &indirCandidate, &indirOpSource);
3866 //----------------------------------------------------------------------------------------------
3867 // This method recognizes the case where we have a treeNode with the following structure:
3868 // storeInd(IndirDst, binOp(gtInd(IndirDst), indirOpSource)) OR
3869 // storeInd(IndirDst, binOp(indirOpSource, gtInd(IndirDst)) in case of commutative operations OR
3870 // storeInd(IndirDst, unaryOp(gtInd(IndirDst)) in case of unary operations
3873 // indirDst = memory write of an addr mode (i.e. storeind destination)
3874 // indirSrc = value being written to memory (i.e. storeind source which could either be a binary or unary op)
3875 // indirCandidate = memory read i.e. a gtInd of an addr mode
3876 // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
3878 // In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the
3879 // following form in case of integer operations:
3880 // binOp [addressing mode], RegIndirOpSource
3881 // binOp [addressing mode], immediateVal
3882 // where RegIndirOpSource is the register where indirOpSource was computed.
3884 // Right now, we recognize few cases:
3885 // a) The gtInd child is a lea/lclVar/lclVarAddr/clsVarAddr/constant
3886 // b) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
3887 // c) unaryOp is either not/neg
3889 // Implementation Note: The following routines need to be in sync for RMW memory op optimization
3890 // to be correct and functional.
3891 // IndirsAreEquivalent()
3892 // NodesAreEquivalentLeaves()
3893 // Codegen of GT_STOREIND and genCodeForShiftRMW()
3896 // TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
3897 // package to perform more complex tree recognition.
3899 // TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
3902 // tree - GT_STOREIND node
3903 // outIndirCandidate - out param set to indirCandidate as described above
3904 // ouutIndirOpSource - out param set to indirOpSource as described above
3907 // True if there is a RMW memory operation rooted at a GT_STOREIND tree
3908 // and out params indirCandidate and indirOpSource are set to non-null values.
3909 // Otherwise, returns false with indirCandidate and indirOpSource set to null.
3910 // Also updates flags of GT_STOREIND tree with its RMW status.
3912 bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTreePtr tree, GenTreePtr* outIndirCandidate, GenTreePtr* outIndirOpSource)
3914 assert(!varTypeIsFloating(tree));
3915 assert(outIndirCandidate != nullptr);
3916 assert(outIndirOpSource != nullptr);
3918 *outIndirCandidate = nullptr;
3919 *outIndirOpSource = nullptr;
3921 // Early out if storeInd is already known to be a non-RMW memory op
3922 GenTreeStoreInd* storeInd = tree->AsStoreInd();
3923 if (storeInd->IsNonRMWMemoryOp())
3928 GenTreePtr indirDst = storeInd->gtGetOp1();
3929 GenTreePtr indirSrc = storeInd->gtGetOp2();
3930 genTreeOps oper = indirSrc->OperGet();
3932 // Early out if it is already known to be a RMW memory op
3933 if (storeInd->IsRMWMemoryOp())
3935 if (GenTree::OperIsBinary(oper))
3937 if (storeInd->IsRMWDstOp1())
3939 *outIndirCandidate = indirSrc->gtGetOp1();
3940 *outIndirOpSource = indirSrc->gtGetOp2();
3944 assert(storeInd->IsRMWDstOp2());
3945 *outIndirCandidate = indirSrc->gtGetOp2();
3946 *outIndirOpSource = indirSrc->gtGetOp1();
3948 assert(IndirsAreEquivalent(*outIndirCandidate, storeInd));
3952 assert(GenTree::OperIsUnary(oper));
3953 assert(IndirsAreEquivalent(indirSrc->gtGetOp1(), storeInd));
3954 *outIndirCandidate = indirSrc->gtGetOp1();
3955 *outIndirOpSource = indirSrc->gtGetOp1();
3961 // If reached here means that we do not know RMW status of tree rooted at storeInd
3962 assert(storeInd->IsRMWStatusUnknown());
3964 // Early out if indirDst is not one of the supported memory operands.
3965 if (indirDst->OperGet() != GT_LEA && indirDst->OperGet() != GT_LCL_VAR && indirDst->OperGet() != GT_LCL_VAR_ADDR &&
3966 indirDst->OperGet() != GT_CLS_VAR_ADDR && indirDst->OperGet() != GT_CNS_INT)
3968 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
3972 // We can not use Read-Modify-Write instruction forms with overflow checking instructions
3973 // because we are not allowed to modify the target until after the overflow check.
3974 if (indirSrc->gtOverflowEx())
3976 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
3980 // At this point we can match one of two patterns:
3982 // t_ind = indir t_addr_0
3984 // t_value = binop t_ind, t_other
3986 // storeIndir t_addr_1, t_value
3990 // t_ind = indir t_addr_0
3992 // t_value = unop t_ind
3994 // storeIndir t_addr_1, t_value
3996 // In all cases, we will eventually make the binop that produces t_value and the entire dataflow tree rooted at
3997 // t_ind contained by t_value.
3999 GenTree* indirCandidate = nullptr;
4000 GenTree* indirOpSource = nullptr;
4001 RMWStatus status = STOREIND_RMW_STATUS_UNKNOWN;
4002 if (GenTree::OperIsBinary(oper))
4004 // Return if binary op is not one of the supported operations for RMW of memory.
4005 if (oper != GT_ADD && oper != GT_SUB && oper != GT_AND && oper != GT_OR && oper != GT_XOR &&
4006 !GenTree::OperIsShiftOrRotate(oper))
4008 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
4012 if (GenTree::OperIsShiftOrRotate(oper) && varTypeIsSmall(storeInd))
4014 // In ldind, Integer values smaller than 4 bytes, a boolean, or a character converted to 4 bytes
4015 // by sign or zero-extension as appropriate. If we directly shift the short type data using sar, we
4016 // will lose the sign or zero-extension bits.
4017 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_TYPE);
4021 // In the common case, the second operand to the binop will be the indir candidate.
4022 GenTreeOp* binOp = indirSrc->AsOp();
4023 if (GenTree::OperIsCommutative(oper) && IsRMWIndirCandidate(binOp->gtOp2, storeInd))
4025 indirCandidate = binOp->gtOp2;
4026 indirOpSource = binOp->gtOp1;
4027 status = STOREIND_RMW_DST_IS_OP2;
4029 else if (IsRMWIndirCandidate(binOp->gtOp1, storeInd))
4031 indirCandidate = binOp->gtOp1;
4032 indirOpSource = binOp->gtOp2;
4033 status = STOREIND_RMW_DST_IS_OP1;
4037 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
4041 else if (GenTree::OperIsUnary(oper))
4043 // Nodes other than GT_NOT and GT_NEG are not yet supported.
4044 if (oper != GT_NOT && oper != GT_NEG)
4046 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
4050 if (indirSrc->gtGetOp1()->OperGet() != GT_IND)
4052 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
4056 GenTreeUnOp* unOp = indirSrc->AsUnOp();
4057 if (IsRMWIndirCandidate(unOp->gtOp1, storeInd))
4059 // src and dest are the same in case of unary ops
4060 indirCandidate = unOp->gtOp1;
4061 indirOpSource = unOp->gtOp1;
4062 status = STOREIND_RMW_DST_IS_OP1;
4066 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
4072 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
4076 // By this point we've verified that we have a supported operand with a supported address. Now we need to ensure
4077 // that we're able to move the destination address for the source indirection forwards.
4078 if (!IsSafeToContainMem(storeInd, indirDst))
4080 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
4084 assert(indirCandidate != nullptr);
4085 assert(indirOpSource != nullptr);
4086 assert(status != STOREIND_RMW_STATUS_UNKNOWN);
4088 *outIndirCandidate = indirCandidate;
4089 *outIndirOpSource = indirOpSource;
4090 storeInd->SetRMWStatus(status);
4094 //--------------------------------------------------------------------------------------------
4095 // SetStoreIndOpCountsIfRMWMemOp checks to see if there is a RMW memory operation rooted at
4096 // GT_STOREIND node and if so will mark register requirements for nodes under storeInd so
4097 // that CodeGen will generate a single instruction of the form:
4099 // binOp [addressing mode], reg
4102 // storeInd - GT_STOREIND node
4105 // True, if RMW memory op tree pattern is recognized and op counts are set.
4108 bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd)
4110 assert(storeInd->OperGet() == GT_STOREIND);
4112 // SSE2 doesn't support RMW on float values
4113 assert(!varTypeIsFloating(storeInd));
4116 // indirDst = memory write of an addr mode (i.e. storeind destination)
4117 // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
4118 // indirCandidate = memory read i.e. a gtInd of an addr mode
4119 // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
4121 GenTreePtr indirCandidate = nullptr;
4122 GenTreePtr indirOpSource = nullptr;
4124 if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
4126 JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
4127 storeInd->AsStoreInd()->GetRMWStatus());
4128 DISPTREERANGE(BlockRange(), storeInd);
4132 GenTreePtr indirDst = storeInd->gtGetOp1();
4133 GenTreePtr indirSrc = storeInd->gtGetOp2();
4134 genTreeOps oper = indirSrc->OperGet();
4136 // At this point we have successfully detected a RMW memory op of one of the following forms
4137 // storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
4138 // storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
4139 // storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
4141 // Here indirSrc = one of the supported binary or unary operation for RMW of memory
4142 // indirCandidate = a GT_IND node
4143 // indirCandidateChild = operand of GT_IND indirCandidate
4145 // The logic below essentially does the following
4146 // set storeInd src count to that of the dst count of indirOpSource
4147 // clear operand counts on indirSrc (i.e. marked as contained and storeInd will generate code for it)
4148 // clear operand counts on indirCandidate
4149 // clear operand counts on indirDst except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
4151 // Increment src count of storeInd to account for the registers required to form indirDst addr mode
4152 // clear operand counts on indirCandidateChild
4154 TreeNodeInfo* info = &(storeInd->gtLsraInfo);
4157 if (GenTree::OperIsBinary(oper))
4159 // On Xarch RMW operations require that the source memory-op be in a register.
4160 assert(!indirOpSource->isMemoryOp() || indirOpSource->gtLsraInfo.dstCount == 1);
4161 JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
4162 info->srcCount = indirOpSource->gtLsraInfo.dstCount;
4166 assert(GenTree::OperIsUnary(oper));
4167 JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
4170 DISPTREERANGE(BlockRange(), storeInd);
4172 m_lsra->clearOperandCounts(indirSrc);
4173 m_lsra->clearOperandCounts(indirCandidate);
4175 GenTreePtr indirCandidateChild = indirCandidate->gtGetOp1();
4176 if (indirCandidateChild->OperGet() == GT_LEA)
4178 GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
4180 if (addrMode->HasBase())
4182 assert(addrMode->Base()->OperIsLeaf());
4183 m_lsra->clearOperandCounts(addrMode->Base());
4187 if (addrMode->HasIndex())
4189 assert(addrMode->Index()->OperIsLeaf());
4190 m_lsra->clearOperandCounts(addrMode->Index());
4194 m_lsra->clearOperandCounts(indirDst);
4198 assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
4199 indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT);
4201 // If it is a GT_LCL_VAR, it still needs the reg to hold the address.
4202 // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
4203 // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
4204 // time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
4205 if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
4207 m_lsra->clearOperandCounts(indirDst);
4209 else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
4211 m_lsra->clearOperandCounts(indirDst);
4215 // Need a reg and hence increment src count of storeind
4216 info->srcCount += indirCandidateChild->gtLsraInfo.dstCount;
4219 m_lsra->clearOperandCounts(indirCandidateChild);
4222 if (varTypeIsByte(storeInd))
4224 // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers.
4225 bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0;
4228 regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra);
4229 assert(regMask != RBM_NONE);
4230 indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS);
4239 * Takes care of annotating the src and dst register
4240 * requirements for a GT_MUL treenode.
4242 void Lowering::SetMulOpCounts(GenTreePtr tree)
4244 #if defined(_TARGET_X86_)
4245 assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG);
4247 assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);
4249 TreeNodeInfo* info = &(tree->gtLsraInfo);
4254 GenTreePtr op1 = tree->gtOp.gtOp1;
4255 GenTreePtr op2 = tree->gtOp.gtOp2;
4257 // Case of float/double mul.
4258 if (varTypeIsFloating(tree->TypeGet()))
4260 assert(tree->OperGet() == GT_MUL);
4262 if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
4264 MakeSrcContained(tree, op2);
4266 else if (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1)))
4268 // Since GT_MUL is commutative, we will try to re-order operands if it is safe to
4269 // generate more efficient code sequence for the case of GT_MUL(op1=memOp, op2=non-memOp)
4270 MakeSrcContained(tree, op1);
4274 // If there are no containable operands, we can make an operand reg optional.
4275 SetRegOptionalForBinOp(tree);
4280 bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0);
4281 bool requiresOverflowCheck = tree->gtOverflowEx();
4282 bool useLeaEncoding = false;
4283 GenTreePtr memOp = nullptr;
4285 bool hasImpliedFirstOperand = false;
4286 GenTreeIntConCommon* imm = nullptr;
4287 GenTreePtr other = nullptr;
4289 // There are three forms of x86 multiply:
4290 // one-op form: RDX:RAX = RAX * r/m
4291 // two-op form: reg *= r/m
4292 // three-op form: reg = r/m * imm
4294 // This special widening 32x32->64 MUL is not used on x64
4295 #if defined(_TARGET_X86_)
4296 if (tree->OperGet() != GT_MUL_LONG)
4299 assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
4302 // Multiply should never be using small types
4303 assert(!varTypeIsSmall(tree->TypeGet()));
4305 // We do use the widening multiply to implement
4306 // the overflow checking for unsigned multiply
4308 if (isUnsignedMultiply && requiresOverflowCheck)
4310 // The only encoding provided is RDX:RAX = RAX * rm
4312 // Here we set RAX as the only destination candidate
4313 // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
4315 info->setDstCandidates(m_lsra, RBM_RAX);
4316 hasImpliedFirstOperand = true;
4318 else if (tree->gtOper == GT_MULHI
4319 #if defined(_TARGET_X86_)
4320 || tree->OperGet() == GT_MUL_LONG
4324 // have to use the encoding:RDX:RAX = RAX * rm
4325 info->setDstCandidates(m_lsra, RBM_RAX);
4326 hasImpliedFirstOperand = true;
4328 else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
4330 if (IsContainableImmed(tree, op2))
4332 imm = op2->AsIntConCommon();
4337 imm = op1->AsIntConCommon();
4341 // CQ: We want to rewrite this into a LEA
4342 ssize_t immVal = imm->AsIntConCommon()->IconValue();
4343 if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
4345 useLeaEncoding = true;
4348 MakeSrcContained(tree, imm); // The imm is always contained
4349 if (other->isMemoryOp())
4351 memOp = other; // memOp may be contained below
4355 // We allow one operand to be a contained memory operand.
4356 // The memory op type must match with the 'tree' type.
4357 // This is because during codegen we use 'tree' type to derive EmitTypeSize.
4358 // E.g op1 type = byte, op2 type = byte but GT_MUL tree type is int.
4360 if (memOp == nullptr && op2->isMemoryOp())
4365 // To generate an LEA we need to force memOp into a register
4366 // so don't allow memOp to be 'contained'
4368 if (!useLeaEncoding)
4370 if ((memOp != nullptr) && (memOp->TypeGet() == tree->TypeGet()) && IsSafeToContainMem(tree, memOp))
4372 MakeSrcContained(tree, memOp);
4374 else if (imm != nullptr)
4376 // Has a contained immediate operand.
4377 // Only 'other' operand can be marked as reg optional.
4378 assert(other != nullptr);
4379 SetRegOptional(other);
4381 else if (hasImpliedFirstOperand)
4383 // Only op2 can be marke as reg optional.
4384 SetRegOptional(op2);
4388 // If there are no containable operands, we can make either of op1 or op2
4390 SetRegOptionalForBinOp(tree);
4395 //------------------------------------------------------------------------------
4396 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
4399 // tree - a binary tree node
4402 // Returns true if we can use the read-modify-write instruction form
4405 // This is used to determine whether to preference the source to the destination register.
4407 bool Lowering::isRMWRegOper(GenTreePtr tree)
4409 // TODO-XArch-CQ: Make this more accurate.
4410 // For now, We assume that most binary operators are of the RMW form.
4411 assert(tree->OperIsBinary());
4413 if (tree->OperIsCompare())
4418 switch (tree->OperGet())
4420 // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand
4428 // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
4430 return (!IsContainableImmed(tree, tree->gtOp.gtOp2) && !IsContainableImmed(tree, tree->gtOp.gtOp1));
4437 // anything is in range for AMD64
4438 bool Lowering::IsCallTargetInRange(void* addr)
4443 // return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
4444 bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode)
4446 if (!childNode->IsIntCnsFitsInI32())
4451 // At this point we know that it is an int const fits within 4-bytes and hence can safely cast to IntConCommon.
4452 // Icons that need relocation should never be marked as contained immed
4453 if (childNode->AsIntConCommon()->ImmedValNeedsReloc(comp))
4461 //-----------------------------------------------------------------------
4462 // PreferredRegOptionalOperand: returns one of the operands of given
4463 // binary oper that is to be preferred for marking as reg optional.
4465 // Since only one of op1 or op2 can be a memory operand on xarch, only
4466 // one of them have to be marked as reg optional. Since Lower doesn't
4467 // know apriori which of op1 or op2 is not likely to get a register, it
4468 // has to make a guess. This routine encapsulates heuristics that
4469 // guess whether it is likely to be beneficial to mark op1 or op2 as
4474 // tree - a binary-op tree node that is either commutative
4475 // or a compare oper.
4478 // Returns op1 or op2 of tree node that is preferred for
4479 // marking as reg optional.
4481 // Note: if the tree oper is neither commutative nor a compare oper
4482 // then only op2 can be reg optional on xarch and hence no need to
4483 // call this routine.
4484 GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
4486 assert(GenTree::OperIsBinary(tree->OperGet()));
4487 assert(tree->OperIsCommutative() || tree->OperIsCompare());
4489 GenTree* op1 = tree->gtGetOp1();
4490 GenTree* op2 = tree->gtGetOp2();
4491 GenTree* preferredOp = nullptr;
4493 // This routine uses the following heuristics:
4495 // a) If both are tracked locals, marking the one with lower weighted
4496 // ref count as reg-optional would likely be beneficial as it has
4497 // higher probability of not getting a register.
4499 // b) op1 = tracked local and op2 = untracked local: LSRA creates two
4500 // ref positions for op2: a def and use position. op2's def position
4501 // requires a reg and it is allocated a reg by spilling another
4502 // interval (if required) and that could be even op1. For this reason
4503 // it is beneficial to mark op1 as reg optional.
4505 // TODO: It is not always mandatory for a def position of an untracked
4506 // local to be allocated a register if it is on rhs of an assignment
4507 // and its use position is reg-optional and has not been assigned a
4508 // register. Reg optional def positions is currently not yet supported.
4510 // c) op1 = untracked local and op2 = tracked local: marking op1 as
4511 // reg optional is beneficial, since its use position is less likely
4512 // to get a register.
4514 // d) If both are untracked locals (i.e. treated like tree temps by
4515 // LSRA): though either of them could be marked as reg optional,
4516 // marking op1 as reg optional is likely to be beneficial because
4517 // while allocating op2's def position, there is a possibility of
4518 // spilling op1's def and in which case op1 is treated as contained
4519 // memory operand rather than requiring to reload.
4521 // e) If only one of them is a local var, prefer to mark it as
4522 // reg-optional. This is heuristic is based on the results
4523 // obtained against CQ perf benchmarks.
4525 // f) If neither of them are local vars (i.e. tree temps), prefer to
4526 // mark op1 as reg optional for the same reason as mentioned in (d) above.
4527 if (op1->OperGet() == GT_LCL_VAR && op2->OperGet() == GT_LCL_VAR)
4529 LclVarDsc* v1 = comp->lvaTable + op1->AsLclVarCommon()->GetLclNum();
4530 LclVarDsc* v2 = comp->lvaTable + op2->AsLclVarCommon()->GetLclNum();
4532 if (v1->lvTracked && v2->lvTracked)
4534 // Both are tracked locals. The one with lower weight is less likely
4535 // to get a register and hence beneficial to mark the one with lower
4536 // weight as reg optional.
4537 if (v1->lvRefCntWtd < v2->lvRefCntWtd)
4546 else if (v2->lvTracked)
4548 // v1 is an untracked lcl and it is use position is less likely to
4552 else if (v1->lvTracked)
4554 // v2 is an untracked lcl and its def position always
4555 // needs a reg. Hence it is better to mark v1 as
4565 else if (op1->OperGet() == GT_LCL_VAR)
4569 else if (op2->OperGet() == GT_LCL_VAR)
4575 // Neither of the operands is a local, prefer marking
4576 // operand that is evaluated first as reg optional
4577 // since its use position is less likely to get a register.
4578 bool reverseOps = ((tree->gtFlags & GTF_REVERSE_OPS) != 0);
4579 preferredOp = reverseOps ? op2 : op1;
4586 //------------------------------------------------------------------------
4587 // ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
4591 // tree - The node of interest
4594 // If we need to exclude non-byteable registers
4596 bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
4598 // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
4599 // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
4600 // value. In this case we need to exclude esi/edi from the src candidates of op2.
4601 if (varTypeIsByte(tree))
4605 // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
4606 else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
4610 else if (tree->OperIsCompare())
4612 GenTree* op1 = tree->gtGetOp1();
4613 GenTree* op2 = tree->gtGetOp2();
4615 // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
4616 // ubyte as the result of comparison and if the result needs to be materialized into a reg
4617 // simply zero extend it to TYP_INT size. Here is an example of generated code:
4618 // cmp dl, byte ptr[addr mode]
4620 if (varTypeIsByte(op1) && varTypeIsByte(op2))
4624 // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
4625 // ubyte as the result of the comparison and if the result needs to be materialized into a reg
4626 // simply zero extend it to TYP_INT size.
4627 else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
4631 // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
4632 // ubyte as the result of the comparison and if the result needs to be materialized into a reg
4633 // simply zero extend it to TYP_INT size.
4634 else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
4648 #endif // _TARGET_X86_
4650 #endif // _TARGET_XARCH_
4652 #endif // !LEGACY_BACKEND