1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
4 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
5 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XX Postconditions (for the nodes currently handled): XX
12 XX - All operands requiring a register are explicit in the graph XX
14 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
25 #if !defined(TARGET_64BIT)
26 #include "decomposelongs.h"
27 #endif // !defined(TARGET_64BIT)
29 //------------------------------------------------------------------------
30 // MakeSrcContained: Make "childNode" a contained node
33 // parentNode - is a non-leaf node that can contain its 'childNode'
34 // childNode - is an op that will now be contained by its parent.
37 // If 'childNode' it has any existing sources, they will now be sources for the parent.
39 void Lowering::MakeSrcContained(GenTree* parentNode, GenTree* childNode) const
41 assert(!parentNode->OperIsLeaf());
42 assert(childNode->canBeContained());
44 childNode->SetContained();
45 assert(childNode->isContained());
48 if (IsContainableMemoryOp(childNode))
50 // Verify caller of this method checked safety.
52 const bool isSafeToContainMem = IsSafeToContainMem(parentNode, childNode);
54 if (!isSafeToContainMem)
56 JITDUMP("** Unsafe mem containment of [%06u] in [%06u}\n", comp->dspTreeID(childNode),
57 comp->dspTreeID(parentNode));
58 assert(isSafeToContainMem);
64 //------------------------------------------------------------------------
65 // MakeSrcRegOptional: Make "childNode" a regOptional node
68 // parentNode - is a non-leaf node that can regOptional its 'childNode'
69 // childNode - is an op that will now be regOptional to its parent.
71 void Lowering::MakeSrcRegOptional(GenTree* parentNode, GenTree* childNode) const
73 assert(!parentNode->OperIsLeaf());
75 childNode->SetRegOptional();
76 assert(childNode->IsRegOptional());
79 // Verify caller of this method checked safety.
81 const bool isSafeToMarkRegOptional = IsSafeToMarkRegOptional(parentNode, childNode);
83 if (!isSafeToMarkRegOptional)
85 JITDUMP("** Unsafe regOptional of [%06u] in [%06u}\n", comp->dspTreeID(childNode), comp->dspTreeID(parentNode));
86 assert(isSafeToMarkRegOptional);
91 //------------------------------------------------------------------------
92 // TryMakeSrcContainedOrRegOptional: Tries to make "childNode" a contained or regOptional node
95 // parentNode - is a non-leaf node that can contain or regOptional its 'childNode'
96 // childNode - is an op that will now be contained or regOptional to its parent.
98 void Lowering::TryMakeSrcContainedOrRegOptional(GenTree* parentNode, GenTree* childNode) const
100 // HWIntrinsic nodes should use TryGetContainableHWIntrinsicOp and its relevant handling
101 assert(!parentNode->OperIsHWIntrinsic());
103 if (IsContainableMemoryOp(childNode) && IsSafeToContainMem(parentNode, childNode))
105 MakeSrcContained(parentNode, childNode);
107 else if (IsSafeToMarkRegOptional(parentNode, childNode))
109 MakeSrcRegOptional(parentNode, childNode);
113 //------------------------------------------------------------------------
114 // CheckImmedAndMakeContained: Checks if the 'childNode' is a containable immediate
115 // and, if so, makes it contained.
118 // parentNode - is any non-leaf node
119 // childNode - is an child op of 'parentNode'
122 // true if we are able to make childNode a contained immediate
124 bool Lowering::CheckImmedAndMakeContained(GenTree* parentNode, GenTree* childNode)
126 assert(!parentNode->OperIsLeaf());
127 // If childNode is a containable immediate
128 if (IsContainableImmed(parentNode, childNode))
130 // then make it contained within the parentNode
131 MakeSrcContained(parentNode, childNode);
137 //------------------------------------------------------------------------
138 // IsInvariantInRange: Check if a node is invariant in the specified range. In
139 // other words, can 'node' be moved to right before 'endExclusive' without its
140 // computation changing values?
144 // endExclusive - The exclusive end of the range to check invariance for.
147 // True if 'node' can be evaluated at any point between its current
148 // location and 'endExclusive' without giving a different result; otherwise
151 bool Lowering::IsInvariantInRange(GenTree* node, GenTree* endExclusive) const
153 assert((node != nullptr) && (endExclusive != nullptr));
155 // Quick early-out for unary cases
157 if (node->gtNext == endExclusive)
162 if (node->OperConsumesFlags())
167 m_scratchSideEffects.Clear();
168 m_scratchSideEffects.AddNode(comp, node);
170 for (GenTree* cur = node->gtNext; cur != endExclusive; cur = cur->gtNext)
172 assert((cur != nullptr) && "Expected first node to precede end node");
173 const bool strict = true;
174 if (m_scratchSideEffects.InterferesWith(comp, cur, strict))
183 //------------------------------------------------------------------------
184 // IsInvariantInRange: Check if a node is invariant in the specified range,
185 // ignoring conflicts with one particular node.
189 // endExclusive - The exclusive end of the range to check invariance for.
190 // ignoreNode - A node to ignore interference checks with, for example
191 // because it will retain its relative order with 'node'.
194 // True if 'node' can be evaluated at any point between its current location
195 // and 'endExclusive' without giving a different result; otherwise false.
197 bool Lowering::IsInvariantInRange(GenTree* node, GenTree* endExclusive, GenTree* ignoreNode) const
199 assert((node != nullptr) && (endExclusive != nullptr));
201 if (ignoreNode == nullptr)
203 return IsInvariantInRange(node, endExclusive);
206 if ((node->gtNext == endExclusive) || ((node->gtNext == ignoreNode) && (node->gtNext->gtNext == endExclusive)))
211 if (node->OperConsumesFlags())
216 m_scratchSideEffects.Clear();
217 m_scratchSideEffects.AddNode(comp, node);
219 for (GenTree* cur = node->gtNext; cur != endExclusive; cur = cur->gtNext)
221 assert((cur != nullptr) && "Expected first node to precede end node");
222 if (cur == ignoreNode)
227 const bool strict = true;
228 if (m_scratchSideEffects.InterferesWith(comp, cur, strict))
237 //------------------------------------------------------------------------
238 // IsRangeInvariantInRange: Check if a range of nodes are invariant in the
242 // rangeStart - The first node.
243 // rangeEnd - The last node.
244 // endExclusive - The exclusive end of the range to check invariance for.
245 // ignoreNode - A node to ignore interference checks with, for example
246 // because it will retain its relative order with 'node'.
249 // True if the range can be evaluated at any point between its current location
250 // and 'endExclusive' without giving a different result; otherwise false.
253 // Note that the range is treated as a unit and no pairwise interference
254 // checks between nodes in the range are performed.
256 bool Lowering::IsRangeInvariantInRange(GenTree* rangeStart,
258 GenTree* endExclusive,
259 GenTree* ignoreNode) const
261 assert((rangeStart != nullptr) && (rangeEnd != nullptr));
263 if ((rangeEnd->gtNext == endExclusive) ||
264 ((ignoreNode != nullptr) && (rangeEnd->gtNext == ignoreNode) && (rangeEnd->gtNext->gtNext == endExclusive)))
269 if (rangeStart->OperConsumesFlags())
274 m_scratchSideEffects.Clear();
275 GenTree* cur = rangeStart;
278 m_scratchSideEffects.AddNode(comp, cur);
286 assert((cur != nullptr) && "Expected rangeStart to precede rangeEnd");
289 for (GenTree* cur = rangeEnd->gtNext; cur != endExclusive; cur = cur->gtNext)
291 assert((cur != nullptr) && "Expected first node to precede end node");
292 if (cur == ignoreNode)
297 const bool strict = true;
298 if (m_scratchSideEffects.InterferesWith(comp, cur, strict))
307 //------------------------------------------------------------------------
308 // IsSafeToContainMem: Checks for conflicts between childNode and parentNode,
309 // and returns 'true' iff memory operand childNode can be contained in parentNode.
312 // parentNode - any non-leaf node
313 // childNode - some node that is an input to `parentNode`
316 // true if it is safe to make childNode a contained memory operand.
318 bool Lowering::IsSafeToContainMem(GenTree* parentNode, GenTree* childNode) const
320 return IsInvariantInRange(childNode, parentNode);
323 //------------------------------------------------------------------------
324 // IsSafeToContainMem: Checks for conflicts between childNode and grandParentNode
325 // and returns 'true' iff memory operand childNode can be contained in grandParentNode.
328 // grandParentNode - any non-leaf node
329 // parentNode - parent of `childNode` and an input to `grandParentNode`
330 // childNode - some node that is an input to `parentNode`
333 // true if it is safe to make childNode a contained memory operand.
335 bool Lowering::IsSafeToContainMem(GenTree* grandparentNode, GenTree* parentNode, GenTree* childNode) const
337 return IsInvariantInRange(childNode, grandparentNode, parentNode);
340 //------------------------------------------------------------------------
341 // IsSafeToMarkRegOptional: Check whether it is safe to mark 'childNode' as
342 // reg-optional in 'parentNode'.
345 // parentNode - parent of 'childNode'
346 // childNode - some node that is an input to `parentNode`
349 // True if it is safe to mark childNode as reg-optional; otherwise false.
352 // Unlike containment, reg-optionality can only rarely introduce new
353 // conflicts, because reg-optionality mostly does not cause the child node
354 // to be evaluated at a new point in time:
356 // 1. For LIR edges (i.e. anything that isn't GT_LCL_VAR) reg-optionality
357 // indicates that if the edge was spilled to a temp at its def, the parent
358 // node can use it directly from its spill location without reloading it
359 // into a register first. This is always safe as as spill temps cannot
362 // For example, an indirection can be marked reg-optional even if there
363 // is interference between it and its parent; the indirection will still
364 // be evaluated at its original position, but if the value is spilled to
365 // stack, then reg-optionality can allow using the value from the spill
366 // location directly. Similarly, GT_LCL_FLD nodes are never register
367 // candidates and can be handled the same way.
369 // 2. For GT_LCL_VAR reg-optionality indicates that the node can use the
370 // local directly from its home location. IR invariants guarantee that the
371 // local is not defined between its LIR location and the parent node (see
372 // CheckLclVarSemanticsHelper). That means the only case where it could
373 // interfere is due to it being address exposed. So this is the only unsafe
376 bool Lowering::IsSafeToMarkRegOptional(GenTree* parentNode, GenTree* childNode) const
378 if (!childNode->OperIs(GT_LCL_VAR))
380 // LIR edges never interfere. This includes GT_LCL_FLD, see the remarks above.
384 LclVarDsc* dsc = comp->lvaGetDesc(childNode->AsLclVarCommon());
385 if (!dsc->IsAddressExposed())
387 // Safe by IR invariants (no assignments occur between parent and node).
391 // We expect this to have interference as otherwise we could have marked it
392 // contained instead of reg-optional.
396 //------------------------------------------------------------------------
397 // LowerNode: this is the main entry point for Lowering.
400 // node - the node we are lowering.
403 // next node in the transformed node sequence that needs to be lowered.
405 GenTree* Lowering::LowerNode(GenTree* node)
407 assert(node != nullptr);
408 switch (node->gtOper)
412 LowerIndir(node->AsIndir());
416 LowerStoreIndirCommon(node->AsStoreInd());
421 GenTree* next = LowerAdd(node->AsOp());
429 #if !defined(TARGET_64BIT)
440 if (comp->opts.OptimizationEnabled() && node->OperIs(GT_AND))
442 GenTree* nextNode = nullptr;
443 if (TryLowerAndNegativeOne(node->AsOp(), &nextNode))
447 assert(nextNode == nullptr);
450 return LowerBinaryArithmetic(node->AsOp());
455 #if defined(TARGET_X86) || defined(TARGET_ARM64)
458 return LowerMul(node->AsOp());
462 if (!LowerUnsignedDivOrMod(node->AsOp()))
464 ContainCheckDivOrMod(node->AsOp());
470 return LowerSignedDivOrMod(node);
473 return LowerSwitch(node);
477 GenTree* newNode = LowerCall(node);
478 if (newNode != nullptr)
494 return LowerCompare(node);
497 return LowerJTrue(node->AsOp());
501 ContainCheckNeg(node->AsOp());
505 return LowerSelect(node->AsConditional());
508 ContainCheckSelect(node->AsOp());
512 LowerJmpMethod(node);
516 LowerRet(node->AsUnOp());
520 ContainCheckReturnTrap(node->AsOp());
528 ContainCheckBitCast(node);
531 #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
532 case GT_BOUNDS_CHECK:
533 ContainCheckBoundsChk(node->AsBoundsChk());
535 #endif // defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
537 case GT_ARR_ELEM: // Lowered by fgMorphArrayOps()
538 case GT_MDARR_LENGTH:
539 case GT_MDARR_LOWER_BOUND:
540 // Lowered by fgSimpleLowering()
552 ContainCheckShiftRotate(node->AsOp());
554 #endif // !TARGET_64BIT
559 #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
560 LowerShift(node->AsOp());
562 ContainCheckShiftRotate(node->AsOp());
567 if (node->AsBlk()->Data()->IsCall())
569 LowerStoreSingleRegCallStruct(node->AsBlk());
573 case GT_STORE_DYN_BLK:
574 LowerBlockStoreCommon(node->AsBlk());
583 ContainCheckIntrinsic(node->AsOp());
585 #endif // TARGET_XARCH
587 #ifdef FEATURE_HW_INTRINSICS
589 return LowerHWIntrinsic(node->AsHWIntrinsic());
590 #endif // FEATURE_HW_INTRINSICS
594 // We should only encounter this for lclVars that are lvDoNotEnregister.
595 verifyLclFldDoNotEnregister(node->AsLclVarCommon()->GetLclNum());
601 GenTreeLclVar* lclNode = node->AsLclVar();
602 WidenSIMD12IfNecessary(lclNode);
603 LclVarDsc* varDsc = comp->lvaGetDesc(lclNode);
605 // The consumer of this node must check compatibility of the fields.
606 // This merely checks whether it is possible for this to be a multireg node.
607 if (lclNode->IsMultiRegLclVar())
609 if (!varDsc->lvPromoted ||
610 (comp->lvaGetPromotionType(varDsc) != Compiler::PROMOTION_TYPE_INDEPENDENT) ||
611 (varDsc->lvFieldCnt > MAX_MULTIREG_COUNT))
613 lclNode->ClearMultiReg();
614 if (lclNode->TypeIs(TYP_STRUCT))
616 comp->lvaSetVarDoNotEnregister(lclNode->GetLclNum() DEBUGARG(DoNotEnregisterReason::BlockOp));
623 case GT_STORE_LCL_VAR:
624 WidenSIMD12IfNecessary(node->AsLclVarCommon());
627 case GT_STORE_LCL_FLD:
628 LowerStoreLocCommon(node->AsLclVarCommon());
631 #if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
633 CheckImmedAndMakeContained(node, node->AsCmpXchg()->gtOpComparand);
639 CheckImmedAndMakeContained(node, node->AsOp()->gtOp2);
641 #elif defined(TARGET_XARCH)
645 if (node->IsUnusedValue())
647 node->ClearUnusedValue();
648 // Make sure the types are identical, since the node type is changed to VOID
649 // CodeGen relies on op2's type to determine the instruction size.
650 // Note that the node type cannot be a small int but the data operand can.
651 assert(genActualType(node->gtGetOp2()->TypeGet()) == node->TypeGet());
652 node->SetOper(GT_LOCKADD);
653 node->gtType = TYP_VOID;
654 CheckImmedAndMakeContained(node, node->gtGetOp2());
660 node->gtGetOp1()->SetRegOptional();
665 const GenTreeLclVarCommon* lclAddr = node->AsLclVarCommon();
666 const LclVarDsc* varDsc = comp->lvaGetDesc(lclAddr);
667 if (!varDsc->lvDoNotEnregister)
669 // TODO-Cleanup: this is definitely not the best place for this detection,
670 // but for now it is the easiest. Move it to morph.
671 comp->lvaSetVarDoNotEnregister(lclAddr->GetLclNum() DEBUGARG(DoNotEnregisterReason::LclAddrNode));
676 #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
679 LowerBswapOp(node->AsOp());
681 #endif // FEATURE_HW_INTRINSICS && TARGET_XARCH
690 /** -- Switch Lowering --
691 * The main idea of switch lowering is to keep transparency of the register requirements of this node
692 * downstream in LSRA. Given that the switch instruction is inherently a control statement which in the JIT
693 * is represented as a simple tree node, at the time we actually generate code for it we end up
694 * generating instructions that actually modify the flow of execution that imposes complicated
695 * register requirement and lifetimes.
697 * So, for the purpose of LSRA, we want to have a more detailed specification of what a switch node actually
698 * means and more importantly, which and when do we need a register for each instruction we want to issue
699 * to correctly allocate them downstream.
701 * For this purpose, this procedure performs switch lowering in two different ways:
703 * a) Represent the switch statement as a zero-index jump table construct. This means that for every destination
704 * of the switch, we will store this destination in an array of addresses and the code generator will issue
705 * a data section where this array will live and will emit code that based on the switch index, will indirect and
706 * jump to the destination specified in the jump table.
708 * For this transformation we introduce a new GT node called GT_SWITCH_TABLE that is a specialization of the switch
709 * node for jump table based switches.
710 * The overall structure of a GT_SWITCH_TABLE is:
713 * |_________ localVar (a temporary local that holds the switch index)
714 * |_________ jumpTable (this is a special node that holds the address of the jump table array)
716 * Now, the way we morph a GT_SWITCH node into this lowered switch table node form is the following:
718 * Input: GT_SWITCH (inside a basic block whose Branch Type is BBJ_SWITCH)
719 * |_____ expr (an arbitrarily complex GT_NODE that represents the switch index)
721 * This gets transformed into the following statements inside a BBJ_COND basic block (the target would be
722 * the default case of the switch in case the conditional is evaluated to true).
724 * ----- original block, transformed
725 * GT_STORE_LCL_VAR tempLocal (a new temporary local variable used to store the switch index)
726 * |_____ expr (the index expression)
731 * |___ Int_Constant (This constant is the index of the default case
732 * that happens to be the highest index in the jump table).
733 * |___ tempLocal (The local variable were we stored the index expression).
735 * ----- new basic block
738 * |_____ jumpTable (a new jump table node that now LSRA can allocate registers for explicitly
739 * and LinearCodeGen will be responsible to generate downstream).
741 * This way there are no implicit temporaries.
743 * b) For small-sized switches, we will actually morph them into a series of conditionals of the form
744 * if (case falls into the default){ goto jumpTable[size]; // last entry in the jump table is the default case }
745 * (For the default case conditional, we'll be constructing the exact same code as the jump table case one).
746 * else if (case == firstCase){ goto jumpTable[1]; }
747 * else if (case == secondCase) { goto jumptable[2]; } and so on.
749 * This transformation is of course made in JIT-IR, not downstream to CodeGen level, so this way we no longer
750 * require internal temporaries to maintain the index we're evaluating plus we're using existing code from
751 * LinearCodeGen to implement this instead of implement all the control flow constructs using InstrDscs and
752 * InstrGroups downstream.
755 GenTree* Lowering::LowerSwitch(GenTree* node)
759 BasicBlock** jumpTab;
761 assert(node->gtOper == GT_SWITCH);
763 // The first step is to build the default case conditional construct that is
764 // shared between both kinds of expansion of the switch node.
766 // To avoid confusion, we'll alias m_block to originalSwitchBB
767 // that represents the node we're morphing.
768 BasicBlock* originalSwitchBB = m_block;
769 LIR::Range& switchBBRange = LIR::AsRange(originalSwitchBB);
771 // jumpCnt is the number of elements in the jump table array.
772 // jumpTab is the actual pointer to the jump table array.
773 // targetCnt is the number of unique targets in the jump table array.
774 jumpCnt = originalSwitchBB->bbJumpSwt->bbsCount;
775 jumpTab = originalSwitchBB->bbJumpSwt->bbsDstTab;
776 targetCnt = originalSwitchBB->NumSucc(comp);
778 // GT_SWITCH must be a top-level node with no use.
782 assert(!switchBBRange.TryGetUse(node, &use));
786 JITDUMP("Lowering switch " FMT_BB ", %d cases\n", originalSwitchBB->bbNum, jumpCnt);
788 // Handle a degenerate case: if the switch has only a default case, just convert it
789 // to an unconditional branch. This should only happen in minopts or with debuggable
793 JITDUMP("Lowering switch " FMT_BB ": single target; converting to BBJ_ALWAYS\n", originalSwitchBB->bbNum);
794 noway_assert(comp->opts.OptimizationDisabled());
795 if (originalSwitchBB->bbNext == jumpTab[0])
797 originalSwitchBB->bbJumpKind = BBJ_NONE;
798 originalSwitchBB->bbJumpDest = nullptr;
802 originalSwitchBB->bbJumpKind = BBJ_ALWAYS;
803 originalSwitchBB->bbJumpDest = jumpTab[0];
805 // Remove extra predecessor links if there was more than one case.
806 for (unsigned i = 1; i < jumpCnt; ++i)
808 (void)comp->fgRemoveRefPred(jumpTab[i], originalSwitchBB);
811 // We have to get rid of the GT_SWITCH node but a child might have side effects so just assign
812 // the result of the child subtree to a temp.
813 GenTree* rhs = node->AsOp()->gtOp1;
815 unsigned lclNum = comp->lvaGrabTemp(true DEBUGARG("Lowering is creating a new local variable"));
816 comp->lvaTable[lclNum].lvType = rhs->TypeGet();
818 GenTreeLclVar* store = comp->gtNewStoreLclVarNode(lclNum, rhs);
820 switchBBRange.InsertAfter(node, store);
821 switchBBRange.Remove(node);
826 noway_assert(jumpCnt >= 2);
828 // Spill the argument to the switch node into a local so that it can be used later.
829 LIR::Use use(switchBBRange, &(node->AsOp()->gtOp1), node);
830 ReplaceWithLclVar(use);
832 // GT_SWITCH(indexExpression) is now two statements:
833 // 1. a statement containing temp = indexExpression
834 // 2. and a statement with GT_SWITCH(temp)
836 assert(node->gtOper == GT_SWITCH);
837 GenTree* temp = node->AsOp()->gtOp1;
838 assert(temp->gtOper == GT_LCL_VAR);
839 unsigned tempLclNum = temp->AsLclVarCommon()->GetLclNum();
840 var_types tempLclType = temp->TypeGet();
842 BasicBlock* defaultBB = jumpTab[jumpCnt - 1];
843 BasicBlock* followingBB = originalSwitchBB->bbNext;
845 /* Is the number of cases right for a test and jump switch? */
846 const bool fFirstCaseFollows = (followingBB == jumpTab[0]);
847 const bool fDefaultFollows = (followingBB == defaultBB);
849 unsigned minSwitchTabJumpCnt = 2; // table is better than just 2 cmp/jcc
851 // This means really just a single cmp/jcc (aka a simple if/else)
852 if (fFirstCaseFollows || fDefaultFollows)
854 minSwitchTabJumpCnt++;
857 #if defined(TARGET_ARM)
858 // On ARM for small switch tables we will
859 // generate a sequence of compare and branch instructions
860 // because the code to load the base of the switch
861 // table is huge and hideous due to the relocation... :(
862 minSwitchTabJumpCnt += 2;
865 // Once we have the temporary variable, we construct the conditional branch for
866 // the default case. As stated above, this conditional is being shared between
867 // both GT_SWITCH lowering code paths.
868 // This condition is of the form: if (temp > jumpTableLength - 2){ goto jumpTable[jumpTableLength - 1]; }
869 GenTree* gtDefaultCaseCond = comp->gtNewOperNode(GT_GT, TYP_INT, comp->gtNewLclvNode(tempLclNum, tempLclType),
870 comp->gtNewIconNode(jumpCnt - 2, genActualType(tempLclType)));
872 // Make sure we perform an unsigned comparison, just in case the switch index in 'temp'
873 // is now less than zero 0 (that would also hit the default case).
874 gtDefaultCaseCond->gtFlags |= GTF_UNSIGNED;
876 GenTree* gtDefaultCaseJump = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, gtDefaultCaseCond);
877 gtDefaultCaseJump->gtFlags = node->gtFlags;
879 LIR::Range condRange = LIR::SeqTree(comp, gtDefaultCaseJump);
880 switchBBRange.InsertAtEnd(std::move(condRange));
882 BasicBlock* afterDefaultCondBlock = comp->fgSplitBlockAfterNode(originalSwitchBB, condRange.LastNode());
884 // afterDefaultCondBlock is now the switch, and all the switch targets have it as a predecessor.
885 // originalSwitchBB is now a BBJ_NONE, and there is a predecessor edge in afterDefaultCondBlock
886 // representing the fall-through flow from originalSwitchBB.
887 assert(originalSwitchBB->bbJumpKind == BBJ_NONE);
888 assert(originalSwitchBB->bbNext == afterDefaultCondBlock);
889 assert(afterDefaultCondBlock->bbJumpKind == BBJ_SWITCH);
890 assert(afterDefaultCondBlock->bbJumpSwt->bbsHasDefault);
891 assert(afterDefaultCondBlock->isEmpty()); // Nothing here yet.
893 // The GT_SWITCH code is still in originalSwitchBB (it will be removed later).
895 // Turn originalSwitchBB into a BBJ_COND.
896 originalSwitchBB->bbJumpKind = BBJ_COND;
897 originalSwitchBB->bbJumpDest = jumpTab[jumpCnt - 1];
899 // Fix the pred for the default case: the default block target still has originalSwitchBB
900 // as a predecessor, but the fgSplitBlockAfterStatement() moved all predecessors to point
901 // to afterDefaultCondBlock.
902 FlowEdge* oldEdge = comp->fgRemoveRefPred(jumpTab[jumpCnt - 1], afterDefaultCondBlock);
903 comp->fgAddRefPred(jumpTab[jumpCnt - 1], originalSwitchBB, oldEdge);
905 bool useJumpSequence = jumpCnt < minSwitchTabJumpCnt;
907 if (TargetOS::IsUnix && TargetArchitecture::IsArm32)
909 // Force using an inlined jumping instead switch table generation.
910 // Switch jump table is generated with incorrect values in NativeAOT case,
911 // so any large switch will crash after loading to PC any such value.
912 // I think this is due to the fact that we use absolute addressing
913 // instead of relative. But in NativeAOT is used as a rule relative
914 // addressing when we generate an executable.
915 // See also https://github.com/dotnet/runtime/issues/8683
916 // Also https://github.com/dotnet/coreclr/pull/13197
917 useJumpSequence = useJumpSequence || comp->IsTargetAbi(CORINFO_NATIVEAOT_ABI);
920 // If we originally had 2 unique successors, check to see whether there is a unique
921 // non-default case, in which case we can eliminate the switch altogether.
922 // Note that the single unique successor case is handled above.
923 BasicBlock* uniqueSucc = nullptr;
926 uniqueSucc = jumpTab[0];
927 noway_assert(jumpCnt >= 2);
928 for (unsigned i = 1; i < jumpCnt - 1; i++)
930 if (jumpTab[i] != uniqueSucc)
932 uniqueSucc = nullptr;
937 if (uniqueSucc != nullptr)
939 // If the unique successor immediately follows this block, we have nothing to do -
940 // it will simply fall-through after we remove the switch, below.
941 // Otherwise, make this a BBJ_ALWAYS.
942 // Now, fixup the predecessor links to uniqueSucc. In the original jumpTab:
943 // jumpTab[i-1] was the default target, which we handled above,
944 // jumpTab[0] is the first target, and we'll leave that predecessor link.
945 // Remove any additional predecessor links to uniqueSucc.
946 for (unsigned i = 1; i < jumpCnt - 1; ++i)
948 assert(jumpTab[i] == uniqueSucc);
949 (void)comp->fgRemoveRefPred(uniqueSucc, afterDefaultCondBlock);
951 if (afterDefaultCondBlock->bbNext == uniqueSucc)
953 afterDefaultCondBlock->bbJumpKind = BBJ_NONE;
954 afterDefaultCondBlock->bbJumpDest = nullptr;
958 afterDefaultCondBlock->bbJumpKind = BBJ_ALWAYS;
959 afterDefaultCondBlock->bbJumpDest = uniqueSucc;
962 // If the number of possible destinations is small enough, we proceed to expand the switch
963 // into a series of conditional branches, otherwise we follow the jump table based switch
965 else if (useJumpSequence || comp->compStressCompile(Compiler::STRESS_SWITCH_CMP_BR_EXPANSION, 50))
967 // Lower the switch into a series of compare and branch IR trees.
969 // In this case we will morph the node in the following way:
970 // 1. Generate a JTRUE statement to evaluate the default case. (This happens above.)
971 // 2. Start splitting the switch basic block into subsequent basic blocks, each of which will contain
972 // a statement that is responsible for performing a comparison of the table index and conditional
975 JITDUMP("Lowering switch " FMT_BB ": using compare/branch expansion\n", originalSwitchBB->bbNum);
977 // We'll use 'afterDefaultCondBlock' for the first conditional. After that, we'll add new
978 // blocks. If we end up not needing it at all (say, if all the non-default cases just fall through),
980 bool fUsedAfterDefaultCondBlock = false;
981 BasicBlock* currentBlock = afterDefaultCondBlock;
982 LIR::Range* currentBBRange = &LIR::AsRange(currentBlock);
984 // Walk to entries 0 to jumpCnt - 1. If a case target follows, ignore it and let it fall through.
985 // If no case target follows, the last one doesn't need to be a compare/branch: it can be an
986 // unconditional branch.
987 bool fAnyTargetFollows = false;
988 for (unsigned i = 0; i < jumpCnt - 1; ++i)
990 assert(currentBlock != nullptr);
992 // Remove the switch from the predecessor list of this case target's block.
993 // We'll add the proper new predecessor edge later.
994 FlowEdge* oldEdge = comp->fgRemoveRefPred(jumpTab[i], afterDefaultCondBlock);
996 if (jumpTab[i] == followingBB)
998 // This case label follows the switch; let it fall through.
999 fAnyTargetFollows = true;
1003 // We need a block to put in the new compare and/or branch.
1004 // If we haven't used the afterDefaultCondBlock yet, then use that.
1005 if (fUsedAfterDefaultCondBlock)
1007 BasicBlock* newBlock = comp->fgNewBBafter(BBJ_NONE, currentBlock, true);
1008 comp->fgAddRefPred(newBlock, currentBlock); // The fall-through predecessor.
1009 currentBlock = newBlock;
1010 currentBBRange = &LIR::AsRange(currentBlock);
1014 assert(currentBlock == afterDefaultCondBlock);
1015 fUsedAfterDefaultCondBlock = true;
1018 // We're going to have a branch, either a conditional or unconditional,
1019 // to the target. Set the target.
1020 currentBlock->bbJumpDest = jumpTab[i];
1022 // Wire up the predecessor list for the "branch" case.
1023 comp->fgAddRefPred(jumpTab[i], currentBlock, oldEdge);
1025 if (!fAnyTargetFollows && (i == jumpCnt - 2))
1027 // We're processing the last one, and there is no fall through from any case
1028 // to the following block, so we can use an unconditional branch to the final
1029 // case: there is no need to compare against the case index, since it's
1030 // guaranteed to be taken (since the default case was handled first, above).
1032 currentBlock->bbJumpKind = BBJ_ALWAYS;
1036 // Otherwise, it's a conditional branch. Set the branch kind, then add the
1037 // condition statement.
1038 currentBlock->bbJumpKind = BBJ_COND;
1040 // Now, build the conditional statement for the current case that is
1045 // |____ (switchIndex) (The temp variable)
1046 // |____ (ICon) (The actual case constant)
1047 GenTree* gtCaseCond = comp->gtNewOperNode(GT_EQ, TYP_INT, comp->gtNewLclvNode(tempLclNum, tempLclType),
1048 comp->gtNewIconNode(i, genActualType(tempLclType)));
1049 GenTree* gtCaseBranch = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, gtCaseCond);
1050 LIR::Range caseRange = LIR::SeqTree(comp, gtCaseBranch);
1051 currentBBRange->InsertAtEnd(std::move(caseRange));
1055 if (fAnyTargetFollows)
1057 // There is a fall-through to the following block. In the loop
1058 // above, we deleted all the predecessor edges from the switch.
1059 // In this case, we need to add one back.
1060 comp->fgAddRefPred(currentBlock->bbNext, currentBlock);
1063 if (!fUsedAfterDefaultCondBlock)
1065 // All the cases were fall-through! We don't need this block.
1066 // Convert it from BBJ_SWITCH to BBJ_NONE and unset the BBF_DONT_REMOVE flag
1067 // so fgRemoveBlock() doesn't complain.
1068 JITDUMP("Lowering switch " FMT_BB ": all switch cases were fall-through\n", originalSwitchBB->bbNum);
1069 assert(currentBlock == afterDefaultCondBlock);
1070 assert(currentBlock->bbJumpKind == BBJ_SWITCH);
1071 currentBlock->bbJumpKind = BBJ_NONE;
1072 currentBlock->bbFlags &= ~BBF_DONT_REMOVE;
1073 comp->fgRemoveBlock(currentBlock, /* unreachable */ false); // It's an empty block.
1078 // At this point the default case has already been handled and we need to generate a jump
1079 // table based switch or a bit test based switch at the end of afterDefaultCondBlock. Both
1080 // switch variants need the switch value so create the necessary LclVar node here.
1081 GenTree* switchValue = comp->gtNewLclvNode(tempLclNum, tempLclType);
1082 LIR::Range& switchBlockRange = LIR::AsRange(afterDefaultCondBlock);
1083 switchBlockRange.InsertAtEnd(switchValue);
1085 // Try generating a bit test based switch first,
1086 // if that's not possible a jump table based switch will be generated.
1087 if (!TryLowerSwitchToBitTest(jumpTab, jumpCnt, targetCnt, afterDefaultCondBlock, switchValue))
1089 JITDUMP("Lowering switch " FMT_BB ": using jump table expansion\n", originalSwitchBB->bbNum);
1092 if (tempLclType != TYP_I_IMPL)
1094 // SWITCH_TABLE expects the switch value (the index into the jump table) to be TYP_I_IMPL.
1095 // Note that the switch value is unsigned so the cast should be unsigned as well.
1096 switchValue = comp->gtNewCastNode(TYP_I_IMPL, switchValue, true, TYP_U_IMPL);
1097 switchBlockRange.InsertAtEnd(switchValue);
1101 GenTree* switchTable = comp->gtNewJmpTableNode();
1102 GenTree* switchJump = comp->gtNewOperNode(GT_SWITCH_TABLE, TYP_VOID, switchValue, switchTable);
1103 switchBlockRange.InsertAfter(switchValue, switchTable, switchJump);
1105 // this block no longer branches to the default block
1106 afterDefaultCondBlock->bbJumpSwt->removeDefault();
1109 comp->fgInvalidateSwitchDescMapEntry(afterDefaultCondBlock);
1112 GenTree* next = node->gtNext;
1114 // Get rid of the GT_SWITCH(temp).
1115 switchBBRange.Remove(node->AsOp()->gtOp1);
1116 switchBBRange.Remove(node);
1121 //------------------------------------------------------------------------
1122 // TryLowerSwitchToBitTest: Attempts to transform a jump table switch into a bit test.
1125 // jumpTable - The jump table
1126 // jumpCount - The number of blocks in the jump table
1127 // targetCount - The number of distinct blocks in the jump table
1128 // bbSwitch - The switch block
1129 // switchValue - A LclVar node that provides the switch value
1132 // true if the switch has been lowered to a bit test
1135 // If the jump table contains less than 32 (64 on 64 bit targets) entries and there
1136 // are at most 2 distinct jump targets then the jump table can be converted to a word
1137 // of bits where a 0 bit corresponds to one jump target and a 1 bit corresponds to the
1138 // other jump target. Instead of the indirect jump a BT-JCC sequence is used to jump
1139 // to the appropriate target:
1140 // mov eax, 245 ; jump table converted to a "bit table"
1141 // bt eax, ebx ; ebx is supposed to contain the switch value
1146 // Such code is both shorter and faster (in part due to the removal of a memory load)
1147 // than the traditional jump table base code. And of course, it also avoids the need
1148 // to emit the jump table itself that can reach up to 256 bytes (for 64 entries).
1150 bool Lowering::TryLowerSwitchToBitTest(
1151 BasicBlock* jumpTable[], unsigned jumpCount, unsigned targetCount, BasicBlock* bbSwitch, GenTree* switchValue)
1153 #ifndef TARGET_XARCH
1154 // Other architectures may use this if they substitute GT_BT with equivalent code.
1157 assert(jumpCount >= 2);
1158 assert(targetCount >= 2);
1159 assert(bbSwitch->bbJumpKind == BBJ_SWITCH);
1160 assert(switchValue->OperIs(GT_LCL_VAR));
1163 // Quick check to see if it's worth going through the jump table. The bit test switch supports
1164 // up to 2 targets but targetCount also includes the default block so we need to allow 3 targets.
1165 // We'll ensure that there are only 2 targets when building the bit table.
1168 if (targetCount > 3)
1174 // The number of bits in the bit table is the same as the number of jump table entries. But the
1175 // jump table also includes the default target (at the end) so we need to ignore it. The default
1176 // has already been handled by a JTRUE(GT(switchValue, jumpCount - 2)) that LowerSwitch generates.
1179 const unsigned bitCount = jumpCount - 1;
1181 if (bitCount > (genTypeSize(TYP_I_IMPL) * 8))
1187 // Build a bit table where a bit set to 0 corresponds to bbCase0 and a bit set to 1 corresponds to
1188 // bbCase1. Simply use the first block in the jump table as bbCase1, later we can invert the bit
1189 // table and/or swap the blocks if it's beneficial.
1192 BasicBlock* bbCase0 = nullptr;
1193 BasicBlock* bbCase1 = jumpTable[0];
1194 size_t bitTable = 1;
1196 for (unsigned bitIndex = 1; bitIndex < bitCount; bitIndex++)
1198 if (jumpTable[bitIndex] == bbCase1)
1200 bitTable |= (size_t(1) << bitIndex);
1202 else if (bbCase0 == nullptr)
1204 bbCase0 = jumpTable[bitIndex];
1206 else if (jumpTable[bitIndex] != bbCase0)
1208 // If it's neither bbCase0 nor bbCase1 then it means we have 3 targets. There can't be more
1209 // than 3 because of the check at the start of the function.
1210 assert(targetCount == 3);
1216 // One of the case blocks has to follow the switch block. This requirement could be avoided
1217 // by adding a BBJ_ALWAYS block after the switch block but doing that sometimes negatively
1218 // impacts register allocation.
1221 if ((bbSwitch->bbNext != bbCase0) && (bbSwitch->bbNext != bbCase1))
1228 // See if we can avoid a 8 byte immediate on 64 bit targets. If all upper 32 bits are 1
1229 // then inverting the bit table will make them 0 so that the table now fits in 32 bits.
1230 // Note that this does not change the number of bits in the bit table, it just takes
1231 // advantage of the fact that loading a 32 bit immediate into a 64 bit register zero
1232 // extends the immediate value to 64 bit.
1235 if (~bitTable <= UINT32_MAX)
1237 bitTable = ~bitTable;
1238 std::swap(bbCase0, bbCase1);
1243 // Rewire the blocks as needed and figure out the condition to use for JCC.
1246 GenCondition bbSwitchCondition;
1247 bbSwitch->bbJumpKind = BBJ_COND;
1249 comp->fgRemoveAllRefPreds(bbCase1, bbSwitch);
1250 comp->fgRemoveAllRefPreds(bbCase0, bbSwitch);
1252 if (bbSwitch->bbNext == bbCase0)
1254 // GenCondition::C generates JC so we jump to bbCase1 when the bit is set
1255 bbSwitchCondition = GenCondition::C;
1256 bbSwitch->bbJumpDest = bbCase1;
1258 comp->fgAddRefPred(bbCase0, bbSwitch);
1259 comp->fgAddRefPred(bbCase1, bbSwitch);
1263 assert(bbSwitch->bbNext == bbCase1);
1265 // GenCondition::NC generates JNC so we jump to bbCase0 when the bit is not set
1266 bbSwitchCondition = GenCondition::NC;
1267 bbSwitch->bbJumpDest = bbCase0;
1269 comp->fgAddRefPred(bbCase0, bbSwitch);
1270 comp->fgAddRefPred(bbCase1, bbSwitch);
1274 // Append BT(bitTable, switchValue) and JCC(condition) to the switch block.
1277 var_types bitTableType = (bitCount <= (genTypeSize(TYP_INT) * 8)) ? TYP_INT : TYP_LONG;
1278 GenTree* bitTableIcon = comp->gtNewIconNode(bitTable, bitTableType);
1279 GenTree* bitTest = comp->gtNewOperNode(GT_BT, TYP_VOID, bitTableIcon, switchValue);
1280 bitTest->gtFlags |= GTF_SET_FLAGS;
1281 GenTreeCC* jcc = comp->gtNewCC(GT_JCC, TYP_VOID, bbSwitchCondition);
1283 LIR::AsRange(bbSwitch).InsertAfter(switchValue, bitTableIcon, bitTest, jcc);
1286 #endif // TARGET_XARCH
1289 void Lowering::ReplaceArgWithPutArgOrBitcast(GenTree** argSlot, GenTree* putArgOrBitcast)
1291 assert(argSlot != nullptr);
1292 assert(*argSlot != nullptr);
1293 assert(putArgOrBitcast->OperIsPutArg() || putArgOrBitcast->OperIs(GT_BITCAST));
1295 GenTree* arg = *argSlot;
1297 // Replace the argument with the putarg/copy
1298 *argSlot = putArgOrBitcast;
1299 putArgOrBitcast->AsOp()->gtOp1 = arg;
1301 // Insert the putarg/copy into the block
1302 BlockRange().InsertAfter(arg, putArgOrBitcast);
1305 //------------------------------------------------------------------------
1306 // NewPutArg: rewrites the tree to put an arg in a register or on the stack.
1309 // call - the call whose arg is being rewritten.
1310 // arg - the arg being rewritten.
1311 // callArg - the CallArg for the argument.
1312 // type - the type of the argument.
1315 // The new tree that was created to put the arg in the right place
1316 // or the incoming arg if the arg tree was not rewritten.
1319 // call, arg, and info must be non-null.
1322 // For System V systems with native struct passing (i.e. UNIX_AMD64_ABI defined)
1323 // this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_FIELD_LIST of two GT_PUTARG_REGs
1324 // for two eightbyte structs. For STK passed structs the method generates GT_PUTARG_STK tree.
1326 GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, CallArg* callArg, var_types type)
1328 assert(call != nullptr);
1329 assert(arg != nullptr);
1330 assert(callArg != nullptr);
1332 GenTree* putArg = nullptr;
1334 bool isOnStack = (callArg->AbiInfo.GetRegNum() == REG_STK);
1336 #if FEATURE_ARG_SPLIT
1337 // Struct can be split into register(s) and stack on ARM
1338 if (compFeatureArgSplit() && callArg->AbiInfo.IsSplit())
1340 assert(arg->OperIs(GT_BLK, GT_FIELD_LIST) || arg->OperIsLocalRead());
1341 // TODO: Need to check correctness for FastTailCall
1342 if (call->IsFastTailCall())
1345 NYI_ARM("lower: struct argument by fast tail call");
1346 #endif // TARGET_ARM
1349 const unsigned slotNumber = callArg->AbiInfo.ByteOffset / TARGET_POINTER_SIZE;
1350 const bool putInIncomingArgArea = call->IsFastTailCall();
1352 putArg = new (comp, GT_PUTARG_SPLIT) GenTreePutArgSplit(arg, callArg->AbiInfo.ByteOffset,
1353 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1354 callArg->AbiInfo.GetStackByteSize(),
1356 callArg->AbiInfo.NumRegs, call, putInIncomingArgArea);
1358 GenTreePutArgSplit* argSplit = putArg->AsPutArgSplit();
1359 for (unsigned regIndex = 0; regIndex < callArg->AbiInfo.NumRegs; regIndex++)
1361 argSplit->SetRegNumByIdx(callArg->AbiInfo.GetRegNum(regIndex), regIndex);
1364 if (arg->OperIs(GT_FIELD_LIST))
1366 unsigned regIndex = 0;
1367 for (GenTreeFieldList::Use& use : arg->AsFieldList()->Uses())
1369 if (regIndex >= callArg->AbiInfo.NumRegs)
1373 var_types regType = use.GetNode()->TypeGet();
1374 // Account for the possibility that float fields may be passed in integer registers.
1375 if (varTypeIsFloating(regType) && !genIsValidFloatReg(argSplit->GetRegNumByIdx(regIndex)))
1377 regType = (regType == TYP_FLOAT) ? TYP_INT : TYP_LONG;
1379 argSplit->m_regType[regIndex] = regType;
1383 // Clear the register assignment on the fieldList node, as these are contained.
1384 arg->SetRegNum(REG_NA);
1388 ClassLayout* layout = arg->GetLayout(comp);
1390 // Set type of registers
1391 for (unsigned index = 0; index < callArg->AbiInfo.NumRegs; index++)
1393 argSplit->m_regType[index] = layout->GetGCPtrType(index);
1398 #endif // FEATURE_ARG_SPLIT
1402 #if FEATURE_MULTIREG_ARGS
1403 if ((callArg->AbiInfo.NumRegs > 1) && (arg->OperGet() == GT_FIELD_LIST))
1405 unsigned int regIndex = 0;
1406 for (GenTreeFieldList::Use& use : arg->AsFieldList()->Uses())
1408 regNumber argReg = callArg->AbiInfo.GetRegNum(regIndex);
1409 GenTree* curOp = use.GetNode();
1410 var_types curTyp = curOp->TypeGet();
1412 // Create a new GT_PUTARG_REG node with op1
1413 GenTree* newOper = comp->gtNewPutArgReg(curTyp, curOp, argReg);
1415 // Splice in the new GT_PUTARG_REG node in the GT_FIELD_LIST
1416 ReplaceArgWithPutArgOrBitcast(&use.NodeRef(), newOper);
1420 // Just return arg. The GT_FIELD_LIST is not replaced.
1421 // Nothing more to do.
1425 #endif // FEATURE_MULTIREG_ARGS
1427 putArg = comp->gtNewPutArgReg(type, arg, callArg->AbiInfo.GetRegNum());
1432 // Mark this one as tail call arg if it is a fast tail call.
1433 // This provides the info to put this argument in in-coming arg area slot
1434 // instead of in out-going arg area slot.
1435 CLANG_FORMAT_COMMENT_ANCHOR;
1438 // Make sure state is correct. The PUTARG_STK has TYP_VOID, as it doesn't produce
1439 // a result. So the type of its operand must be the correct type to push on the stack.
1440 callArg->CheckIsStruct();
1443 if ((arg->OperGet() != GT_FIELD_LIST))
1445 #if defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK)
1446 if (type == TYP_SIMD12)
1448 #if !defined(TARGET_64BIT)
1449 assert(callArg->AbiInfo.ByteSize == 12);
1450 #else // TARGET_64BIT
1451 if (compMacOsArm64Abi())
1453 assert(callArg->AbiInfo.ByteSize == 12);
1457 assert(callArg->AbiInfo.ByteSize == 16);
1459 #endif // TARGET_64BIT
1462 #endif // defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK)
1464 assert(genActualType(arg->TypeGet()) == type);
1467 const unsigned slotNumber = callArg->AbiInfo.ByteOffset / TARGET_POINTER_SIZE;
1468 const bool putInIncomingArgArea = call->IsFastTailCall();
1471 new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg, callArg->AbiInfo.ByteOffset,
1472 #ifdef FEATURE_PUT_STRUCT_ARG_STK
1473 callArg->AbiInfo.GetStackByteSize(),
1475 call, putInIncomingArgArea);
1477 #if defined(DEBUG) && defined(FEATURE_PUT_STRUCT_ARG_STK)
1478 if (callArg->AbiInfo.IsStruct)
1480 // We use GT_BLK only for non-SIMD struct arguments.
1481 if (arg->OperIs(GT_BLK))
1483 assert(!varTypeIsSIMD(arg));
1485 else if (!arg->TypeIs(TYP_STRUCT))
1488 assert((callArg->AbiInfo.GetStackSlotsNumber() == 1) ||
1489 ((arg->TypeGet() == TYP_DOUBLE) && (callArg->AbiInfo.GetStackSlotsNumber() == 2)));
1491 assert(varTypeIsSIMD(arg) || (callArg->AbiInfo.GetStackSlotsNumber() == 1));
1495 #endif // defined(DEBUG) && defined(FEATURE_PUT_STRUCT_ARG_STK)
1499 JITDUMP("new node is : ");
1506 //------------------------------------------------------------------------
1507 // LowerArg: Lower one argument of a call. This entails splicing a "putarg" node between
1508 // the argument evaluation and the call. This is the point at which the source is
1509 // consumed and the value transitions from control of the register allocator to the calling
1513 // call - The call node
1514 // callArg - Call argument
1515 // late - Whether it is the late arg that is being lowered.
1520 void Lowering::LowerArg(GenTreeCall* call, CallArg* callArg, bool late)
1522 GenTree** ppArg = late ? &callArg->LateNodeRef() : &callArg->EarlyNodeRef();
1523 GenTree* arg = *ppArg;
1524 assert(arg != nullptr);
1526 JITDUMP("lowering arg : ");
1528 assert(arg->IsValue());
1530 var_types type = genActualType(arg);
1532 #if defined(FEATURE_SIMD)
1533 #if defined(TARGET_X86)
1534 // Non-param TYP_SIMD12 local var nodes are massaged in Lower to TYP_SIMD16 to match their
1535 // allocated size (see lvSize()). However, when passing the variables as arguments, and
1536 // storing the variables to the outgoing argument area on the stack, we must use their
1537 // actual TYP_SIMD12 type, so exactly 12 bytes is allocated and written.
1538 if (type == TYP_SIMD16)
1540 if ((arg->OperGet() == GT_LCL_VAR) || (arg->OperGet() == GT_STORE_LCL_VAR))
1542 const LclVarDsc* varDsc = comp->lvaGetDesc(arg->AsLclVarCommon());
1543 type = varDsc->lvType;
1545 else if (arg->OperIs(GT_HWINTRINSIC))
1547 GenTreeJitIntrinsic* jitIntrinsic = reinterpret_cast<GenTreeJitIntrinsic*>(arg);
1549 // For HWIntrinsic, there are some intrinsics like ExtractVector128 which have
1550 // a gtType of TYP_SIMD16 but a SimdSize of 32, so we need to include that in
1551 // the assert below.
1553 assert((jitIntrinsic->GetSimdSize() == 12) || (jitIntrinsic->GetSimdSize() == 16) ||
1554 (jitIntrinsic->GetSimdSize() == 32));
1556 if (jitIntrinsic->GetSimdSize() == 12)
1562 #elif defined(TARGET_AMD64)
1563 // TYP_SIMD8 parameters that are passed as longs
1564 if (type == TYP_SIMD8 && genIsValidIntReg(callArg->AbiInfo.GetRegNum()))
1566 GenTree* bitcast = comp->gtNewBitCastNode(TYP_LONG, arg);
1567 BlockRange().InsertAfter(arg, bitcast);
1569 *ppArg = arg = bitcast;
1572 #endif // defined(TARGET_X86)
1573 #endif // defined(FEATURE_SIMD)
1575 // If we hit this we are probably double-lowering.
1576 assert(!arg->OperIsPutArg());
1578 #if !defined(TARGET_64BIT)
1579 if (varTypeIsLong(type))
1581 noway_assert(arg->OperIs(GT_LONG));
1582 GenTreeFieldList* fieldList = new (comp, GT_FIELD_LIST) GenTreeFieldList();
1583 fieldList->AddFieldLIR(comp, arg->AsOp()->gtGetOp1(), 0, TYP_INT);
1584 fieldList->AddFieldLIR(comp, arg->AsOp()->gtGetOp2(), 4, TYP_INT);
1585 GenTree* newArg = NewPutArg(call, fieldList, callArg, type);
1587 if (callArg->AbiInfo.GetRegNum() != REG_STK)
1589 assert(callArg->AbiInfo.NumRegs == 2);
1590 // In the register argument case, NewPutArg replaces the original field list args with new
1591 // GT_PUTARG_REG nodes, inserts them in linear order and returns the field list. So the
1592 // only thing left to do is to insert the field list itself in linear order.
1593 assert(newArg == fieldList);
1594 BlockRange().InsertBefore(arg, newArg);
1598 // For longs, we will replace the GT_LONG with a GT_FIELD_LIST, and put that under a PUTARG_STK.
1599 // Although the hi argument needs to be pushed first, that will be handled by the general case,
1600 // in which the fields will be reversed.
1601 assert(callArg->AbiInfo.GetStackSlotsNumber() == 2);
1602 newArg->SetRegNum(REG_STK);
1603 BlockRange().InsertBefore(arg, fieldList, newArg);
1607 BlockRange().Remove(arg);
1610 #endif // !defined(TARGET_64BIT)
1613 #if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
1614 if (call->IsVarargs() || comp->opts.compUseSoftFP)
1616 // For vararg call or on armel, reg args should be all integer.
1617 // Insert copies as needed to move float value to integer register.
1618 GenTree* newNode = LowerFloatArg(ppArg, callArg);
1619 if (newNode != nullptr)
1621 type = newNode->TypeGet();
1624 #endif // TARGET_ARMARCH || TARGET_LOONGARCH64 || TARGET_RISCV64
1626 GenTree* putArg = NewPutArg(call, arg, callArg, type);
1628 // In the case of register passable struct (in one or two registers)
1629 // the NewPutArg returns a new node (GT_PUTARG_REG or a GT_FIELD_LIST with two GT_PUTARG_REGs.)
1630 // If an extra node is returned, splice it in the right place in the tree.
1633 ReplaceArgWithPutArgOrBitcast(ppArg, putArg);
1639 if (arg->OperIsPutArgStk() || arg->OperIsPutArgSplit())
1641 LowerPutArgStkOrSplit(arg->AsPutArgStk());
1645 #if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
1646 //------------------------------------------------------------------------
1647 // LowerFloatArg: Lower float call arguments on the arm/LoongArch64 platform.
1650 // arg - The arg node
1651 // callArg - call argument info
1654 // Return nullptr, if no transformation was done;
1655 // return arg if there was in place transformation;
1656 // return a new tree if the root was changed.
1659 // This must handle scalar float arguments as well as GT_FIELD_LISTs
1660 // with floating point fields.
1662 GenTree* Lowering::LowerFloatArg(GenTree** pArg, CallArg* callArg)
1664 GenTree* arg = *pArg;
1665 if (callArg->AbiInfo.GetRegNum() != REG_STK)
1667 if (arg->OperIs(GT_FIELD_LIST))
1669 // Transform fields that are passed as registers in place.
1670 regNumber currRegNumber = callArg->AbiInfo.GetRegNum();
1671 unsigned regIndex = 0;
1672 for (GenTreeFieldList::Use& use : arg->AsFieldList()->Uses())
1674 if (regIndex >= callArg->AbiInfo.NumRegs)
1678 GenTree* node = use.GetNode();
1679 if (varTypeUsesFloatReg(node))
1681 GenTree* intNode = LowerFloatArgReg(node, currRegNumber);
1682 assert(intNode != nullptr);
1684 ReplaceArgWithPutArgOrBitcast(&use.NodeRef(), intNode);
1687 if (node->TypeGet() == TYP_DOUBLE)
1689 currRegNumber = REG_NEXT(REG_NEXT(currRegNumber));
1694 currRegNumber = REG_NEXT(currRegNumber);
1698 // List fields were replaced in place.
1701 else if (varTypeUsesFloatReg(arg))
1703 GenTree* intNode = LowerFloatArgReg(arg, callArg->AbiInfo.GetRegNum());
1704 assert(intNode != nullptr);
1705 ReplaceArgWithPutArgOrBitcast(pArg, intNode);
1712 //------------------------------------------------------------------------
1713 // LowerFloatArgReg: Lower the float call argument node that is passed via register.
1716 // arg - The arg node
1717 // regNum - register number
1720 // Return new bitcast node, that moves float to int register.
1722 GenTree* Lowering::LowerFloatArgReg(GenTree* arg, regNumber regNum)
1724 assert(varTypeUsesFloatReg(arg));
1726 var_types floatType = arg->TypeGet();
1727 var_types intType = (floatType == TYP_FLOAT) ? TYP_INT : TYP_LONG;
1728 GenTree* intArg = comp->gtNewBitCastNode(intType, arg);
1729 intArg->SetRegNum(regNum);
1732 if (floatType == TYP_DOUBLE)
1734 // A special case when we introduce TYP_LONG
1735 // during lowering for arm32 softFP to pass double
1736 // in int registers.
1737 assert(comp->opts.compUseSoftFP);
1739 regNumber nextReg = REG_NEXT(regNum);
1740 intArg->AsMultiRegOp()->gtOtherReg = nextReg;
1747 // do lowering steps for each arg of a call
1748 void Lowering::LowerArgsForCall(GenTreeCall* call)
1750 JITDUMP("args:\n======\n");
1751 for (CallArg& arg : call->gtArgs.EarlyArgs())
1753 LowerArg(call, &arg, false);
1756 JITDUMP("\nlate:\n======\n");
1757 for (CallArg& arg : call->gtArgs.LateArgs())
1759 LowerArg(call, &arg, true);
1763 // helper that create a node representing a relocatable physical address computation
1764 GenTree* Lowering::AddrGen(ssize_t addr)
1766 // this should end up in codegen as : instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, reg, addr)
1767 GenTree* result = comp->gtNewIconHandleNode(addr, GTF_ICON_FTN_ADDR);
1771 // variant that takes a void*
1772 GenTree* Lowering::AddrGen(void* addr)
1774 return AddrGen((ssize_t)addr);
1777 //------------------------------------------------------------------------
1778 // LowerCallMemmove: Replace Buffer.Memmove(DST, SRC, CNS_SIZE) with a GT_STORE_BLK:
1780 // * STORE_BLK struct<CNS_SIZE> (copy) (Unroll)
1781 // +--* LCL_VAR byref dst
1783 // \--* LCL_VAR byref src
1786 // tree - GenTreeCall node to replace with STORE_BLK
1789 // nullptr if no changes were made
1791 GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
1793 JITDUMP("Considering Memmove [%06d] for unrolling.. ", comp->dspTreeID(call))
1794 assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove);
1796 assert(call->gtArgs.CountUserArgs() == 3);
1798 if (comp->info.compHasNextCallRetAddr)
1800 JITDUMP("compHasNextCallRetAddr=true so we won't be able to remove the call - bail out.\n")
1804 GenTree* lengthArg = call->gtArgs.GetUserArgByIndex(2)->GetNode();
1805 if (lengthArg->IsIntegralConst())
1807 ssize_t cnsSize = lengthArg->AsIntCon()->IconValue();
1808 JITDUMP("Size=%ld.. ", (LONG)cnsSize);
1809 // TODO-CQ: drop the whole thing in case of 0
1810 if ((cnsSize > 0) && (cnsSize <= (ssize_t)comp->getUnrollThreshold(Compiler::UnrollKind::Memmove)))
1812 JITDUMP("Accepted for unrolling!\nOld tree:\n")
1815 GenTree* dstAddr = call->gtArgs.GetUserArgByIndex(0)->GetNode();
1816 GenTree* srcAddr = call->gtArgs.GetUserArgByIndex(1)->GetNode();
1818 // TODO-CQ: Try to create an addressing mode
1819 GenTreeIndir* srcBlk = comp->gtNewIndir(TYP_STRUCT, srcAddr);
1820 srcBlk->SetContained();
1822 GenTreeBlk* storeBlk = new (comp, GT_STORE_BLK)
1823 GenTreeBlk(GT_STORE_BLK, TYP_STRUCT, dstAddr, srcBlk, comp->typGetBlkLayout((unsigned)cnsSize));
1824 storeBlk->gtFlags |= (GTF_IND_UNALIGNED | GTF_ASG | GTF_EXCEPT | GTF_GLOB_REF);
1826 // TODO-CQ: Use GenTreeBlk::BlkOpKindUnroll here if srcAddr and dstAddr don't overlap, thus, we can
1827 // unroll this memmove as memcpy - it doesn't require lots of temp registers
1828 storeBlk->gtBlkOpKind = GenTreeBlk::BlkOpKindUnrollMemmove;
1830 BlockRange().InsertBefore(call, srcBlk);
1831 BlockRange().InsertBefore(call, storeBlk);
1832 BlockRange().Remove(lengthArg);
1833 BlockRange().Remove(call);
1835 // Remove all non-user args (e.g. r2r cell)
1836 for (CallArg& arg : call->gtArgs.Args())
1838 if (arg.IsArgAddedLate())
1840 arg.GetNode()->SetUnusedValue();
1844 JITDUMP("\nNew tree:\n")
1850 JITDUMP("Size is either 0 or too big to unroll.\n")
1855 JITDUMP("size is not a constant.\n")
1860 //------------------------------------------------------------------------
1861 // LowerCallMemcmp: Replace SpanHelpers.SequenceEqual)(left, right, CNS_SIZE)
1862 // with a series of merged comparisons (via GT_IND nodes)
1865 // tree - GenTreeCall node to unroll as memcmp
1868 // nullptr if no changes were made
1870 GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call)
1872 JITDUMP("Considering Memcmp [%06d] for unrolling.. ", comp->dspTreeID(call))
1873 assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_SpanHelpers_SequenceEqual);
1874 assert(call->gtArgs.CountUserArgs() == 3);
1875 assert(TARGET_POINTER_SIZE == 8);
1877 if (!comp->opts.OptimizationEnabled())
1879 JITDUMP("Optimizations aren't allowed - bail out.\n")
1883 if (comp->info.compHasNextCallRetAddr)
1885 JITDUMP("compHasNextCallRetAddr=true so we won't be able to remove the call - bail out.\n")
1889 GenTree* lengthArg = call->gtArgs.GetUserArgByIndex(2)->GetNode();
1890 if (lengthArg->IsIntegralConst())
1892 ssize_t cnsSize = lengthArg->AsIntCon()->IconValue();
1893 JITDUMP("Size=%ld.. ", (LONG)cnsSize);
1894 // TODO-CQ: drop the whole thing in case of 0
1897 GenTree* lArg = call->gtArgs.GetUserArgByIndex(0)->GetNode();
1898 GenTree* rArg = call->gtArgs.GetUserArgByIndex(1)->GetNode();
1900 ssize_t MaxUnrollSize = comp->IsBaselineSimdIsaSupported() ? 32 : 16;
1902 #if defined(FEATURE_SIMD) && defined(TARGET_XARCH)
1903 if (comp->IsBaselineVector512IsaSupportedOpportunistically())
1905 MaxUnrollSize = 128;
1907 else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
1909 // We need AVX2 for NI_Vector256_op_Equality, fallback to Vector128 if only AVX is available
1914 if (cnsSize <= MaxUnrollSize)
1916 unsigned loadWidth = 1 << BitOperations::Log2((unsigned)cnsSize);
1920 loadType = TYP_UBYTE;
1922 else if (loadWidth == 2)
1924 loadType = TYP_USHORT;
1926 else if (loadWidth == 4)
1930 else if ((loadWidth == 8) || (MaxUnrollSize == 16))
1933 loadType = TYP_LONG;
1936 else if ((loadWidth == 16) || (MaxUnrollSize == 32))
1939 loadType = TYP_SIMD16;
1942 else if ((loadWidth == 32) || (MaxUnrollSize == 64))
1945 loadType = TYP_SIMD32;
1947 else if ((loadWidth == 64) || (MaxUnrollSize == 128))
1950 loadType = TYP_SIMD64;
1952 #endif // TARGET_XARCH
1953 #endif // FEATURE_SIMD
1958 var_types actualLoadType = genActualType(loadType);
1960 GenTree* result = nullptr;
1962 auto newBinaryOp = [](Compiler* comp, genTreeOps oper, var_types type, GenTree* op1,
1963 GenTree* op2) -> GenTree* {
1965 if (varTypeIsSIMD(op1))
1967 if (GenTree::OperIsCmpCompare(oper))
1969 assert(type == TYP_INT);
1970 return comp->gtNewSimdCmpOpAllNode(oper, TYP_BOOL, op1, op2, CORINFO_TYPE_NATIVEUINT,
1973 return comp->gtNewSimdBinOpNode(oper, op1->TypeGet(), op1, op2, CORINFO_TYPE_NATIVEUINT,
1977 return comp->gtNewOperNode(oper, type, op1, op2);
1980 // loadWidth == cnsSize means a single load is enough for both args
1981 if (loadWidth == (unsigned)cnsSize)
1983 // We're going to emit something like the following:
1985 // bool result = *(int*)leftArg == *(int*)rightArg
1987 // ^ in the given example we unroll for length=4
1989 GenTree* lIndir = comp->gtNewIndir(loadType, lArg);
1990 GenTree* rIndir = comp->gtNewIndir(loadType, rArg);
1991 result = newBinaryOp(comp, GT_EQ, TYP_INT, lIndir, rIndir);
1993 BlockRange().InsertAfter(lArg, lIndir);
1994 BlockRange().InsertAfter(rArg, rIndir);
1995 BlockRange().InsertBefore(call, result);
1999 // First, make both args multi-use:
2002 bool lFoundUse = BlockRange().TryGetUse(lArg, &lArgUse);
2003 bool rFoundUse = BlockRange().TryGetUse(rArg, &rArgUse);
2004 assert(lFoundUse && rFoundUse);
2005 GenTree* lArgClone = comp->gtNewLclvNode(lArgUse.ReplaceWithLclVar(comp), genActualType(lArg));
2006 GenTree* rArgClone = comp->gtNewLclvNode(rArgUse.ReplaceWithLclVar(comp), genActualType(rArg));
2007 BlockRange().InsertBefore(call, lArgClone, rArgClone);
2009 // We're going to emit something like the following:
2011 // bool result = ((*(int*)leftArg ^ *(int*)rightArg) |
2012 // (*(int*)(leftArg + 1) ^ *((int*)(rightArg + 1)))) == 0;
2014 // ^ in the given example we unroll for length=5
2022 // | | | \--* LCL_VAR byref V1
2024 // | | \--* LCL_VAR byref V2
2027 // | | \--* ADD byref
2028 // | | +--* LCL_VAR byref V1
2029 // | | \--* CNS_INT int 1
2032 // | +--* LCL_VAR byref V2
2033 // | \--* CNS_INT int 1
2034 // \--* CNS_INT int 0
2036 GenTree* l1Indir = comp->gtNewIndir(loadType, lArgUse.Def());
2037 GenTree* r1Indir = comp->gtNewIndir(loadType, rArgUse.Def());
2038 GenTree* lXor = newBinaryOp(comp, GT_XOR, actualLoadType, l1Indir, r1Indir);
2039 GenTree* l2Offs = comp->gtNewIconNode(cnsSize - loadWidth, TYP_I_IMPL);
2040 GenTree* l2AddOffs = newBinaryOp(comp, GT_ADD, lArg->TypeGet(), lArgClone, l2Offs);
2041 GenTree* l2Indir = comp->gtNewIndir(loadType, l2AddOffs);
2042 GenTree* r2Offs = comp->gtCloneExpr(l2Offs); // offset is the same
2043 GenTree* r2AddOffs = newBinaryOp(comp, GT_ADD, rArg->TypeGet(), rArgClone, r2Offs);
2044 GenTree* r2Indir = comp->gtNewIndir(loadType, r2AddOffs);
2045 GenTree* rXor = newBinaryOp(comp, GT_XOR, actualLoadType, l2Indir, r2Indir);
2046 GenTree* resultOr = newBinaryOp(comp, GT_OR, actualLoadType, lXor, rXor);
2047 GenTree* zeroCns = comp->gtNewZeroConNode(actualLoadType);
2048 result = newBinaryOp(comp, GT_EQ, TYP_INT, resultOr, zeroCns);
2050 BlockRange().InsertAfter(rArgClone, l1Indir, r1Indir, l2Offs, l2AddOffs);
2051 BlockRange().InsertAfter(l2AddOffs, l2Indir, r2Offs, r2AddOffs, r2Indir);
2052 BlockRange().InsertAfter(r2Indir, lXor, rXor, resultOr, zeroCns);
2053 BlockRange().InsertAfter(zeroCns, result);
2056 JITDUMP("\nUnrolled to:\n");
2060 if (BlockRange().TryGetUse(call, &use))
2062 use.ReplaceWith(result);
2066 result->SetUnusedValue();
2068 BlockRange().Remove(lengthArg);
2069 BlockRange().Remove(call);
2071 // Remove all non-user args (e.g. r2r cell)
2072 for (CallArg& arg : call->gtArgs.Args())
2074 if (!arg.IsUserArg())
2076 arg.GetNode()->SetUnusedValue();
2084 JITDUMP("Size is either 0 or too big to unroll.\n")
2089 JITDUMP("size is not a constant.\n")
2094 // do lowering steps for a call
2096 // - adding the placement nodes (either stack or register variety) for arguments
2097 // - lowering the expression that calculates the target address
2098 // - adding nodes for other operations that occur after the call sequence starts and before
2099 // control transfer occurs (profiling and tail call helpers, pinvoke incantations)
2101 GenTree* Lowering::LowerCall(GenTree* node)
2103 GenTreeCall* call = node->AsCall();
2105 JITDUMP("lowering call (before):\n");
2106 DISPTREERANGE(BlockRange(), call);
2109 // All runtime lookups are expected to be expanded in fgExpandRuntimeLookups
2110 assert(!call->IsExpRuntimeLookup());
2112 // Also, always expand static cctor helper for NativeAOT, see
2113 // https://github.com/dotnet/runtime/issues/68278#issuecomment-1543322819
2114 if (comp->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && comp->IsStaticHelperEligibleForExpansion(call))
2116 assert(call->gtInitClsHnd == nullptr);
2119 #if defined(TARGET_AMD64) || defined(TARGET_ARM64)
2120 if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC)
2122 GenTree* newNode = nullptr;
2123 NamedIntrinsic ni = comp->lookupNamedIntrinsic(call->gtCallMethHnd);
2124 if (ni == NI_System_Buffer_Memmove)
2126 newNode = LowerCallMemmove(call);
2128 else if (ni == NI_System_SpanHelpers_SequenceEqual)
2130 newNode = LowerCallMemcmp(call);
2133 if (newNode != nullptr)
2135 return newNode->gtNext;
2140 call->ClearOtherRegs();
2141 LowerArgsForCall(call);
2143 // note that everything generated from this point might run AFTER the outgoing args are placed
2144 GenTree* controlExpr = nullptr;
2145 bool callWasExpandedEarly = false;
2147 // for x86, this is where we record ESP for checking later to make sure stack is balanced
2149 // Check for Delegate.Invoke(). If so, we inline it. We get the
2150 // target-object and target-function from the delegate-object, and do
2151 // an indirect call.
2152 if (call->IsDelegateInvoke())
2154 controlExpr = LowerDelegateInvoke(call);
2158 // Virtual and interface calls
2159 switch (call->gtFlags & GTF_CALL_VIRT_KIND_MASK)
2161 case GTF_CALL_VIRT_STUB:
2162 controlExpr = LowerVirtualStubCall(call);
2165 case GTF_CALL_VIRT_VTABLE:
2166 assert(call->IsVirtualVtable());
2167 if (!call->IsExpandedEarly())
2169 assert(call->gtControlExpr == nullptr);
2170 controlExpr = LowerVirtualVtableCall(call);
2174 callWasExpandedEarly = true;
2175 controlExpr = call->gtControlExpr;
2179 case GTF_CALL_NONVIRT:
2180 if (call->IsUnmanaged())
2182 controlExpr = LowerNonvirtPinvokeCall(call);
2184 else if (call->gtCallType == CT_INDIRECT)
2186 controlExpr = LowerIndirectNonvirtCall(call);
2190 controlExpr = LowerDirectCall(call);
2195 noway_assert(!"strange call type");
2200 // Indirect calls should always go through GenTreeCall::gtCallAddr and
2201 // should never have a control expression as well.
2202 assert((call->gtCallType != CT_INDIRECT) || (controlExpr == nullptr));
2204 if (call->IsTailCallViaJitHelper())
2206 // Either controlExpr or gtCallAddr must contain real call target.
2207 if (controlExpr == nullptr)
2209 assert(call->gtCallType == CT_INDIRECT);
2210 assert(call->gtCallAddr != nullptr);
2211 controlExpr = call->gtCallAddr;
2214 controlExpr = LowerTailCallViaJitHelper(call, controlExpr);
2217 // Check if we need to thread a newly created controlExpr into the LIR
2219 if ((controlExpr != nullptr) && !callWasExpandedEarly)
2221 LIR::Range controlExprRange = LIR::SeqTree(comp, controlExpr);
2223 JITDUMP("results of lowering call:\n");
2224 DISPRANGE(controlExprRange);
2226 ContainCheckRange(controlExprRange);
2228 BlockRange().InsertBefore(call, std::move(controlExprRange));
2229 call->gtControlExpr = controlExpr;
2232 if (comp->opts.IsCFGEnabled())
2237 if (call->IsFastTailCall())
2239 // Lower fast tail call can introduce new temps to set up args correctly for Callee.
2240 // This involves patching LCL_VAR and LCL_VAR_ADDR nodes holding Caller stack args
2241 // and replacing them with a new temp. Control expr also can contain nodes that need
2243 // Therefore lower fast tail call must be done after controlExpr is inserted into LIR.
2244 // There is one side effect which is flipping the order of PME and control expression
2245 // since LowerFastTailCall calls InsertPInvokeMethodEpilog.
2246 LowerFastTailCall(call);
2250 if (!call->IsHelperCall(comp, CORINFO_HELP_VALIDATE_INDIRECT_CALL))
2252 RequireOutgoingArgSpace(call, call->gtArgs.OutgoingArgsStackSize());
2256 if (varTypeIsStruct(call))
2258 LowerCallStruct(call);
2261 ContainCheckCallOperands(call);
2262 JITDUMP("lowering call (after):\n");
2263 DISPTREERANGE(BlockRange(), call);
2268 // Inserts profiler hook, GT_PROF_HOOK for a tail call node.
2271 // We need to insert this after all nested calls, but before all the arguments to this call have been set up.
2272 // To do this, we look for the first GT_PUTARG_STK or GT_PUTARG_REG, and insert the hook immediately before
2273 // that. If there are no args, then it should be inserted before the call node.
2276 // * stmtExpr void (top level) (IL 0x000...0x010)
2277 // arg0 SETUP | /--* argPlace ref REG NA $c5
2278 // this in rcx | | /--* argPlace ref REG NA $c1
2279 // | | | /--* call ref System.Globalization.CultureInfo.get_InvariantCulture $c2
2280 // arg1 SETUP | | +--* st.lclVar ref V02 tmp1 REG NA $c2
2281 // | | | /--* lclVar ref V02 tmp1 u : 2 (last use) REG NA $c2
2282 // arg1 in rdx | | +--* putarg_reg ref REG NA
2283 // | | | /--* lclVar ref V00 arg0 u : 2 (last use) REG NA $80
2284 // this in rcx | | +--* putarg_reg ref REG NA
2285 // | | /--* call nullcheck ref System.String.ToLower $c5
2286 // | | { * stmtExpr void (embedded)(IL 0x000... ? ? ? )
2287 // | | { \--* prof_hook void REG NA
2288 // arg0 in rcx | +--* putarg_reg ref REG NA
2289 // control expr | +--* const(h) long 0x7ffe8e910e98 ftn REG NA
2290 // \--* call void System.Runtime.Remoting.Identity.RemoveAppNameOrAppGuidIfNecessary $VN.Void
2292 // In this case, the GT_PUTARG_REG src is a nested call. We need to put the instructions after that call
2293 // (as shown). We assume that of all the GT_PUTARG_*, only the first one can have a nested call.
2296 // Insert the profiler hook immediately before the call. The profiler hook will preserve
2297 // all argument registers (ECX, EDX), but nothing else.
2300 // callNode - tail call node
2301 // insertionPoint - if non-null, insert the profiler hook before this point.
2302 // If null, insert the profiler hook before args are setup
2303 // but after all arg side effects are computed.
2305 void Lowering::InsertProfTailCallHook(GenTreeCall* call, GenTree* insertionPoint)
2307 assert(call->IsTailCall());
2308 assert(comp->compIsProfilerHookNeeded());
2310 #if defined(TARGET_X86)
2312 if (insertionPoint == nullptr)
2314 insertionPoint = call;
2317 #else // !defined(TARGET_X86)
2319 if (insertionPoint == nullptr)
2321 insertionPoint = FindEarliestPutArg(call);
2323 if (insertionPoint == nullptr)
2325 insertionPoint = call;
2329 #endif // !defined(TARGET_X86)
2331 assert(insertionPoint != nullptr);
2332 JITDUMP("Inserting profiler tail call before [%06u]\n", comp->dspTreeID(insertionPoint));
2334 GenTree* profHookNode = new (comp, GT_PROF_HOOK) GenTree(GT_PROF_HOOK, TYP_VOID);
2335 BlockRange().InsertBefore(insertionPoint, profHookNode);
2338 //------------------------------------------------------------------------
2339 // FindEarliestPutArg: Find the earliest direct PUTARG operand of a call node in
2346 // A PUTARG_* node that is the earliest of the call, or nullptr if the call
2347 // has no arguments.
2349 GenTree* Lowering::FindEarliestPutArg(GenTreeCall* call)
2351 size_t numMarkedNodes = 0;
2352 for (CallArg& arg : call->gtArgs.Args())
2354 if (arg.GetEarlyNode() != nullptr)
2356 numMarkedNodes += MarkPutArgNodes(arg.GetEarlyNode());
2359 if (arg.GetLateNode() != nullptr)
2361 numMarkedNodes += MarkPutArgNodes(arg.GetLateNode());
2365 if (numMarkedNodes <= 0)
2370 GenTree* node = call;
2373 node = node->gtPrev;
2375 assert((node != nullptr) && "Reached beginning of basic block while looking for marked nodes");
2377 if ((node->gtLIRFlags & LIR::Flags::Mark) != 0)
2379 node->gtLIRFlags &= ~LIR::Flags::Mark;
2382 } while (numMarkedNodes > 0);
2384 assert(node->OperIsPutArg());
2388 //------------------------------------------------------------------------
2389 // MarkPutArgNodes: Mark all direct operand PUTARG nodes with a LIR mark.
2392 // node - the node (either a field list or PUTARG node)
2395 // The number of marks added.
2397 size_t Lowering::MarkPutArgNodes(GenTree* node)
2399 assert(node->OperIsPutArg() || node->OperIsFieldList());
2402 if (node->OperIsFieldList())
2404 for (GenTreeFieldList::Use& operand : node->AsFieldList()->Uses())
2406 assert(operand.GetNode()->OperIsPutArg());
2407 result += MarkPutArgNodes(operand.GetNode());
2412 assert((node->gtLIRFlags & LIR::Flags::Mark) == 0);
2413 node->gtLIRFlags |= LIR::Flags::Mark;
2420 //------------------------------------------------------------------------
2421 // LowerFastTailCall: Lower a call node dispatched as a fast tailcall (epilog +
2425 // call - the call node that is being dispatched as a fast tailcall.
2428 // call must be non-null.
2431 // For fast tail calls it is necessary to set up stack args in the incoming
2432 // arg stack space area. When args passed also come from this area we may
2433 // run into problems because we may end up overwriting the stack slot before
2434 // using it. For example, for foo(a, b) { return bar(b, a); }, if a and b
2435 // are on incoming arg stack space in foo they need to be swapped in this
2436 // area for the call to bar. This function detects this situation and
2437 // introduces a temp when an outgoing argument would overwrite a later-used
2438 // incoming argument.
2440 // This function also handles inserting necessary profiler hooks and pinvoke
2441 // method epilogs in case there are inlined pinvokes.
2442 void Lowering::LowerFastTailCall(GenTreeCall* call)
2444 #if FEATURE_FASTTAILCALL
2445 // Tail call restrictions i.e. conditions under which tail prefix is ignored.
2446 // Most of these checks are already done by importer or fgMorphTailCall().
2447 // This serves as a double sanity check.
2448 assert((comp->info.compFlags & CORINFO_FLG_SYNCH) == 0); // tail calls from synchronized methods
2449 assert(!comp->opts.IsReversePInvoke()); // tail calls reverse pinvoke
2450 assert(!call->IsUnmanaged()); // tail calls to unamanaged methods
2451 assert(!comp->compLocallocUsed); // tail call from methods that also do localloc
2454 assert(!comp->getNeedsGSSecurityCookie()); // jit64 compat: tail calls from methods that need GS check
2455 #endif // TARGET_AMD64
2457 // We expect to see a call that meets the following conditions
2458 assert(call->IsFastTailCall());
2460 // VM cannot use return address hijacking when A() and B() tail call each
2461 // other in mutual recursion. Therefore, this block is reachable through
2462 // a GC-safe point or the whole method is marked as fully interruptible.
2465 // optReachWithoutCall() depends on the fact that loop headers blocks
2466 // will have a block number > fgLastBB. These loop headers gets added
2467 // after dominator computation and get skipped by OptReachWithoutCall().
2468 // The below condition cannot be asserted in lower because fgSimpleLowering()
2469 // can add a new basic block for range check failure which becomes
2470 // fgLastBB with block number > loop header block number.
2471 // assert((comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT) ||
2472 // !comp->optReachWithoutCall(comp->fgFirstBB, comp->compCurBB) || comp->GetInterruptible());
2474 // If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
2475 // a method returns. This is a case of caller method has both PInvokes and tail calls.
2476 if (comp->compMethodRequiresPInvokeFrame())
2478 InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(call));
2481 // Args for tail call are setup in incoming arg area. The gc-ness of args of
2482 // caller and callee (which being tail called) may not match. Therefore, everything
2483 // from arg setup until the epilog need to be non-interruptible by GC. This is
2484 // achieved by inserting GT_START_NONGC before the very first GT_PUTARG_STK node
2485 // of call is setup. Note that once a stack arg is setup, it cannot have nested
2486 // calls subsequently in execution order to setup other args, because the nested
2487 // call could over-write the stack arg that is setup earlier.
2488 ArrayStack<GenTree*> putargs(comp->getAllocator(CMK_ArrayStack));
2490 for (CallArg& arg : call->gtArgs.EarlyArgs())
2492 if (arg.GetEarlyNode()->OperIs(GT_PUTARG_STK))
2494 putargs.Push(arg.GetEarlyNode());
2498 for (CallArg& arg : call->gtArgs.LateArgs())
2500 if (arg.GetLateNode()->OperIs(GT_PUTARG_STK))
2502 putargs.Push(arg.GetLateNode());
2506 GenTree* startNonGCNode = nullptr;
2507 if (!putargs.Empty())
2509 // Get the earliest operand of the first PUTARG_STK node. We will make
2510 // the required copies of args before this node.
2512 GenTree* insertionPoint = BlockRange().GetTreeRange(putargs.Bottom(), &unused).FirstNode();
2513 // Insert GT_START_NONGC node before we evaluate the PUTARG_STK args.
2514 // Note that if there are no args to be setup on stack, no need to
2515 // insert GT_START_NONGC node.
2516 startNonGCNode = new (comp, GT_START_NONGC) GenTree(GT_START_NONGC, TYP_VOID);
2517 BlockRange().InsertBefore(insertionPoint, startNonGCNode);
2519 // Gc-interruptability in the following case:
2520 // foo(a, b, c, d, e) { bar(a, b, c, d, e); }
2521 // bar(a, b, c, d, e) { foo(a, b, d, d, e); }
2523 // Since the instruction group starting from the instruction that sets up first
2524 // stack arg to the end of the tail call is marked as non-gc interruptible,
2525 // this will form a non-interruptible tight loop causing gc-starvation. To fix
2526 // this we insert GT_NO_OP as embedded stmt before GT_START_NONGC, if the method
2527 // has a single basic block and is not a GC-safe point. The presence of a single
2528 // nop outside non-gc interruptible region will prevent gc starvation.
2529 if ((comp->fgBBcount == 1) && !(comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT))
2531 assert(comp->fgFirstBB == comp->compCurBB);
2532 GenTree* noOp = new (comp, GT_NO_OP) GenTree(GT_NO_OP, TYP_VOID);
2533 BlockRange().InsertBefore(startNonGCNode, noOp);
2536 // Since this is a fast tailcall each PUTARG_STK will place the argument in the
2537 // _incoming_ arg space area. This will effectively overwrite our already existing
2538 // incoming args that live in that area. If we have later uses of those args, this
2539 // is a problem. We introduce a defensive copy into a temp here of those args that
2540 // potentially may cause problems.
2541 for (int i = 0; i < putargs.Height(); i++)
2543 GenTreePutArgStk* put = putargs.Bottom(i)->AsPutArgStk();
2545 unsigned int overwrittenStart = put->getArgOffset();
2546 unsigned int overwrittenEnd = overwrittenStart + put->GetStackByteSize();
2547 int baseOff = -1; // Stack offset of first arg on stack
2549 for (unsigned callerArgLclNum = 0; callerArgLclNum < comp->info.compArgsCount; callerArgLclNum++)
2551 LclVarDsc* callerArgDsc = comp->lvaGetDesc(callerArgLclNum);
2553 if (callerArgDsc->lvIsRegArg)
2558 unsigned int argStart;
2559 unsigned int argEnd;
2560 #if defined(TARGET_AMD64)
2561 if (TargetOS::IsWindows)
2563 // On Windows x64, the argument position determines the stack slot uniquely, and even the
2564 // register args take up space in the stack frame (shadow space).
2565 argStart = callerArgLclNum * TARGET_POINTER_SIZE;
2566 argEnd = argStart + static_cast<unsigned int>(callerArgDsc->lvArgStackSize());
2569 #endif // TARGET_AMD64
2571 assert(callerArgDsc->GetStackOffset() != BAD_STK_OFFS);
2575 baseOff = callerArgDsc->GetStackOffset();
2578 // On all ABIs where we fast tail call the stack args should come in order.
2579 assert(baseOff <= callerArgDsc->GetStackOffset());
2581 // Compute offset of this stack argument relative to the first stack arg.
2582 // This will be its offset into the incoming arg space area.
2583 argStart = static_cast<unsigned int>(callerArgDsc->GetStackOffset() - baseOff);
2584 argEnd = argStart + comp->lvaLclSize(callerArgLclNum);
2587 // If ranges do not overlap then this PUTARG_STK will not mess up the arg.
2588 if ((overwrittenEnd <= argStart) || (overwrittenStart >= argEnd))
2593 // Codegen cannot handle a partially overlapping copy. For
2594 // example, if we have
2595 // bar(S16 stack, S32 stack2)
2596 // foo(S32 stack, S32 stack2) { bar(..., stack) }
2597 // then we may end up having to move 'stack' in foo 16 bytes
2598 // ahead. It is possible that this PUTARG_STK is the only use,
2599 // in which case we will need to introduce a temp, so look for
2600 // uses starting from it. Note that we assume that in-place
2602 GenTree* lookForUsesFrom = put->gtNext;
2603 if (overwrittenStart != argStart)
2605 lookForUsesFrom = insertionPoint;
2608 RehomeArgForFastTailCall(callerArgLclNum, insertionPoint, lookForUsesFrom, call);
2609 // The above call can introduce temps and invalidate the pointer.
2610 callerArgDsc = comp->lvaGetDesc(callerArgLclNum);
2612 // For promoted locals we have more work to do as its fields could also have been invalidated.
2613 if (!callerArgDsc->lvPromoted)
2618 unsigned int fieldsFirst = callerArgDsc->lvFieldLclStart;
2619 unsigned int fieldsEnd = fieldsFirst + callerArgDsc->lvFieldCnt;
2620 for (unsigned int j = fieldsFirst; j < fieldsEnd; j++)
2622 RehomeArgForFastTailCall(j, insertionPoint, lookForUsesFrom, call);
2628 // Insert GT_PROF_HOOK node to emit profiler tail call hook. This should be
2629 // inserted before the args are setup but after the side effects of args are
2630 // computed. That is, GT_PROF_HOOK node needs to be inserted before GT_START_NONGC
2631 // node if one exists.
2632 if (comp->compIsProfilerHookNeeded())
2634 InsertProfTailCallHook(call, startNonGCNode);
2637 #else // !FEATURE_FASTTAILCALL
2639 // Platform does not implement fast tail call mechanism. This cannot be
2640 // reached because we always choose to do a tailcall via helper on those
2641 // platforms (or no tailcall at all).
2646 //------------------------------------------------------------------------
2647 // RehomeArgForFastTailCall: Introduce temps for args that may be overwritten
2648 // during fast tailcall sequence.
2651 // lclNum - the lcl num of the arg that will be overwritten.
2652 // insertTempBefore - the node at which to copy the arg into a temp.
2653 // lookForUsesStart - the node where to start scanning and replacing uses of
2654 // the arg specified by lclNum.
2655 // callNode - the call node that is being dispatched as a fast tailcall.
2658 // all args must be non-null.
2661 // This function scans for uses of the arg specified by lclNum starting
2662 // from the lookForUsesStart node. If it finds any uses it introduces a temp
2663 // for this argument and updates uses to use this instead. In the situation
2664 // where it introduces a temp it can thus invalidate pointers to other
2667 void Lowering::RehomeArgForFastTailCall(unsigned int lclNum,
2668 GenTree* insertTempBefore,
2669 GenTree* lookForUsesStart,
2670 GenTreeCall* callNode)
2672 unsigned int tmpLclNum = BAD_VAR_NUM;
2673 for (GenTree* treeNode = lookForUsesStart; treeNode != callNode; treeNode = treeNode->gtNext)
2675 if (!treeNode->OperIsLocal() && !treeNode->OperIs(GT_LCL_ADDR))
2680 GenTreeLclVarCommon* lcl = treeNode->AsLclVarCommon();
2682 if (lcl->GetLclNum() != lclNum)
2687 // Create tmp and use it in place of callerArgDsc
2688 if (tmpLclNum == BAD_VAR_NUM)
2690 tmpLclNum = comp->lvaGrabTemp(true DEBUGARG("Fast tail call lowering is creating a new local variable"));
2692 LclVarDsc* callerArgDsc = comp->lvaGetDesc(lclNum);
2693 var_types tmpTyp = genActualType(callerArgDsc->TypeGet());
2694 comp->lvaTable[tmpLclNum].lvType = tmpTyp;
2695 // TODO-CQ: I don't see why we should copy doNotEnreg.
2696 comp->lvaTable[tmpLclNum].lvDoNotEnregister = callerArgDsc->lvDoNotEnregister;
2698 comp->lvaTable[tmpLclNum].SetDoNotEnregReason(callerArgDsc->GetDoNotEnregReason());
2703 if (tmpTyp == TYP_LONG)
2705 GenTree* loResult = comp->gtNewLclFldNode(lclNum, TYP_INT, 0);
2706 GenTree* hiResult = comp->gtNewLclFldNode(lclNum, TYP_INT, 4);
2707 value = new (comp, GT_LONG) GenTreeOp(GT_LONG, TYP_LONG, loResult, hiResult);
2710 #endif // TARGET_ARM
2712 value = comp->gtNewLclvNode(lclNum, tmpTyp);
2715 if (tmpTyp == TYP_STRUCT)
2717 comp->lvaSetStruct(tmpLclNum, comp->lvaGetDesc(lclNum)->GetLayout(), false);
2719 GenTreeLclVar* storeLclVar = comp->gtNewStoreLclVarNode(tmpLclNum, value);
2720 BlockRange().InsertBefore(insertTempBefore, LIR::SeqTree(comp, storeLclVar));
2721 ContainCheckRange(value, storeLclVar);
2722 LowerNode(storeLclVar);
2725 lcl->SetLclNum(tmpLclNum);
2729 //------------------------------------------------------------------------
2730 // LowerTailCallViaJitHelper: lower a call via the tailcall JIT helper. Morph
2731 // has already inserted tailcall helper special arguments. This function inserts
2732 // actual data for some placeholders. This function is only used on x86.
2735 // tail.call(<function args>, int numberOfOldStackArgs, int dummyNumberOfNewStackArgs, int flags, void* dummyArg)
2737 // JIT_TailCall(<function args>, int numberOfOldStackArgsWords, int numberOfNewStackArgsWords, int flags, void*
2739 // Note that the special arguments are on the stack, whereas the function arguments follow the normal convention.
2741 // Also inserts PInvoke method epilog if required.
2744 // call - The call node
2745 // callTarget - The real call target. This is used to replace the dummyArg during lowering.
2748 // Returns control expression tree for making a call to helper Jit_TailCall.
2750 GenTree* Lowering::LowerTailCallViaJitHelper(GenTreeCall* call, GenTree* callTarget)
2752 // Tail call restrictions i.e. conditions under which tail prefix is ignored.
2753 // Most of these checks are already done by importer or fgMorphTailCall().
2754 // This serves as a double sanity check.
2755 assert((comp->info.compFlags & CORINFO_FLG_SYNCH) == 0); // tail calls from synchronized methods
2756 assert(!call->IsUnmanaged()); // tail calls to unamanaged methods
2757 assert(!comp->compLocallocUsed); // tail call from methods that also do localloc
2759 // We expect to see a call that meets the following conditions
2760 assert(call->IsTailCallViaJitHelper());
2761 assert(callTarget != nullptr);
2763 // The TailCall helper call never returns to the caller and is not GC interruptible.
2764 // Therefore the block containing the tail call should be a GC safe point to avoid
2765 // GC starvation. It is legal for the block to be unmarked iff the entry block is a
2766 // GC safe point, as the entry block trivially dominates every reachable block.
2767 assert((comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT) || (comp->fgFirstBB->bbFlags & BBF_GC_SAFE_POINT));
2769 // If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
2770 // a method returns. This is a case of caller method has both PInvokes and tail calls.
2771 if (comp->compMethodRequiresPInvokeFrame())
2773 InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(call));
2776 // Remove gtCallAddr from execution order if present.
2777 if (call->gtCallType == CT_INDIRECT)
2779 assert(call->gtCallAddr != nullptr);
2782 LIR::ReadOnlyRange callAddrRange = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed);
2785 BlockRange().Remove(std::move(callAddrRange));
2788 // The callTarget tree needs to be sequenced.
2789 LIR::Range callTargetRange = LIR::SeqTree(comp, callTarget);
2791 // Verify the special args are what we expect, and replace the dummy args with real values.
2792 // We need to figure out the size of the outgoing stack arguments, not including the special args.
2793 // The number of 4-byte words is passed to the helper for the incoming and outgoing argument sizes.
2794 // This number is exactly the next slot number in the call's argument info struct.
2795 unsigned nNewStkArgsBytes = call->gtArgs.OutgoingArgsStackSize();
2796 const int wordSize = 4;
2797 unsigned nNewStkArgsWords = nNewStkArgsBytes / wordSize;
2798 assert(nNewStkArgsWords >= 4); // There must be at least the four special stack args.
2799 nNewStkArgsWords -= 4;
2801 unsigned numArgs = call->gtArgs.CountArgs();
2803 // arg 0 == callTarget.
2804 CallArg* argEntry = call->gtArgs.GetArgByIndex(numArgs - 1);
2805 assert(argEntry != nullptr);
2806 GenTree* arg0 = argEntry->GetEarlyNode()->AsPutArgStk()->gtGetOp1();
2808 ContainCheckRange(callTargetRange);
2809 BlockRange().InsertAfter(arg0, std::move(callTargetRange));
2812 LIR::ReadOnlyRange secondArgRange = BlockRange().GetTreeRange(arg0, &isClosed);
2814 BlockRange().Remove(std::move(secondArgRange));
2816 argEntry->GetEarlyNode()->AsPutArgStk()->gtOp1 = callTarget;
2819 argEntry = call->gtArgs.GetArgByIndex(numArgs - 2);
2820 assert(argEntry != nullptr);
2821 GenTree* arg1 = argEntry->GetEarlyNode()->AsPutArgStk()->gtGetOp1();
2822 assert(arg1->gtOper == GT_CNS_INT);
2824 ssize_t tailCallHelperFlags = 1 | // always restore EDI,ESI,EBX
2825 (call->IsVirtualStub() ? 0x2 : 0x0); // Stub dispatch flag
2826 arg1->AsIntCon()->gtIconVal = tailCallHelperFlags;
2828 // arg 2 == numberOfNewStackArgsWords
2829 argEntry = call->gtArgs.GetArgByIndex(numArgs - 3);
2830 assert(argEntry != nullptr);
2831 GenTree* arg2 = argEntry->GetEarlyNode()->AsPutArgStk()->gtGetOp1();
2832 assert(arg2->gtOper == GT_CNS_INT);
2834 arg2->AsIntCon()->gtIconVal = nNewStkArgsWords;
2837 // arg 3 == numberOfOldStackArgsWords
2838 argEntry = call->gtArgs.GetArgByIndex(numArgs - 4);
2839 assert(argEntry != nullptr);
2840 GenTree* arg3 = argEntry->GetEarlyNode()->AsPutArgStk()->gtGetOp1();
2841 assert(arg3->gtOper == GT_CNS_INT);
2844 // Transform this call node into a call to Jit tail call helper.
2845 call->gtCallType = CT_HELPER;
2846 call->gtCallMethHnd = comp->eeFindHelper(CORINFO_HELP_TAILCALL);
2847 call->gtFlags &= ~GTF_CALL_VIRT_KIND_MASK;
2849 // Lower this as if it were a pure helper call.
2850 call->gtCallMoreFlags &= ~(GTF_CALL_M_TAILCALL | GTF_CALL_M_TAILCALL_VIA_JIT_HELPER);
2851 GenTree* result = LowerDirectCall(call);
2853 // Now add back tail call flags for identifying this node as tail call dispatched via helper.
2854 call->gtCallMoreFlags |= GTF_CALL_M_TAILCALL | GTF_CALL_M_TAILCALL_VIA_JIT_HELPER;
2856 #ifdef PROFILING_SUPPORTED
2857 // Insert profiler tail call hook if needed.
2858 // Since we don't know the insertion point, pass null for second param.
2859 if (comp->compIsProfilerHookNeeded())
2861 InsertProfTailCallHook(call, nullptr);
2863 #endif // PROFILING_SUPPORTED
2868 //------------------------------------------------------------------------
2869 // LowerCFGCall: Potentially lower a call to use control-flow guard. This
2870 // expands indirect calls into either a validate+call sequence or to a dispatch
2871 // helper taking the original target in a special register.
2874 // call - The call node
2876 void Lowering::LowerCFGCall(GenTreeCall* call)
2878 assert(!call->IsHelperCall(comp, CORINFO_HELP_DISPATCH_INDIRECT_CALL));
2879 if (call->IsHelperCall(comp, CORINFO_HELP_VALIDATE_INDIRECT_CALL))
2884 GenTree* callTarget = call->gtCallType == CT_INDIRECT ? call->gtCallAddr : call->gtControlExpr;
2885 if (callTarget == nullptr)
2887 assert((call->gtCallType != CT_INDIRECT) && (!call->IsVirtual() || call->IsVirtualStubRelativeIndir()));
2888 if (!call->IsVirtual())
2890 // Direct call with stashed address
2894 // This is a VSD call with the call target being null because we are
2895 // supposed to load it from the indir cell. Due to CFG we will need
2896 // this address twice, and at least on ARM64 we do not want to
2897 // materialize the constant both times.
2898 CallArg* indirCellArg = call->gtArgs.FindWellKnownArg(WellKnownArg::VirtualStubCell);
2899 assert((indirCellArg != nullptr) && indirCellArg->GetNode()->OperIs(GT_PUTARG_REG));
2901 GenTreeOp* putArgNode = indirCellArg->GetNode()->AsOp();
2902 LIR::Use indirCellArgUse(BlockRange(), &putArgNode->gtOp1, putArgNode);
2904 // On non-xarch, we create a local even for constants. On xarch cloning
2905 // the constant is better since it can be contained in the load below.
2906 bool cloneConsts = false;
2911 GenTree* indirCellClone;
2913 if (indirCellArgUse.Def()->OperIs(GT_LCL_VAR) || (cloneConsts && indirCellArgUse.Def()->IsCnsIntOrI()))
2915 indirCellClone = comp->gtClone(indirCellArgUse.Def());
2919 unsigned newLcl = indirCellArgUse.ReplaceWithLclVar(comp);
2920 indirCellClone = comp->gtNewLclvNode(newLcl, TYP_I_IMPL);
2923 callTarget = Ind(indirCellClone);
2924 LIR::Range controlExprRange = LIR::SeqTree(comp, callTarget);
2925 ContainCheckRange(controlExprRange);
2927 BlockRange().InsertBefore(call, std::move(controlExprRange));
2928 call->gtControlExpr = callTarget;
2932 if (callTarget->IsIntegralConst())
2934 // This is a direct call, no CFG check is necessary.
2939 CFGCallKind cfgKind = call->GetCFGCallKind();
2943 case CFGCallKind::ValidateAndCall:
2945 // To safely apply CFG we need to generate a very specific pattern:
2946 // in particular, it is a safety issue to allow the JIT to reload
2947 // the call target from memory between calling
2948 // CORINFO_HELP_VALIDATE_INDIRECT_CALL and the target. This is
2949 // something that would easily occur in debug codegen if we
2950 // produced high-level IR. Instead we will use a GT_PHYSREG node
2951 // to get the target back from the register that contains the target.
2953 // Additionally, the validator does not preserve all arg registers,
2954 // so we have to move all GT_PUTARG_REG nodes that would otherwise
2955 // be trashed ahead. The JIT also has an internal invariant that
2956 // once GT_PUTARG nodes start to appear in LIR, the call is coming
2957 // up. To avoid breaking this invariant we move _all_ GT_PUTARG
2958 // nodes (in particular, GC info reporting relies on this).
2960 // To sum up, we end up transforming
2962 // ta... = <early args>
2963 // tb... = <late args>
2965 // GT_CALL tc, ta..., tb...
2969 // ta... = <early args> (without GT_PUTARG_* nodes)
2971 // GT_CALL CORINFO_HELP_VALIDATE_INDIRECT_CALL, tb
2972 // tc = GT_PHYSREG REG_VALIDATE_INDIRECT_CALL_ADDR (preserved by helper)
2973 // td = <moved GT_PUTARG_* nodes>
2974 // GT_CALL tb, ta..., td..
2977 GenTree* regNode = PhysReg(REG_VALIDATE_INDIRECT_CALL_ADDR, TYP_I_IMPL);
2979 bool gotUse = BlockRange().TryGetUse(callTarget, &useOfTar);
2981 useOfTar.ReplaceWith(regNode);
2983 // Add the call to the validator. Use a placeholder for the target while we
2984 // morph, sequence and lower, to avoid redoing that for the actual target.
2985 GenTree* targetPlaceholder = comp->gtNewZeroConNode(callTarget->TypeGet());
2986 GenTreeCall* validate = comp->gtNewHelperCallNode(CORINFO_HELP_VALIDATE_INDIRECT_CALL, TYP_VOID);
2988 NewCallArg::Primitive(targetPlaceholder).WellKnown(WellKnownArg::ValidateIndirectCallTarget);
2989 validate->gtArgs.PushFront(comp, newArg);
2991 comp->fgMorphTree(validate);
2993 LIR::Range validateRange = LIR::SeqTree(comp, validate);
2994 GenTree* validateFirst = validateRange.FirstNode();
2995 GenTree* validateLast = validateRange.LastNode();
2996 // Insert the validator with the call target before the late args.
2997 BlockRange().InsertBefore(call, std::move(validateRange));
2999 // Swap out the target
3000 gotUse = BlockRange().TryGetUse(targetPlaceholder, &useOfTar);
3002 useOfTar.ReplaceWith(callTarget);
3003 targetPlaceholder->SetUnusedValue();
3005 LowerRange(validateFirst, validateLast);
3007 // Insert the PHYSREG node that we must load right after validation.
3008 BlockRange().InsertAfter(validate, regNode);
3011 // Finally move all GT_PUTARG_* nodes
3012 for (CallArg& arg : call->gtArgs.EarlyArgs())
3014 GenTree* node = arg.GetEarlyNode();
3015 // Non-value nodes in early args are setup nodes for late args.
3016 if (node->IsValue())
3018 assert(node->OperIsPutArg() || node->OperIsFieldList());
3019 MoveCFGCallArg(call, node);
3023 for (CallArg& arg : call->gtArgs.LateArgs())
3025 GenTree* node = arg.GetLateNode();
3026 assert(node->OperIsPutArg() || node->OperIsFieldList());
3027 MoveCFGCallArg(call, node);
3031 case CFGCallKind::Dispatch:
3033 #ifdef REG_DISPATCH_INDIRECT_CALL_ADDR
3034 // Now insert the call target as an extra argument.
3036 NewCallArg callTargetNewArg =
3037 NewCallArg::Primitive(callTarget).WellKnown(WellKnownArg::DispatchIndirectCallTarget);
3038 CallArg* targetArg = call->gtArgs.PushBack(comp, callTargetNewArg);
3039 targetArg->SetEarlyNode(nullptr);
3040 targetArg->SetLateNode(callTarget);
3041 call->gtArgs.PushLateBack(targetArg);
3043 // Set up ABI information for this arg.
3044 targetArg->AbiInfo.ArgType = callTarget->TypeGet();
3045 targetArg->AbiInfo.SetRegNum(0, REG_DISPATCH_INDIRECT_CALL_ADDR);
3046 targetArg->AbiInfo.NumRegs = 1;
3047 targetArg->AbiInfo.SetByteSize(TARGET_POINTER_SIZE, TARGET_POINTER_SIZE, false, false);
3049 // Lower the newly added args now that call is updated
3050 LowerArg(call, targetArg, true /* late */);
3052 // Finally update the call to be a helper call
3053 call->gtCallType = CT_HELPER;
3054 call->gtCallMethHnd = Compiler::eeFindHelper(CORINFO_HELP_DISPATCH_INDIRECT_CALL);
3055 call->gtFlags &= ~GTF_CALL_VIRT_KIND_MASK;
3056 #ifdef FEATURE_READYTORUN
3057 call->gtEntryPoint.addr = nullptr;
3058 call->gtEntryPoint.accessType = IAT_VALUE;
3061 // Now relower the call target
3062 call->gtControlExpr = LowerDirectCall(call);
3064 if (call->gtControlExpr != nullptr)
3066 LIR::Range dispatchControlExprRange = LIR::SeqTree(comp, call->gtControlExpr);
3068 ContainCheckRange(dispatchControlExprRange);
3069 BlockRange().InsertBefore(call, std::move(dispatchControlExprRange));
3072 assert(!"Unexpected CFGCallKind::Dispatch for platform without dispatcher");
3081 //------------------------------------------------------------------------
3082 // IsCFGCallArgInvariantInRange: A cheap version of IsInvariantInRange to check
3083 // if a node is invariant in the specified range. In other words, can 'node' be
3084 // moved to right before 'endExclusive' without its computation changing
3089 // endExclusive - The exclusive end of the range to check invariance for.
3091 bool Lowering::IsCFGCallArgInvariantInRange(GenTree* node, GenTree* endExclusive)
3093 assert(node->Precedes(endExclusive));
3095 if (node->IsInvariant())
3100 if (!node->IsValue())
3105 if (node->OperIsLocal())
3107 GenTreeLclVarCommon* lcl = node->AsLclVarCommon();
3108 LclVarDsc* desc = comp->lvaGetDesc(lcl);
3109 if (desc->IsAddressExposed())
3114 // Currently, non-address exposed locals have the property that their
3115 // use occurs at the user, so no further interference check is
3123 //------------------------------------------------------------------------
3124 // MoveCFGCallArg: Given a call that will be CFG transformed using the
3125 // validate+call scheme, and an argument GT_PUTARG_* or GT_FIELD_LIST node,
3126 // move that node right before the call.
3129 // call - The call that is being CFG transformed
3130 // node - The argument node
3133 // We can always move the GT_PUTARG_* node further ahead as the side-effects
3134 // of these nodes are handled by LSRA. However, the operands of these nodes
3135 // are not always safe to move further ahead; for invariant operands, we
3136 // move them ahead as well to shorten the lifetime of these values.
3138 void Lowering::MoveCFGCallArg(GenTreeCall* call, GenTree* node)
3140 assert(node->OperIsPutArg() || node->OperIsFieldList());
3142 if (node->OperIsFieldList())
3144 JITDUMP("Node is a GT_FIELD_LIST; moving all operands\n");
3145 for (GenTreeFieldList::Use& operand : node->AsFieldList()->Uses())
3147 assert(operand.GetNode()->OperIsPutArg());
3148 MoveCFGCallArg(call, operand.GetNode());
3153 GenTree* operand = node->AsOp()->gtGetOp1();
3154 JITDUMP("Checking if we can move operand of GT_PUTARG_* node:\n");
3156 if (((operand->gtFlags & GTF_ALL_EFFECT) == 0) && IsCFGCallArgInvariantInRange(operand, call))
3158 JITDUMP("...yes, moving to after validator call\n");
3159 BlockRange().Remove(operand);
3160 BlockRange().InsertBefore(call, operand);
3164 JITDUMP("...no, operand has side effects or is not invariant\n");
3168 JITDUMP("Moving\n");
3171 BlockRange().Remove(node);
3172 BlockRange().InsertBefore(call, node);
3175 #ifndef TARGET_64BIT
3176 //------------------------------------------------------------------------
3177 // Lowering::DecomposeLongCompare: Decomposes a TYP_LONG compare node.
3180 // cmp - the compare node
3183 // The next node to lower.
3186 // This is done during lowering because DecomposeLongs handles only nodes
3187 // that produce TYP_LONG values. Compare nodes may consume TYP_LONG values
3188 // but produce TYP_INT values.
3190 GenTree* Lowering::DecomposeLongCompare(GenTree* cmp)
3192 assert(cmp->gtGetOp1()->TypeGet() == TYP_LONG);
3194 GenTree* src1 = cmp->gtGetOp1();
3195 GenTree* src2 = cmp->gtGetOp2();
3196 assert(src1->OperIs(GT_LONG));
3197 assert(src2->OperIs(GT_LONG));
3198 GenTree* loSrc1 = src1->gtGetOp1();
3199 GenTree* hiSrc1 = src1->gtGetOp2();
3200 GenTree* loSrc2 = src2->gtGetOp1();
3201 GenTree* hiSrc2 = src2->gtGetOp2();
3202 BlockRange().Remove(src1);
3203 BlockRange().Remove(src2);
3205 genTreeOps condition = cmp->OperGet();
3209 if (cmp->OperIs(GT_EQ, GT_NE))
3212 // Transform (x EQ|NE y) into (((x.lo XOR y.lo) OR (x.hi XOR y.hi)) EQ|NE 0). If y is 0 then this can
3213 // be reduced to just ((x.lo OR x.hi) EQ|NE 0). The OR is expected to set the condition flags so we
3214 // don't need to generate a redundant compare against 0, we only generate a SETCC|JCC instruction.
3216 // XOR is used rather than SUB because it is commutative and thus allows swapping the operands when
3217 // the first happens to be a constant. Usually only the second compare operand is a constant but it's
3218 // still possible to have a constant on the left side. For example, when src1 is a uint->ulong cast
3219 // then hiSrc1 would be 0.
3222 if (loSrc1->OperIs(GT_CNS_INT))
3224 std::swap(loSrc1, loSrc2);
3227 if (loSrc2->IsIntegralConst(0))
3229 BlockRange().Remove(loSrc2);
3234 loCmp = comp->gtNewOperNode(GT_XOR, TYP_INT, loSrc1, loSrc2);
3235 BlockRange().InsertBefore(cmp, loCmp);
3236 ContainCheckBinary(loCmp->AsOp());
3239 if (hiSrc1->OperIs(GT_CNS_INT))
3241 std::swap(hiSrc1, hiSrc2);
3244 if (hiSrc2->IsIntegralConst(0))
3246 BlockRange().Remove(hiSrc2);
3251 hiCmp = comp->gtNewOperNode(GT_XOR, TYP_INT, hiSrc1, hiSrc2);
3252 BlockRange().InsertBefore(cmp, hiCmp);
3253 ContainCheckBinary(hiCmp->AsOp());
3256 hiCmp = comp->gtNewOperNode(GT_OR, TYP_INT, loCmp, hiCmp);
3257 BlockRange().InsertBefore(cmp, hiCmp);
3258 ContainCheckBinary(hiCmp->AsOp());
3262 assert(cmp->OperIs(GT_LT, GT_LE, GT_GE, GT_GT));
3265 // If the compare is signed then (x LT|GE y) can be transformed into ((x SUB y) LT|GE 0).
3266 // If the compare is unsigned we can still use SUB but we need to check the Carry flag,
3267 // not the actual result. In both cases we can simply check the appropriate condition flags
3268 // and ignore the actual result:
3269 // SUB_LO loSrc1, loSrc2
3270 // SUB_HI hiSrc1, hiSrc2
3271 // SETCC|JCC (signed|unsigned LT|GE)
3272 // If loSrc2 happens to be 0 then the first SUB can be eliminated and the second one can
3273 // be turned into a CMP because the first SUB would have set carry to 0. This effectively
3274 // transforms a long compare against 0 into an int compare of the high part against 0.
3276 // (x LE|GT y) can to be transformed into ((x SUB y) LE|GT 0) but checking that a long value
3277 // is greater than 0 is not so easy. We need to turn this into a positive/negative check
3278 // like the one we get for LT|GE compares, this can be achieved by swapping the compare:
3279 // (x LE|GT y) becomes (y GE|LT x)
3281 // Having to swap operands is problematic when the second operand is a constant. The constant
3282 // moves to the first operand where it cannot be contained and thus needs a register. This can
3283 // be avoided by changing the constant such that LE|GT becomes LT|GE:
3284 // (x LE|GT 41) becomes (x LT|GE 42)
3287 if (cmp->OperIs(GT_LE, GT_GT))
3289 bool mustSwap = true;
3291 if (loSrc2->OperIs(GT_CNS_INT) && hiSrc2->OperIs(GT_CNS_INT))
3293 uint32_t loValue = static_cast<uint32_t>(loSrc2->AsIntCon()->IconValue());
3294 uint32_t hiValue = static_cast<uint32_t>(hiSrc2->AsIntCon()->IconValue());
3295 uint64_t value = static_cast<uint64_t>(loValue) | (static_cast<uint64_t>(hiValue) << 32);
3296 uint64_t maxValue = cmp->IsUnsigned() ? UINT64_MAX : INT64_MAX;
3298 if (value != maxValue)
3301 loValue = value & UINT32_MAX;
3302 hiValue = (value >> 32) & UINT32_MAX;
3303 loSrc2->AsIntCon()->SetIconValue(loValue);
3304 hiSrc2->AsIntCon()->SetIconValue(hiValue);
3306 condition = cmp->OperIs(GT_LE) ? GT_LT : GT_GE;
3313 std::swap(loSrc1, loSrc2);
3314 std::swap(hiSrc1, hiSrc2);
3315 condition = GenTree::SwapRelop(condition);
3319 assert((condition == GT_LT) || (condition == GT_GE));
3321 if (loSrc2->IsIntegralConst(0))
3323 BlockRange().Remove(loSrc2);
3325 // Very conservative dead code removal... but it helps.
3327 if (loSrc1->OperIs(GT_CNS_INT, GT_LCL_VAR, GT_LCL_FLD))
3329 BlockRange().Remove(loSrc1);
3333 loSrc1->SetUnusedValue();
3336 hiCmp = comp->gtNewOperNode(GT_CMP, TYP_VOID, hiSrc1, hiSrc2);
3337 BlockRange().InsertBefore(cmp, hiCmp);
3338 ContainCheckCompare(hiCmp->AsOp());
3342 loCmp = comp->gtNewOperNode(GT_CMP, TYP_VOID, loSrc1, loSrc2);
3343 loCmp->gtFlags |= GTF_SET_FLAGS;
3344 hiCmp = comp->gtNewOperNode(GT_SUB_HI, TYP_INT, hiSrc1, hiSrc2);
3345 BlockRange().InsertBefore(cmp, loCmp, hiCmp);
3346 ContainCheckCompare(loCmp->AsOp());
3347 ContainCheckBinary(hiCmp->AsOp());
3350 // Try to move the first SUB_HI operands right in front of it, this allows using
3351 // a single temporary register instead of 2 (one for CMP and one for SUB_HI). Do
3352 // this only for locals as they won't change condition flags. Note that we could
3353 // move constants (except 0 which generates XOR reg, reg) but it's extremely rare
3354 // to have a constant as the first operand.
3357 if (hiSrc1->OperIs(GT_LCL_VAR, GT_LCL_FLD) && IsInvariantInRange(hiSrc1, hiCmp))
3359 BlockRange().Remove(hiSrc1);
3360 BlockRange().InsertBefore(hiCmp, hiSrc1);
3365 hiCmp->gtFlags |= GTF_SET_FLAGS;
3366 if (hiCmp->IsValue())
3368 hiCmp->SetUnusedValue();
3372 if (BlockRange().TryGetUse(cmp, &cmpUse) && cmpUse.User()->OperIs(GT_JTRUE))
3374 BlockRange().Remove(cmp);
3376 GenTree* jcc = cmpUse.User();
3377 jcc->AsOp()->gtOp1 = nullptr;
3378 jcc->ChangeOper(GT_JCC);
3379 jcc->AsCC()->gtCondition = GenCondition::FromIntegralRelop(condition, cmp->IsUnsigned());
3383 cmp->AsOp()->gtOp1 = nullptr;
3384 cmp->AsOp()->gtOp2 = nullptr;
3385 cmp->ChangeOper(GT_SETCC);
3386 cmp->AsCC()->gtCondition = GenCondition::FromIntegralRelop(condition, cmp->IsUnsigned());
3391 #endif // !TARGET_64BIT
3393 //------------------------------------------------------------------------
3394 // Lowering::OptimizeConstCompare: Performs various "compare with const" optimizations.
3397 // cmp - the compare node
3400 // The original compare node if lowering should proceed as usual or the next node
3401 // to lower if the compare node was changed in such a way that lowering is no
3405 // - Narrow operands to enable memory operand containment (XARCH specific).
3406 // - Transform cmp(and(x, y), 0) into test(x, y) (XARCH/Arm64 specific but could
3407 // be used for ARM as well if support for GT_TEST_EQ/GT_TEST_NE is added).
3408 // - Transform TEST(x, LSH(1, y)) into BT(x, y) (XARCH specific)
3409 // - Transform RELOP(OP, 0) into SETCC(OP) or JCC(OP) if OP can set the
3410 // condition flags appropriately (XARCH/ARM64 specific but could be extended
3411 // to ARM32 as well if ARM32 codegen supports GTF_SET_FLAGS).
3413 GenTree* Lowering::OptimizeConstCompare(GenTree* cmp)
3415 assert(cmp->gtGetOp2()->IsIntegralConst());
3417 GenTree* op1 = cmp->gtGetOp1();
3418 GenTreeIntCon* op2 = cmp->gtGetOp2()->AsIntCon();
3420 #if defined(TARGET_XARCH) || defined(TARGET_ARM64)
3421 ssize_t op2Value = op2->IconValue();
3424 var_types op1Type = op1->TypeGet();
3425 if (IsContainableMemoryOp(op1) && varTypeIsSmall(op1Type) && FitsIn(op1Type, op2Value))
3428 // If op1's type is small then try to narrow op2 so it has the same type as op1.
3429 // Small types are usually used by memory loads and if both compare operands have
3430 // the same type then the memory load can be contained. In certain situations
3431 // (e.g "cmp ubyte, 200") we also get a smaller instruction encoding.
3434 op2->gtType = op1Type;
3438 if (op1->OperIs(GT_CAST) && !op1->gtOverflow())
3440 GenTreeCast* cast = op1->AsCast();
3441 var_types castToType = cast->CastToType();
3442 GenTree* castOp = cast->gtGetOp1();
3444 if (((castToType == TYP_BOOL) || (castToType == TYP_UBYTE)) && FitsIn<UINT8>(op2Value))
3447 // Since we're going to remove the cast we need to be able to narrow the cast operand
3448 // to the cast type. This can be done safely only for certain opers (e.g AND, OR, XOR).
3449 // Some opers just can't be narrowed (e.g DIV, MUL) while other could be narrowed but
3450 // doing so would produce incorrect results (e.g. RSZ, RSH).
3452 // The below list of handled opers is conservative but enough to handle the most common
3457 (op2Value == 0) && cmp->OperIs(GT_EQ, GT_NE, GT_GT) && !castOp->isContained() &&
3459 (castOp->OperIs(GT_LCL_VAR, GT_CALL, GT_OR, GT_XOR, GT_AND)
3461 || IsContainableMemoryOp(castOp)
3467 assert(!castOp->gtOverflowEx()); // Must not be an overflow checking operation
3470 bool cmpEq = cmp->OperIs(GT_EQ);
3472 cmp->SetOperRaw(cmpEq ? GT_TEST_EQ : GT_TEST_NE);
3473 op2->SetIconValue(0xff);
3474 op2->gtType = castOp->gtType;
3476 castOp->gtType = castToType;
3477 op2->gtType = castToType;
3479 // If we have any contained memory ops on castOp, they must now not be contained.
3480 castOp->ClearContained();
3482 if (castOp->OperIs(GT_OR, GT_XOR, GT_AND))
3484 castOp->gtGetOp1()->ClearContained();
3485 castOp->gtGetOp2()->ClearContained();
3486 ContainCheckBinary(castOp->AsOp());
3489 cmp->AsOp()->gtOp1 = castOp;
3491 BlockRange().Remove(cast);
3495 else if (op1->OperIs(GT_AND) && cmp->OperIs(GT_EQ, GT_NE))
3498 // Transform ((x AND y) EQ|NE 0) into (x TEST_EQ|TEST_NE y) when possible.
3501 GenTree* andOp1 = op1->gtGetOp1();
3502 GenTree* andOp2 = op1->gtGetOp2();
3505 // If we don't have a 0 compare we can get one by transforming ((x AND mask) EQ|NE mask)
3506 // into ((x AND mask) NE|EQ 0) when mask is a single bit.
3508 if ((op2Value != 0) && isPow2(static_cast<target_size_t>(op2Value)) && andOp2->IsIntegralConst(op2Value))
3511 op2->SetIconValue(0);
3512 cmp->SetOperRaw(GenTree::ReverseRelop(cmp->OperGet()));
3515 // Optimizes (X & 1) != 0 to (X & 1)
3516 // Optimizes (X & 1) == 0 to ((NOT X) & 1)
3517 // (== 1 or != 1) cases are transformed to (!= 0 or == 0) above
3518 // The compiler requires jumps to have relop operands, so we do not fold that case.
3520 const bool optimizeToAnd = (op2Value == 0) && cmp->OperIs(GT_NE);
3521 const bool optimizeToNotAnd = (op2Value == 0) && cmp->OperIs(GT_EQ);
3523 if ((andOp2->IsIntegralConst(1)) && (genActualType(op1) == cmp->TypeGet()) &&
3524 (optimizeToAnd || optimizeToNotAnd))
3527 if (BlockRange().TryGetUse(cmp, &cmpUse) && !cmpUse.User()->OperIs(GT_JTRUE) &&
3528 !cmpUse.User()->OperIsConditional())
3530 GenTree* next = cmp->gtNext;
3532 if (optimizeToNotAnd)
3534 GenTree* notNode = comp->gtNewOperNode(GT_NOT, andOp1->TypeGet(), andOp1);
3535 op1->AsOp()->gtOp1 = notNode;
3536 BlockRange().InsertAfter(andOp1, notNode);
3539 cmpUse.ReplaceWith(op1);
3541 BlockRange().Remove(cmp->gtGetOp2());
3542 BlockRange().Remove(cmp);
3550 BlockRange().Remove(op1);
3551 BlockRange().Remove(op2);
3553 cmp->SetOperRaw(cmp->OperIs(GT_EQ) ? GT_TEST_EQ : GT_TEST_NE);
3554 cmp->AsOp()->gtOp1 = andOp1;
3555 cmp->AsOp()->gtOp2 = andOp2;
3556 // We will re-evaluate containment below
3557 andOp1->ClearContained();
3558 andOp2->ClearContained();
3561 if (IsContainableMemoryOp(andOp1) && andOp2->IsIntegralConst())
3564 // For "test" we only care about the bits that are set in the second operand (mask).
3565 // If the mask fits in a small type then we can narrow both operands to generate a "test"
3566 // instruction with a smaller encoding ("test" does not have a r/m32, imm8 form) and avoid
3567 // a widening load in some cases.
3569 // For 16 bit operands we narrow only if the memory operand is already 16 bit. This matches
3570 // the behavior of a previous implementation and avoids adding more cases where we generate
3571 // 16 bit instructions that require a length changing prefix (0x66). These suffer from
3572 // significant decoder stalls on Intel CPUs.
3574 // We could also do this for 64 bit masks that fit into 32 bit but it doesn't help.
3575 // In such cases morph narrows down the existing GT_AND by inserting a cast between it and
3576 // the memory operand so we'd need to add more code to recognize and eliminate that cast.
3579 size_t mask = static_cast<size_t>(andOp2->AsIntCon()->IconValue());
3581 if (FitsIn<UINT8>(mask))
3583 andOp1->gtType = TYP_UBYTE;
3584 andOp2->gtType = TYP_UBYTE;
3586 else if (FitsIn<UINT16>(mask) && genTypeSize(andOp1) == 2)
3588 andOp1->gtType = TYP_USHORT;
3589 andOp2->gtType = TYP_USHORT;
3597 if (cmp->OperIs(GT_TEST_EQ, GT_TEST_NE))
3600 // Transform TEST_EQ|NE(x, LSH(1, y)) into BT(x, y) when possible. Using BT
3601 // results in smaller and faster code. It also doesn't have special register
3602 // requirements, unlike LSH that requires the shift count to be in ECX.
3603 // Note that BT has the same behavior as LSH when the bit index exceeds the
3604 // operand bit size - it uses (bit_index MOD bit_size).
3607 GenTree* lsh = cmp->gtGetOp2();
3609 if (lsh->OperIs(GT_LSH) && varTypeIsIntOrI(lsh->TypeGet()) && lsh->gtGetOp1()->IsIntegralConst(1))
3611 cmp->SetOper(cmp->OperIs(GT_TEST_EQ) ? GT_BITTEST_EQ : GT_BITTEST_NE);
3612 cmp->AsOp()->gtOp2 = lsh->gtGetOp2();
3613 cmp->gtGetOp2()->ClearContained();
3615 BlockRange().Remove(lsh->gtGetOp1());
3616 BlockRange().Remove(lsh);
3621 #endif // TARGET_XARCH
3622 #endif // defined(TARGET_XARCH) || defined(TARGET_ARM64)
3624 // Optimize EQ/NE(relop/SETCC, 0) into (maybe reversed) cond.
3625 if (cmp->OperIs(GT_EQ, GT_NE) && op2->IsIntegralConst(0) && (op1->OperIsCompare() || op1->OperIs(GT_SETCC)))
3628 if (BlockRange().TryGetUse(cmp, &use))
3630 if (cmp->OperIs(GT_EQ))
3632 GenTree* reversed = comp->gtReverseCond(op1);
3633 assert(reversed == op1);
3636 // Relops and SETCC can be either TYP_INT or TYP_LONG typed, so we
3637 // may need to retype it.
3638 op1->gtType = cmp->TypeGet();
3640 GenTree* next = cmp->gtNext;
3641 use.ReplaceWith(op1);
3642 BlockRange().Remove(cmp->gtGetOp2());
3643 BlockRange().Remove(cmp);
3651 //------------------------------------------------------------------------
3652 // Lowering::LowerCompare: Lowers a compare node.
3655 // cmp - the compare node
3658 // The next node to lower.
3660 GenTree* Lowering::LowerCompare(GenTree* cmp)
3662 #ifndef TARGET_64BIT
3663 if (cmp->gtGetOp1()->TypeGet() == TYP_LONG)
3665 return DecomposeLongCompare(cmp);
3669 if (cmp->gtGetOp2()->IsIntegralConst() && !comp->opts.MinOpts())
3671 GenTree* next = OptimizeConstCompare(cmp);
3673 // If OptimizeConstCompare return the compare node as "next" then we need to continue lowering.
3681 if (cmp->gtGetOp1()->TypeGet() == cmp->gtGetOp2()->TypeGet())
3683 if (varTypeIsSmall(cmp->gtGetOp1()->TypeGet()) && varTypeIsUnsigned(cmp->gtGetOp1()->TypeGet()))
3686 // If both operands have the same type then codegen will use the common operand type to
3687 // determine the instruction type. For small types this would result in performing a
3688 // signed comparison of two small unsigned values without zero extending them to TYP_INT
3689 // which is incorrect. Note that making the comparison unsigned doesn't imply that codegen
3690 // has to generate a small comparison, it can still correctly generate a TYP_INT comparison.
3693 cmp->gtFlags |= GTF_UNSIGNED;
3696 #endif // TARGET_XARCH
3697 ContainCheckCompare(cmp->AsOp());
3701 #if !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64)
3702 //------------------------------------------------------------------------
3703 // Lowering::LowerJTrue: Lowers a JTRUE node.
3706 // jtrue - the JTRUE node
3709 // The next node to lower (usually nullptr).
3712 // On ARM64 this may remove the JTRUE node and transform its associated
3713 // relop into a JCMP node.
3715 GenTree* Lowering::LowerJTrue(GenTreeOp* jtrue)
3717 GenTree* cond = jtrue->gtGetOp1();
3719 JITDUMP("Lowering JTRUE:\n");
3720 DISPTREERANGE(BlockRange(), jtrue);
3723 #if defined(TARGET_ARM64)
3724 if (cond->OperIsCompare() && cond->gtGetOp2()->IsCnsIntOrI())
3726 GenTree* relopOp1 = cond->gtGetOp1();
3727 GenTree* relopOp2 = cond->gtGetOp2();
3728 genTreeOps newOper = GT_COUNT;
3731 if (cond->OperIs(GT_EQ, GT_NE) && relopOp2->IsIntegralConst(0))
3733 // Codegen will use cbz or cbnz in codegen which do not affect the flag register
3735 cc = GenCondition::FromRelop(cond);
3737 else if (cond->OperIs(GT_LT, GT_GE) && !cond->IsUnsigned() && relopOp2->IsIntegralConst(0))
3739 // Codegen will use tbnz or tbz in codegen which do not affect the flag register
3741 cc = cond->OperIs(GT_LT) ? GenCondition(GenCondition::NE) : GenCondition(GenCondition::EQ);
3742 // x < 0 => (x & signBit) != 0. Update the constant to be the sign bit.
3743 relopOp2->AsIntConCommon()->SetIntegralValue(
3744 (static_cast<INT64>(1) << (8 * genTypeSize(genActualType(relopOp1)) - 1)));
3746 else if (cond->OperIs(GT_TEST_EQ, GT_TEST_NE) && isPow2(relopOp2->AsIntCon()->IconValue()))
3748 // Codegen will use tbz or tbnz in codegen which do not affect the flag register
3750 cc = GenCondition::FromRelop(cond);
3753 if (newOper != GT_COUNT)
3755 jtrue->ChangeOper(newOper);
3756 jtrue->gtOp1 = relopOp1;
3757 jtrue->gtOp2 = relopOp2;
3758 jtrue->AsOpCC()->gtCondition = cc;
3760 relopOp2->SetContained();
3762 BlockRange().Remove(cond);
3763 JITDUMP("Lowered to %s\n", GenTree::OpName(newOper));
3767 #endif // TARGET_ARM64
3769 GenCondition condCode;
3770 if (TryLowerConditionToFlagsNode(jtrue, cond, &condCode))
3772 jtrue->SetOper(GT_JCC);
3773 jtrue->AsCC()->gtCondition = condCode;
3776 JITDUMP("Lowering JTRUE Result:\n");
3777 DISPTREERANGE(BlockRange(), jtrue);
3782 #endif // !TARGET_LOONGARCH64 && !TARGET_RISCV64
3784 //----------------------------------------------------------------------------------------------
3785 // LowerSelect: Lower a GT_SELECT node.
3788 // select - The node
3791 // The next node to lower.
3793 GenTree* Lowering::LowerSelect(GenTreeConditional* select)
3795 GenTree* cond = select->gtCond;
3796 GenTree* trueVal = select->gtOp1;
3797 GenTree* falseVal = select->gtOp2;
3799 // Replace SELECT cond 1/0 0/1 with (perhaps reversed) cond
3800 if (cond->OperIsCompare() && ((trueVal->IsIntegralConst(0) && falseVal->IsIntegralConst(1)) ||
3801 (trueVal->IsIntegralConst(1) && falseVal->IsIntegralConst(0))))
3803 assert(select->TypeIs(TYP_INT, TYP_LONG));
3806 if (BlockRange().TryGetUse(select, &use))
3808 if (trueVal->IsIntegralConst(0))
3810 GenTree* reversed = comp->gtReverseCond(cond);
3811 assert(reversed == cond);
3814 // Codegen supports also TYP_LONG typed compares so we can just
3815 // retype the compare instead of inserting a cast.
3816 cond->gtType = select->TypeGet();
3818 BlockRange().Remove(trueVal);
3819 BlockRange().Remove(falseVal);
3820 BlockRange().Remove(select);
3821 use.ReplaceWith(cond);
3823 return cond->gtNext;
3827 JITDUMP("Lowering select:\n");
3828 DISPTREERANGE(BlockRange(), select);
3831 // Do not transform GT_SELECT with GTF_SET_FLAGS into GT_SELECTCC; this
3832 // node is used by decomposition on x86.
3833 // TODO-CQ: If we allowed multiple nodes to consume the same CPU flags then
3834 // we could do this on x86. We currently disable if-conversion for TYP_LONG
3835 // on 32-bit architectures because of this.
3836 GenCondition selectCond;
3837 GenTreeOpCC* newSelect = nullptr;
3838 if (((select->gtFlags & GTF_SET_FLAGS) == 0) && TryLowerConditionToFlagsNode(select, cond, &selectCond))
3840 select->SetOper(GT_SELECTCC);
3841 newSelect = select->AsOpCC();
3842 newSelect->gtCondition = selectCond;
3843 ContainCheckSelect(newSelect);
3844 JITDUMP("Converted to SELECTCC:\n");
3845 DISPTREERANGE(BlockRange(), newSelect);
3850 ContainCheckSelect(select);
3854 if (trueVal->OperIs(GT_NOT, GT_NEG) || falseVal->OperIs(GT_NOT, GT_NEG))
3856 TryLowerCselToCinvOrCneg(select, cond);
3858 else if (trueVal->IsCnsIntOrI() && falseVal->IsCnsIntOrI())
3860 TryLowerCselToCinc(select, cond);
3864 return newSelect != nullptr ? newSelect->gtNext : select->gtNext;
3867 //----------------------------------------------------------------------------------------------
3868 // TryLowerConditionToFlagsNode: Given a node 'parent' that is able to consume
3869 // conditions from CPU flags, try to transform 'condition' into a node that
3870 // produces CPU flags, and reorder it to happen right before 'parent'.
3873 // parent - The parent node that can consume from CPU flags.
3874 // condition - The condition that to try to transform into something that produces CPU flags.
3875 // code - [out] The condition code that makes the condition true.
3878 // True if relop was transformed and is now right before 'parent'; otherwise false.
3880 bool Lowering::TryLowerConditionToFlagsNode(GenTree* parent, GenTree* condition, GenCondition* cond)
3882 JITDUMP("Lowering condition:\n");
3883 DISPTREERANGE(BlockRange(), condition);
3886 if (condition->OperIsCompare())
3888 if (!IsInvariantInRange(condition, parent))
3893 GenTreeOp* relop = condition->AsOp();
3895 *cond = GenCondition::FromRelop(relop);
3896 bool optimizing = comp->opts.OptimizationEnabled();
3898 GenTree* relopOp1 = relop->gtGetOp1();
3899 GenTree* relopOp2 = relop->gtGetOp2();
3902 // Optimize FP x != x to only check parity flag. This is a common way of
3903 // checking NaN and avoids two branches that we would otherwise emit.
3904 if (optimizing && (cond->GetCode() == GenCondition::FNEU) && relopOp1->OperIsLocal() &&
3905 GenTree::Compare(relopOp1, relopOp2) && IsInvariantInRange(relopOp1, relop) &&
3906 IsInvariantInRange(relopOp2, relop))
3908 *cond = GenCondition(GenCondition::P);
3912 // Optimize EQ/NE(op_that_sets_zf, 0) into op_that_sets_zf with GTF_SET_FLAGS.
3913 if (optimizing && relop->OperIs(GT_EQ, GT_NE) && relopOp2->IsIntegralConst(0) &&
3914 relopOp1->SupportsSettingZeroFlag() && IsInvariantInRange(relopOp1, parent))
3916 relopOp1->gtFlags |= GTF_SET_FLAGS;
3917 relopOp1->SetUnusedValue();
3919 BlockRange().Remove(relopOp1);
3920 BlockRange().InsertBefore(parent, relopOp1);
3921 BlockRange().Remove(relop);
3922 BlockRange().Remove(relopOp2);
3926 relop->gtType = TYP_VOID;
3927 relop->gtFlags |= GTF_SET_FLAGS;
3929 if (relop->OperIs(GT_EQ, GT_NE, GT_LT, GT_LE, GT_GE, GT_GT))
3931 relop->SetOper(GT_CMP);
3933 if (cond->PreferSwap())
3935 std::swap(relop->gtOp1, relop->gtOp2);
3936 *cond = GenCondition::Swap(*cond);
3940 else if (relop->OperIs(GT_BITTEST_EQ, GT_BITTEST_NE))
3942 relop->SetOper(GT_BT);
3947 assert(relop->OperIs(GT_TEST_EQ, GT_TEST_NE));
3948 relop->SetOper(GT_TEST);
3951 if (relop->gtNext != parent)
3953 BlockRange().Remove(relop);
3954 BlockRange().InsertBefore(parent, relop);
3961 if (condition->OperIs(GT_SETCC))
3963 assert((condition->gtPrev->gtFlags & GTF_SET_FLAGS) != 0);
3964 GenTree* flagsDef = condition->gtPrev;
3966 // CCMP is a flag producing node that also consumes flags, so find the
3967 // "root" of the flags producers and move the entire range.
3968 // We limit this to 10 nodes look back to avoid quadratic behavior.
3969 for (int i = 0; i < 10 && flagsDef->OperIs(GT_CCMP); i++)
3971 assert((flagsDef->gtPrev != nullptr) && ((flagsDef->gtPrev->gtFlags & GTF_SET_FLAGS) != 0));
3972 flagsDef = flagsDef->gtPrev;
3975 if (!IsRangeInvariantInRange(flagsDef, condition->gtPrev, parent, condition))
3980 *cond = condition->AsCC()->gtCondition;
3982 LIR::Range range = BlockRange().Remove(flagsDef, condition->gtPrev);
3983 BlockRange().InsertBefore(parent, std::move(range));
3984 BlockRange().Remove(condition);
3991 //----------------------------------------------------------------------------------------------
3992 // LowerNodeCC: Lowers a node that produces a boolean value by setting the condition flags.
3995 // node - The node to lower
3996 // condition - The condition code of the generated SETCC/JCC node
3999 // A SETCC/JCC node or nullptr if `node` is not used.
4002 // This simply replaces `node`'s use with an appropriate SETCC/JCC node,
4003 // `node` is not actually changed, except by having its GTF_SET_FLAGS set.
4004 // It's the caller's responsibility to change `node` such that it only
4005 // sets the condition flags, without producing a boolean value.
4007 GenTreeCC* Lowering::LowerNodeCC(GenTree* node, GenCondition condition)
4009 // Skip over a chain of EQ/NE(x, 0) relops. This may be present either
4010 // because `node` is not a relop and so it cannot be used directly by a
4011 // JTRUE, or because the frontend failed to remove a EQ/NE(x, 0) that's
4012 // used as logical negation.
4014 // Usually there's only one such relop but there's little difference
4015 // between removing one or all so we may as well remove them all.
4017 // We can't allow any other nodes between `node` and its user because we
4018 // have no way of knowing if those nodes change flags or not. So we're looking
4019 // to skip over a sequence of appropriately connected zero and EQ/NE nodes.
4021 // The x in EQ/NE(x, 0)
4022 GenTree* relop = node;
4023 // The first node of the relop sequence
4024 GenTree* first = node->gtNext;
4025 // The node following the relop sequence
4026 GenTree* next = first;
4028 while ((next != nullptr) && next->IsIntegralConst(0) && (next->gtNext != nullptr) &&
4029 next->gtNext->OperIs(GT_EQ, GT_NE) && (next->gtNext->AsOp()->gtGetOp1() == relop) &&
4030 (next->gtNext->AsOp()->gtGetOp2() == next))
4032 relop = next->gtNext;
4033 next = relop->gtNext;
4035 if (relop->OperIs(GT_EQ))
4037 condition = GenCondition::Reverse(condition);
4041 GenTreeCC* cc = nullptr;
4043 // Next may be null if `node` is not used. In that case we don't need to generate a SETCC node.
4044 if (next != nullptr)
4046 if (next->OperIs(GT_JTRUE))
4048 // If the instruction immediately following 'relop', i.e. 'next' is a conditional branch,
4049 // it should always have 'relop' as its 'op1'. If it doesn't, then we have improperly
4050 // constructed IL (the setting of a condition code should always immediately precede its
4051 // use, since the JIT doesn't track dataflow for condition codes). Still, if it happens
4052 // it's not our problem, it simply means that `node` is not used and can be removed.
4053 if (next->AsUnOp()->gtGetOp1() == relop)
4055 assert(relop->OperIsCompare());
4057 next->ChangeOper(GT_JCC);
4059 cc->gtCondition = condition;
4064 // If the node is used by something other than a JTRUE then we need to insert a
4065 // SETCC node to materialize the boolean value.
4068 if (BlockRange().TryGetUse(relop, &use))
4070 cc = comp->gtNewCC(GT_SETCC, TYP_INT, condition);
4071 BlockRange().InsertAfter(node, cc);
4072 use.ReplaceWith(cc);
4079 node->gtFlags |= GTF_SET_FLAGS;
4082 // Remove the chain of EQ/NE(x, 0) relop nodes, if any. Note that if a SETCC was
4083 // inserted after `node`, `first` still points to the node that was initially
4087 BlockRange().Remove(first, relop);
4093 // Lower "jmp <method>" tail call to insert PInvoke method epilog if required.
4094 void Lowering::LowerJmpMethod(GenTree* jmp)
4096 assert(jmp->OperGet() == GT_JMP);
4098 JITDUMP("lowering GT_JMP\n");
4100 JITDUMP("============");
4102 // If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
4103 // a method returns.
4104 if (comp->compMethodRequiresPInvokeFrame())
4106 InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(jmp));
4110 // Lower GT_RETURN node to insert PInvoke method epilog if required.
4111 void Lowering::LowerRet(GenTreeUnOp* ret)
4113 assert(ret->OperGet() == GT_RETURN);
4115 JITDUMP("lowering GT_RETURN\n");
4117 JITDUMP("============");
4119 GenTree* retVal = ret->gtGetOp1();
4120 // There are two kinds of retyping:
4121 // - A simple bitcast can be inserted when:
4122 // - We're returning a floating type as an integral type or vice-versa, or
4123 // - If we're returning a struct as a primitive type, we change the type of
4124 // 'retval' in 'LowerRetStructLclVar()'
4125 bool needBitcast = (ret->TypeGet() != TYP_VOID) && !varTypeUsesSameRegType(ret, ret->gtGetOp1());
4126 bool doPrimitiveBitcast = false;
4129 doPrimitiveBitcast = (!varTypeIsStruct(ret) && !varTypeIsStruct(retVal));
4132 if (doPrimitiveBitcast)
4134 // Add a simple bitcast when both types are not structs.
4135 // If one type is a struct it will be handled below.
4137 assert(!varTypeIsStruct(ret) && !varTypeIsStruct(retVal));
4140 GenTree* bitcast = comp->gtNewBitCastNode(ret->TypeGet(), retVal);
4141 ret->gtOp1 = bitcast;
4142 BlockRange().InsertBefore(ret, bitcast);
4143 ContainCheckBitCast(bitcast);
4145 else if (ret->TypeGet() != TYP_VOID)
4147 #if FEATURE_MULTIREG_RET
4148 if (comp->compMethodReturnsMultiRegRetType() && retVal->OperIs(GT_LCL_VAR))
4150 CheckMultiRegLclVar(retVal->AsLclVar(), comp->compRetTypeDesc.GetReturnRegCount());
4152 #endif // FEATURE_MULTIREG_RET
4154 if (varTypeIsStruct(ret->TypeGet()) != varTypeIsStruct(retVal->TypeGet()))
4156 if (varTypeIsStruct(ret->TypeGet()))
4158 assert(comp->info.compRetNativeType != TYP_STRUCT);
4160 var_types retActualType = genActualType(comp->info.compRetNativeType);
4161 var_types retValActualType = genActualType(retVal->TypeGet());
4163 bool constStructInit = retVal->IsConstInitVal();
4164 bool implicitCastFromSameOrBiggerSize = (genTypeSize(retActualType) <= genTypeSize(retValActualType));
4166 // This could happen if we have retyped op1 as a primitive type during struct promotion.
4167 bool actualTypesMatch = (retActualType == retValActualType);
4169 assert(actualTypesMatch || constStructInit || implicitCastFromSameOrBiggerSize);
4174 if (varTypeIsStruct(ret))
4176 LowerRetStruct(ret);
4178 else if (!ret->TypeIs(TYP_VOID) && varTypeIsStruct(retVal))
4180 // Return struct as a primitive using Unsafe cast.
4181 assert(retVal->OperIs(GT_LCL_VAR));
4182 LowerRetSingleRegStructLclVar(ret);
4186 // Method doing PInvokes has exactly one return block unless it has tail calls.
4187 if (comp->compMethodRequiresPInvokeFrame() && (comp->compCurBB == comp->genReturnBB))
4189 InsertPInvokeMethodEpilog(comp->compCurBB DEBUGARG(ret));
4191 ContainCheckRet(ret);
4194 //----------------------------------------------------------------------------------------------
4195 // LowerStoreLocCommon: platform idependent part of local var or field store lowering.
4198 // lclStore - The store lcl node to lower.
4200 void Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore)
4202 assert(lclStore->OperIs(GT_STORE_LCL_FLD, GT_STORE_LCL_VAR));
4203 JITDUMP("lowering store lcl var/field (before):\n");
4204 DISPTREERANGE(BlockRange(), lclStore);
4207 TryRetypingFloatingPointStoreToIntegerStore(lclStore);
4209 GenTree* src = lclStore->gtGetOp1();
4210 LclVarDsc* varDsc = comp->lvaGetDesc(lclStore);
4211 const bool srcIsMultiReg = src->IsMultiRegNode();
4213 if (!srcIsMultiReg && varTypeIsStruct(varDsc))
4215 // TODO-Cleanup: we want to check `varDsc->lvRegStruct` as the last condition instead of `!varDsc->lvPromoted`,
4216 // but we do not set it for `CSE` vars so it is currently failing.
4217 assert(varDsc->CanBeReplacedWithItsField(comp) || varDsc->lvDoNotEnregister || !varDsc->lvPromoted);
4218 if (varDsc->CanBeReplacedWithItsField(comp))
4220 assert(varDsc->lvFieldCnt == 1);
4221 unsigned fldNum = varDsc->lvFieldLclStart;
4222 LclVarDsc* fldDsc = comp->lvaGetDesc(fldNum);
4224 JITDUMP("Replacing an independently promoted local var V%02u with its only field V%02u for the store "
4225 "from a call [%06u]\n",
4226 lclStore->GetLclNum(), fldNum, comp->dspTreeID(lclStore));
4227 lclStore->SetLclNum(fldNum);
4228 lclStore->ChangeType(fldDsc->TypeGet());
4235 CheckMultiRegLclVar(lclStore->AsLclVar(), src->GetMultiRegCount(comp));
4238 const var_types lclRegType = varDsc->GetRegisterType(lclStore);
4240 if ((lclStore->TypeGet() == TYP_STRUCT) && !srcIsMultiReg)
4242 bool convertToStoreObj;
4243 if (lclStore->OperIs(GT_STORE_LCL_FLD))
4245 convertToStoreObj = true;
4247 else if (src->OperGet() == GT_CALL)
4249 GenTreeCall* call = src->AsCall();
4252 const ClassLayout* layout = lclStore->GetLayout(comp);
4253 const unsigned slotCount = layout->GetSlotCount();
4254 #if defined(TARGET_XARCH) && !defined(UNIX_AMD64_ABI)
4255 // Windows x64 doesn't have multireg returns,
4256 // x86 uses it only for long return type, not for structs.
4257 assert(slotCount == 1);
4258 assert(lclRegType != TYP_UNDEF);
4259 #else // !TARGET_XARCH || UNIX_AMD64_ABI
4260 if (!comp->IsHfa(layout->GetClassHandle()))
4264 assert(call->HasMultiRegRetVal());
4268 unsigned size = layout->GetSize();
4269 assert((size <= 8) || (size == 16));
4270 bool isPowerOf2 = (((size - 1) & size) == 0);
4271 bool isTypeDefined = (lclRegType != TYP_UNDEF);
4272 assert(isPowerOf2 == isTypeDefined);
4275 #endif // !TARGET_XARCH || UNIX_AMD64_ABI
4278 #if !defined(WINDOWS_AMD64_ABI)
4279 if (!call->HasMultiRegRetVal() && (lclRegType == TYP_UNDEF))
4281 // If we have a single return register,
4282 // but we can't retype it as a primitive type, we must spill it.
4283 GenTreeLclVar* spilledCall = SpillStructCallResult(call);
4284 lclStore->gtOp1 = spilledCall;
4285 src = lclStore->gtOp1;
4286 JITDUMP("lowering store lcl var/field has to spill call src.\n");
4287 LowerStoreLocCommon(lclStore);
4290 #endif // !WINDOWS_AMD64_ABI
4291 convertToStoreObj = false;
4293 else if (!varDsc->IsEnregisterableType())
4295 convertToStoreObj = true;
4297 else if (src->OperIs(GT_CNS_INT))
4299 assert(src->IsIntegralConst(0) && "expected an INIT_VAL for non-zero init.");
4302 if (varTypeIsSIMD(lclRegType))
4304 GenTree* zeroCon = comp->gtNewZeroConNode(lclRegType);
4306 BlockRange().InsertAfter(src, zeroCon);
4307 BlockRange().Remove(src);
4310 lclStore->gtOp1 = src;
4312 #endif // FEATURE_SIMD
4314 convertToStoreObj = false;
4316 else if (src->OperIs(GT_LCL_VAR))
4318 convertToStoreObj = false;
4320 else if (src->OperIs(GT_IND, GT_BLK, GT_LCL_FLD))
4322 #if !defined(TARGET_ARM64)
4324 if (src->TypeIs(TYP_STRUCT))
4326 src->ChangeType(lclRegType);
4327 if (src->OperIs(GT_IND, GT_BLK))
4329 if (src->OperIs(GT_BLK))
4331 src->SetOper(GT_IND);
4333 // This logic is skipped for struct indir in
4334 // `Lowering::LowerIndir` because we don't know the size.
4336 LowerIndir(src->AsIndir());
4338 #if defined(TARGET_XARCH)
4339 if (varTypeIsSmall(lclRegType))
4341 src->SetDontExtend();
4343 #endif // TARGET_XARCH
4345 convertToStoreObj = false;
4346 #else // TARGET_ARM64
4347 // This optimization on arm64 allows more SIMD16 vars to be enregistered but it could cause
4348 // regressions when there are many calls and before/after each one we have to store/save the upper
4349 // half of these registers. So enable this for arm64 only when LSRA is taught not to allocate registers when
4350 // it would have to spilled too many times.
4351 convertToStoreObj = true;
4352 #endif // TARGET_ARM64
4356 assert(src->OperIsInitVal());
4357 convertToStoreObj = true;
4360 if (convertToStoreObj)
4362 ClassLayout* layout = lclStore->GetLayout(comp);
4363 const unsigned lclNum = lclStore->GetLclNum();
4364 GenTreeLclFld* addr = comp->gtNewLclAddrNode(lclNum, lclStore->GetLclOffs(), TYP_BYREF);
4365 comp->lvaSetVarDoNotEnregister(lclNum DEBUGARG(DoNotEnregisterReason::BlockOp));
4367 addr->gtFlags |= lclStore->gtFlags & (GTF_VAR_DEF | GTF_VAR_USEASG);
4369 // Create the assignment node.
4370 lclStore->ChangeOper(GT_STORE_BLK);
4371 GenTreeBlk* objStore = lclStore->AsBlk();
4372 objStore->gtFlags = GTF_ASG | GTF_IND_NONFAULTING | GTF_IND_TGT_NOT_HEAP;
4373 objStore->Initialize(layout);
4374 objStore->SetAddr(addr);
4375 objStore->SetData(src);
4377 BlockRange().InsertBefore(objStore, addr);
4378 LowerNode(objStore);
4380 JITDUMP("lowering store lcl var/field (after):\n");
4381 DISPTREERANGE(BlockRange(), objStore);
4388 // src and dst can be in registers, check if we need a bitcast.
4389 if (!src->TypeIs(TYP_STRUCT) && !varTypeUsesSameRegType(lclRegType, src))
4391 assert(!srcIsMultiReg);
4392 assert(lclStore->OperIsLocalStore());
4393 assert(lclRegType != TYP_UNDEF);
4395 GenTree* bitcast = comp->gtNewBitCastNode(lclRegType, src);
4396 lclStore->gtOp1 = bitcast;
4397 src = lclStore->gtGetOp1();
4398 BlockRange().InsertBefore(lclStore, bitcast);
4399 ContainCheckBitCast(bitcast);
4402 LowerStoreLoc(lclStore);
4404 JITDUMP("lowering store lcl var/field (after):\n");
4405 DISPTREERANGE(BlockRange(), lclStore);
4409 //----------------------------------------------------------------------------------------------
4410 // LowerRetStructLclVar: Lowers a struct return node.
4413 // node - The return node to lower.
4415 void Lowering::LowerRetStruct(GenTreeUnOp* ret)
4418 if (GlobalJitOptions::compFeatureHfa)
4420 if (varTypeIsSIMD(ret))
4422 if (comp->info.compRetNativeType == TYP_STRUCT)
4424 assert(varTypeIsSIMD(ret->gtGetOp1()));
4425 assert(comp->compMethodReturnsMultiRegRetType());
4426 ret->ChangeType(comp->info.compRetNativeType);
4430 assert(comp->info.compRetNativeType == ret->TypeGet());
4434 #endif // TARGET_ARM64
4436 if (comp->compMethodReturnsMultiRegRetType())
4441 assert(ret->OperIs(GT_RETURN));
4442 assert(varTypeIsStruct(ret));
4444 GenTree* retVal = ret->gtGetOp1();
4445 var_types nativeReturnType = comp->info.compRetNativeType;
4446 // Note: small types are returned as INT.
4447 ret->ChangeType(genActualType(nativeReturnType));
4449 switch (retVal->OperGet())
4453 // When we promote LCL_VAR single fields into return, we could have all types of constants here.
4454 if (varTypeUsesFloatReg(nativeReturnType))
4456 // ZeroObj assertion propagation can create INT zeros for DOUBLE returns.
4457 assert((genTypeSize(retVal) == genTypeSize(nativeReturnType)) || retVal->IsIntegralConst(0));
4458 int64_t value = retVal->AsIntCon()->IconValue();
4460 if (nativeReturnType == TYP_FLOAT)
4462 retVal->BashToConst(*reinterpret_cast<float*>(&value));
4466 retVal->BashToConst(*reinterpret_cast<double*>(&value));
4471 assert(varTypeUsesIntReg(nativeReturnType));
4479 // Spill to a local if sizes don't match so we can avoid the "load more than requested"
4480 // problem, e.g. struct size is 5 and we emit "ldr x0, [x1]"
4481 if (genTypeSize(nativeReturnType) > retVal->AsIndir()->Size())
4483 LIR::Use retValUse(BlockRange(), &ret->gtOp1, ret);
4484 unsigned tmpNum = comp->lvaGrabTemp(true DEBUGARG("mis-sized struct return"));
4485 comp->lvaSetStruct(tmpNum, comp->info.compMethodInfo->args.retTypeClass, false);
4487 ReplaceWithLclVar(retValUse, tmpNum);
4488 LowerRetSingleRegStructLclVar(ret);
4492 retVal->ChangeOper(GT_IND);
4493 retVal->ChangeType(nativeReturnType);
4494 LowerIndir(retVal->AsIndir());
4499 LowerRetSingleRegStructLclVar(ret);
4503 retVal->ChangeType(nativeReturnType);
4507 assert(varTypeIsEnregisterable(retVal));
4508 if (!varTypeUsesSameRegType(ret, retVal))
4510 GenTree* bitcast = comp->gtNewBitCastNode(ret->TypeGet(), retVal);
4511 ret->gtOp1 = bitcast;
4512 BlockRange().InsertBefore(ret, bitcast);
4513 ContainCheckBitCast(bitcast);
4519 //----------------------------------------------------------------------------------------------
4520 // LowerRetSingleRegStructLclVar: Lowers a return node with a struct lclVar as a source.
4523 // node - The return node to lower.
4526 // - the function is only for LclVars that are returned in one register;
4527 // - if LclVar is allocated in memory then read it as return type;
4528 // - if LclVar can be enregistered read it as register type and add a bitcast if necessary;
4530 void Lowering::LowerRetSingleRegStructLclVar(GenTreeUnOp* ret)
4532 assert(!comp->compMethodReturnsMultiRegRetType());
4533 assert(ret->OperIs(GT_RETURN));
4534 GenTreeLclVarCommon* lclVar = ret->gtGetOp1()->AsLclVar();
4535 assert(lclVar->OperIs(GT_LCL_VAR));
4536 unsigned lclNum = lclVar->GetLclNum();
4537 LclVarDsc* varDsc = comp->lvaGetDesc(lclNum);
4539 if (varDsc->lvPromoted)
4541 // TODO-1stClassStructs: We can no longer independently promote
4542 // or enregister this struct, since it is referenced as a whole.
4543 comp->lvaSetVarDoNotEnregister(lclNum DEBUGARG(DoNotEnregisterReason::BlockOpRet));
4546 if (varDsc->lvDoNotEnregister)
4548 lclVar->ChangeOper(GT_LCL_FLD);
4550 // We are returning as a primitive type and the lcl is of struct type.
4551 assert(comp->info.compRetNativeType != TYP_STRUCT);
4552 assert((genTypeSize(comp->info.compRetNativeType) == genTypeSize(ret)) ||
4553 (varTypeIsIntegral(ret) && varTypeIsIntegral(comp->info.compRetNativeType) &&
4554 (genTypeSize(comp->info.compRetNativeType) <= genTypeSize(ret))));
4555 // If the actual return type requires normalization, then make sure we
4556 // do so by using the correct small type for the GT_LCL_FLD. It would
4557 // be conservative to check just compRetNativeType for this since small
4558 // structs are normalized to primitive types when they are returned in
4559 // registers, so we would normalize for them as well.
4560 if (varTypeIsSmall(comp->info.compRetType))
4562 assert(genTypeSize(comp->info.compRetNativeType) == genTypeSize(comp->info.compRetType));
4563 lclVar->ChangeType(comp->info.compRetType);
4567 // Otherwise we don't mind that we leave the upper bits undefined.
4568 lclVar->ChangeType(ret->TypeGet());
4573 const var_types lclVarType = varDsc->GetRegisterType(lclVar);
4574 assert(lclVarType != TYP_UNDEF);
4576 const var_types actualType = genActualType(lclVarType);
4577 lclVar->ChangeType(actualType);
4579 if (!varTypeUsesSameRegType(ret, lclVarType))
4581 GenTree* bitcast = comp->gtNewBitCastNode(ret->TypeGet(), ret->gtOp1);
4582 ret->gtOp1 = bitcast;
4583 BlockRange().InsertBefore(ret, bitcast);
4584 ContainCheckBitCast(bitcast);
4589 //----------------------------------------------------------------------------------------------
4590 // LowerCallStruct: Lowers a call node that returns a struct.
4593 // call - The call node to lower.
4596 // - this handles only single-register returns;
4597 // - it transforms the call's user for `GT_STOREIND`.
4599 void Lowering::LowerCallStruct(GenTreeCall* call)
4601 assert(varTypeIsStruct(call));
4602 if (call->HasMultiRegRetVal())
4607 if (GlobalJitOptions::compFeatureHfa)
4609 if (comp->IsHfa(call->gtRetClsHnd))
4611 #if defined(TARGET_ARM64)
4612 assert(comp->GetHfaCount(call->gtRetClsHnd) == 1);
4613 #elif defined(TARGET_ARM)
4614 // ARM returns double in 2 float registers, but
4615 // `call->HasMultiRegRetVal()` count double registers.
4616 assert(comp->GetHfaCount(call->gtRetClsHnd) <= 2);
4617 #else // !TARGET_ARM64 && !TARGET_ARM
4618 NYI("Unknown architecture");
4619 #endif // !TARGET_ARM64 && !TARGET_ARM
4620 var_types hfaType = comp->GetHfaType(call->gtRetClsHnd);
4621 if (call->TypeIs(hfaType))
4628 CORINFO_CLASS_HANDLE retClsHnd = call->gtRetClsHnd;
4629 Compiler::structPassingKind howToReturnStruct;
4630 var_types returnType = comp->getReturnTypeForStruct(retClsHnd, call->GetUnmanagedCallConv(), &howToReturnStruct);
4631 assert(returnType != TYP_STRUCT && returnType != TYP_UNKNOWN);
4632 var_types origType = call->TypeGet();
4633 call->gtType = genActualType(returnType);
4636 if (BlockRange().TryGetUse(call, &callUse))
4638 GenTree* user = callUse.User();
4639 switch (user->OperGet())
4642 case GT_STORE_LCL_VAR:
4644 // Leave as is, the user will handle it.
4645 assert(user->TypeIs(origType) || varTypeIsSIMD(user->TypeGet()));
4648 case GT_STORE_LCL_FLD:
4649 // The call's type should match the user's type or struct's returnType.
4650 // We leave handling the former case to user's lowering.
4651 assert(user->TypeIs(origType) || (returnType == user->TypeGet()));
4655 // Argument lowering will deal with register file mismatches if needed.
4656 assert(varTypeIsSIMD(origType));
4661 if (varTypeIsSIMD(user))
4663 user->ChangeType(returnType);
4666 #endif // FEATURE_SIMD
4667 // importer has a separate mechanism to retype calls to helpers,
4669 assert(user->TypeIs(TYP_REF) || (user->TypeIs(TYP_I_IMPL) && comp->IsTargetAbi(CORINFO_NATIVEAOT_ABI)));
4670 assert(call->IsHelperCall());
4671 assert(returnType == user->TypeGet());
4674 #ifdef FEATURE_HW_INTRINSICS
4675 case GT_HWINTRINSIC:
4677 if (!varTypeUsesSameRegType(returnType, origType))
4679 GenTree* bitCast = comp->gtNewBitCastNode(origType, call);
4680 BlockRange().InsertAfter(call, bitCast);
4681 callUse.ReplaceWith(bitCast);
4682 ContainCheckBitCast(bitCast);
4686 #endif // FEATURE_HW_INTRINSICS
4694 //----------------------------------------------------------------------------------------------
4695 // LowerStoreSingleRegCallStruct: Lowers a store block where the source is a struct typed call.
4698 // store - The store node to lower.
4701 // - the function is only for calls that return one register;
4702 // - it spills the call's result if it can be retyped as a primitive type;
4704 void Lowering::LowerStoreSingleRegCallStruct(GenTreeBlk* store)
4706 assert(store->Data()->IsCall());
4707 GenTreeCall* call = store->Data()->AsCall();
4708 assert(!call->HasMultiRegRetVal());
4710 const ClassLayout* layout = store->GetLayout();
4711 var_types regType = layout->GetRegisterType();
4713 if (regType != TYP_UNDEF)
4715 #if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
4716 if (varTypeIsFloating(call->TypeGet()))
4718 regType = call->TypeGet();
4721 store->ChangeType(regType);
4722 store->SetOper(GT_STOREIND);
4723 LowerStoreIndirCommon(store->AsStoreInd());
4728 #if defined(WINDOWS_AMD64_ABI)
4729 // All ABI except Windows x64 supports passing 3 byte structs in registers.
4730 // Other 64 bites ABI-s support passing 5, 6, 7 byte structs.
4732 #else // !WINDOWS_AMD64_ABI
4733 store->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
4734 GenTreeLclVar* spilledCall = SpillStructCallResult(call);
4735 store->SetData(spilledCall);
4736 LowerBlockStoreCommon(store);
4737 #endif // WINDOWS_AMD64_ABI
4741 #if !defined(WINDOWS_AMD64_ABI)
4742 //----------------------------------------------------------------------------------------------
4743 // SpillStructCallResult: Spill call result to memory.
4746 // call - call with 3, 5, 6 or 7 return size that has to be spilled to memory.
4749 // load of the spilled variable.
4751 GenTreeLclVar* Lowering::SpillStructCallResult(GenTreeCall* call) const
4753 // TODO-1stClassStructs: we can support this in codegen for `GT_STORE_BLK` without new temps.
4754 const unsigned spillNum = comp->lvaGrabTemp(true DEBUGARG("Return value temp for an odd struct return size"));
4755 comp->lvaSetVarDoNotEnregister(spillNum DEBUGARG(DoNotEnregisterReason::LocalField));
4756 CORINFO_CLASS_HANDLE retClsHnd = call->gtRetClsHnd;
4757 comp->lvaSetStruct(spillNum, retClsHnd, false);
4758 GenTreeLclFld* spill = comp->gtNewStoreLclFldNode(spillNum, call->TypeGet(), 0, call);
4760 BlockRange().InsertAfter(call, spill);
4761 ContainCheckStoreLoc(spill);
4762 GenTreeLclVar* loadCallResult = comp->gtNewLclvNode(spillNum, TYP_STRUCT)->AsLclVar();
4763 BlockRange().InsertAfter(spill, loadCallResult);
4764 return loadCallResult;
4766 #endif // !WINDOWS_AMD64_ABI
4768 GenTree* Lowering::LowerDirectCall(GenTreeCall* call)
4770 noway_assert(call->gtCallType == CT_USER_FUNC || call->gtCallType == CT_HELPER);
4772 // Non-virtual direct/indirect calls: Work out if the address of the
4773 // call is known at JIT time. If not it is either an indirect call
4774 // or the address must be accessed via an single/double indirection.
4777 InfoAccessType accessType;
4778 CorInfoHelpFunc helperNum = comp->eeGetHelperNum(call->gtCallMethHnd);
4780 #ifdef FEATURE_READYTORUN
4781 if (call->gtEntryPoint.addr != nullptr)
4783 accessType = call->gtEntryPoint.accessType;
4784 addr = call->gtEntryPoint.addr;
4788 if (call->gtCallType == CT_HELPER)
4790 noway_assert(helperNum != CORINFO_HELP_UNDEF);
4792 // the convention on getHelperFtn seems to be (it's not documented)
4793 // that it returns an address or if it returns null, pAddr is set to
4794 // another address, which requires an indirection
4796 addr = comp->info.compCompHnd->getHelperFtn(helperNum, (void**)&pAddr);
4798 if (addr != nullptr)
4800 assert(pAddr == nullptr);
4801 accessType = IAT_VALUE;
4805 accessType = IAT_PVALUE;
4811 noway_assert(helperNum == CORINFO_HELP_UNDEF);
4813 CORINFO_ACCESS_FLAGS aflags = CORINFO_ACCESS_ANY;
4815 if (call->IsSameThis())
4817 aflags = (CORINFO_ACCESS_FLAGS)(aflags | CORINFO_ACCESS_THIS);
4820 if (!call->NeedsNullCheck())
4822 aflags = (CORINFO_ACCESS_FLAGS)(aflags | CORINFO_ACCESS_NONNULL);
4825 CORINFO_CONST_LOOKUP addrInfo;
4826 comp->info.compCompHnd->getFunctionEntryPoint(call->gtCallMethHnd, &addrInfo, aflags);
4828 accessType = addrInfo.accessType;
4829 addr = addrInfo.addr;
4832 GenTree* result = nullptr;
4836 // Non-virtual direct call to known address.
4837 // For JIT helper based tailcall (only used on x86) the target
4838 // address is passed as an arg to the helper so we want a node for
4840 if (!IsCallTargetInRange(addr) || call->IsTailCallViaJitHelper())
4842 result = AddrGen(addr);
4846 // a direct call within range of hardware relative call instruction
4847 // stash the address for codegen
4848 call->gtDirectCallAddress = addr;
4854 // If we are using an indirection cell for a direct call then apply
4855 // an optimization that loads the call target directly from the
4856 // indirection cell, instead of duplicating the tree.
4857 bool hasIndirectionCell = call->GetIndirectionCellArgKind() != WellKnownArg::None;
4859 if (!hasIndirectionCell)
4861 // Non-virtual direct calls to addresses accessed by
4862 // a single indirection.
4863 GenTree* cellAddr = AddrGen(addr);
4865 cellAddr->AsIntCon()->gtTargetHandle = (size_t)call->gtCallMethHnd;
4867 GenTree* indir = Ind(cellAddr);
4874 // Non-virtual direct calls to addresses accessed by
4875 // a double indirection.
4878 // Expanding an IAT_PPVALUE here, will lose the opportunity
4879 // to Hoist/CSE the first indirection as it is an invariant load
4881 assert(!"IAT_PPVALUE case in LowerDirectCall");
4883 noway_assert(helperNum == CORINFO_HELP_UNDEF);
4884 result = AddrGen(addr);
4885 // Double-indirection. Load the address into a register
4886 // and call indirectly through the register
4888 result = Ind(Ind(result));
4893 // Non-virtual direct calls to addresses accessed by
4894 // a single relative indirection.
4895 GenTree* cellAddr = AddrGen(addr);
4896 GenTree* indir = Ind(cellAddr);
4897 result = comp->gtNewOperNode(GT_ADD, TYP_I_IMPL, indir, AddrGen(addr));
4902 noway_assert(!"Bad accessType");
4909 GenTree* Lowering::LowerDelegateInvoke(GenTreeCall* call)
4911 noway_assert(call->gtCallType == CT_USER_FUNC);
4913 assert((comp->info.compCompHnd->getMethodAttribs(call->gtCallMethHnd) &
4914 (CORINFO_FLG_DELEGATE_INVOKE | CORINFO_FLG_FINAL)) == (CORINFO_FLG_DELEGATE_INVOKE | CORINFO_FLG_FINAL));
4916 GenTree* thisArgNode;
4917 if (call->IsTailCallViaJitHelper())
4919 thisArgNode = call->gtArgs.GetArgByIndex(0)->GetNode();
4923 thisArgNode = call->gtArgs.GetThisArg()->GetNode();
4926 assert(thisArgNode != nullptr);
4927 assert(thisArgNode->gtOper == GT_PUTARG_REG);
4928 GenTree* thisExpr = thisArgNode->AsOp()->gtOp1;
4930 // We're going to use the 'this' expression multiple times, so make a local to copy it.
4933 if (thisExpr->OperIs(GT_LCL_VAR))
4935 base = comp->gtNewLclvNode(thisExpr->AsLclVar()->GetLclNum(), thisExpr->TypeGet());
4937 else if (thisExpr->OperIs(GT_LCL_FLD))
4939 base = comp->gtNewLclFldNode(thisExpr->AsLclFld()->GetLclNum(), thisExpr->TypeGet(),
4940 thisExpr->AsLclFld()->GetLclOffs());
4944 unsigned delegateInvokeTmp = comp->lvaGrabTemp(true DEBUGARG("delegate invoke call"));
4945 base = comp->gtNewLclvNode(delegateInvokeTmp, thisExpr->TypeGet());
4947 LIR::Use thisExprUse(BlockRange(), &thisArgNode->AsOp()->gtOp1, thisArgNode);
4948 ReplaceWithLclVar(thisExprUse, delegateInvokeTmp);
4950 thisExpr = thisExprUse.Def(); // it's changed; reload it.
4953 // replace original expression feeding into thisPtr with
4954 // [originalThis + offsetOfDelegateInstance]
4956 GenTree* newThisAddr = new (comp, GT_LEA)
4957 GenTreeAddrMode(TYP_BYREF, thisExpr, nullptr, 0, comp->eeGetEEInfo()->offsetOfDelegateInstance);
4959 GenTree* newThis = comp->gtNewIndir(TYP_REF, newThisAddr);
4961 // Insert the new 'this' arg right before the call to get the correct null
4962 // behavior (the NRE that would logically happen inside Delegate.Invoke
4963 // should happen after all args are evaluated). We must also move the
4964 // PUTARG_REG node ahead.
4965 thisArgNode->AsOp()->gtOp1 = newThis;
4966 BlockRange().Remove(thisArgNode);
4967 BlockRange().InsertBefore(call, newThisAddr, newThis, thisArgNode);
4969 ContainCheckIndir(newThis->AsIndir());
4971 // the control target is
4972 // [originalThis + firstTgtOffs]
4974 unsigned targetOffs = comp->eeGetEEInfo()->offsetOfDelegateFirstTarget;
4975 GenTree* result = new (comp, GT_LEA) GenTreeAddrMode(TYP_REF, base, nullptr, 0, targetOffs);
4976 GenTree* callTarget = Ind(result);
4978 // don't need to sequence and insert this tree, caller will do it
4983 GenTree* Lowering::LowerIndirectNonvirtCall(GenTreeCall* call)
4986 if (call->gtCallCookie != nullptr)
4988 NYI_X86("Morphing indirect non-virtual call with non-standard args");
4992 // Indirect cookie calls gets transformed by fgMorphArgs as indirect call with non-standard args.
4993 // Hence we should never see this type of call in lower.
4995 noway_assert(call->gtCallCookie == nullptr);
5000 //------------------------------------------------------------------------
5001 // CreateReturnTrapSeq: Create a tree to perform a "return trap", used in PInvoke
5002 // epilogs to invoke a GC under a condition. The return trap checks some global
5003 // location (the runtime tells us where that is and how many indirections to make),
5004 // then, based on the result, conditionally calls a GC helper. We use a special node
5005 // for this because at this time (late in the compilation phases), introducing flow
5006 // is tedious/difficult.
5008 // This is used for PInvoke inlining.
5011 // Code tree to perform the action.
5013 GenTree* Lowering::CreateReturnTrapSeq()
5015 // The GT_RETURNTRAP node expands to this:
5016 // if (g_TrapReturningThreads)
5018 // RareDisablePreemptiveGC();
5021 // The only thing to do here is build up the expression that evaluates 'g_TrapReturningThreads'.
5023 void* pAddrOfCaptureThreadGlobal = nullptr;
5024 int32_t* addrOfCaptureThreadGlobal =
5025 comp->info.compCompHnd->getAddrOfCaptureThreadGlobal(&pAddrOfCaptureThreadGlobal);
5028 if (addrOfCaptureThreadGlobal != nullptr)
5030 testTree = AddrGen(addrOfCaptureThreadGlobal);
5034 testTree = Ind(AddrGen(pAddrOfCaptureThreadGlobal));
5036 return comp->gtNewOperNode(GT_RETURNTRAP, TYP_INT, Ind(testTree, TYP_INT));
5039 //------------------------------------------------------------------------
5040 // SetGCState: Create a tree that stores the given constant (0 or 1) into the
5041 // thread's GC state field.
5043 // This is used for PInvoke inlining.
5046 // state - constant (0 or 1) to store into the thread's GC state field.
5049 // Code tree to perform the action.
5051 GenTree* Lowering::SetGCState(int state)
5053 // Thread.offsetOfGcState = 0/1
5055 assert(state == 0 || state == 1);
5057 const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
5059 GenTree* base = new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, TYP_I_IMPL, comp->info.compLvFrameListRoot);
5061 GenTree* stateNode = new (comp, GT_CNS_INT) GenTreeIntCon(TYP_BYTE, state);
5062 GenTree* addr = new (comp, GT_LEA) GenTreeAddrMode(TYP_I_IMPL, base, nullptr, 1, pInfo->offsetOfGCState);
5063 GenTree* storeGcState = new (comp, GT_STOREIND) GenTreeStoreInd(TYP_BYTE, addr, stateNode);
5064 return storeGcState;
5067 //------------------------------------------------------------------------
5068 // CreateFrameLinkUpdate: Create a tree that either links or unlinks the
5069 // locally-allocated InlinedCallFrame from the Frame list.
5071 // This is used for PInvoke inlining.
5074 // action - whether to link (push) or unlink (pop) the Frame
5077 // Code tree to perform the action.
5079 GenTree* Lowering::CreateFrameLinkUpdate(FrameLinkAction action)
5081 const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
5082 const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = pInfo->inlinedCallFrameInfo;
5084 GenTree* TCB = new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, TYP_I_IMPL, comp->info.compLvFrameListRoot);
5087 GenTree* addr = new (comp, GT_LEA) GenTreeAddrMode(TYP_I_IMPL, TCB, nullptr, 1, pInfo->offsetOfThreadFrame);
5089 GenTree* data = nullptr;
5091 if (action == PushFrame)
5093 // Thread->m_pFrame = &inlinedCallFrame;
5094 data = new (comp, GT_LCL_ADDR)
5095 GenTreeLclFld(GT_LCL_ADDR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfFrameVptr);
5099 assert(action == PopFrame);
5100 // Thread->m_pFrame = inlinedCallFrame.m_pNext;
5102 data = new (comp, GT_LCL_FLD) GenTreeLclFld(GT_LCL_FLD, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar,
5103 pInfo->inlinedCallFrameInfo.offsetOfFrameLink);
5105 GenTree* storeInd = new (comp, GT_STOREIND) GenTreeStoreInd(TYP_I_IMPL, addr, data);
5109 //------------------------------------------------------------------------
5110 // InsertPInvokeMethodProlog: Create the code that runs at the start of
5111 // every method that has PInvoke calls.
5113 // Initialize the TCB local and the InlinedCallFrame object. Then link ("push")
5114 // the InlinedCallFrame object on the Frame chain. The layout of InlinedCallFrame
5115 // is defined in vm/frames.h. See also vm/jitinterface.cpp for more information.
5116 // The offsets of these fields is returned by the VM in a call to ICorStaticInfo::getEEInfo().
5118 // The (current) layout is as follows:
5120 // 64-bit 32-bit CORINFO_EE_INFO
5121 // offset offset field name offset when set
5122 // -----------------------------------------------------------------------------------------
5123 // +00h +00h GS cookie offsetOfGSCookie
5124 // +08h +04h vptr for class InlinedCallFrame offsetOfFrameVptr method prolog
5125 // +10h +08h m_Next offsetOfFrameLink method prolog
5126 // +18h +0Ch m_Datum offsetOfCallTarget call site
5127 // +20h n/a m_StubSecretArg not set by JIT
5128 // +28h +10h m_pCallSiteSP offsetOfCallSiteSP x86: call site, and zeroed in method
5130 // non-x86: method prolog (SP remains
5131 // constant in function, after prolog: no
5132 // localloc and PInvoke in same function)
5133 // +30h +14h m_pCallerReturnAddress offsetOfReturnAddress call site
5134 // +38h +18h m_pCalleeSavedFP offsetOfCalleeSavedFP not set by JIT
5136 // +20h m_pSPAfterProlog offsetOfSPAfterProlog arm only
5137 // +20/24h JIT retval spill area (int) before call_gc ???
5138 // +24/28h JIT retval spill area (long) before call_gc ???
5139 // +28/2Ch Saved value of EBP method prolog ???
5141 // Note that in the VM, InlinedCallFrame is a C++ class whose objects have a 'this' pointer that points
5142 // to the InlinedCallFrame vptr (the 2nd field listed above), and the GS cookie is stored *before*
5143 // the object. When we link the InlinedCallFrame onto the Frame chain, we must point at this location,
5144 // and not at the beginning of the InlinedCallFrame local, which is actually the GS cookie.
5149 // See the usages for USE_PER_FRAME_PINVOKE_INIT for more information.
5150 void Lowering::InsertPInvokeMethodProlog()
5152 noway_assert(comp->info.compUnmanagedCallCountWithGCTransition);
5153 noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
5155 if (comp->opts.ShouldUsePInvokeHelpers())
5160 JITDUMP("======= Inserting PInvoke method prolog\n");
5162 // The first BB must be a scratch BB in order for us to be able to safely insert the P/Invoke prolog.
5163 assert(comp->fgFirstBBisScratch());
5165 LIR::Range& firstBlockRange = LIR::AsRange(comp->fgFirstBB);
5167 const CORINFO_EE_INFO* pInfo = comp->eeGetEEInfo();
5168 const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = pInfo->inlinedCallFrameInfo;
5170 // First arg: &compiler->lvaInlinedPInvokeFrameVar + callFrameInfo.offsetOfFrameVptr
5172 const LclVarDsc* inlinedPInvokeDsc = comp->lvaGetDesc(comp->lvaInlinedPInvokeFrameVar);
5173 assert(inlinedPInvokeDsc->IsAddressExposed());
5175 GenTree* frameAddr = new (comp, GT_LCL_ADDR)
5176 GenTreeLclFld(GT_LCL_ADDR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, callFrameInfo.offsetOfFrameVptr);
5178 // Call runtime helper to fill in our InlinedCallFrame and push it on the Frame list:
5179 // TCB = CORINFO_HELP_INIT_PINVOKE_FRAME(&symFrameStart, secretArg);
5180 GenTreeCall* call = comp->gtNewHelperCallNode(CORINFO_HELP_INIT_PINVOKE_FRAME, TYP_I_IMPL);
5182 NewCallArg frameAddrArg = NewCallArg::Primitive(frameAddr).WellKnown(WellKnownArg::PInvokeFrame);
5183 call->gtArgs.PushBack(comp, frameAddrArg);
5184 // for x86/arm32 don't pass the secretArg.
5185 #if !defined(TARGET_X86) && !defined(TARGET_ARM)
5187 if (comp->info.compPublishStubParam)
5189 argNode = comp->gtNewLclvNode(comp->lvaStubArgumentVar, TYP_I_IMPL);
5193 argNode = comp->gtNewIconNode(0, TYP_I_IMPL);
5195 NewCallArg stubParamArg = NewCallArg::Primitive(argNode).WellKnown(WellKnownArg::SecretStubParam);
5196 call->gtArgs.PushBack(comp, stubParamArg);
5199 // some sanity checks on the frame list root vardsc
5200 const unsigned lclNum = comp->info.compLvFrameListRoot;
5201 const LclVarDsc* varDsc = comp->lvaGetDesc(lclNum);
5202 noway_assert(!varDsc->lvIsParam);
5203 noway_assert(varDsc->lvType == TYP_I_IMPL);
5205 GenTree* store = new (comp, GT_STORE_LCL_VAR) GenTreeLclVar(GT_STORE_LCL_VAR, TYP_I_IMPL, lclNum);
5206 store->AsOp()->gtOp1 = call;
5207 store->gtFlags |= GTF_VAR_DEF;
5209 GenTree* const insertionPoint = firstBlockRange.FirstNonCatchArgNode();
5211 comp->fgMorphTree(store);
5212 firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, store));
5213 DISPTREERANGE(firstBlockRange, store);
5215 #if !defined(TARGET_X86) && !defined(TARGET_ARM)
5216 // For x86, this step is done at the call site (due to stack pointer not being static in the function).
5217 // For arm32, CallSiteSP is set up by the call to CORINFO_HELP_INIT_PINVOKE_FRAME.
5219 // --------------------------------------------------------
5220 // InlinedCallFrame.m_pCallSiteSP = @RSP;
5222 GenTree* spValue = PhysReg(REG_SPBASE);
5223 GenTreeLclFld* storeSP = comp->gtNewStoreLclFldNode(comp->lvaInlinedPInvokeFrameVar, TYP_I_IMPL,
5224 callFrameInfo.offsetOfCallSiteSP, spValue);
5225 assert(inlinedPInvokeDsc->lvDoNotEnregister);
5227 firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeSP));
5228 DISPTREERANGE(firstBlockRange, storeSP);
5230 #endif // !defined(TARGET_X86) && !defined(TARGET_ARM)
5232 #if !defined(TARGET_ARM)
5233 // For arm32, CalleeSavedFP is set up by the call to CORINFO_HELP_INIT_PINVOKE_FRAME.
5235 // --------------------------------------------------------
5236 // InlinedCallFrame.m_pCalleeSavedEBP = @RBP;
5238 GenTree* fpValue = PhysReg(REG_FPBASE);
5239 GenTreeLclFld* storeFP = comp->gtNewStoreLclFldNode(comp->lvaInlinedPInvokeFrameVar, TYP_I_IMPL,
5240 callFrameInfo.offsetOfCalleeSavedFP, fpValue);
5241 assert(inlinedPInvokeDsc->lvDoNotEnregister);
5243 firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeFP));
5244 DISPTREERANGE(firstBlockRange, storeFP);
5245 #endif // !defined(TARGET_ARM)
5247 // --------------------------------------------------------
5248 // On 32-bit targets, CORINFO_HELP_INIT_PINVOKE_FRAME initializes the PInvoke frame and then pushes it onto
5249 // the current thread's Frame stack. On 64-bit targets, it only initializes the PInvoke frame.
5250 // As a result, don't push the frame onto the frame stack here for any 64-bit targets
5251 CLANG_FORMAT_COMMENT_ANCHOR;
5254 #ifdef USE_PER_FRAME_PINVOKE_INIT
5255 // For IL stubs, we push the frame once even when we're doing per-pinvoke init.
5256 if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
5257 #endif // USE_PER_FRAME_PINVOKE_INIT
5259 // Push a frame. The init routine sets InlinedCallFrame's m_pNext, so we just set the thread's top-of-stack
5260 GenTree* frameUpd = CreateFrameLinkUpdate(PushFrame);
5261 firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd));
5262 ContainCheckStoreIndir(frameUpd->AsStoreInd());
5263 DISPTREERANGE(firstBlockRange, frameUpd);
5265 #endif // TARGET_64BIT
5268 //------------------------------------------------------------------------
5269 // InsertPInvokeMethodEpilog: Code that needs to be run when exiting any method
5270 // that has PInvoke inlines. This needs to be inserted any place you can exit the
5271 // function: returns, tailcalls and jmps.
5274 // returnBB - basic block from which a method can return
5275 // lastExpr - GenTree of the last top level stmnt of returnBB (debug only arg)
5278 // Code tree to perform the action.
5280 void Lowering::InsertPInvokeMethodEpilog(BasicBlock* returnBB DEBUGARG(GenTree* lastExpr))
5282 assert(returnBB != nullptr);
5283 assert(comp->info.compUnmanagedCallCountWithGCTransition);
5285 if (comp->opts.ShouldUsePInvokeHelpers())
5290 JITDUMP("======= Inserting PInvoke method epilog\n");
5292 // Method doing PInvoke calls has exactly one return block unless it has "jmp" or tail calls.
5293 assert(((returnBB == comp->genReturnBB) && (returnBB->bbJumpKind == BBJ_RETURN)) ||
5294 returnBB->endsWithTailCallOrJmp(comp));
5296 LIR::Range& returnBlockRange = LIR::AsRange(returnBB);
5298 GenTree* insertionPoint = returnBlockRange.LastNode();
5299 assert(insertionPoint == lastExpr);
5301 // Note: PInvoke Method Epilog (PME) needs to be inserted just before GT_RETURN, GT_JMP or GT_CALL node in execution
5302 // order so that it is guaranteed that there will be no further PInvokes after that point in the method.
5304 // Example1: GT_RETURN(op1) - say execution order is: Op1, GT_RETURN. After inserting PME, execution order would be
5305 // Op1, PME, GT_RETURN
5307 // Example2: GT_CALL(arg side effect computing nodes, Stk Args Setup, Reg Args setup). The execution order would be
5308 // arg side effect computing nodes, Stk Args setup, Reg Args setup, GT_CALL
5309 // After inserting PME execution order would be:
5310 // arg side effect computing nodes, Stk Args setup, Reg Args setup, PME, GT_CALL
5312 // Example3: GT_JMP. After inserting PME execution order would be: PME, GT_JMP
5313 // That is after PME, args for GT_JMP call will be setup.
5315 // Pop the frame if necessary. This always happens in the epilog on 32-bit targets. For 64-bit targets, we only do
5316 // this in the epilog for IL stubs; for non-IL stubs the frame is popped after every PInvoke call.
5317 CLANG_FORMAT_COMMENT_ANCHOR;
5319 #ifdef USE_PER_FRAME_PINVOKE_INIT
5320 // For IL stubs, we push the frame once even when we're doing per-pinvoke init
5321 if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
5322 #endif // USE_PER_FRAME_PINVOKE_INIT
5324 GenTree* frameUpd = CreateFrameLinkUpdate(PopFrame);
5325 returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd));
5326 ContainCheckStoreIndir(frameUpd->AsStoreInd());
5330 //------------------------------------------------------------------------
5331 // InsertPInvokeCallProlog: Emit the call-site prolog for direct calls to unmanaged code.
5332 // It does all the necessary call-site setup of the InlinedCallFrame.
5335 // call - the call for which we are inserting the PInvoke prolog.
5340 void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
5342 JITDUMP("======= Inserting PInvoke call prolog\n");
5344 GenTree* insertBefore = call;
5345 if (call->gtCallType == CT_INDIRECT)
5348 insertBefore = BlockRange().GetTreeRange(call->gtCallAddr, &isClosed).FirstNode();
5352 const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = comp->eeGetEEInfo()->inlinedCallFrameInfo;
5354 gtCallTypes callType = (gtCallTypes)call->gtCallType;
5356 noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
5358 if (comp->opts.ShouldUsePInvokeHelpers())
5360 // First argument is the address of the frame variable.
5361 GenTree* frameAddr = comp->gtNewLclVarAddrNode(comp->lvaInlinedPInvokeFrameVar, TYP_BYREF);
5363 #if defined(TARGET_X86) && !defined(UNIX_X86_ABI)
5364 // On x86 targets, PInvoke calls need the size of the stack args in InlinedCallFrame.m_Datum.
5365 // This is because the callee pops stack arguments, and we need to keep track of this during stack
5367 const unsigned numStkArgBytes = call->gtArgs.OutgoingArgsStackSize();
5368 GenTree* stackBytes = comp->gtNewIconNode(numStkArgBytes, TYP_INT);
5369 // Insert call to CORINFO_HELP_JIT_PINVOKE_BEGIN
5370 GenTree* helperCall =
5371 comp->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_BEGIN, TYP_VOID, frameAddr, stackBytes);
5373 GenTree* helperCall = comp->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_BEGIN, TYP_VOID, frameAddr);
5376 comp->fgMorphTree(helperCall);
5377 BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, helperCall));
5378 LowerNode(helperCall); // helper call is inserted before current node and should be lowered here.
5382 // Emit the following sequence:
5384 // InlinedCallFrame.callTarget = methodHandle // stored in m_Datum
5385 // InlinedCallFrame.m_pCallSiteSP = SP // x86 only
5386 // InlinedCallFrame.m_pCallerReturnAddress = return address
5387 // GT_START_PREEEMPTC
5388 // Thread.gcState = 0
5389 // (non-stub) - update top Frame on TCB // 64-bit targets only
5391 // ----------------------------------------------------------------------------------
5392 // Setup InlinedCallFrame.callSiteTarget (which is how the JIT refers to it).
5393 // The actual field is InlinedCallFrame.m_Datum which has many different uses and meanings.
5395 GenTree* src = nullptr;
5397 if (callType == CT_INDIRECT)
5399 #if !defined(TARGET_64BIT)
5400 // On 32-bit targets, indirect calls need the size of the stack args in InlinedCallFrame.m_Datum.
5401 const unsigned stackByteOffset = call->gtArgs.OutgoingArgsStackSize();
5402 src = comp->gtNewIconNode(stackByteOffset, TYP_INT);
5404 // On 64-bit targets, indirect calls may need the stub parameter value in InlinedCallFrame.m_Datum.
5405 // If the stub parameter value is not needed, m_Datum will be initialized by the VM.
5406 if (comp->info.compPublishStubParam)
5408 src = comp->gtNewLclvNode(comp->lvaStubArgumentVar, TYP_I_IMPL);
5410 #endif // !defined(TARGET_64BIT)
5414 assert(callType == CT_USER_FUNC);
5416 void* pEmbedMethodHandle = nullptr;
5417 CORINFO_METHOD_HANDLE embedMethodHandle =
5418 comp->info.compCompHnd->embedMethodHandle(call->gtCallMethHnd, &pEmbedMethodHandle);
5420 noway_assert((!embedMethodHandle) != (!pEmbedMethodHandle));
5422 if (embedMethodHandle != nullptr)
5424 // InlinedCallFrame.callSiteTarget = methodHandle
5425 src = AddrGen(embedMethodHandle);
5429 // InlinedCallFrame.callSiteTarget = *pEmbedMethodHandle
5430 src = Ind(AddrGen(pEmbedMethodHandle));
5436 // Store into InlinedCallFrame.m_Datum, the offset of which is given by offsetOfCallTarget.
5437 GenTreeLclFld* store = comp->gtNewStoreLclFldNode(comp->lvaInlinedPInvokeFrameVar, TYP_I_IMPL,
5438 callFrameInfo.offsetOfCallTarget, src);
5440 InsertTreeBeforeAndContainCheck(insertBefore, store);
5445 // ----------------------------------------------------------------------------------
5446 // InlinedCallFrame.m_pCallSiteSP = SP
5448 GenTree* callSiteSP = PhysReg(REG_SPBASE);
5449 GenTreeLclFld* storeCallSiteSP = comp->gtNewStoreLclFldNode(comp->lvaInlinedPInvokeFrameVar, TYP_I_IMPL,
5450 callFrameInfo.offsetOfCallSiteSP, callSiteSP);
5452 InsertTreeBeforeAndContainCheck(insertBefore, storeCallSiteSP);
5456 // ----------------------------------------------------------------------------------
5457 // InlinedCallFrame.m_pCallerReturnAddress = &label (the address of the instruction immediately following the call)
5459 GenTree* label = new (comp, GT_LABEL) GenTree(GT_LABEL, TYP_I_IMPL);
5460 GenTreeLclFld* storeLab = comp->gtNewStoreLclFldNode(comp->lvaInlinedPInvokeFrameVar, TYP_I_IMPL,
5461 callFrameInfo.offsetOfReturnAddress, label);
5463 InsertTreeBeforeAndContainCheck(insertBefore, storeLab);
5465 // Push the PInvoke frame if necessary. On 32-bit targets this only happens in the method prolog if a method
5466 // contains PInvokes; on 64-bit targets this is necessary in non-stubs.
5467 CLANG_FORMAT_COMMENT_ANCHOR;
5469 #ifdef USE_PER_FRAME_PINVOKE_INIT
5470 if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
5472 // Set the TCB's frame to be the one we just created.
5473 // Note the init routine for the InlinedCallFrame (CORINFO_HELP_INIT_PINVOKE_FRAME)
5474 // has prepended it to the linked list to maintain the stack of Frames.
5476 // Stubs do this once per stub, not once per call.
5477 GenTree* frameUpd = CreateFrameLinkUpdate(PushFrame);
5478 BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, frameUpd));
5479 ContainCheckStoreIndir(frameUpd->AsStoreInd());
5481 #endif // USE_PER_FRAME_PINVOKE_INIT
5483 // IMPORTANT **** This instruction must be the last real instruction ****
5484 // It changes the thread's state to Preemptive mode
5485 // ----------------------------------------------------------------------------------
5486 // [tcb + offsetOfGcState] = 0
5487 GenTree* storeGCState = SetGCState(0);
5488 BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, storeGCState));
5489 ContainCheckStoreIndir(storeGCState->AsStoreInd());
5491 // Indicate that codegen has switched this thread to preemptive GC.
5492 // This tree node doesn't generate any code, but impacts LSRA and gc reporting.
5493 // This tree node is simple so doesn't require sequencing.
5494 GenTree* preemptiveGCNode = new (comp, GT_START_PREEMPTGC) GenTree(GT_START_PREEMPTGC, TYP_VOID);
5495 BlockRange().InsertBefore(insertBefore, preemptiveGCNode);
5498 //------------------------------------------------------------------------
5499 // InsertPInvokeCallEpilog: Insert the code that goes after every inlined pinvoke call.
5502 // call - the call for which we are inserting the PInvoke epilog.
5507 void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call)
5509 JITDUMP("======= Inserting PInvoke call epilog\n");
5511 if (comp->opts.ShouldUsePInvokeHelpers())
5513 noway_assert(comp->lvaInlinedPInvokeFrameVar != BAD_VAR_NUM);
5515 // First argument is the address of the frame variable.
5516 GenTree* frameAddr = comp->gtNewLclVarAddrNode(comp->lvaInlinedPInvokeFrameVar, TYP_BYREF);
5519 const LclVarDsc* inlinedPInvokeDsc = comp->lvaGetDesc(comp->lvaInlinedPInvokeFrameVar);
5520 assert(inlinedPInvokeDsc->IsAddressExposed());
5523 // Insert call to CORINFO_HELP_JIT_PINVOKE_END
5524 GenTreeCall* helperCall = comp->gtNewHelperCallNode(CORINFO_HELP_JIT_PINVOKE_END, TYP_VOID, frameAddr);
5526 comp->fgMorphTree(helperCall);
5527 BlockRange().InsertAfter(call, LIR::SeqTree(comp, helperCall));
5528 ContainCheckCallOperands(helperCall);
5533 GenTree* insertionPoint = call->gtNext;
5535 GenTree* tree = SetGCState(1);
5536 BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
5537 ContainCheckStoreIndir(tree->AsStoreInd());
5539 tree = CreateReturnTrapSeq();
5540 BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
5541 ContainCheckReturnTrap(tree->AsOp());
5543 // Pop the frame if necessary. On 32-bit targets this only happens in the method epilog; on 64-bit targets
5544 // this happens after every PInvoke call in non-stubs. 32-bit targets instead mark the frame as inactive.
5545 CLANG_FORMAT_COMMENT_ANCHOR;
5547 #ifdef USE_PER_FRAME_PINVOKE_INIT
5548 if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
5550 tree = CreateFrameLinkUpdate(PopFrame);
5551 BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
5552 ContainCheckStoreIndir(tree->AsStoreInd());
5555 const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = comp->eeGetEEInfo()->inlinedCallFrameInfo;
5557 // ----------------------------------------------------------------------------------
5558 // InlinedCallFrame.m_pCallerReturnAddress = nullptr
5560 GenTreeIntCon* const zero = comp->gtNewIconNode(0, TYP_I_IMPL);
5561 GenTreeLclFld* const storeCallSiteTracker = comp->gtNewStoreLclFldNode(comp->lvaInlinedPInvokeFrameVar, TYP_I_IMPL,
5562 callFrameInfo.offsetOfReturnAddress, zero);
5564 BlockRange().InsertBefore(insertionPoint, zero, storeCallSiteTracker);
5565 ContainCheckStoreLoc(storeCallSiteTracker);
5566 #endif // USE_PER_FRAME_PINVOKE_INIT
5569 //------------------------------------------------------------------------
5570 // LowerNonvirtPinvokeCall: Lower a non-virtual / indirect PInvoke call
5573 // call - The call to lower.
5576 // The lowered call tree.
5578 GenTree* Lowering::LowerNonvirtPinvokeCall(GenTreeCall* call)
5580 // PInvoke lowering varies depending on the flags passed in by the EE. By default,
5581 // GC transitions are generated inline; if CORJIT_FLAG_USE_PINVOKE_HELPERS is specified,
5582 // GC transitions are instead performed using helper calls. Examples of each case are given
5583 // below. Note that the data structure that is used to store information about a call frame
5584 // containing any P/Invoke calls is initialized in the method prolog (see
5585 // InsertPInvokeMethod{Prolog,Epilog} for details).
5587 // Inline transitions:
5588 // InlinedCallFrame inlinedCallFrame;
5592 // // Set up frame information
5593 // inlinedCallFrame.callTarget = methodHandle; // stored in m_Datum
5594 // inlinedCallFrame.m_pCallSiteSP = SP; // x86 only
5595 // inlinedCallFrame.m_pCallerReturnAddress = &label; (the address of the instruction immediately following the
5597 // Thread.m_pFrame = &inlinedCallFrame; (non-IL-stub only)
5599 // // Switch the thread's GC mode to preemptive mode
5600 // thread->m_fPreemptiveGCDisabled = 0;
5602 // // Call the unmanaged method
5605 // // Switch the thread's GC mode back to cooperative mode
5606 // thread->m_fPreemptiveGCDisabled = 1;
5608 // // Rendezvous with a running collection if necessary
5609 // if (g_TrapReturningThreads)
5610 // RareDisablePreemptiveGC();
5612 // Transitions using helpers:
5614 // OpaqueFrame opaqueFrame;
5618 // // Call the JIT_PINVOKE_BEGIN helper
5619 // JIT_PINVOKE_BEGIN(&opaqueFrame);
5621 // // Call the unmanaged method
5624 // // Call the JIT_PINVOKE_END helper
5625 // JIT_PINVOKE_END(&opaqueFrame);
5627 // Note that the JIT_PINVOKE_{BEGIN.END} helpers currently use the default calling convention for the target
5628 // platform. They may be changed in the future such that they preserve all register values.
5630 GenTree* result = nullptr;
5632 // All code generated by this function must not contain the randomly-inserted NOPs
5633 // that we insert to inhibit JIT spraying in partial trust scenarios.
5634 // The PINVOKE_PROLOG op signals this to the code generator/emitter.
5636 GenTree* prolog = new (comp, GT_NOP) GenTree(GT_PINVOKE_PROLOG, TYP_VOID);
5637 BlockRange().InsertBefore(call, prolog);
5639 bool addPInvokePrologEpilog = !call->IsSuppressGCTransition();
5640 if (addPInvokePrologEpilog)
5642 InsertPInvokeCallProlog(call);
5645 if (call->gtCallType != CT_INDIRECT)
5647 noway_assert(call->gtCallType == CT_USER_FUNC);
5648 CORINFO_METHOD_HANDLE methHnd = call->gtCallMethHnd;
5650 CORINFO_CONST_LOOKUP lookup;
5651 comp->info.compCompHnd->getAddressOfPInvokeTarget(methHnd, &lookup);
5653 void* addr = lookup.addr;
5655 switch (lookup.accessType)
5658 // IsCallTargetInRange always return true on x64. It wants to use rip-based addressing
5659 // for this call. Unfortunately, in case of already resolved pinvokes to external libs,
5660 // which are identified via accessType: IAT_VALUE, the relative offset is unlikely to
5661 // fit into int32 and we will have to turn fAllowRel32 off globally. To prevent that
5662 // we'll create a wrapper node and force LSRA to allocate a register so RIP relative
5663 // isn't used and we don't need to pessimize other callsites.
5664 if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT) || !IsCallTargetInRange(addr))
5666 result = AddrGen(addr);
5670 // a direct call within range of hardware relative call instruction
5671 // stash the address for codegen
5672 call->gtDirectCallAddress = addr;
5673 #ifdef FEATURE_READYTORUN
5674 call->gtEntryPoint.addr = nullptr;
5675 call->gtEntryPoint.accessType = IAT_VALUE;
5681 addrTree = AddrGen(addr);
5683 addrTree->AsIntCon()->gtTargetHandle = (size_t)methHnd;
5685 result = Ind(addrTree);
5689 // ToDo: Expanding an IAT_PPVALUE here, loses the opportunity
5690 // to Hoist/CSE the first indirection as it is an invariant load
5692 // This case currently occurs today when we make PInvoke calls in crossgen
5694 // assert(!"IAT_PPVALUE in Lowering::LowerNonvirtPinvokeCall");
5696 addrTree = AddrGen(addr);
5698 addrTree->AsIntCon()->gtTargetHandle = (size_t)methHnd;
5700 // Double-indirection. Load the address into a register
5701 // and call indirectly through the register
5703 result = Ind(Ind(addrTree));
5711 if (addPInvokePrologEpilog)
5713 InsertPInvokeCallEpilog(call);
5719 // Expand the code necessary to calculate the control target.
5720 // Returns: the expression needed to calculate the control target
5721 // May insert embedded statements
5722 GenTree* Lowering::LowerVirtualVtableCall(GenTreeCall* call)
5724 noway_assert(call->gtCallType == CT_USER_FUNC);
5726 GenTree* thisArgNode;
5727 if (call->IsTailCallViaJitHelper())
5729 assert(call->gtArgs.CountArgs() > 0);
5730 thisArgNode = call->gtArgs.GetArgByIndex(0)->GetNode();
5734 assert(call->gtArgs.HasThisPointer());
5735 thisArgNode = call->gtArgs.GetThisArg()->GetNode();
5738 // get a reference to the thisPtr being passed
5739 assert(thisArgNode->OperIs(GT_PUTARG_REG));
5740 GenTree* thisPtr = thisArgNode->AsUnOp()->gtGetOp1();
5742 // If what we are passing as the thisptr is not already a local, make a new local to place it in
5743 // because we will be creating expressions based on it.
5745 if (thisPtr->OperIsLocal())
5747 lclNum = thisPtr->AsLclVarCommon()->GetLclNum();
5751 // Split off the thisPtr and store to a temporary variable.
5752 if (vtableCallTemp == BAD_VAR_NUM)
5754 vtableCallTemp = comp->lvaGrabTemp(true DEBUGARG("virtual vtable call"));
5757 LIR::Use thisPtrUse(BlockRange(), &thisArgNode->AsUnOp()->gtOp1, thisArgNode);
5758 ReplaceWithLclVar(thisPtrUse, vtableCallTemp);
5760 lclNum = vtableCallTemp;
5763 // Get hold of the vtable offset (note: this might be expensive)
5764 unsigned vtabOffsOfIndirection;
5765 unsigned vtabOffsAfterIndirection;
5767 comp->info.compCompHnd->getMethodVTableOffset(call->gtCallMethHnd, &vtabOffsOfIndirection,
5768 &vtabOffsAfterIndirection, &isRelative);
5770 // If the thisPtr is a local field, then construct a local field type node
5772 if (thisPtr->isLclField())
5774 local = new (comp, GT_LCL_FLD)
5775 GenTreeLclFld(GT_LCL_FLD, thisPtr->TypeGet(), lclNum, thisPtr->AsLclFld()->GetLclOffs());
5779 local = new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, thisPtr->TypeGet(), lclNum);
5782 // pointer to virtual table = [REG_CALL_THIS + offs]
5783 GenTree* result = Ind(Offset(local, VPTR_OFFS));
5785 // Get the appropriate vtable chunk
5786 if (vtabOffsOfIndirection != CORINFO_VIRTUALCALL_NO_CHUNK)
5790 // MethodTable offset is a relative pointer.
5792 // Additional temporary variable is used to store virtual table pointer.
5793 // Address of method is obtained by the next computations:
5795 // Save relative offset to tmp (vtab is virtual table pointer, vtabOffsOfIndirection is offset of
5796 // vtable-1st-level-indirection):
5799 // Save address of method to result (vtabOffsAfterIndirection is offset of vtable-2nd-level-indirection):
5800 // result = [tmp + vtabOffsOfIndirection + vtabOffsAfterIndirection + [tmp + vtabOffsOfIndirection]]
5803 // If relative pointers are also in second level indirection, additional temporary is used:
5805 // tmp2 = tmp1 + vtabOffsOfIndirection + vtabOffsAfterIndirection + [tmp1 + vtabOffsOfIndirection]
5806 // result = tmp2 + [tmp2]
5808 unsigned lclNumTmp = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp"));
5809 unsigned lclNumTmp2 = comp->lvaGrabTemp(true DEBUGARG("lclNumTmp2"));
5811 GenTree* lclvNodeStore = comp->gtNewTempStore(lclNumTmp, result);
5813 GenTree* tmpTree = comp->gtNewLclvNode(lclNumTmp, result->TypeGet());
5814 tmpTree = Offset(tmpTree, vtabOffsOfIndirection);
5816 tmpTree = Ind(tmpTree);
5817 GenTree* offs = comp->gtNewIconNode(vtabOffsOfIndirection + vtabOffsAfterIndirection, TYP_INT);
5818 result = comp->gtNewOperNode(GT_ADD, TYP_I_IMPL, comp->gtNewLclvNode(lclNumTmp, result->TypeGet()), offs);
5820 GenTree* base = OffsetByIndexWithScale(result, tmpTree, 1);
5821 GenTree* lclvNodeStore2 = comp->gtNewTempStore(lclNumTmp2, base);
5823 LIR::Range range = LIR::SeqTree(comp, lclvNodeStore);
5824 JITDUMP("result of obtaining pointer to virtual table:\n");
5826 BlockRange().InsertBefore(call, std::move(range));
5828 LIR::Range range2 = LIR::SeqTree(comp, lclvNodeStore2);
5829 ContainCheckIndir(tmpTree->AsIndir());
5830 JITDUMP("result of obtaining pointer to virtual table 2nd level indirection:\n");
5832 BlockRange().InsertAfter(lclvNodeStore, std::move(range2));
5834 result = Ind(comp->gtNewLclvNode(lclNumTmp2, result->TypeGet()));
5836 comp->gtNewOperNode(GT_ADD, TYP_I_IMPL, result, comp->gtNewLclvNode(lclNumTmp2, result->TypeGet()));
5840 // result = [REG_CALL_IND_SCRATCH + vtabOffsOfIndirection]
5841 result = Ind(Offset(result, vtabOffsOfIndirection));
5846 assert(!isRelative);
5849 // Load the function address
5850 // result = [reg+vtabOffs]
5853 result = Ind(Offset(result, vtabOffsAfterIndirection));
5859 // Lower stub dispatched virtual calls.
5860 GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call)
5862 assert(call->IsVirtualStub());
5864 // An x86 JIT which uses full stub dispatch must generate only
5865 // the following stub dispatch calls:
5867 // (1) isCallRelativeIndirect:
5868 // call dword ptr [rel32] ; FF 15 ---rel32----
5869 // (2) isCallRelative:
5870 // call abc ; E8 ---rel32----
5871 // (3) isCallRegisterIndirect:
5873 // call dword ptr [eax] ; FF 10
5875 // THIS IS VERY TIGHTLY TIED TO THE PREDICATES IN
5876 // vm\i386\cGenCpu.h, esp. isCallRegisterIndirect.
5878 GenTree* result = nullptr;
5880 // This is code to set up an indirect call to a stub address computed
5881 // via dictionary lookup.
5882 if (call->gtCallType == CT_INDIRECT)
5884 // The importer decided we needed a stub call via a computed
5885 // stub dispatch address, i.e. an address which came from a dictionary lookup.
5886 // - The dictionary lookup produces an indirected address, suitable for call
5887 // via "call [VirtualStubParam.reg]"
5889 // This combination will only be generated for shared generic code and when
5890 // stub dispatch is active.
5892 // fgMorphArgs will have created trees to pass the address in VirtualStubParam.reg.
5893 // All we have to do here is add an indirection to generate the actual call target.
5895 GenTree* ind = Ind(call->gtCallAddr);
5896 BlockRange().InsertAfter(call->gtCallAddr, ind);
5897 call->gtCallAddr = ind;
5899 ind->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
5901 ContainCheckIndir(ind->AsIndir());
5905 // Direct stub call.
5906 // Get stub addr. This will return NULL if virtual call stubs are not active
5907 void* stubAddr = call->gtStubCallStubAddr;
5908 noway_assert(stubAddr != nullptr);
5910 // If not CT_INDIRECT, then it should always be relative indir call.
5911 // This is ensured by VM.
5912 noway_assert(call->IsVirtualStubRelativeIndir());
5914 // Direct stub calls, though the stubAddr itself may still need to be
5915 // accessed via an indirection.
5916 GenTree* addr = AddrGen(stubAddr);
5918 // On x86, for tailcall via helper, the JIT_TailCall helper takes the stubAddr as
5919 // the target address, and we set a flag that it's a VSD call. The helper then
5920 // handles any necessary indirection.
5921 if (call->IsTailCallViaJitHelper())
5927 bool shouldOptimizeVirtualStubCall = false;
5928 #if defined(TARGET_ARMARCH) || defined(TARGET_AMD64)
5929 // Skip inserting the indirection node to load the address that is already
5930 // computed in the VSD stub arg register as a hidden parameter. Instead during the
5931 // codegen, just load the call target from there.
5932 shouldOptimizeVirtualStubCall = true;
5935 if (!shouldOptimizeVirtualStubCall)
5942 // TODO-Cleanup: start emitting random NOPS
5946 //------------------------------------------------------------------------
5947 // Lowering::AreSourcesPossibleModifiedLocals:
5948 // Given two nodes which will be used in an addressing mode (base,
5949 // index), check to see if they are lclVar reads, and if so, walk
5950 // backwards from the use until both reads have been visited to
5951 // determine if they are potentially modified in that range.
5954 // addr - the node that uses the base and index nodes
5955 // base - the base node
5956 // index - the index node
5958 // Returns: true if either the base or index may be modified between the
5961 bool Lowering::AreSourcesPossiblyModifiedLocals(GenTree* addr, GenTree* base, GenTree* index)
5963 assert(addr != nullptr);
5965 SideEffectSet baseSideEffects;
5966 if (base != nullptr)
5968 if (base->OperIsLocalRead())
5970 baseSideEffects.AddNode(comp, base);
5978 SideEffectSet indexSideEffects;
5979 if (index != nullptr)
5981 if (index->OperIsLocalRead())
5983 indexSideEffects.AddNode(comp, index);
5991 for (GenTree* cursor = addr;; cursor = cursor->gtPrev)
5993 assert(cursor != nullptr);
6000 if (cursor == index)
6005 if ((base == nullptr) && (index == nullptr))
6010 m_scratchSideEffects.Clear();
6011 m_scratchSideEffects.AddNode(comp, cursor);
6012 if ((base != nullptr) && m_scratchSideEffects.InterferesWith(baseSideEffects, false))
6017 if ((index != nullptr) && m_scratchSideEffects.InterferesWith(indexSideEffects, false))
6024 //------------------------------------------------------------------------
6025 // TryCreateAddrMode: recognize trees which can be implemented using an
6026 // addressing mode and transform them to a GT_LEA
6029 // addr - the use of the address we want to transform
6030 // isContainable - true if this addressing mode can be contained
6031 // parent - the node that consumes the given addr (most likely it's an IND)
6034 // true if the address node was changed to a LEA, false otherwise.
6036 bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent)
6038 if (!addr->OperIs(GT_ADD) || addr->gtOverflow())
6044 if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile())
6046 // For Arm64 we avoid using LEA for volatile INDs
6047 // because we won't be able to use ldar/star
6052 GenTree* base = nullptr;
6053 GenTree* index = nullptr;
6058 // Find out if an addressing mode can be constructed
6059 bool doAddrMode = comp->codeGen->genCreateAddrMode(addr, // address
6061 &rev, // reverse ops
6063 &index, // index val
6065 &offset); // displacement
6067 var_types targetType = parent->OperIsIndir() ? parent->TypeGet() : TYP_UNDEF;
6069 #ifdef TARGET_ARMARCH
6070 // Multiplier should be a "natural-scale" power of two number which is equal to target's width.
6072 // *(ulong*)(data + index * 8); - can be optimized
6073 // *(ulong*)(data + index * 7); - can not be optimized
6074 // *(int*)(data + index * 2); - can not be optimized
6076 if ((scale > 0) && (genTypeSize(targetType) != scale))
6081 if (((scale | offset) > 0) && parent->OperIsHWIntrinsic())
6083 // For now we only support unscaled indices for SIMD loads
6095 // this is just a reg-const add
6096 if (index == nullptr)
6101 // this is just a reg-reg add
6102 if ((scale == 1) && (offset == 0))
6108 // make sure there are not any side effects between def of leaves and use
6109 if (!doAddrMode || AreSourcesPossiblyModifiedLocals(addr, base, index))
6111 JITDUMP("No addressing mode:\n ");
6116 JITDUMP("Addressing mode:\n");
6117 JITDUMP(" Base\n ");
6119 if (index != nullptr)
6121 JITDUMP(" + Index * %u + %d\n ", scale, offset);
6126 JITDUMP(" + %d\n", offset);
6129 // Save the (potentially) unused operands before changing the address to LEA.
6130 ArrayStack<GenTree*> unusedStack(comp->getAllocator(CMK_ArrayStack));
6131 unusedStack.Push(addr->AsOp()->gtGetOp1());
6132 unusedStack.Push(addr->AsOp()->gtGetOp2());
6134 addr->ChangeOper(GT_LEA);
6135 // Make sure there are no leftover side effects (though the existing ADD we're
6136 // changing shouldn't have any at this point, but sometimes it does).
6137 addr->gtFlags &= ~GTF_ALL_EFFECT;
6139 GenTreeAddrMode* addrMode = addr->AsAddrMode();
6140 addrMode->SetBase(base);
6141 addrMode->SetIndex(index);
6142 addrMode->SetScale(scale);
6143 addrMode->SetOffset(static_cast<int>(offset));
6145 // Neither the base nor the index should now be contained.
6146 if (base != nullptr)
6148 base->ClearContained();
6150 if (index != nullptr)
6152 index->ClearContained();
6155 // Remove all the nodes that are no longer used.
6156 while (!unusedStack.Empty())
6158 GenTree* unused = unusedStack.Pop();
6160 if ((unused != base) && (unused != index))
6162 JITDUMP("Removing unused node:\n ");
6165 BlockRange().Remove(unused);
6167 for (GenTree* operand : unused->Operands())
6169 unusedStack.Push(operand);
6172 DEBUG_DESTROY_NODE(unused);
6178 if (index != nullptr)
6180 if (index->OperIs(GT_CAST) && (scale == 1) && (offset == 0) && varTypeIsByte(targetType))
6182 if (IsInvariantInRange(index, parent))
6184 // Check containment safety against the parent node - this will ensure that LEA with the contained
6185 // index will itself always be contained. We do not support uncontained LEAs with contained indices.
6186 index->AsCast()->CastOp()->ClearContained(); // Uncontain any memory operands.
6187 MakeSrcContained(addrMode, index);
6190 else if (index->OperIs(GT_BFIZ) && index->gtGetOp1()->OperIs(GT_CAST) && index->gtGetOp2()->IsCnsIntOrI() &&
6191 !varTypeIsStruct(targetType))
6193 // Check if we can "contain" LEA(BFIZ) in order to extend 32bit index to 64bit as part of load/store.
6194 // BFIZ node is a binary op where op1 is GT_CAST and op2 is GT_CNS_INT
6195 GenTreeCast* cast = index->gtGetOp1()->AsCast();
6196 assert(cast->isContained());
6198 const unsigned shiftBy = (unsigned)index->gtGetOp2()->AsIntCon()->IconValue();
6200 // 'scale' and 'offset' have to be unset since we're going to use [base + index * SXTW/UXTW scale] form
6201 // where there is no room for additional offsets/scales on ARM64. 'shiftBy' has to match target's width.
6202 if (cast->CastOp()->TypeIs(TYP_INT) && cast->TypeIs(TYP_LONG) &&
6203 (genTypeSize(targetType) == (1U << shiftBy)) && (scale == 1) && (offset == 0))
6205 if (IsInvariantInRange(index, parent))
6207 // Check containment safety against the parent node - this will ensure that LEA with the contained
6208 // index will itself always be contained. We do not support uncontained LEAs with contained indices.
6210 // TODO: Make sure that genCreateAddrMode marks such BFIZ candidates as GTF_DONT_CSE for better CQ.
6211 MakeSrcContained(addrMode, index);
6218 JITDUMP("New addressing mode node:\n ");
6225 //------------------------------------------------------------------------
6226 // LowerAdd: turn this add into a GT_LEA if that would be profitable
6229 // node - the node we care about
6232 // nullptr if no transformation was done, or the next node in the transformed node sequence that
6233 // needs to be lowered.
6235 GenTree* Lowering::LowerAdd(GenTreeOp* node)
6237 if (varTypeIsIntegralOrI(node->TypeGet()))
6239 GenTree* op1 = node->gtGetOp1();
6240 GenTree* op2 = node->gtGetOp2();
6243 // It is not the best place to do such simple arithmetic optimizations,
6244 // but it allows us to avoid `LEA(addr, 0)` nodes and doing that in morph
6245 // requires more changes. Delete that part if we get an expression optimizer.
6246 if (op2->IsIntegralConst(0))
6248 JITDUMP("Lower: optimize val + 0: ");
6250 JITDUMP("Replaced with: ");
6252 if (BlockRange().TryGetUse(node, &use))
6254 use.ReplaceWith(op1);
6258 op1->SetUnusedValue();
6260 GenTree* next = node->gtNext;
6261 BlockRange().Remove(op2);
6262 BlockRange().Remove(node);
6263 JITDUMP("Remove [%06u], [%06u]\n", op2->gtTreeID, node->gtTreeID);
6267 // Fold ADD(CNS1, CNS2). We mainly target a very specific pattern - byref ADD(frozen_handle, cns_offset)
6268 // We could do this folding earlier, but that is not trivial as we'll have to introduce a way to restore
6269 // the original object from a byref constant for optimizations.
6270 if (comp->opts.OptimizationEnabled() && op1->IsCnsIntOrI() && op2->IsCnsIntOrI() && !node->gtOverflow() &&
6271 (op1->IsIconHandle(GTF_ICON_OBJ_HDL) || op2->IsIconHandle(GTF_ICON_OBJ_HDL)) &&
6272 !op1->AsIntCon()->ImmedValNeedsReloc(comp) && !op2->AsIntCon()->ImmedValNeedsReloc(comp))
6274 assert(node->TypeIs(TYP_I_IMPL, TYP_BYREF));
6276 // TODO-CQ: we should allow this for AOT too. For that we need to guarantee that the new constant
6277 // will be lowered as the original handle with offset in a reloc.
6278 BlockRange().Remove(op1);
6279 BlockRange().Remove(op2);
6280 node->BashToConst(op1->AsIntCon()->IconValue() + op2->AsIntCon()->IconValue(), node->TypeGet());
6284 if (BlockRange().TryGetUse(node, &use))
6286 // If this is a child of an indir, let the parent handle it.
6287 // If there is a chain of adds, only look at the topmost one.
6288 GenTree* parent = use.User();
6289 if (!parent->OperIsIndir() && !parent->OperIs(GT_ADD))
6291 TryCreateAddrMode(node, false, parent);
6294 #endif // TARGET_XARCH
6298 if (node->OperIs(GT_ADD))
6300 GenTree* next = LowerAddForPossibleContainment(node);
6301 if (next != nullptr)
6306 #endif // TARGET_ARM64
6308 if (node->OperIs(GT_ADD))
6310 ContainCheckBinary(node);
6316 //------------------------------------------------------------------------
6317 // LowerUnsignedDivOrMod: Lowers a GT_UDIV/GT_UMOD node.
6320 // divMod - pointer to the GT_UDIV/GT_UMOD node to be lowered
6323 // Returns a boolean indicating whether the node was transformed.
6326 // - Transform UDIV/UMOD by power of 2 into RSZ/AND
6327 // - Transform UDIV by constant >= 2^(N-1) into GE
6328 // - Transform UDIV/UMOD by constant >= 3 into "magic division"
6331 bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
6333 assert(divMod->OperIs(GT_UDIV, GT_UMOD));
6335 #if defined(USE_HELPERS_FOR_INT_DIV)
6336 if (!varTypeIsIntegral(divMod->TypeGet()))
6338 assert(!"unreachable: integral GT_UDIV/GT_UMOD should get morphed into helper calls");
6340 assert(varTypeIsFloating(divMod->TypeGet()));
6341 #endif // USE_HELPERS_FOR_INT_DIV
6342 #if defined(TARGET_ARM64)
6343 assert(divMod->OperGet() != GT_UMOD);
6344 #endif // TARGET_ARM64
6346 GenTree* dividend = divMod->gtGetOp1();
6347 GenTree* divisor = divMod->gtGetOp2();
6349 #if !defined(TARGET_64BIT)
6350 if (dividend->OperIs(GT_LONG))
6356 if (!divisor->IsCnsIntOrI())
6361 if (dividend->IsCnsIntOrI())
6363 // We shouldn't see a divmod with constant operands here but if we do then it's likely
6364 // because optimizations are disabled or it's a case that's supposed to throw an exception.
6365 // Don't optimize this.
6369 const var_types type = divMod->TypeGet();
6370 assert((type == TYP_INT) || (type == TYP_I_IMPL));
6372 size_t divisorValue = static_cast<size_t>(divisor->AsIntCon()->IconValue());
6374 if (type == TYP_INT)
6376 // Clear up the upper 32 bits of the value, they may be set to 1 because constants
6377 // are treated as signed and stored in ssize_t which is 64 bit in size on 64 bit targets.
6378 divisorValue &= UINT32_MAX;
6381 if (divisorValue == 0)
6386 const bool isDiv = divMod->OperIs(GT_UDIV);
6388 if (isPow2(divisorValue))
6395 divisorValue = genLog2(divisorValue);
6403 divMod->SetOper(newOper);
6404 divisor->AsIntCon()->SetIconValue(divisorValue);
6405 ContainCheckNode(divMod);
6410 // If the divisor is greater or equal than 2^(N - 1) then the result is 1
6411 // iff the dividend is greater or equal than the divisor.
6412 if (((type == TYP_INT) && (divisorValue > (UINT32_MAX / 2))) ||
6413 ((type == TYP_LONG) && (divisorValue > (UINT64_MAX / 2))))
6415 divMod->SetOper(GT_GE);
6416 divMod->gtFlags |= GTF_UNSIGNED;
6417 ContainCheckNode(divMod);
6422 // TODO-ARM-CQ: Currently there's no GT_MULHI for ARM32
6423 #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
6424 if (!comp->opts.MinOpts() && (divisorValue >= 3))
6430 bool simpleMul = false;
6432 unsigned bits = type == TYP_INT ? 32 : 64;
6433 // if the dividend operand is AND or RSZ with a constant then the number of input bits can be reduced
6434 if (dividend->OperIs(GT_AND) && dividend->gtGetOp2()->IsCnsIntOrI())
6436 size_t maskCns = static_cast<size_t>(dividend->gtGetOp2()->AsIntCon()->IconValue());
6439 unsigned maskBits = 1;
6440 while (maskCns >>= 1)
6442 if (maskBits < bits)
6446 else if (dividend->OperIs(GT_RSZ) && dividend->gtGetOp2()->IsCnsIntOrI())
6448 size_t shiftCns = static_cast<size_t>(dividend->gtGetOp2()->AsIntCon()->IconValue());
6449 if (shiftCns < bits)
6451 bits -= static_cast<unsigned>(shiftCns);
6455 if (type == TYP_INT)
6457 magic = MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &increment, &preShift,
6461 // avoid inc_saturate/multiple shifts by widening to 32x64 MULHI
6462 if (increment || (preShift
6464 // IMUL reg,reg,imm32 can't be used if magic<0 because of sign-extension
6465 && static_cast<int32_t>(magic) < 0
6469 magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift,
6472 // otherwise just widen to regular multiplication
6483 magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift,
6489 assert(divMod->MarkedDivideByConstOptimized());
6491 const bool requiresDividendMultiuse = !isDiv;
6492 const weight_t curBBWeight = m_block->getBBWeight(comp);
6494 if (requiresDividendMultiuse)
6496 LIR::Use dividendUse(BlockRange(), &divMod->gtOp1, divMod);
6497 dividend = ReplaceWithLclVar(dividendUse);
6500 GenTree* firstNode = nullptr;
6501 GenTree* adjustedDividend = dividend;
6503 #if defined(TARGET_ARM64)
6504 // On ARM64 we will use a 32x32->64 bit multiply instead of a 64x64->64 one.
6505 bool widenToNativeIntForMul = (type != TYP_I_IMPL) && !simpleMul;
6507 CLANG_FORMAT_COMMENT_ANCHOR;
6508 bool widenToNativeIntForMul = (type != TYP_I_IMPL);
6511 // If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first
6514 adjustedDividend = comp->gtNewOperNode(GT_INC_SATURATE, type, adjustedDividend);
6515 BlockRange().InsertBefore(divMod, adjustedDividend);
6516 firstNode = adjustedDividend;
6519 // if "preShift" is required, then do a right shift before
6522 GenTree* preShiftBy = comp->gtNewIconNode(preShift, TYP_INT);
6523 adjustedDividend = comp->gtNewOperNode(GT_RSZ, type, adjustedDividend, preShiftBy);
6524 BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend);
6525 firstNode = preShiftBy;
6527 else if (widenToNativeIntForMul)
6529 adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_I_IMPL);
6530 BlockRange().InsertBefore(divMod, adjustedDividend);
6531 firstNode = adjustedDividend;
6535 // force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes
6536 // redundant copies otherwise
6537 if (firstNode && !simpleMul)
6539 adjustedDividend->SetRegNum(REG_RAX);
6543 if (widenToNativeIntForMul)
6545 divisor->gtType = TYP_I_IMPL;
6547 divisor->AsIntCon()->SetIconValue(magic);
6549 if (isDiv && !postShift && (type == TYP_I_IMPL))
6551 divMod->SetOper(GT_MULHI);
6552 divMod->gtOp1 = adjustedDividend;
6553 divMod->SetUnsigned();
6558 // 64-bit MUL is more expensive than UMULL on ARM64.
6559 genTreeOps mulOper = simpleMul ? GT_MUL_LONG : GT_MULHI;
6561 // 64-bit IMUL is less expensive than MUL eax:edx on x64.
6562 genTreeOps mulOper = simpleMul ? GT_MUL : GT_MULHI;
6564 // Insert a new multiplication node before the existing GT_UDIV/GT_UMOD node.
6565 // The existing node will later be transformed into a GT_RSZ/GT_SUB that
6566 // computes the final result. This way don't need to find and change the use
6567 // of the existing node.
6568 GenTree* mulhi = comp->gtNewOperNode(mulOper, TYP_I_IMPL, adjustedDividend, divisor);
6569 mulhi->SetUnsigned();
6570 BlockRange().InsertBefore(divMod, mulhi);
6571 if (firstNode == nullptr)
6578 GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT);
6579 BlockRange().InsertBefore(divMod, shiftBy);
6581 if (isDiv && (type == TYP_I_IMPL))
6583 divMod->SetOper(GT_RSZ);
6584 divMod->gtOp1 = mulhi;
6585 divMod->gtOp2 = shiftBy;
6589 mulhi = comp->gtNewOperNode(GT_RSZ, TYP_I_IMPL, mulhi, shiftBy);
6590 BlockRange().InsertBefore(divMod, mulhi);
6596 // divisor UMOD dividend = dividend SUB (div MUL divisor)
6597 GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
6598 GenTree* mul = comp->gtNewOperNode(GT_MUL, type, mulhi, divisor);
6599 dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
6601 divMod->SetOper(GT_SUB);
6602 divMod->gtOp1 = dividend;
6603 divMod->gtOp2 = mul;
6605 BlockRange().InsertBefore(divMod, divisor, mul, dividend);
6607 else if (type != TYP_I_IMPL)
6609 divMod->SetOper(GT_CAST);
6610 divMod->AsCast()->gtCastType = TYP_INT;
6611 divMod->gtOp1 = mulhi;
6612 divMod->gtOp2 = nullptr;
6616 if (firstNode != nullptr)
6618 ContainCheckRange(firstNode, divMod);
6627 // LowerConstIntDivOrMod: Transform integer GT_DIV/GT_MOD nodes with a power of 2
6628 // const divisor into equivalent but faster sequences.
6631 // node - pointer to the DIV or MOD node
6632 // nextNode - out parameter for the next node in the transformed node sequence that needs to be lowered
6635 // false if no transformation is done, true if a transformation is done
6637 bool Lowering::TryLowerConstIntDivOrMod(GenTree* node, GenTree** nextNode)
6639 assert((node->OperGet() == GT_DIV) || (node->OperGet() == GT_MOD));
6640 assert(nextNode != nullptr);
6642 GenTree* divMod = node;
6643 GenTree* dividend = divMod->gtGetOp1();
6644 GenTree* divisor = divMod->gtGetOp2();
6646 const var_types type = divMod->TypeGet();
6647 assert((type == TYP_INT) || (type == TYP_LONG));
6649 #if defined(USE_HELPERS_FOR_INT_DIV)
6650 assert(!"unreachable: integral GT_DIV/GT_MOD should get morphed into helper calls");
6651 #endif // USE_HELPERS_FOR_INT_DIV
6652 #if defined(TARGET_ARM64)
6653 if (divMod->OperIs(GT_MOD) && divisor->IsIntegralConstPow2())
6656 *nextNode = node->gtNext;
6659 assert(node->OperGet() != GT_MOD);
6660 #endif // TARGET_ARM64
6662 if (!divisor->IsCnsIntOrI())
6664 return false; // no transformations to make
6667 if (dividend->IsCnsIntOrI())
6669 // We shouldn't see a divmod with constant operands here but if we do then it's likely
6670 // because optimizations are disabled or it's a case that's supposed to throw an exception.
6671 // Don't optimize this.
6675 ssize_t divisorValue = divisor->AsIntCon()->IconValue();
6677 if (divisorValue == -1 || divisorValue == 0)
6679 // x / 0 and x % 0 can't be optimized because they are required to throw an exception.
6681 // x / -1 can't be optimized because INT_MIN / -1 is required to throw an exception.
6683 // x % -1 is always 0 and the IL spec says that the rem instruction "can" throw an exception if x is
6684 // the minimum representable integer. However, the C# spec says that an exception "is" thrown in this
6685 // case so optimizing this case would break C# code.
6687 // A runtime check could be used to handle this case but it's probably too rare to matter.
6691 bool isDiv = divMod->OperGet() == GT_DIV;
6695 if ((type == TYP_INT && divisorValue == INT_MIN) || (type == TYP_LONG && divisorValue == INT64_MIN))
6697 // If the divisor is the minimum representable integer value then we can use a compare,
6698 // the result is 1 iff the dividend equals divisor.
6699 divMod->SetOper(GT_EQ);
6705 size_t absDivisorValue =
6706 (divisorValue == SSIZE_T_MIN) ? static_cast<size_t>(divisorValue) : static_cast<size_t>(abs(divisorValue));
6708 if (!isPow2(absDivisorValue))
6710 if (comp->opts.MinOpts())
6715 #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
6719 if (type == TYP_INT)
6721 magic = MagicDivide::GetSigned32Magic(static_cast<int32_t>(divisorValue), &shift);
6726 magic = MagicDivide::GetSigned64Magic(static_cast<int64_t>(divisorValue), &shift);
6727 #else // !TARGET_64BIT
6729 #endif // !TARGET_64BIT
6732 divisor->AsIntConCommon()->SetIconValue(magic);
6734 // Insert a new GT_MULHI node in front of the existing GT_DIV/GT_MOD node.
6735 // The existing node will later be transformed into a GT_ADD/GT_SUB that
6736 // computes the final result. This way don't need to find and change the
6737 // use of the existing node.
6738 GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, divisor, dividend);
6739 BlockRange().InsertBefore(divMod, mulhi);
6741 // mulhi was the easy part. Now we need to generate different code depending
6742 // on the divisor value:
6744 // div = signbit(mulhi) + mulhi
6746 // div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust
6748 // mulhi += dividend ; requires add adjust
6749 // div = signbit(mulhi) + sar(mulhi, 2) ; requires shift adjust
6751 // mulhi -= dividend ; requires sub adjust
6752 // div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust
6753 bool requiresAddSubAdjust = signum(divisorValue) != signum(magic);
6754 bool requiresShiftAdjust = shift != 0;
6755 bool requiresDividendMultiuse = requiresAddSubAdjust || !isDiv;
6757 if (requiresDividendMultiuse)
6759 LIR::Use dividendUse(BlockRange(), &mulhi->AsOp()->gtOp2, mulhi);
6760 dividend = ReplaceWithLclVar(dividendUse);
6765 if (requiresAddSubAdjust)
6767 dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
6768 adjusted = comp->gtNewOperNode(divisorValue > 0 ? GT_ADD : GT_SUB, type, mulhi, dividend);
6769 BlockRange().InsertBefore(divMod, dividend, adjusted);
6776 GenTree* shiftBy = comp->gtNewIconNode(genTypeSize(type) * 8 - 1, type);
6777 GenTree* signBit = comp->gtNewOperNode(GT_RSZ, type, adjusted, shiftBy);
6778 BlockRange().InsertBefore(divMod, shiftBy, signBit);
6780 LIR::Use adjustedUse(BlockRange(), &signBit->AsOp()->gtOp1, signBit);
6781 adjusted = ReplaceWithLclVar(adjustedUse);
6782 adjusted = comp->gtNewLclvNode(adjusted->AsLclVar()->GetLclNum(), adjusted->TypeGet());
6783 BlockRange().InsertBefore(divMod, adjusted);
6785 if (requiresShiftAdjust)
6787 shiftBy = comp->gtNewIconNode(shift, TYP_INT);
6788 adjusted = comp->gtNewOperNode(GT_RSH, type, adjusted, shiftBy);
6789 BlockRange().InsertBefore(divMod, shiftBy, adjusted);
6794 divMod->SetOperRaw(GT_ADD);
6795 divMod->AsOp()->gtOp1 = adjusted;
6796 divMod->AsOp()->gtOp2 = signBit;
6800 GenTree* div = comp->gtNewOperNode(GT_ADD, type, adjusted, signBit);
6802 dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
6804 // divisor % dividend = dividend - divisor x div
6805 GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
6806 GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor);
6807 BlockRange().InsertBefore(divMod, dividend, div, divisor, mul);
6809 divMod->SetOperRaw(GT_SUB);
6810 divMod->AsOp()->gtOp1 = dividend;
6811 divMod->AsOp()->gtOp2 = mul;
6816 #elif defined(TARGET_ARM)
6817 // Currently there's no GT_MULHI for ARM32
6820 #error Unsupported or unset target architecture
6824 // We're committed to the conversion now. Go find the use if any.
6826 if (!BlockRange().TryGetUse(node, &use))
6831 // We need to use the dividend node multiple times so its value needs to be
6832 // computed once and stored in a temp variable.
6833 LIR::Use opDividend(BlockRange(), &divMod->AsOp()->gtOp1, divMod);
6834 dividend = ReplaceWithLclVar(opDividend);
6836 GenTree* adjustment = comp->gtNewOperNode(GT_RSH, type, dividend, comp->gtNewIconNode(type == TYP_INT ? 31 : 63));
6838 if (absDivisorValue == 2)
6840 // If the divisor is +/-2 then we'd end up with a bitwise and between 0/-1 and 1.
6841 // We can get the same result by using GT_RSZ instead of GT_RSH.
6842 adjustment->SetOper(GT_RSZ);
6846 adjustment = comp->gtNewOperNode(GT_AND, type, adjustment, comp->gtNewIconNode(absDivisorValue - 1, type));
6849 GenTree* adjustedDividend =
6850 comp->gtNewOperNode(GT_ADD, type, adjustment,
6851 comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()));
6857 // perform the division by right shifting the adjusted dividend
6858 divisor->AsIntCon()->SetIconValue(genLog2(absDivisorValue));
6860 newDivMod = comp->gtNewOperNode(GT_RSH, type, adjustedDividend, divisor);
6861 ContainCheckShiftRotate(newDivMod->AsOp());
6863 if (divisorValue < 0)
6865 // negate the result if the divisor is negative
6866 newDivMod = comp->gtNewOperNode(GT_NEG, type, newDivMod);
6867 ContainCheckNode(newDivMod);
6872 // divisor % dividend = dividend - divisor x (dividend / divisor)
6873 // divisor x (dividend / divisor) translates to (dividend >> log2(divisor)) << log2(divisor)
6874 // which simply discards the low log2(divisor) bits, that's just dividend & ~(divisor - 1)
6875 divisor->AsIntCon()->SetIconValue(~(absDivisorValue - 1));
6877 newDivMod = comp->gtNewOperNode(GT_SUB, type,
6878 comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()),
6879 comp->gtNewOperNode(GT_AND, type, adjustedDividend, divisor));
6882 // Remove the divisor and dividend nodes from the linear order,
6883 // since we have reused them and will resequence the tree
6884 BlockRange().Remove(divisor);
6885 BlockRange().Remove(dividend);
6887 // linearize and insert the new tree before the original divMod node
6888 InsertTreeBeforeAndContainCheck(divMod, newDivMod);
6889 BlockRange().Remove(divMod);
6891 // replace the original divmod node with the new divmod tree
6892 use.ReplaceWith(newDivMod);
6894 *nextNode = newDivMod->gtNext;
6897 //------------------------------------------------------------------------
6898 // LowerSignedDivOrMod: transform integer GT_DIV/GT_MOD nodes with a power of 2
6899 // const divisor into equivalent but faster sequences.
6902 // node - the DIV or MOD node
6905 // The next node to lower.
6907 GenTree* Lowering::LowerSignedDivOrMod(GenTree* node)
6909 assert((node->OperGet() == GT_DIV) || (node->OperGet() == GT_MOD));
6911 if (varTypeIsIntegral(node->TypeGet()))
6913 GenTree* nextNode = nullptr;
6914 if (TryLowerConstIntDivOrMod(node, &nextNode))
6918 assert(nextNode == nullptr);
6920 ContainCheckDivOrMod(node->AsOp());
6922 return node->gtNext;
6925 //------------------------------------------------------------------------
6926 // LowerShift: Lower shift nodes
6929 // shift - the shift node (GT_LSH, GT_RSH or GT_RSZ)
6932 // Remove unnecessary shift count masking, xarch shift instructions
6933 // mask the shift count to 5 bits (or 6 bits for 64 bit operations).
6935 void Lowering::LowerShift(GenTreeOp* shift)
6937 assert(shift->OperIs(GT_LSH, GT_RSH, GT_RSZ));
6941 if (varTypeIsLong(shift->TypeGet()))
6946 assert(!varTypeIsLong(shift->TypeGet()));
6949 for (GenTree* andOp = shift->gtGetOp2(); andOp->OperIs(GT_AND); andOp = andOp->gtGetOp1())
6951 GenTree* maskOp = andOp->gtGetOp2();
6953 if (!maskOp->IsCnsIntOrI())
6958 if ((static_cast<size_t>(maskOp->AsIntCon()->IconValue()) & mask) != mask)
6963 shift->gtOp2 = andOp->gtGetOp1();
6964 BlockRange().Remove(andOp);
6965 BlockRange().Remove(maskOp);
6966 // The parent was replaced, clear contain and regOpt flag.
6967 shift->gtOp2->ClearContained();
6970 ContainCheckShiftRotate(shift);
6973 // Try to recognize ubfiz/sbfiz idiom in LSH(CAST(X), CNS) tree
6974 if (comp->opts.OptimizationEnabled() && shift->OperIs(GT_LSH) && shift->gtGetOp1()->OperIs(GT_CAST) &&
6975 shift->gtGetOp2()->IsCnsIntOrI() && !shift->isContained())
6977 GenTreeIntCon* cns = shift->gtGetOp2()->AsIntCon();
6978 GenTreeCast* cast = shift->gtGetOp1()->AsCast();
6980 if (!cast->isContained() && !cast->IsRegOptional() && !cast->gtOverflow() &&
6981 // Smaller CastOp is most likely an IND(X) node which is lowered to a zero-extend load
6982 cast->CastOp()->TypeIs(TYP_LONG, TYP_INT))
6984 // Cast is either "TYP_LONG <- TYP_INT" or "TYP_INT <- %SMALL_INT% <- TYP_INT" (signed or unsigned)
6985 unsigned dstBits = genTypeSize(cast) * BITS_PER_BYTE;
6986 unsigned srcBits = varTypeIsSmall(cast->CastToType()) ? genTypeSize(cast->CastToType()) * BITS_PER_BYTE
6987 : genTypeSize(cast->CastOp()) * BITS_PER_BYTE;
6989 // It has to be an upcast and CNS must be in [1..srcBits) range
6990 if ((srcBits < dstBits) && (cns->IconValue() > 0) && (cns->IconValue() < srcBits))
6992 JITDUMP("Recognized ubfix/sbfix pattern in LSH(CAST, CNS). Changing op to GT_BFIZ");
6993 shift->ChangeOper(GT_BFIZ);
6994 cast->CastOp()->ClearContained(); // Uncontain any memory operands.
6995 MakeSrcContained(shift, cast);
7002 void Lowering::WidenSIMD12IfNecessary(GenTreeLclVarCommon* node)
7005 if (node->TypeGet() == TYP_SIMD12)
7008 // RyuJit backend depends on the assumption that on 64-Bit targets Vector3 size is rounded off
7009 // to TARGET_POINTER_SIZE and hence Vector3 locals on stack can be treated as TYP_SIMD16 for
7010 // reading and writing purposes.
7013 // RyuJit backend is making another implicit assumption that Vector3 type args when passed in
7014 // registers or on stack, the upper most 4-bytes will be zero.
7016 // For P/Invoke return and Reverse P/Invoke argument passing, native compiler doesn't guarantee
7017 // that upper 4-bytes of a Vector3 type struct is zero initialized and hence assumption 2 is
7020 // RyuJIT x64 Windows: arguments are treated as passed by ref and hence read/written just 12
7021 // bytes. In case of Vector3 returns, Caller allocates a zero initialized Vector3 local and
7022 // passes it retBuf arg and Callee method writes only 12 bytes to retBuf. For this reason,
7023 // there is no need to clear upper 4-bytes of Vector3 type args.
7025 // RyuJIT x64 Unix: arguments are treated as passed by value and read/writen as if TYP_SIMD16.
7026 // Vector3 return values are returned two return registers and Caller assembles them into a
7027 // single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3
7028 // type args in prolog and Vector3 type return value of a call
7030 // RyuJIT x86 Windows: all non-param Vector3 local vars are allocated as 16 bytes. Vector3 arguments
7031 // are pushed as 12 bytes. For return values, a 16-byte local is allocated and the address passed
7032 // as a return buffer pointer. The callee doesn't write the high 4 bytes, and we don't need to clear
7035 LclVarDsc* varDsc = comp->lvaGetDesc(node->AsLclVarCommon());
7037 if (comp->lvaMapSimd12ToSimd16(varDsc))
7039 JITDUMP("Mapping TYP_SIMD12 lclvar node to TYP_SIMD16:\n");
7041 JITDUMP("============");
7043 node->gtType = TYP_SIMD16;
7046 #endif // FEATURE_SIMD
7049 PhaseStatus Lowering::DoPhase()
7051 // If we have any PInvoke calls, insert the one-time prolog code. We'll insert the epilog code in the
7052 // appropriate spots later. NOTE: there is a minor optimization opportunity here, as we still create p/invoke
7053 // data structures and setup/teardown even if we've eliminated all p/invoke calls due to dead code elimination.
7054 if (comp->compMethodRequiresPInvokeFrame())
7056 InsertPInvokeMethodProlog();
7059 #if !defined(TARGET_64BIT)
7060 DecomposeLongs decomp(comp); // Initialize the long decomposition class.
7061 if (comp->compLongUsed)
7063 decomp.PrepareForDecomposition();
7065 #endif // !defined(TARGET_64BIT)
7067 if (!comp->compEnregLocals())
7069 // Lowering is checking if lvDoNotEnregister is already set for contained optimizations.
7070 // If we are running without `CLFLG_REGVAR` flag set (`compEnregLocals() == false`)
7071 // then we already know that we won't enregister any locals and it is better to set
7072 // `lvDoNotEnregister` flag before we start reading it.
7073 // The main reason why this flag is not set is that we are running in minOpts.
7074 comp->lvSetMinOptsDoNotEnreg();
7077 for (BasicBlock* const block : comp->Blocks())
7079 /* Make the block publicly available */
7080 comp->compCurBB = block;
7082 #if !defined(TARGET_64BIT)
7083 if (comp->compLongUsed)
7085 decomp.DecomposeBlock(block);
7087 #endif //! TARGET_64BIT
7093 JITDUMP("Lower has completed modifying nodes.\n");
7096 comp->fgDispBasicBlocks(true);
7100 FinalizeOutgoingArgSpace();
7102 // Recompute local var ref counts before potentially sorting for liveness.
7103 // Note this does minimal work in cases where we are not going to sort.
7104 const bool isRecompute = true;
7105 const bool setSlotNumbers = false;
7106 comp->lvaComputeRefCounts(isRecompute, setSlotNumbers);
7108 comp->fgLocalVarLiveness();
7109 // local var liveness can delete code, which may create empty blocks
7110 if (comp->opts.OptimizationEnabled())
7112 bool modified = comp->fgUpdateFlowGraph();
7113 modified |= comp->fgRemoveDeadBlocks();
7117 JITDUMP("had to run another liveness pass:\n");
7118 comp->fgLocalVarLiveness();
7123 // If we are not optimizing, remove the dead blocks regardless.
7124 comp->fgRemoveDeadBlocks();
7127 // Recompute local var ref counts again after liveness to reflect
7128 // impact of any dead code removal. Note this may leave us with
7129 // tracked vars that have zero refs.
7130 comp->lvaComputeRefCounts(isRecompute, setSlotNumbers);
7132 return PhaseStatus::MODIFIED_EVERYTHING;
7137 //------------------------------------------------------------------------
7138 // Lowering::CheckCallArg: check that a call argument is in an expected
7139 // form after lowering.
7142 // arg - the argument to check.
7144 void Lowering::CheckCallArg(GenTree* arg)
7146 if (!arg->IsValue() && !arg->OperIsPutArgStk())
7148 assert(arg->OperIsStore());
7152 switch (arg->OperGet())
7156 GenTreeFieldList* list = arg->AsFieldList();
7157 assert(list->isContained());
7159 for (GenTreeFieldList::Use& use : list->Uses())
7161 assert(use.GetNode()->OperIsPutArg());
7167 assert(arg->OperIsPutArg());
7172 //------------------------------------------------------------------------
7173 // Lowering::CheckCall: check that a call is in an expected form after
7174 // lowering. Currently this amounts to checking its
7175 // arguments, but could be expanded to verify more
7176 // properties in the future.
7179 // call - the call to check.
7181 void Lowering::CheckCall(GenTreeCall* call)
7183 for (CallArg& arg : call->gtArgs.EarlyArgs())
7185 CheckCallArg(arg.GetEarlyNode());
7188 for (CallArg& arg : call->gtArgs.LateArgs())
7190 CheckCallArg(arg.GetLateNode());
7194 //------------------------------------------------------------------------
7195 // Lowering::CheckNode: check that an LIR node is in an expected form
7199 // compiler - the compiler context.
7200 // node - the node to check.
7202 void Lowering::CheckNode(Compiler* compiler, GenTree* node)
7204 switch (node->OperGet())
7207 CheckCall(node->AsCall());
7211 case GT_HWINTRINSIC:
7212 assert(node->TypeGet() != TYP_SIMD12);
7214 #endif // FEATURE_SIMD
7217 case GT_STORE_LCL_VAR:
7219 const LclVarDsc* varDsc = compiler->lvaGetDesc(node->AsLclVar());
7220 #if defined(FEATURE_SIMD) && defined(TARGET_64BIT)
7221 if (node->TypeIs(TYP_SIMD12))
7223 assert(compiler->lvaIsFieldOfDependentlyPromotedStruct(varDsc) || (varDsc->lvSize() == 12));
7225 #endif // FEATURE_SIMD && TARGET_64BIT
7226 if (varDsc->lvPromoted)
7228 assert(varDsc->lvDoNotEnregister || varDsc->lvIsMultiRegRet);
7235 const GenTreeLclVarCommon* lclVarAddr = node->AsLclVarCommon();
7236 const LclVarDsc* varDsc = compiler->lvaGetDesc(lclVarAddr);
7237 if (((lclVarAddr->gtFlags & GTF_VAR_DEF) != 0) && varDsc->HasGCPtr())
7239 // Emitter does not correctly handle live updates for LCL_ADDR
7240 // when they are not contained, for example, `STOREIND byref(GT_LCL_ADDR not-contained)`
7242 // add r1, sp, 48 // r1 contains address of a lclVar V01.
7243 // str r0, [r1] // a gc ref becomes live in V01, but emitter would not report it.
7244 // Make sure that we use uncontained address nodes only for variables
7245 // that will be marked as mustInit and will be alive throughout the whole block even when tracked.
7246 assert(lclVarAddr->isContained() || !varDsc->lvTracked || varTypeIsStruct(varDsc));
7247 // TODO: support this assert for uses, see https://github.com/dotnet/runtime/issues/51900.
7250 assert(varDsc->lvDoNotEnregister);
7256 assert(!"Should not see phi nodes after rationalize");
7260 case GT_STORE_LCL_FLD:
7262 const LclVarDsc* varDsc = compiler->lvaGetDesc(node->AsLclFld());
7263 assert(varDsc->lvDoNotEnregister);
7272 //------------------------------------------------------------------------
7273 // Lowering::CheckBlock: check that the contents of an LIR block are in an
7274 // expected form after lowering.
7277 // compiler - the compiler context.
7278 // block - the block to check.
7280 bool Lowering::CheckBlock(Compiler* compiler, BasicBlock* block)
7282 assert(block->isEmpty() || block->IsLIR());
7284 LIR::Range& blockRange = LIR::AsRange(block);
7285 for (GenTree* node : blockRange)
7287 CheckNode(compiler, node);
7290 assert(blockRange.CheckLIR(compiler, true));
7295 //------------------------------------------------------------------------
7296 // Lowering::LowerBlock: Lower all the nodes in a BasicBlock
7299 // block - the block to lower.
7301 void Lowering::LowerBlock(BasicBlock* block)
7303 assert(block == comp->compCurBB); // compCurBB must already be set.
7304 assert(block->isEmpty() || block->IsLIR());
7308 // NOTE: some of the lowering methods insert calls before the node being
7309 // lowered (See e.g. InsertPInvoke{Method,Call}{Prolog,Epilog}). In
7310 // general, any code that is inserted before the current node should be
7311 // "pre-lowered" as they won't be subject to further processing.
7312 // Lowering::CheckBlock() runs some extra checks on call arguments in
7313 // order to help catch unlowered nodes.
7315 GenTree* node = BlockRange().FirstNode();
7316 while (node != nullptr)
7318 node = LowerNode(node);
7321 assert(CheckBlock(comp, block));
7324 /** Verifies if both of these trees represent the same indirection.
7325 * Used by Lower to annotate if CodeGen generate an instruction of the
7326 * form *addrMode BinOp= expr
7328 * Preconditions: both trees are children of GT_INDs and their underlying children
7329 * have the same gtOper.
7331 * This is a first iteration to actually recognize trees that can be code-generated
7332 * as a single read-modify-write instruction on AMD64/x86. For now
7333 * this method only supports the recognition of simple addressing modes (through GT_LEA)
7334 * or local var indirections. Local fields, array access and other more complex nodes are
7335 * not yet supported.
7337 * TODO-CQ: Perform tree recognition by using the Value Numbering Package, that way we can recognize
7338 * arbitrary complex trees and support much more addressing patterns.
7340 bool Lowering::IndirsAreEquivalent(GenTree* candidate, GenTree* storeInd)
7342 assert(candidate->OperGet() == GT_IND);
7343 assert(storeInd->OperGet() == GT_STOREIND);
7345 // We should check the size of the indirections. If they are
7346 // different, say because of a cast, then we can't call them equivalent. Doing so could cause us
7348 // Signed-ness difference is okay and expected since a store indirection must always
7349 // be signed based on the CIL spec, but a load could be unsigned.
7350 if (genTypeSize(candidate->gtType) != genTypeSize(storeInd->gtType))
7355 GenTree* pTreeA = candidate->gtGetOp1();
7356 GenTree* pTreeB = storeInd->gtGetOp1();
7358 // This method will be called by codegen (as well as during lowering).
7359 // After register allocation, the sources may have been spilled and reloaded
7360 // to a different register, indicated by an inserted GT_RELOAD node.
7361 pTreeA = pTreeA->gtSkipReloadOrCopy();
7362 pTreeB = pTreeB->gtSkipReloadOrCopy();
7366 if (pTreeA->OperGet() != pTreeB->OperGet())
7371 oper = pTreeA->OperGet();
7375 if (pTreeA->AsLclFld()->GetLclOffs() != 0)
7377 // TODO-CQ: support arbitrary local addresses here.
7384 return NodesAreEquivalentLeaves(pTreeA, pTreeB);
7388 GenTreeAddrMode* gtAddr1 = pTreeA->AsAddrMode();
7389 GenTreeAddrMode* gtAddr2 = pTreeB->AsAddrMode();
7390 return NodesAreEquivalentLeaves(gtAddr1->Base(), gtAddr2->Base()) &&
7391 NodesAreEquivalentLeaves(gtAddr1->Index(), gtAddr2->Index()) &&
7392 (gtAddr1->gtScale == gtAddr2->gtScale) && (gtAddr1->Offset() == gtAddr2->Offset());
7395 // We don't handle anything that is not either a constant,
7396 // a local var or LEA.
7401 //------------------------------------------------------------------------
7402 // NodesAreEquivalentLeaves: Check whether the two given nodes are the same leaves.
7405 // tree1 and tree2 are nodes to be checked.
7407 // Returns true if they are same leaves, false otherwise.
7410 bool Lowering::NodesAreEquivalentLeaves(GenTree* tree1, GenTree* tree2)
7417 if (tree1 == nullptr || tree2 == nullptr)
7422 tree1 = tree1->gtSkipReloadOrCopy();
7423 tree2 = tree2->gtSkipReloadOrCopy();
7425 if (tree1->TypeGet() != tree2->TypeGet())
7430 if (tree1->OperGet() != tree2->OperGet())
7435 if (!tree1->OperIsLeaf() || !tree2->OperIsLeaf())
7440 switch (tree1->OperGet())
7443 return tree1->AsIntCon()->IconValue() == tree2->AsIntCon()->IconValue() &&
7444 tree1->IsIconHandle() == tree2->IsIconHandle();
7446 if (tree1->AsLclFld()->GetLclOffs() != tree2->AsLclFld()->GetLclOffs())
7452 return tree1->AsLclVarCommon()->GetLclNum() == tree2->AsLclVarCommon()->GetLclNum();
7453 case GT_CLS_VAR_ADDR:
7454 return tree1->AsClsVar()->gtClsVarHnd == tree2->AsClsVar()->gtClsVarHnd;
7460 //------------------------------------------------------------------------
7461 // Lowering::CheckMultiRegLclVar: Check whether a MultiReg GT_LCL_VAR node can
7462 // remain a multi-reg.
7465 // lclNode - the GT_LCL_VAR or GT_STORE_LCL_VAR node.
7466 // registerCount - use register count for uses; source register count for stores.
7468 bool Lowering::CheckMultiRegLclVar(GenTreeLclVar* lclNode, int registerCount)
7470 bool canEnregisterAsMultiReg = false;
7471 bool canEnregisterAsSingleReg = false;
7473 #if FEATURE_MULTIREG_RET || defined(FEATURE_HW_INTRINSICS)
7474 LclVarDsc* varDsc = comp->lvaGetDesc(lclNode->GetLclNum());
7475 if (varDsc->lvDoNotEnregister)
7477 assert(!lclNode->IsMultiReg());
7481 if ((comp->lvaEnregMultiRegVars) && varDsc->lvPromoted)
7483 // We can enregister if we have a promoted struct and all the fields' types match the ABI requirements.
7484 // Note that we don't promote structs with explicit layout, so we don't need to check field offsets, and
7485 // if we have multiple types packed into a single register, we won't have matching reg and field counts,
7486 // so we can tolerate mismatches of integer size.
7487 if (comp->lvaGetPromotionType(varDsc) == Compiler::PROMOTION_TYPE_INDEPENDENT)
7489 if (registerCount == varDsc->lvFieldCnt)
7491 canEnregisterAsMultiReg = true;
7494 // TYP_SIMD12 breaks the above invariant that "we won't have
7495 // matching reg and field counts"; for example, consider
7497 // * STORE_LCL_VAR<struct{Vector3, int}>(CALL)
7498 // * RETURN(LCL_VAR<struct{Vector3, int}>)
7500 // These return in two GPR registers, while the fields of the
7501 // local are stored in SIMD and GPR register, so registerCount
7502 // == varDsc->lvFieldCnt == 2. But the backend cannot handle
7505 for (int i = 0; i < varDsc->lvFieldCnt; i++)
7507 if (comp->lvaGetDesc(varDsc->lvFieldLclStart + i)->TypeGet() == TYP_SIMD12)
7509 canEnregisterAsMultiReg = false;
7519 canEnregisterAsSingleReg = varTypeIsSIMD(lclNode);
7521 if (lclNode->OperIs(GT_STORE_LCL_VAR) && varTypeIsStruct(lclNode->Data()) && !lclNode->Data()->OperIs(GT_CALL))
7523 canEnregisterAsSingleReg = false;
7525 #endif // TARGET_XARCH
7528 if (canEnregisterAsSingleReg || canEnregisterAsMultiReg)
7530 if (canEnregisterAsMultiReg)
7532 lclNode->SetMultiReg();
7537 comp->lvaSetVarDoNotEnregister(lclNode->GetLclNum() DEBUGARG(DoNotEnregisterReason::BlockOp));
7539 #endif // FEATURE_MULTIREG_RET || defined(FEATURE_HW_INTRINSICS)
7541 return canEnregisterAsSingleReg || canEnregisterAsMultiReg;
7544 //------------------------------------------------------------------------
7545 // Containment Analysis
7546 //------------------------------------------------------------------------
7547 void Lowering::ContainCheckNode(GenTree* node)
7549 switch (node->gtOper)
7551 case GT_STORE_LCL_VAR:
7552 case GT_STORE_LCL_FLD:
7553 ContainCheckStoreLoc(node->AsLclVarCommon());
7567 ContainCheckCompare(node->AsOp());
7571 ContainCheckSelect(node->AsConditional());
7576 #if !defined(TARGET_64BIT)
7585 ContainCheckBinary(node->AsOp());
7588 #if defined(TARGET_X86)
7593 ContainCheckMul(node->AsOp());
7599 ContainCheckDivOrMod(node->AsOp());
7606 #ifndef TARGET_64BIT
7610 ContainCheckShiftRotate(node->AsOp());
7613 ContainCheckCast(node->AsCast());
7616 ContainCheckBitCast(node);
7619 ContainCheckLclHeap(node->AsOp());
7622 ContainCheckRet(node->AsOp());
7625 ContainCheckReturnTrap(node->AsOp());
7628 ContainCheckStoreIndir(node->AsStoreInd());
7631 ContainCheckIndir(node->AsIndir());
7635 #if FEATURE_ARG_SPLIT
7636 case GT_PUTARG_SPLIT:
7637 #endif // FEATURE_ARG_SPLIT
7638 // The regNum must have been set by the lowering of the call.
7639 assert(node->GetRegNum() != REG_NA);
7643 ContainCheckIntrinsic(node->AsOp());
7645 #endif // TARGET_XARCH
7646 #ifdef FEATURE_HW_INTRINSICS
7647 case GT_HWINTRINSIC:
7648 ContainCheckHWIntrinsic(node->AsHWIntrinsic());
7650 #endif // FEATURE_HW_INTRINSICS
7656 //------------------------------------------------------------------------
7657 // ContainCheckReturnTrap: determine whether the source of a RETURNTRAP should be contained.
7660 // node - pointer to the GT_RETURNTRAP node
7662 void Lowering::ContainCheckReturnTrap(GenTreeOp* node)
7665 assert(node->OperIs(GT_RETURNTRAP));
7666 // This just turns into a compare of its child with an int + a conditional call
7667 if (node->gtOp1->isIndir())
7669 MakeSrcContained(node, node->gtOp1);
7671 #endif // TARGET_XARCH
7674 //------------------------------------------------------------------------
7675 // ContainCheckLclHeap: determine whether the source of a GT_LCLHEAP node should be contained.
7678 // node - pointer to the node
7680 void Lowering::ContainCheckLclHeap(GenTreeOp* node)
7682 assert(node->OperIs(GT_LCLHEAP));
7683 GenTree* size = node->gtOp1;
7684 if (size->IsCnsIntOrI())
7686 MakeSrcContained(node, size);
7690 //------------------------------------------------------------------------
7691 // ContainCheckRet: determine whether the source of a node should be contained.
7694 // node - pointer to the node
7696 void Lowering::ContainCheckRet(GenTreeUnOp* ret)
7698 assert(ret->OperIs(GT_RETURN));
7700 #if !defined(TARGET_64BIT)
7701 if (ret->TypeGet() == TYP_LONG)
7703 GenTree* op1 = ret->gtGetOp1();
7704 noway_assert(op1->OperGet() == GT_LONG);
7705 MakeSrcContained(ret, op1);
7707 #endif // !defined(TARGET_64BIT)
7708 #if FEATURE_MULTIREG_RET
7709 if (ret->TypeIs(TYP_STRUCT))
7711 GenTree* op1 = ret->gtGetOp1();
7712 // op1 must be either a lclvar or a multi-reg returning call
7713 if (op1->OperGet() == GT_LCL_VAR)
7715 const LclVarDsc* varDsc = comp->lvaGetDesc(op1->AsLclVarCommon());
7716 // This must be a multi-reg return or an HFA of a single element.
7717 assert(varDsc->lvIsMultiRegRet || (varDsc->lvIsHfa() && varTypeIsValidHfaType(varDsc->lvType)));
7719 // Mark var as contained if not enregisterable.
7720 if (!varDsc->IsEnregisterableLcl())
7722 if (!op1->IsMultiRegLclVar())
7724 MakeSrcContained(ret, op1);
7729 #endif // FEATURE_MULTIREG_RET
7732 //------------------------------------------------------------------------
7733 // ContainCheckBitCast: determine whether the source of a BITCAST should be contained.
7736 // node - pointer to the node
7738 void Lowering::ContainCheckBitCast(GenTree* node)
7740 GenTree* const op1 = node->AsOp()->gtOp1;
7741 if (op1->OperIs(GT_LCL_VAR) && (genTypeSize(op1) == genTypeSize(node)))
7743 if (IsContainableMemoryOp(op1) && IsSafeToContainMem(node, op1))
7745 MakeSrcContained(node, op1);
7747 else if (IsSafeToMarkRegOptional(node, op1))
7749 MakeSrcRegOptional(node, op1);
7754 //------------------------------------------------------------------------
7755 // LowerStoreIndirCommon: a common logic to lower StoreIndir.
7758 // ind - the store indirection node we are lowering.
7760 void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
7762 assert(ind->TypeGet() != TYP_STRUCT);
7764 TryRetypingFloatingPointStoreToIntegerStore(ind);
7766 #if defined(TARGET_ARM64)
7767 // Verify containment safety before creating an LEA that must be contained.
7769 const bool isContainable = IsInvariantInRange(ind->Addr(), ind);
7771 const bool isContainable = true;
7773 TryCreateAddrMode(ind->Addr(), isContainable, ind);
7775 if (!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind))
7777 #ifndef TARGET_XARCH
7778 if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL))
7780 const ssize_t handle = ind->Data()->AsIntCon()->IconValue();
7781 if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast<CORINFO_OBJECT_HANDLE>(handle)))
7783 // On platforms with weaker memory model we need to make sure we use a store with the release semantic
7784 // when we publish a potentially mutable object
7785 // See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and
7786 // https://github.com/dotnet/runtime/pull/76112#discussion_r980639782
7788 // This can be relaxed to "just make sure to use stlr/memory barrier" if needed
7789 ind->gtFlags |= GTF_IND_VOLATILE;
7794 LowerStoreIndir(ind);
7798 //------------------------------------------------------------------------
7799 // LowerIndir: a common logic to lower IND load or NullCheck.
7802 // ind - the ind node we are lowering.
7804 void Lowering::LowerIndir(GenTreeIndir* ind)
7806 assert(ind->OperIs(GT_IND, GT_NULLCHECK));
7807 // Process struct typed indirs separately unless they are unused;
7808 // they only appear as the source of a block copy operation or a return node.
7809 if (!ind->TypeIs(TYP_STRUCT) || ind->IsUnusedValue())
7811 #ifndef TARGET_XARCH
7812 // On non-xarch, whether or not we can contain an address mode will depend on the access width
7813 // which may be changed when transforming an unused indir, so do that first.
7814 // On xarch, it is the opposite: we transform to indir/nullcheck based on whether we contained the
7815 // address mode, so in that case we must do this transformation last.
7816 if (ind->OperIs(GT_NULLCHECK) || ind->IsUnusedValue())
7818 TransformUnusedIndirection(ind, comp, m_block);
7822 // TODO-Cleanup: We're passing isContainable = true but ContainCheckIndir rejects
7823 // address containment in some cases so we end up creating trivial (reg + offfset)
7824 // or (reg + reg) LEAs that are not necessary.
7825 CLANG_FORMAT_COMMENT_ANCHOR;
7827 #if defined(TARGET_ARM64)
7828 // Verify containment safety before creating an LEA that must be contained.
7830 const bool isContainable = IsInvariantInRange(ind->Addr(), ind);
7832 const bool isContainable = true;
7835 TryCreateAddrMode(ind->Addr(), isContainable, ind);
7836 ContainCheckIndir(ind);
7839 if (ind->OperIs(GT_NULLCHECK) || ind->IsUnusedValue())
7841 TransformUnusedIndirection(ind, comp, m_block);
7847 // If the `ADDR` node under `STORE_BLK(dstAddr, IND(struct(ADDR))`
7848 // is a complex one it could benefit from an `LEA` that is not contained.
7849 const bool isContainable = false;
7850 TryCreateAddrMode(ind->Addr(), isContainable, ind);
7854 //------------------------------------------------------------------------
7855 // TransformUnusedIndirection: change the opcode and the type of the unused indirection.
7858 // ind - Indirection to transform.
7859 // comp - Compiler instance.
7860 // block - Basic block of the indirection.
7862 void Lowering::TransformUnusedIndirection(GenTreeIndir* ind, Compiler* comp, BasicBlock* block)
7864 // A nullcheck is essentially the same as an indirection with no use.
7865 // The difference lies in whether a target register must be allocated.
7866 // On XARCH we can generate a compare with no target register as long as the address
7867 // is not contained.
7868 // On ARM64 we can generate a load to REG_ZR in all cases.
7869 // However, on ARM we must always generate a load to a register.
7870 // In the case where we require a target register, it is better to use GT_IND, since
7871 // GT_NULLCHECK is a non-value node and would therefore require an internal register
7872 // to use as the target. That is non-optimal because it will be modeled as conflicting
7873 // with the source register(s).
7874 // So, to summarize:
7875 // - On ARM64, always use GT_NULLCHECK for a dead indirection.
7876 // - On ARM, always use GT_IND.
7877 // - On XARCH, use GT_IND if we have a contained address, and GT_NULLCHECK otherwise.
7878 // In all cases we try to preserve the original type and never make it wider to avoid AVEs.
7879 // For structs we conservatively lower it to BYTE. For 8-byte primitives we lower it to TYP_INT
7880 // on XARCH as an optimization.
7882 assert(ind->OperIs(GT_NULLCHECK, GT_IND, GT_BLK));
7884 ind->ChangeType(comp->gtTypeForNullCheck(ind));
7886 #if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
7887 bool useNullCheck = true;
7888 #elif defined(TARGET_ARM)
7889 bool useNullCheck = false;
7890 #else // TARGET_XARCH
7891 bool useNullCheck = !ind->Addr()->isContained();
7892 ind->ClearDontExtend();
7893 #endif // !TARGET_XARCH
7895 if (useNullCheck && !ind->OperIs(GT_NULLCHECK))
7897 comp->gtChangeOperToNullCheck(ind, block);
7898 ind->ClearUnusedValue();
7900 else if (!useNullCheck && !ind->OperIs(GT_IND))
7902 ind->ChangeOper(GT_IND);
7903 ind->SetUnusedValue();
7907 //------------------------------------------------------------------------
7908 // LowerLclHeap: a common logic to lower LCLHEAP.
7911 // blkNode - the LCLHEAP node we are lowering.
7913 void Lowering::LowerLclHeap(GenTree* node)
7915 assert(node->OperIs(GT_LCLHEAP));
7917 #if defined(TARGET_XARCH)
7918 if (node->gtGetOp1()->IsCnsIntOrI())
7920 GenTreeIntCon* sizeNode = node->gtGetOp1()->AsIntCon();
7921 ssize_t size = sizeNode->IconValue();
7925 // Replace with null for LCLHEAP(0)
7926 node->BashToZeroConst(TYP_I_IMPL);
7927 BlockRange().Remove(sizeNode);
7931 if (comp->info.compInitMem)
7933 ssize_t alignedSize = ALIGN_UP(size, STACK_ALIGN);
7934 if ((size > UINT_MAX) || (alignedSize > UINT_MAX))
7936 // Size is too big - don't mark sizeNode as contained
7941 if (BlockRange().TryGetUse(node, &use))
7943 // Align LCLHEAP size for more efficient zeroing via BLK
7944 sizeNode->SetIconValue(alignedSize);
7946 // Emit STORE_BLK to zero it
7948 // * STORE_BLK struct<alignedSize> (init) (Unroll)
7949 // +--* LCL_VAR long V01
7950 // \--* CNS_INT int 0
7952 GenTree* heapLcl = comp->gtNewLclvNode(use.ReplaceWithLclVar(comp), TYP_I_IMPL);
7953 GenTree* zero = comp->gtNewIconNode(0);
7954 GenTreeBlk* storeBlk = new (comp, GT_STORE_BLK)
7955 GenTreeBlk(GT_STORE_BLK, TYP_STRUCT, heapLcl, zero, comp->typGetBlkLayout((unsigned)alignedSize));
7956 storeBlk->gtFlags |= (GTF_IND_UNALIGNED | GTF_ASG | GTF_EXCEPT | GTF_GLOB_REF);
7957 BlockRange().InsertAfter(use.Def(), heapLcl, zero, storeBlk);
7958 LowerNode(storeBlk);
7962 // Value is unused and we don't mark the size node as contained
7968 ContainCheckLclHeap(node->AsOp());
7971 //------------------------------------------------------------------------
7972 // LowerBlockStoreCommon: a common logic to lower STORE_BLK/DYN_BLK.
7975 // blkNode - the store blk/obj node we are lowering.
7977 void Lowering::LowerBlockStoreCommon(GenTreeBlk* blkNode)
7979 assert(blkNode->OperIs(GT_STORE_BLK, GT_STORE_DYN_BLK));
7981 // Lose the type information stored in the source - we no longer need it.
7982 if (blkNode->Data()->OperIs(GT_BLK))
7984 blkNode->Data()->SetOper(GT_IND);
7985 LowerIndir(blkNode->Data()->AsIndir());
7988 if (TryTransformStoreObjAsStoreInd(blkNode))
7993 LowerBlockStore(blkNode);
7996 //------------------------------------------------------------------------
7997 // TryTransformStoreObjAsStoreInd: try to replace STORE_BLK as STOREIND.
8000 // blkNode - the store node.
8003 // true if the replacement was made, false otherwise.
8006 // TODO-CQ: this method should do the transformation when possible
8007 // and STOREIND should always generate better or the same code as
8008 // STORE_BLK for the same copy.
8010 bool Lowering::TryTransformStoreObjAsStoreInd(GenTreeBlk* blkNode)
8012 assert(blkNode->OperIs(GT_STORE_BLK, GT_STORE_DYN_BLK));
8013 if (!comp->opts.OptimizationEnabled())
8018 if (blkNode->OperIs(GT_STORE_DYN_BLK))
8023 var_types regType = blkNode->GetLayout()->GetRegisterType();
8024 if (regType == TYP_UNDEF)
8029 GenTree* src = blkNode->Data();
8030 if (varTypeIsSIMD(regType) && src->IsConstInitVal())
8032 // TODO-CQ: support STORE_IND SIMD16(SIMD16, CNT_INT 0).
8036 if (varTypeIsGC(regType))
8038 // TODO-CQ: STOREIND does not try to contain src if we need a barrier,
8039 // STORE_BLK generates better code currently.
8043 if (src->OperIsInitVal() && !src->IsConstInitVal())
8048 JITDUMP("Replacing STORE_BLK with STOREIND for [%06u]\n", blkNode->gtTreeID);
8049 blkNode->ChangeOper(GT_STOREIND);
8050 blkNode->ChangeType(regType);
8052 if (varTypeIsStruct(src))
8054 src->ChangeType(regType);
8055 LowerNode(blkNode->Data());
8057 else if (src->OperIsInitVal())
8059 GenTreeUnOp* initVal = src->AsUnOp();
8060 src = src->gtGetOp1();
8061 assert(src->IsCnsIntOrI());
8062 src->AsIntCon()->FixupInitBlkValue(regType);
8063 blkNode->SetData(src);
8064 BlockRange().Remove(initVal);
8068 assert(src->TypeIs(regType) || src->IsCnsIntOrI() || src->IsCall());
8071 #if defined(TARGET_XARCH)
8072 if (varTypeIsSmall(regType) && src->OperIs(GT_IND, GT_LCL_FLD))
8074 src->SetDontExtend();
8076 #endif // TARGET_XARCH
8078 LowerStoreIndirCommon(blkNode->AsStoreInd());
8082 //------------------------------------------------------------------------
8083 // TryRetypingFloatingPointStoreToIntegerStore: Retype an FP memory store.
8085 // On some targets, integer stores are cheaper and/or smaller than their
8086 // floating-point counterparts, because, e. g., integer immediates can be
8087 // encoded inline while FP ones need to be loaded from the data section.
8090 // store - The store node
8092 void Lowering::TryRetypingFloatingPointStoreToIntegerStore(GenTree* store)
8094 assert(store->OperIsStore());
8096 if (!varTypeIsFloating(store))
8101 // We only want to transform memory stores, not definitions of candidate locals.
8103 if (store->OperIs(GT_STORE_LCL_VAR) && !comp->lvaGetDesc(store->AsLclVar())->lvDoNotEnregister)
8108 GenTree* data = store->Data();
8109 assert(store->TypeGet() == data->TypeGet());
8111 // Optimize *x = DCON to *x = ICON which can be slightly faster and/or smaller.
8113 if (data->IsCnsFltOrDbl())
8115 double dblCns = data->AsDblCon()->DconValue();
8117 var_types type = TYP_UNKNOWN;
8118 // XARCH: we can always contain the immediates.
8119 // ARM64: zero can always be contained, other cases will use immediates from the data
8120 // section and it is not a clear win to switch them to inline integers.
8121 // ARM: FP constants are assembled from integral ones, so it is always profitable
8122 // to directly use the integers as it avoids the int -> float conversion.
8123 CLANG_FORMAT_COMMENT_ANCHOR;
8125 #if defined(TARGET_XARCH) || defined(TARGET_ARM)
8126 bool shouldSwitchToInteger = true;
8127 #else // TARGET_ARM64 || TARGET_LOONGARCH64 || TARGET_RISCV64
8128 bool shouldSwitchToInteger = FloatingPointUtils::isPositiveZero(dblCns);
8131 if (shouldSwitchToInteger)
8133 if (store->TypeIs(TYP_FLOAT))
8135 float fltCns = static_cast<float>(dblCns);
8136 intCns = *reinterpret_cast<INT32*>(&fltCns);
8142 assert(store->TypeIs(TYP_DOUBLE));
8143 intCns = *reinterpret_cast<INT64*>(&dblCns);
8149 if (type != TYP_UNKNOWN)
8151 data->BashToConst(intCns, type);
8153 assert(!store->OperIsLocalStore() || comp->lvaGetDesc(store->AsLclVarCommon())->lvDoNotEnregister);
8154 if (store->OperIs(GT_STORE_LCL_VAR))
8156 store->SetOper(GT_STORE_LCL_FLD);
8158 store->ChangeType(type);
8163 //----------------------------------------------------------------------------------------------
8164 // Lowering::TryLowerAndNegativeOne:
8165 // If safe, lowers a tree AND(X, CNS(-1)) to X.
8168 // node - GT_AND node of integral type
8169 // nextNode - out parameter that represents the 'gtNext' of the given node if the transformation was successful
8172 // Returns the true if the transformation was successful; false if it was not.
8173 bool Lowering::TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode)
8175 assert(node->OperIs(GT_AND));
8176 assert(nextNode != nullptr);
8178 if (!varTypeIsIntegral(node))
8181 if (node->gtSetFlags())
8184 if (node->isContained())
8187 GenTree* op2 = node->gtGetOp2();
8189 if (!op2->IsIntegralConst(-1))
8192 #ifndef TARGET_64BIT
8193 assert(op2->TypeIs(TYP_INT));
8194 #endif // !TARGET_64BIT
8196 GenTree* op1 = node->gtGetOp1();
8199 if (BlockRange().TryGetUse(node, &use))
8201 use.ReplaceWith(op1);
8205 op1->SetUnusedValue();
8208 *nextNode = node->gtNext;
8210 BlockRange().Remove(op2);
8211 BlockRange().Remove(node);
8216 #if defined(FEATURE_HW_INTRINSICS)
8217 //----------------------------------------------------------------------------------------------
8218 // Lowering::InsertNewSimdCreateScalarUnsafeNode: Inserts a new simd CreateScalarUnsafe node
8221 // simdType - The return type of SIMD node being created
8222 // op1 - The value of the lowest element of the simd value
8223 // simdBaseJitType - the base JIT type of SIMD type of the intrinsic
8224 // simdSize - the size of the SIMD type of the intrinsic
8227 // The inserted CreateScalarUnsafe node
8230 // If the created node is a vector constant, op1 will be removed from the block range
8232 GenTree* Lowering::InsertNewSimdCreateScalarUnsafeNode(var_types simdType,
8234 CorInfoType simdBaseJitType,
8237 assert(varTypeIsSIMD(simdType));
8239 GenTree* result = comp->gtNewSimdCreateScalarUnsafeNode(simdType, op1, simdBaseJitType, simdSize);
8240 BlockRange().InsertAfter(op1, result);
8242 if (result->IsVectorConst())
8244 BlockRange().Remove(op1);
8248 #endif // FEATURE_HW_INTRINSICS
8250 //----------------------------------------------------------------------------------------------
8251 // Lowering::RequireOutgoingArgSpace: Record that the compilation will require
8252 // outgoing arg space of at least the specified size.
8255 // node - The node that is the reason for the requirement.
8256 // size - The minimal required size of the outgoing arg space.
8258 void Lowering::RequireOutgoingArgSpace(GenTree* node, unsigned size)
8260 #if FEATURE_FIXED_OUT_ARGS
8261 if (size <= m_outgoingArgSpaceSize)
8266 JITDUMP("Bumping outgoing arg space size from %u to %u for [%06u]\n", m_outgoingArgSpaceSize, size,
8267 Compiler::dspTreeID(node));
8268 m_outgoingArgSpaceSize = size;
8272 //----------------------------------------------------------------------------------------------
8273 // Lowering::FinalizeOutgoingArgSpace: Finalize and allocate the outgoing arg
8276 void Lowering::FinalizeOutgoingArgSpace()
8278 #if FEATURE_FIXED_OUT_ARGS
8279 // Finish computing the outgoing args area size
8281 // Need to make sure the MIN_ARG_AREA_FOR_CALL space is added to the frame if:
8282 // 1. there are calls to THROW_HELPER methods.
8283 // 2. we are generating profiling Enter/Leave/TailCall hooks. This will ensure
8284 // that even methods without any calls will have outgoing arg area space allocated.
8285 // 3. We will be generating calls to PInvoke helpers. TODO: This shouldn't be required because
8286 // if there are any calls to PInvoke methods, there should be a call that we processed
8287 // above. However, we still generate calls to PInvoke prolog helpers even if we have dead code
8288 // eliminated all the calls.
8289 // 4. We will be generating a stack cookie check. In this case we can call a helper to fail fast.
8291 // An example for these two cases is Windows Amd64, where the ABI requires to have 4 slots for
8292 // the outgoing arg space if the method makes any calls.
8293 if (m_outgoingArgSpaceSize < MIN_ARG_AREA_FOR_CALL)
8295 if (comp->compUsesThrowHelper || comp->compIsProfilerHookNeeded() ||
8296 (comp->compMethodRequiresPInvokeFrame() && !comp->opts.ShouldUsePInvokeHelpers()) ||
8297 comp->getNeedsGSSecurityCookie())
8299 m_outgoingArgSpaceSize = MIN_ARG_AREA_FOR_CALL;
8300 JITDUMP("Bumping outgoing arg space size to %u for possible helper or profile hook call",
8301 m_outgoingArgSpaceSize);
8305 // If a function has localloc, we will need to move the outgoing arg space when the
8306 // localloc happens. When we do this, we need to maintain stack alignment. To avoid
8307 // leaving alignment-related holes when doing this move, make sure the outgoing
8308 // argument space size is a multiple of the stack alignment by aligning up to the next
8309 // stack alignment boundary.
8310 if (comp->compLocallocUsed)
8312 m_outgoingArgSpaceSize = roundUp(m_outgoingArgSpaceSize, STACK_ALIGN);
8313 JITDUMP("Bumping outgoing arg space size to %u for localloc", m_outgoingArgSpaceSize);
8316 assert((m_outgoingArgSpaceSize % TARGET_POINTER_SIZE) == 0);
8318 // Publish the final value and mark it as read only so any update
8319 // attempt later will cause an assert.
8320 comp->lvaOutgoingArgSpaceSize = m_outgoingArgSpaceSize;
8321 comp->lvaOutgoingArgSpaceSize.MarkAsReadOnly();
8322 comp->lvaGetDesc(comp->lvaOutgoingArgSpaceVar)->GrowBlockLayout(comp->typGetBlkLayout(m_outgoingArgSpaceSize));