1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Lowering for AMD64 XX
10 XX This encapsulates all the logic for lowering trees for the AMD64 XX
11 XX architecture. For a more detailed view of what is lowering, please XX
12 XX take a look at Lower.cpp XX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
24 #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
29 #include "sideeffects.h"
32 // xarch supports both ROL and ROR instructions so no lowering is required.
33 void Lowering::LowerRotate(GenTree* tree)
35 ContainCheckShiftRotate(tree->AsOp());
38 //------------------------------------------------------------------------
39 // LowerStoreLoc: Lower a store of a lclVar
42 // storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
46 // - Handling of contained immediates.
47 // - Widening operations of unsigneds.
49 void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
51 // Try to widen the ops if they are going into a local var.
52 if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT))
54 GenTreeIntCon* con = storeLoc->gtOp1->AsIntCon();
55 ssize_t ival = con->gtIconVal;
57 unsigned varNum = storeLoc->gtLclNum;
58 LclVarDsc* varDsc = comp->lvaTable + varNum;
60 if (varDsc->lvIsSIMDType())
62 noway_assert(storeLoc->gtType != TYP_STRUCT);
64 unsigned size = genTypeSize(storeLoc);
65 // If we are storing a constant into a local variable
66 // we extend the size of the store here
67 if ((size < 4) && !varTypeIsStruct(varDsc))
69 if (!varTypeIsUnsigned(varDsc))
71 if (genTypeSize(storeLoc) == 1)
73 if ((ival & 0x7f) != ival)
75 ival = ival | 0xffffff00;
80 assert(genTypeSize(storeLoc) == 2);
81 if ((ival & 0x7fff) != ival)
83 ival = ival | 0xffff0000;
88 // A local stack slot is at least 4 bytes in size, regardless of
89 // what the local var is typed as, so auto-promote it here
90 // unless it is a field of a promoted struct
91 // TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
92 if (!varDsc->lvIsStructField)
94 storeLoc->gtType = TYP_INT;
95 con->SetIconValue(ival);
99 ContainCheckStoreLoc(storeLoc);
102 //------------------------------------------------------------------------
103 // LowerStoreIndir: Determine addressing mode for an indirection, and whether operands are contained.
106 // node - The indirect store node (GT_STORE_IND) of interest
111 void Lowering::LowerStoreIndir(GenTreeIndir* node)
113 // Mark all GT_STOREIND nodes to indicate that it is not known
114 // whether it represents a RMW memory op.
115 node->AsStoreInd()->SetRMWStatusDefault();
117 if (!varTypeIsFloating(node))
119 // Perform recognition of trees with the following structure:
120 // StoreInd(addr, BinOp(expr, GT_IND(addr)))
121 // to be able to fold this into an instruction of the form
122 // BINOP [addr], register
123 // where register is the actual place where 'expr' is computed.
125 // SSE2 doesn't support RMW form of instructions.
126 if (LowerRMWMemOp(node))
131 ContainCheckStoreIndir(node);
134 //------------------------------------------------------------------------
135 // LowerBlockStore: Set block store type
138 // blkNode - The block store node of interest
143 void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
145 GenTree* dstAddr = blkNode->Addr();
146 unsigned size = blkNode->gtBlkSize;
147 GenTree* source = blkNode->Data();
148 Compiler* compiler = comp;
149 GenTree* srcAddrOrFill = nullptr;
150 bool isInitBlk = blkNode->OperIsInitBlkOp();
154 // CopyObj or CopyBlk
155 if ((blkNode->OperGet() == GT_STORE_OBJ) && ((blkNode->AsObj()->gtGcPtrCount == 0) || blkNode->gtBlkOpGcUnsafe))
157 blkNode->SetOper(GT_STORE_BLK);
159 if (source->gtOper == GT_IND)
161 srcAddrOrFill = blkNode->Data()->gtGetOp1();
167 GenTree* initVal = source;
168 if (initVal->OperIsInitVal())
170 initVal->SetContained();
171 initVal = initVal->gtGetOp1();
173 srcAddrOrFill = initVal;
174 // If we have an InitBlk with constant block size we can optimize several ways:
175 // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
176 // we use rep stosb since this reduces the register pressure in LSRA and we have
177 // roughly the same performance as calling the helper.
178 // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant,
179 // we can speed this up by unrolling the loop using SSE2 stores. The reason for
180 // this threshold is because our last investigation (Fall 2013), more than 95% of initblks
181 // in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
182 // preferred code sequence for the vast majority of cases.
184 // This threshold will decide from using the helper or let the JIT decide to inline
185 // a code sequence of its choice.
186 unsigned helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);
188 // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
189 if (size != 0 && size <= helperThreshold)
191 // Always favor unrolling vs rep stos.
192 if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
194 // The fill value of an initblk is interpreted to hold a
195 // value of (unsigned int8) however a constant of any size
196 // may practically reside on the evaluation stack. So extract
197 // the lower byte out of the initVal constant and replicate
198 // it to a larger constant whose size is sufficient to support
199 // the largest width store of the desired inline expansion.
201 ssize_t fill = initVal->gtIntCon.gtIconVal & 0xFF;
202 #ifdef _TARGET_AMD64_
203 if (size < REGSIZE_BYTES)
205 initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
209 initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * fill;
210 initVal->gtType = TYP_LONG;
211 if ((fill == 0) && ((size & 0xf) == 0))
213 MakeSrcContained(blkNode, source);
216 #else // !_TARGET_AMD64_
217 initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
218 #endif // !_TARGET_AMD64_
220 if ((fill == 0) && ((size & 0xf) == 0))
222 MakeSrcContained(blkNode, source);
224 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
228 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
233 #ifdef _TARGET_AMD64_
234 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
235 #else // !_TARGET_AMD64_
236 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
237 #endif // !_TARGET_AMD64_
242 if (blkNode->gtOper == GT_STORE_OBJ)
246 GenTreeObj* cpObjNode = blkNode->AsObj();
248 unsigned slots = cpObjNode->gtSlots;
251 // CpObj must always have at least one GC-Pointer as a member.
252 assert(cpObjNode->gtGcPtrCount > 0);
254 assert(dstAddr->gtType == TYP_BYREF || dstAddr->gtType == TYP_I_IMPL);
256 CORINFO_CLASS_HANDLE clsHnd = cpObjNode->gtClass;
257 size_t classSize = comp->info.compCompHnd->getClassSize(clsHnd);
258 size_t blkSize = roundUp(classSize, TARGET_POINTER_SIZE);
260 // Currently, the EE always round up a class data structure so
261 // we are not handling the case where we have a non multiple of pointer sized
262 // struct. This behavior may change in the future so in order to keeps things correct
263 // let's assert it just to be safe. Going forward we should simply
265 assert(classSize == blkSize);
266 assert((blkSize / TARGET_POINTER_SIZE) == slots);
267 assert(cpObjNode->HasGCPtr());
270 bool IsRepMovsProfitable = false;
272 // If the destination is not on the stack, let's find out if we
273 // can improve code size by using rep movsq instead of generating
274 // sequences of movsq instructions.
275 if (!dstAddr->OperIsLocalAddr())
277 // Let's inspect the struct/class layout and determine if it's profitable
278 // to use rep movsq for copying non-gc memory instead of using single movsq
279 // instructions for each memory slot.
281 BYTE* gcPtrs = cpObjNode->gtGcPtrs;
285 unsigned nonGCSlots = 0;
286 // Measure a contiguous non-gc area inside the struct and note the maximum.
287 while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
293 while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
298 if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
300 IsRepMovsProfitable = true;
305 else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
307 IsRepMovsProfitable = true;
310 // There are two cases in which we need to materialize the
312 // a) When the destination is on the stack we don't need to use the
313 // write barrier, we can just simply call rep movsq and get a win in codesize.
314 // b) If we determine we have contiguous non-gc regions in the struct where it's profitable
315 // to use rep movsq instead of a sequence of single movsq instructions. According to the
316 // Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
317 // the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
318 if (IsRepMovsProfitable)
320 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
321 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
325 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
330 assert((blkNode->OperGet() == GT_STORE_BLK) || (blkNode->OperGet() == GT_STORE_DYN_BLK));
332 // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
333 // we can use rep movs to generate code instead of the helper call.
335 // This threshold will decide between using the helper or let the JIT decide to inline
336 // a code sequence of its choice.
337 unsigned helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
339 // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
340 if ((size != 0) && (size <= helperThreshold))
342 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
343 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
344 // our framework assemblies, so this is the main code generation scheme we'll use.
345 if (size <= CPBLK_UNROLL_LIMIT)
347 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
349 // If src or dst are on stack, we don't have to generate the address
350 // into a register because it's just some constant+SP.
351 if ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr())
353 MakeSrcContained(blkNode, srcAddrOrFill);
356 if (dstAddr->OperIsLocalAddr())
358 MakeSrcContained(blkNode, dstAddr);
363 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
366 #ifdef _TARGET_AMD64_
369 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
371 #elif defined(_TARGET_X86_)
374 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
376 #endif // _TARGET_X86_
377 assert(blkNode->gtBlkOpKind != GenTreeBlk::BlkOpKindInvalid);
380 // CopyObj or CopyBlk
381 if (source->gtOper == GT_IND)
383 // The GT_IND is contained, but the address must be in a register unless it is local.
384 MakeSrcContained(blkNode, source);
385 GenTree* addr = source->AsIndir()->Addr();
386 if (!addr->OperIsLocalAddr())
388 addr->ClearContained();
391 else if (!source->IsMultiRegCall() && !source->OperIsSIMD() && !source->OperIsSimdHWIntrinsic())
393 assert(source->IsLocal());
394 MakeSrcContained(blkNode, source);
399 //------------------------------------------------------------------------
400 // LowerPutArgStk: Lower a GT_PUTARG_STK.
403 // tree - The node of interest
408 void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
411 if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
413 putArgStk->gtNumberReferenceSlots = 0;
414 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;
416 GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
418 // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
419 // of uses is visible to LSRA.
420 unsigned fieldCount = 0;
421 GenTreeFieldList* head = nullptr;
422 for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next)
424 next = current->Rest();
426 // First, insert the field node into the sorted list.
427 GenTreeFieldList* prev = nullptr;
428 for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
430 // If the offset of the current list node is greater than the offset of the cursor or if we have
431 // reached the end of the list, insert the current node before the cursor and terminate.
432 if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset))
436 assert(cursor == head);
441 prev->Rest() = current;
444 current->Rest() = cursor;
452 // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
453 // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
454 // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
455 // corresponding field list nodes in two, giving an upper bound of 8.
457 // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
458 // the maximum size of a field list grows significantly, we will need to reevaluate it.
459 assert(fieldCount <= 8);
461 // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
463 if (head != fieldList)
465 head->gtFlags |= GTF_FIELD_LIST_HEAD;
466 head->SetContained();
468 fieldList->ClearContained();
469 fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
472 head->gtSeqNum = fieldList->gtSeqNum;
475 BlockRange().InsertAfter(fieldList, head);
476 BlockRange().Remove(fieldList);
479 putArgStk->gtOp1 = fieldList;
480 putArgStk->gtType = fieldList->gtType;
483 // Now that the fields have been sorted, the kind of code we will generate.
484 bool allFieldsAreSlots = true;
485 unsigned prevOffset = putArgStk->getArgSize();
486 for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
488 GenTree* const fieldNode = current->Current();
489 const var_types fieldType = fieldNode->TypeGet();
490 const unsigned fieldOffset = current->gtFieldOffset;
491 assert(fieldType != TYP_LONG);
493 // We can treat as a slot any field that is stored at a slot boundary, where the previous
494 // field is not in the same slot. (Note that we store the fields in reverse order.)
495 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
498 allFieldsAreSlots = false;
501 if (varTypeIsGC(fieldType))
503 putArgStk->gtNumberReferenceSlots++;
506 // For x86 we must mark all integral fields as contained or reg-optional, and handle them
507 // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
508 // registers to be consumed atomically by the call.
509 if (varTypeIsIntegralOrI(fieldNode))
511 if (fieldNode->OperGet() == GT_LCL_VAR)
513 LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
514 if (!varDsc->lvDoNotEnregister)
516 fieldNode->SetRegOptional();
520 MakeSrcContained(putArgStk, fieldNode);
523 else if (fieldNode->IsIntCnsFitsInI32())
525 MakeSrcContained(putArgStk, fieldNode);
529 // For the case where we cannot directly push the value, if we run out of registers,
530 // it would be better to defer computation until we are pushing the arguments rather
531 // than spilling, but this situation is not all that common, as most cases of promoted
532 // structs do not have a large number of fields, and of those most are lclVars or
533 // copy-propagated constants.
534 fieldNode->SetRegOptional();
538 prevOffset = fieldOffset;
541 // Set the copy kind.
542 // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
543 // adjust the stack once for those fields. The latter is really best done in code generation, but
544 // this tuning should probably be undertaken as a whole.
545 // Also, if there are floating point fields, it may be better to use the "Unroll" mode
546 // of copying the struct as a whole, if the fields are not register candidates.
547 if (allFieldsAreSlots)
549 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
553 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
557 #endif // _TARGET_X86_
559 GenTree* src = putArgStk->gtOp1;
561 #ifdef FEATURE_PUT_STRUCT_ARG_STK
562 if (src->TypeGet() != TYP_STRUCT)
563 #endif // FEATURE_PUT_STRUCT_ARG_STK
565 // If the child of GT_PUTARG_STK is a constant, we don't need a register to
566 // move it to memory (stack location).
568 // On AMD64, we don't want to make 0 contained, because we can generate smaller code
569 // by zeroing a register and then storing it. E.g.:
571 // mov gword ptr [rsp+28H], rdx
572 // is 2 bytes smaller than:
573 // mov gword ptr [rsp+28H], 0
575 // On x86, we push stack arguments; we don't use 'mov'. So:
577 // is 1 byte smaller than:
581 if (IsContainableImmed(putArgStk, src)
582 #if defined(_TARGET_AMD64_)
583 && !src->IsIntegralConst(0)
584 #endif // _TARGET_AMD64_
587 MakeSrcContained(putArgStk, src);
592 #ifdef FEATURE_PUT_STRUCT_ARG_STK
593 GenTree* dst = putArgStk;
594 GenTree* srcAddr = nullptr;
596 bool haveLocalAddr = false;
597 if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
599 srcAddr = src->gtOp.gtOp1;
600 assert(srcAddr != nullptr);
601 haveLocalAddr = srcAddr->OperIsLocalAddr();
605 assert(varTypeIsSIMD(putArgStk));
608 // In case of a CpBlk we could use a helper call. In case of putarg_stk we
609 // can't do that since the helper call could kill some already set up outgoing args.
610 // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
611 // The cpyXXXX code is rather complex and this could cause it to be more complex, but
612 // it might be the right thing to do.
614 // This threshold will decide from using the helper or let the JIT decide to inline
615 // a code sequence of its choice.
616 ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
617 ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
619 // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
620 // (I don't know which).
622 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
623 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
624 // our framework assemblies, so this is the main code generation scheme we'll use.
625 if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0)
628 if (size < XMM_REGSIZE_BYTES)
630 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
633 #endif // _TARGET_X86_
635 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
639 else if (putArgStk->gtNumberReferenceSlots != 0)
641 // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
642 // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
643 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
645 #endif // _TARGET_X86_
648 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
650 // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
651 MakeSrcContained(putArgStk, src);
654 // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
657 MakeSrcContained(putArgStk, srcAddr);
659 #endif // FEATURE_PUT_STRUCT_ARG_STK
662 /* Lower GT_CAST(srcType, DstType) nodes.
664 * Casts from small int type to float/double are transformed as follows:
665 * GT_CAST(byte, float/double) = GT_CAST(GT_CAST(byte, int32), float/double)
666 * GT_CAST(sbyte, float/double) = GT_CAST(GT_CAST(sbyte, int32), float/double)
667 * GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double)
668 * GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double)
670 * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
671 * are morphed as follows by front-end and hence should not be seen here.
672 * GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double)
673 * GT_CAST(uint64, float) = GT_CAST(GT_CAST(uint64, double), float)
676 * Similarly casts from float/double to a smaller int type are transformed as follows:
677 * GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte)
678 * GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte)
679 * GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16)
680 * GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16)
682 * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
683 * integer. The above transformations help us to leverage those instructions.
685 * Note that for the following conversions we still depend on helper calls and
686 * don't expect to see them here.
687 * i) GT_CAST(float/double, uint64)
688 * ii) GT_CAST(float/double, int type with overflow detection)
690 * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
691 * There are hardly any occurrences of this conversion operation in platform
692 * assemblies or in CQ perf benchmarks (1 occurrence in mscorlib, microsoft.jscript,
693 * 1 occurence in Roslyn and no occurrences in system, system.core, system.numerics
694 * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
695 * doing this optimization is a win, should consider generating in-lined code.
697 void Lowering::LowerCast(GenTree* tree)
699 assert(tree->OperGet() == GT_CAST);
701 GenTree* castOp = tree->gtCast.CastOp();
702 var_types castToType = tree->CastToType();
703 var_types srcType = castOp->TypeGet();
704 var_types tmpType = TYP_UNDEF;
706 // force the srcType to unsigned if GT_UNSIGNED flag is set
707 if (tree->gtFlags & GTF_UNSIGNED)
709 srcType = genUnsignedType(srcType);
712 // We should never see the following casts as they are expected to be lowered
713 // apropriately or converted into helper calls by front-end.
714 // srcType = float/double castToType = * and overflow detecting cast
715 // Reason: must be converted to a helper call
716 // srcType = float/double, castToType = ulong
717 // Reason: must be converted to a helper call
718 // srcType = uint castToType = float/double
719 // Reason: uint -> float/double = uint -> long -> float/double
720 // srcType = ulong castToType = float
721 // Reason: ulong -> float = ulong -> double -> float
722 if (varTypeIsFloating(srcType))
724 noway_assert(!tree->gtOverflow());
725 noway_assert(castToType != TYP_ULONG);
727 else if (srcType == TYP_UINT)
729 noway_assert(!varTypeIsFloating(castToType));
731 else if (srcType == TYP_ULONG)
733 noway_assert(castToType != TYP_FLOAT);
736 // Case of src is a small type and dst is a floating point type.
737 if (varTypeIsSmall(srcType) && varTypeIsFloating(castToType))
739 // These conversions can never be overflow detecting ones.
740 noway_assert(!tree->gtOverflow());
743 // case of src is a floating point type and dst is a small type.
744 else if (varTypeIsFloating(srcType) && varTypeIsSmall(castToType))
749 if (tmpType != TYP_UNDEF)
751 GenTree* tmp = comp->gtNewCastNode(tmpType, castOp, tree->IsUnsigned(), tmpType);
752 tmp->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT));
754 tree->gtFlags &= ~GTF_UNSIGNED;
755 tree->gtOp.gtOp1 = tmp;
756 BlockRange().InsertAfter(castOp, tmp);
757 ContainCheckCast(tmp->AsCast());
760 // Now determine if we have operands that should be contained.
761 ContainCheckCast(tree->AsCast());
765 //----------------------------------------------------------------------------------------------
766 // Lowering::LowerSIMD: Perform containment analysis for a SIMD intrinsic node.
769 // simdNode - The SIMD intrinsic node.
771 void Lowering::LowerSIMD(GenTreeSIMD* simdNode)
773 if (simdNode->TypeGet() == TYP_SIMD12)
775 // GT_SIMD node requiring to produce TYP_SIMD12 in fact
776 // produces a TYP_SIMD16 result
777 simdNode->gtType = TYP_SIMD16;
780 #ifdef _TARGET_XARCH_
781 if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (simdNode->gtGetOp1()->OperGet() == GT_IND))
783 // If SIMD vector is already in memory, we force its
784 // addr to be evaluated into a reg. This would allow
785 // us to generate [regBase] or [regBase+offset] or
786 // [regBase+sizeOf(SIMD vector baseType)*regIndex]
787 // to access the required SIMD vector element directly
790 // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
791 // might be able update GT_LEA to fold the regIndex
792 // or offset in some cases. Instead with this
793 // approach we always evaluate GT_LEA into a reg.
794 // Ideally, we should be able to lower GetItem intrinsic
795 // into GT_IND(newAddr) where newAddr combines
796 // the addr of SIMD vector with the given index.
797 simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
799 else if (simdNode->IsSIMDEqualityOrInequality())
803 if (BlockRange().TryGetUse(simdNode, &simdUse))
806 // Try to transform JTRUE(EQ|NE(SIMD<OpEquality|OpInEquality>(x, y), 0|1)) into
807 // JCC(SIMD<OpEquality|OpInEquality>(x, y)). SIMD<OpEquality|OpInEquality>(x, y)
808 // is expected to set the Zero flag appropriately.
809 // All the involved nodes must form a continuous range, there's no other way to
810 // guarantee that condition flags aren't changed between the SIMD node and the JCC
814 bool transformed = false;
815 GenTree* simdUser = simdUse.User();
817 if (simdUser->OperIs(GT_EQ, GT_NE) && simdUser->gtGetOp2()->IsCnsIntOrI() &&
818 (simdNode->gtNext == simdUser->gtGetOp2()) && (simdUser->gtGetOp2()->gtNext == simdUser))
820 ssize_t relopOp2Value = simdUser->gtGetOp2()->AsIntCon()->IconValue();
822 if ((relopOp2Value == 0) || (relopOp2Value == 1))
824 GenTree* jtrue = simdUser->gtNext;
826 if ((jtrue != nullptr) && jtrue->OperIs(GT_JTRUE) && (jtrue->gtGetOp1() == simdUser))
828 if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) != simdUser->OperIs(GT_EQ))
833 jtrue->ChangeOper(GT_JCC);
834 GenTreeCC* jcc = jtrue->AsCC();
835 jcc->gtFlags |= GTF_USE_FLAGS;
836 jcc->gtCondition = (relopOp2Value == 0) ? GT_NE : GT_EQ;
838 BlockRange().Remove(simdUser->gtGetOp2());
839 BlockRange().Remove(simdUser);
848 // The code generated for SIMD SIMD<OpEquality|OpInEquality>(x, y) nodes sets
849 // the Zero flag like integer compares do so we can simply use SETCC<EQ|NE>
850 // to produce the desired result. This avoids the need for subsequent phases
851 // to have to handle 2 cases (set flags/set destination register).
854 genTreeOps condition = (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? GT_EQ : GT_NE;
855 GenTreeCC* setcc = new (comp, GT_SETCC) GenTreeCC(GT_SETCC, condition, simdNode->TypeGet());
856 setcc->gtFlags |= GTF_USE_FLAGS;
857 BlockRange().InsertAfter(simdNode, setcc);
858 simdUse.ReplaceWith(comp, setcc);
862 simdNode->gtFlags |= GTF_SET_FLAGS;
863 simdNode->gtType = TYP_VOID;
866 ContainCheckSIMD(simdNode);
868 #endif // FEATURE_SIMD
870 #ifdef FEATURE_HW_INTRINSICS
871 //----------------------------------------------------------------------------------------------
872 // Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
875 // node - The hardware intrinsic node.
877 void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
879 ContainCheckHWIntrinsic(node);
881 #endif // FEATURE_HW_INTRINSICS
883 //----------------------------------------------------------------------------------------------
884 // Lowering::IsRMWIndirCandidate:
885 // Returns true if the given operand is a candidate indirection for a read-modify-write
889 // operand - The operand to consider.
890 // storeInd - The indirect store that roots the possible RMW operator.
892 bool Lowering::IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd)
894 // If the operand isn't an indirection, it's trivially not a candidate.
895 if (operand->OperGet() != GT_IND)
900 // If the indirection's source address isn't equivalent to the destination address of the storeIndir, then the
901 // indirection is not a candidate.
902 GenTree* srcAddr = operand->gtGetOp1();
903 GenTree* dstAddr = storeInd->gtGetOp1();
904 if ((srcAddr->OperGet() != dstAddr->OperGet()) || !IndirsAreEquivalent(operand, storeInd))
909 // If it is not safe to contain the entire tree rooted at the indirection, then the indirection is not a
910 // candidate. Crawl the IR from the node immediately preceding the storeIndir until the last node in the
911 // indirection's tree is visited and check the side effects at each point.
913 m_scratchSideEffects.Clear();
915 assert((operand->gtLIRFlags & LIR::Flags::Mark) == 0);
916 operand->gtLIRFlags |= LIR::Flags::Mark;
918 unsigned markCount = 1;
920 for (node = storeInd->gtPrev; markCount > 0; node = node->gtPrev)
922 assert(node != nullptr);
924 if ((node->gtLIRFlags & LIR::Flags::Mark) == 0)
926 m_scratchSideEffects.AddNode(comp, node);
930 node->gtLIRFlags &= ~LIR::Flags::Mark;
933 if (m_scratchSideEffects.InterferesWith(comp, node, false))
935 // The indirection's tree contains some node that can't be moved to the storeInder. The indirection is
936 // not a candidate. Clear any leftover mark bits and return.
937 for (; markCount > 0; node = node->gtPrev)
939 if ((node->gtLIRFlags & LIR::Flags::Mark) != 0)
941 node->gtLIRFlags &= ~LIR::Flags::Mark;
948 node->VisitOperands([&markCount](GenTree* nodeOperand) -> GenTree::VisitResult {
949 assert((nodeOperand->gtLIRFlags & LIR::Flags::Mark) == 0);
950 nodeOperand->gtLIRFlags |= LIR::Flags::Mark;
952 return GenTree::VisitResult::Continue;
957 // At this point we've verified that the operand is an indirection, its address is equivalent to the storeIndir's
958 // destination address, and that it and the transitive closure of its operand can be safely contained by the
959 // storeIndir. This indirection is therefore a candidate for an RMW op.
963 //----------------------------------------------------------------------------------------------
964 // Returns true if this tree is bin-op of a GT_STOREIND of the following form
965 // storeInd(subTreeA, binOp(gtInd(subTreeA), subtreeB)) or
966 // storeInd(subTreeA, binOp(subtreeB, gtInd(subTreeA)) in case of commutative bin-ops
968 // The above form for storeInd represents a read-modify-write memory binary operation.
971 // tree - GentreePtr of binOp
974 // True if 'tree' is part of a RMW memory operation pattern
976 bool Lowering::IsBinOpInRMWStoreInd(GenTree* tree)
978 // Must be a non floating-point type binary operator since SSE2 doesn't support RMW memory ops
979 assert(!varTypeIsFloating(tree));
980 assert(GenTree::OperIsBinary(tree->OperGet()));
982 // Cheap bail out check before more expensive checks are performed.
983 // RMW memory op pattern requires that one of the operands of binOp to be GT_IND.
984 if (tree->gtGetOp1()->OperGet() != GT_IND && tree->gtGetOp2()->OperGet() != GT_IND)
990 if (!BlockRange().TryGetUse(tree, &use) || use.User()->OperGet() != GT_STOREIND || use.User()->gtGetOp2() != tree)
995 // Since it is not relatively cheap to recognize RMW memory op pattern, we
996 // cache the result in GT_STOREIND node so that while lowering GT_STOREIND
997 // we can use the result.
998 GenTree* indirCandidate = nullptr;
999 GenTree* indirOpSource = nullptr;
1000 return IsRMWMemOpRootedAtStoreInd(use.User(), &indirCandidate, &indirOpSource);
1003 //----------------------------------------------------------------------------------------------
1004 // This method recognizes the case where we have a treeNode with the following structure:
1005 // storeInd(IndirDst, binOp(gtInd(IndirDst), indirOpSource)) OR
1006 // storeInd(IndirDst, binOp(indirOpSource, gtInd(IndirDst)) in case of commutative operations OR
1007 // storeInd(IndirDst, unaryOp(gtInd(IndirDst)) in case of unary operations
1010 // indirDst = memory write of an addr mode (i.e. storeind destination)
1011 // indirSrc = value being written to memory (i.e. storeind source which could either be a binary or unary op)
1012 // indirCandidate = memory read i.e. a gtInd of an addr mode
1013 // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
1015 // In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the
1016 // following form in case of integer operations:
1017 // binOp [addressing mode], RegIndirOpSource
1018 // binOp [addressing mode], immediateVal
1019 // where RegIndirOpSource is the register where indirOpSource was computed.
1021 // Right now, we recognize few cases:
1022 // a) The gtInd child is a lea/lclVar/lclVarAddr/clsVarAddr/constant
1023 // b) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
1024 // c) unaryOp is either not/neg
1026 // Implementation Note: The following routines need to be in sync for RMW memory op optimization
1027 // to be correct and functional.
1028 // IndirsAreEquivalent()
1029 // NodesAreEquivalentLeaves()
1030 // Codegen of GT_STOREIND and genCodeForShiftRMW()
1033 // TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
1034 // package to perform more complex tree recognition.
1036 // TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
1039 // tree - GT_STOREIND node
1040 // outIndirCandidate - out param set to indirCandidate as described above
1041 // ouutIndirOpSource - out param set to indirOpSource as described above
1044 // True if there is a RMW memory operation rooted at a GT_STOREIND tree
1045 // and out params indirCandidate and indirOpSource are set to non-null values.
1046 // Otherwise, returns false with indirCandidate and indirOpSource set to null.
1047 // Also updates flags of GT_STOREIND tree with its RMW status.
1049 bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTree* tree, GenTree** outIndirCandidate, GenTree** outIndirOpSource)
1051 assert(!varTypeIsFloating(tree));
1052 assert(outIndirCandidate != nullptr);
1053 assert(outIndirOpSource != nullptr);
1055 *outIndirCandidate = nullptr;
1056 *outIndirOpSource = nullptr;
1058 // Early out if storeInd is already known to be a non-RMW memory op
1059 GenTreeStoreInd* storeInd = tree->AsStoreInd();
1060 if (storeInd->IsNonRMWMemoryOp())
1065 GenTree* indirDst = storeInd->gtGetOp1();
1066 GenTree* indirSrc = storeInd->gtGetOp2();
1067 genTreeOps oper = indirSrc->OperGet();
1069 // Early out if it is already known to be a RMW memory op
1070 if (storeInd->IsRMWMemoryOp())
1072 if (GenTree::OperIsBinary(oper))
1074 if (storeInd->IsRMWDstOp1())
1076 *outIndirCandidate = indirSrc->gtGetOp1();
1077 *outIndirOpSource = indirSrc->gtGetOp2();
1081 assert(storeInd->IsRMWDstOp2());
1082 *outIndirCandidate = indirSrc->gtGetOp2();
1083 *outIndirOpSource = indirSrc->gtGetOp1();
1085 assert(IndirsAreEquivalent(*outIndirCandidate, storeInd));
1089 assert(GenTree::OperIsUnary(oper));
1090 assert(IndirsAreEquivalent(indirSrc->gtGetOp1(), storeInd));
1091 *outIndirCandidate = indirSrc->gtGetOp1();
1092 *outIndirOpSource = indirSrc->gtGetOp1();
1098 // If reached here means that we do not know RMW status of tree rooted at storeInd
1099 assert(storeInd->IsRMWStatusUnknown());
1101 // Early out if indirDst is not one of the supported memory operands.
1102 if (indirDst->OperGet() != GT_LEA && indirDst->OperGet() != GT_LCL_VAR && indirDst->OperGet() != GT_LCL_VAR_ADDR &&
1103 indirDst->OperGet() != GT_CLS_VAR_ADDR && indirDst->OperGet() != GT_CNS_INT)
1105 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1109 // We can not use Read-Modify-Write instruction forms with overflow checking instructions
1110 // because we are not allowed to modify the target until after the overflow check.
1111 if (indirSrc->gtOverflowEx())
1113 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1117 // At this point we can match one of two patterns:
1119 // t_ind = indir t_addr_0
1121 // t_value = binop t_ind, t_other
1123 // storeIndir t_addr_1, t_value
1127 // t_ind = indir t_addr_0
1129 // t_value = unop t_ind
1131 // storeIndir t_addr_1, t_value
1133 // In all cases, we will eventually make the binop that produces t_value and the entire dataflow tree rooted at
1134 // t_ind contained by t_value.
1136 GenTree* indirCandidate = nullptr;
1137 GenTree* indirOpSource = nullptr;
1138 RMWStatus status = STOREIND_RMW_STATUS_UNKNOWN;
1139 if (GenTree::OperIsBinary(oper))
1141 // Return if binary op is not one of the supported operations for RMW of memory.
1142 if (!GenTree::OperIsRMWMemOp(oper))
1144 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1148 if (GenTree::OperIsShiftOrRotate(oper) && varTypeIsSmall(storeInd))
1150 // In ldind, Integer values smaller than 4 bytes, a boolean, or a character converted to 4 bytes
1151 // by sign or zero-extension as appropriate. If we directly shift the short type data using sar, we
1152 // will lose the sign or zero-extension bits.
1153 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_TYPE);
1157 // In the common case, the second operand to the binop will be the indir candidate.
1158 GenTreeOp* binOp = indirSrc->AsOp();
1159 if (GenTree::OperIsCommutative(oper) && IsRMWIndirCandidate(binOp->gtOp2, storeInd))
1161 indirCandidate = binOp->gtOp2;
1162 indirOpSource = binOp->gtOp1;
1163 status = STOREIND_RMW_DST_IS_OP2;
1165 else if (IsRMWIndirCandidate(binOp->gtOp1, storeInd))
1167 indirCandidate = binOp->gtOp1;
1168 indirOpSource = binOp->gtOp2;
1169 status = STOREIND_RMW_DST_IS_OP1;
1173 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1177 else if (GenTree::OperIsUnary(oper))
1179 // Nodes other than GT_NOT and GT_NEG are not yet supported.
1180 if (oper != GT_NOT && oper != GT_NEG)
1182 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1186 if (indirSrc->gtGetOp1()->OperGet() != GT_IND)
1188 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1192 GenTreeUnOp* unOp = indirSrc->AsUnOp();
1193 if (IsRMWIndirCandidate(unOp->gtOp1, storeInd))
1195 // src and dest are the same in case of unary ops
1196 indirCandidate = unOp->gtOp1;
1197 indirOpSource = unOp->gtOp1;
1198 status = STOREIND_RMW_DST_IS_OP1;
1202 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1208 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1212 // By this point we've verified that we have a supported operand with a supported address. Now we need to ensure
1213 // that we're able to move the destination address for the source indirection forwards.
1214 if (!IsSafeToContainMem(storeInd, indirDst))
1216 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1220 assert(indirCandidate != nullptr);
1221 assert(indirOpSource != nullptr);
1222 assert(status != STOREIND_RMW_STATUS_UNKNOWN);
1224 *outIndirCandidate = indirCandidate;
1225 *outIndirOpSource = indirOpSource;
1226 storeInd->SetRMWStatus(status);
1230 // anything is in range for AMD64
1231 bool Lowering::IsCallTargetInRange(void* addr)
1236 // return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
1237 bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode)
1239 if (!childNode->IsIntCnsFitsInI32())
1244 // At this point we know that it is an int const fits within 4-bytes and hence can safely cast to IntConCommon.
1245 // Icons that need relocation should never be marked as contained immed
1246 if (childNode->AsIntConCommon()->ImmedValNeedsReloc(comp))
1254 //-----------------------------------------------------------------------
1255 // PreferredRegOptionalOperand: returns one of the operands of given
1256 // binary oper that is to be preferred for marking as reg optional.
1258 // Since only one of op1 or op2 can be a memory operand on xarch, only
1259 // one of them have to be marked as reg optional. Since Lower doesn't
1260 // know apriori which of op1 or op2 is not likely to get a register, it
1261 // has to make a guess. This routine encapsulates heuristics that
1262 // guess whether it is likely to be beneficial to mark op1 or op2 as
1267 // tree - a binary-op tree node that is either commutative
1268 // or a compare oper.
1271 // Returns op1 or op2 of tree node that is preferred for
1272 // marking as reg optional.
1274 // Note: if the tree oper is neither commutative nor a compare oper
1275 // then only op2 can be reg optional on xarch and hence no need to
1276 // call this routine.
1277 GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
1279 assert(GenTree::OperIsBinary(tree->OperGet()));
1280 assert(tree->OperIsCommutative() || tree->OperIsCompare() || tree->OperIs(GT_CMP));
1282 GenTree* op1 = tree->gtGetOp1();
1283 GenTree* op2 = tree->gtGetOp2();
1284 assert(!op1->IsRegOptional() && !op2->IsRegOptional());
1286 // We default to op1, as op2 is likely to have the shorter lifetime.
1287 GenTree* preferredOp = op1;
1289 // This routine uses the following heuristics:
1291 // a) If both are register candidates, marking the one with lower weighted
1292 // ref count as reg-optional would likely be beneficial as it has
1293 // higher probability of not getting a register. Note that we use !lvDoNotEnregister
1294 // here because this is being done while we are adding lclVars for Lowering.
1296 // b) op1 = tracked local and op2 = untracked local: LSRA creates two
1297 // ref positions for op2: a def and use position. op2's def position
1298 // requires a reg and it is allocated a reg by spilling another
1299 // interval (if required) and that could be even op1. For this reason
1300 // it is beneficial to mark op1 as reg optional.
1302 // TODO: It is not always mandatory for a def position of an untracked
1303 // local to be allocated a register if it is on rhs of an assignment
1304 // and its use position is reg-optional and has not been assigned a
1305 // register. Reg optional def positions is currently not yet supported.
1307 // c) op1 = untracked local and op2 = tracked local: marking op1 as
1308 // reg optional is beneficial, since its use position is less likely
1309 // to get a register.
1311 // d) If both are untracked locals (i.e. treated like tree temps by
1312 // LSRA): though either of them could be marked as reg optional,
1313 // marking op1 as reg optional is likely to be beneficial because
1314 // while allocating op2's def position, there is a possibility of
1315 // spilling op1's def and in which case op1 is treated as contained
1316 // memory operand rather than requiring to reload.
1318 // e) If only one of them is a local var, prefer to mark it as
1319 // reg-optional. This is heuristic is based on the results
1320 // obtained against CQ perf benchmarks.
1322 // f) If neither of them are local vars (i.e. tree temps), prefer to
1323 // mark op1 as reg optional for the same reason as mentioned in (d) above.
1324 if (op1->OperGet() == GT_LCL_VAR && op2->OperGet() == GT_LCL_VAR)
1326 LclVarDsc* v1 = comp->lvaTable + op1->AsLclVarCommon()->GetLclNum();
1327 LclVarDsc* v2 = comp->lvaTable + op2->AsLclVarCommon()->GetLclNum();
1329 bool v1IsRegCandidate = !v1->lvDoNotEnregister;
1330 bool v2IsRegCandidate = !v2->lvDoNotEnregister;
1331 if (v1IsRegCandidate && v2IsRegCandidate)
1333 // Both are enregisterable locals. The one with lower weight is less likely
1334 // to get a register and hence beneficial to mark the one with lower
1335 // weight as reg optional.
1336 // If either is not tracked, it may be that it was introduced after liveness
1337 // was run, in which case we will always prefer op1 (should we use raw refcnt??).
1338 if (v1->lvTracked && v2->lvTracked && (v1->lvRefCntWtd >= v2->lvRefCntWtd))
1344 else if (!(op1->OperGet() == GT_LCL_VAR) && (op2->OperGet() == GT_LCL_VAR))
1352 //------------------------------------------------------------------------
1353 // Containment analysis
1354 //------------------------------------------------------------------------
1356 //------------------------------------------------------------------------
1357 // ContainCheckCallOperands: Determine whether operands of a call should be contained.
1360 // call - The call node of interest
1365 void Lowering::ContainCheckCallOperands(GenTreeCall* call)
1367 GenTree* ctrlExpr = call->gtControlExpr;
1368 if (call->gtCallType == CT_INDIRECT)
1370 // either gtControlExpr != null or gtCallAddr != null.
1371 // Both cannot be non-null at the same time.
1372 assert(ctrlExpr == nullptr);
1373 assert(call->gtCallAddr != nullptr);
1374 ctrlExpr = call->gtCallAddr;
1377 // Fast tail calls aren't currently supported on x86, but if they ever are, the code
1378 // below that handles indirect VSD calls will need to be fixed.
1379 assert(!call->IsFastTailCall() || !call->IsVirtualStub());
1380 #endif // _TARGET_X86_
1383 // set reg requirements on call target represented as control sequence.
1384 if (ctrlExpr != nullptr)
1386 // we should never see a gtControlExpr whose type is void.
1387 assert(ctrlExpr->TypeGet() != TYP_VOID);
1389 // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1390 // computed into a register.
1391 if (!call->IsFastTailCall())
1394 // On x86, we need to generate a very specific pattern for indirect VSD calls:
1397 // call dword ptr [eax]
1399 // Where EAX is also used as an argument to the stub dispatch helper. Make
1400 // sure that the call target address is computed into EAX in this case.
1401 if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1403 assert(ctrlExpr->isIndir());
1404 MakeSrcContained(call, ctrlExpr);
1407 #endif // _TARGET_X86_
1408 if (ctrlExpr->isIndir())
1410 // We may have cases where we have set a register target on the ctrlExpr, but if it
1411 // contained we must clear it.
1412 ctrlExpr->gtRegNum = REG_NA;
1413 MakeSrcContained(call, ctrlExpr);
1418 GenTree* args = call->gtCallArgs;
1421 GenTree* arg = args->gtOp.gtOp1;
1422 if (arg->gtOper == GT_PUTARG_STK)
1424 LowerPutArgStk(arg->AsPutArgStk());
1426 args = args->gtOp.gtOp2;
1428 args = call->gtCallLateArgs;
1431 GenTree* arg = args->gtOp.gtOp1;
1432 if (arg->gtOper == GT_PUTARG_STK)
1434 LowerPutArgStk(arg->AsPutArgStk());
1436 args = args->gtOp.gtOp2;
1440 //------------------------------------------------------------------------
1441 // ContainCheckIndir: Determine whether operands of an indir should be contained.
1444 // node - The indirection node of interest
1447 // This is called for both store and load indirections. In the former case, it is assumed that
1448 // LowerStoreIndir() has already been called to check for RMW opportunities.
1453 void Lowering::ContainCheckIndir(GenTreeIndir* node)
1455 GenTree* addr = node->Addr();
1457 // If this is the rhs of a block copy it will be handled when we handle the store.
1458 if (node->TypeGet() == TYP_STRUCT)
1464 // If indirTree is of TYP_SIMD12, don't mark addr as contained
1465 // so that it always get computed to a register. This would
1466 // mean codegen side logic doesn't need to handle all possible
1467 // addr expressions that could be contained.
1469 // TODO-XArch-CQ: handle other addr mode expressions that could be marked
1471 if (node->TypeGet() == TYP_SIMD12)
1475 #endif // FEATURE_SIMD
1477 if ((node->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
1479 // The address of an indirection that requires its address in a reg.
1480 // Skip any further processing that might otherwise make it contained.
1482 else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
1484 // These nodes go into an addr mode:
1485 // - GT_CLS_VAR_ADDR turns into a constant.
1486 // - GT_LCL_VAR_ADDR is a stack addr mode.
1488 // make this contained, it turns into a constant that goes into an addr mode
1489 MakeSrcContained(node, addr);
1491 else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
1494 // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
1495 // (i.e. those VSD calls for which stub addr is known during JIT compilation time). In this case,
1496 // VM requires us to pass stub addr in VirtualStubParam.reg - see LowerVirtualStubCall(). For
1497 // that reason we cannot mark such an addr as contained. Note that this is not an issue for
1498 // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
1502 // Note that LowerVirtualStubCall() sets addr->gtRegNum to VirtualStubParam.reg and Lowering::doPhase()
1503 // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA.
1504 // Ideally we should set a flag on addr nodes that shouldn't be marked as contained
1505 // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose. As a workaround
1506 // an explicit check is made here.
1508 // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
1509 MakeSrcContained(node, addr);
1511 else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(node, addr))
1513 MakeSrcContained(node, addr);
1517 //------------------------------------------------------------------------
1518 // ContainCheckStoreIndir: determine whether the sources of a STOREIND node should be contained.
1521 // node - pointer to the node
1523 void Lowering::ContainCheckStoreIndir(GenTreeIndir* node)
1525 // If the source is a containable immediate, make it contained, unless it is
1526 // an int-size or larger store of zero to memory, because we can generate smaller code
1527 // by zeroing a register and then storing it.
1528 GenTree* src = node->gtOp.gtOp2;
1529 if (IsContainableImmed(node, src) &&
1530 (!src->IsIntegralConst(0) || varTypeIsSmall(node) || node->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
1532 MakeSrcContained(node, src);
1534 ContainCheckIndir(node);
1537 //------------------------------------------------------------------------
1538 // ContainCheckMul: determine whether the sources of a MUL node should be contained.
1541 // node - pointer to the node
1543 void Lowering::ContainCheckMul(GenTreeOp* node)
1545 #if defined(_TARGET_X86_)
1546 assert(node->OperIs(GT_MUL, GT_MULHI, GT_MUL_LONG));
1548 assert(node->OperIs(GT_MUL, GT_MULHI));
1550 GenTree* op1 = node->gtOp.gtOp1;
1551 GenTree* op2 = node->gtOp.gtOp2;
1553 // Case of float/double mul.
1554 if (varTypeIsFloating(node->TypeGet()))
1556 assert(node->OperGet() == GT_MUL);
1558 if (IsContainableMemoryOp(op2) || op2->IsCnsNonZeroFltOrDbl())
1560 MakeSrcContained(node, op2);
1562 else if (op1->IsCnsNonZeroFltOrDbl() || (IsContainableMemoryOp(op1) && IsSafeToContainMem(node, op1)))
1564 // Since GT_MUL is commutative, we will try to re-order operands if it is safe to
1565 // generate more efficient code sequence for the case of GT_MUL(op1=memOp, op2=non-memOp)
1566 MakeSrcContained(node, op1);
1570 // If there are no containable operands, we can make an operand reg optional.
1571 SetRegOptionalForBinOp(node);
1576 bool isUnsignedMultiply = ((node->gtFlags & GTF_UNSIGNED) != 0);
1577 bool requiresOverflowCheck = node->gtOverflowEx();
1578 bool useLeaEncoding = false;
1579 GenTree* memOp = nullptr;
1581 bool hasImpliedFirstOperand = false;
1582 GenTreeIntConCommon* imm = nullptr;
1583 GenTree* other = nullptr;
1585 // Multiply should never be using small types
1586 assert(!varTypeIsSmall(node->TypeGet()));
1588 // We do use the widening multiply to implement
1589 // the overflow checking for unsigned multiply
1591 if (isUnsignedMultiply && requiresOverflowCheck)
1593 hasImpliedFirstOperand = true;
1595 else if (node->OperGet() == GT_MULHI)
1597 hasImpliedFirstOperand = true;
1599 #if defined(_TARGET_X86_)
1600 else if (node->OperGet() == GT_MUL_LONG)
1602 hasImpliedFirstOperand = true;
1605 else if (IsContainableImmed(node, op2) || IsContainableImmed(node, op1))
1607 if (IsContainableImmed(node, op2))
1609 imm = op2->AsIntConCommon();
1614 imm = op1->AsIntConCommon();
1618 // CQ: We want to rewrite this into a LEA
1619 ssize_t immVal = imm->AsIntConCommon()->IconValue();
1620 if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
1622 useLeaEncoding = true;
1625 MakeSrcContained(node, imm); // The imm is always contained
1626 if (IsContainableMemoryOp(other))
1628 memOp = other; // memOp may be contained below
1632 // We allow one operand to be a contained memory operand.
1633 // The memory op type must match with the 'node' type.
1634 // This is because during codegen we use 'node' type to derive EmitTypeSize.
1635 // E.g op1 type = byte, op2 type = byte but GT_MUL node type is int.
1637 if (memOp == nullptr)
1639 if (IsContainableMemoryOp(op2) && (op2->TypeGet() == node->TypeGet()) && IsSafeToContainMem(node, op2))
1643 else if (IsContainableMemoryOp(op1) && (op1->TypeGet() == node->TypeGet()) && IsSafeToContainMem(node, op1))
1650 if ((memOp->TypeGet() != node->TypeGet()) || !IsSafeToContainMem(node, memOp))
1655 // To generate an LEA we need to force memOp into a register
1656 // so don't allow memOp to be 'contained'
1658 if (!useLeaEncoding)
1660 if (memOp != nullptr)
1662 MakeSrcContained(node, memOp);
1664 else if (imm != nullptr)
1666 // Has a contained immediate operand.
1667 // Only 'other' operand can be marked as reg optional.
1668 assert(other != nullptr);
1669 other->SetRegOptional();
1671 else if (hasImpliedFirstOperand)
1673 // Only op2 can be marke as reg optional.
1674 op2->SetRegOptional();
1678 // If there are no containable operands, we can make either of op1 or op2
1680 SetRegOptionalForBinOp(node);
1685 //------------------------------------------------------------------------
1686 // ContainCheckShiftRotate: determine whether the sources of a shift/rotate node should be contained.
1689 // node - pointer to the node
1691 void Lowering::ContainCheckShiftRotate(GenTreeOp* node)
1694 GenTree* source = node->gtOp1;
1695 if (node->OperIs(GT_LSH_HI, GT_RSH_LO))
1697 assert(source->OperGet() == GT_LONG);
1698 MakeSrcContained(node, source);
1701 #endif // !_TARGET_X86_
1703 assert(node->OperIsShiftOrRotate());
1706 GenTree* shiftBy = node->gtOp2;
1707 if (IsContainableImmed(node, shiftBy) && (shiftBy->gtIntConCommon.IconValue() <= 255) &&
1708 (shiftBy->gtIntConCommon.IconValue() >= 0))
1710 MakeSrcContained(node, shiftBy);
1714 //------------------------------------------------------------------------
1715 // ContainCheckStoreLoc: determine whether the source of a STORE_LCL* should be contained.
1718 // node - pointer to the node
1720 void Lowering::ContainCheckStoreLoc(GenTreeLclVarCommon* storeLoc)
1722 assert(storeLoc->OperIsLocalStore());
1723 GenTree* op1 = storeLoc->gtGetOp1();
1726 if (varTypeIsSIMD(storeLoc))
1728 if (op1->IsCnsIntOrI())
1730 // For an InitBlk we want op1 to be contained; otherwise we want it to
1731 // be evaluated into an xmm register.
1732 MakeSrcContained(storeLoc, op1);
1736 #endif // FEATURE_SIMD
1738 // If the source is a containable immediate, make it contained, unless it is
1739 // an int-size or larger store of zero to memory, because we can generate smaller code
1740 // by zeroing a register and then storing it.
1741 if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
1743 MakeSrcContained(storeLoc, op1);
1746 else if (op1->OperGet() == GT_LONG)
1748 MakeSrcContained(storeLoc, op1);
1750 #endif // _TARGET_X86_
1753 //------------------------------------------------------------------------
1754 // ContainCheckCast: determine whether the source of a CAST node should be contained.
1757 // node - pointer to the node
1759 void Lowering::ContainCheckCast(GenTreeCast* node)
1761 GenTree* castOp = node->CastOp();
1762 var_types castToType = node->CastToType();
1763 var_types srcType = castOp->TypeGet();
1765 // force the srcType to unsigned if GT_UNSIGNED flag is set
1766 if (node->gtFlags & GTF_UNSIGNED)
1768 srcType = genUnsignedType(srcType);
1771 if (!node->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(srcType)))
1774 // If converting to float/double, the operand must be 4 or 8 byte in size.
1775 if (varTypeIsFloating(castToType))
1777 unsigned opSize = genTypeSize(srcType);
1778 assert(opSize == 4 || opSize == 8);
1782 // U8 -> R8 conversion requires that the operand be in a register.
1783 if (srcType != TYP_ULONG)
1785 if (IsContainableMemoryOp(castOp) || castOp->IsCnsNonZeroFltOrDbl())
1787 MakeSrcContained(node, castOp);
1791 // Mark castOp as reg optional to indicate codegen
1792 // can still generate code if it is on stack.
1793 castOp->SetRegOptional();
1797 #if !defined(_TARGET_64BIT_)
1798 if (varTypeIsLong(srcType))
1800 noway_assert(castOp->OperGet() == GT_LONG);
1801 castOp->SetContained();
1803 #endif // !defined(_TARGET_64BIT_)
1806 //------------------------------------------------------------------------
1807 // ContainCheckCompare: determine whether the sources of a compare node should be contained.
1810 // node - pointer to the node
1812 void Lowering::ContainCheckCompare(GenTreeOp* cmp)
1814 assert(cmp->OperIsCompare() || cmp->OperIs(GT_CMP));
1816 GenTree* op1 = cmp->gtOp.gtOp1;
1817 GenTree* op2 = cmp->gtOp.gtOp2;
1818 var_types op1Type = op1->TypeGet();
1819 var_types op2Type = op2->TypeGet();
1821 // If either of op1 or op2 is floating point values, then we need to use
1822 // ucomiss or ucomisd to compare, both of which support the following form:
1823 // ucomis[s|d] xmm, xmm/mem
1824 // That is only the second operand can be a memory op.
1826 // Second operand is a memory Op: Note that depending on comparison operator,
1827 // the operands of ucomis[s|d] need to be reversed. Therefore, either op1 or
1828 // op2 can be a memory op depending on the comparison operator.
1829 if (varTypeIsFloating(op1Type))
1831 // The type of the operands has to be the same and no implicit conversions at this stage.
1832 assert(op1Type == op2Type);
1835 if ((cmp->gtFlags & GTF_RELOP_NAN_UN) != 0)
1837 // Unordered comparison case
1838 reverseOps = cmp->OperIs(GT_GT, GT_GE);
1842 reverseOps = cmp->OperIs(GT_LT, GT_LE);
1855 assert(otherOp != nullptr);
1856 if (otherOp->IsCnsNonZeroFltOrDbl())
1858 MakeSrcContained(cmp, otherOp);
1860 else if (IsContainableMemoryOp(otherOp) && ((otherOp == op2) || IsSafeToContainMem(cmp, otherOp)))
1862 MakeSrcContained(cmp, otherOp);
1866 // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
1867 // contained, we can mark it reg-optional.
1868 otherOp->SetRegOptional();
1874 // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
1875 // or in other backend.
1877 if (CheckImmedAndMakeContained(cmp, op2))
1879 // If the types are the same, or if the constant is of the correct size,
1880 // we can treat the MemoryOp as contained.
1881 if (op1Type == op2Type)
1883 if (IsContainableMemoryOp(op1))
1885 MakeSrcContained(cmp, op1);
1889 op1->SetRegOptional();
1893 else if (op1Type == op2Type)
1895 // Note that TEST does not have a r,rm encoding like CMP has but we can still
1896 // contain the second operand because the emitter maps both r,rm and rm,r to
1897 // the same instruction code. This avoids the need to special case TEST here.
1898 if (IsContainableMemoryOp(op2))
1900 MakeSrcContained(cmp, op2);
1902 else if (IsContainableMemoryOp(op1) && IsSafeToContainMem(cmp, op1))
1904 MakeSrcContained(cmp, op1);
1906 else if (op1->IsCnsIntOrI())
1908 op2->SetRegOptional();
1912 // One of op1 or op2 could be marked as reg optional
1913 // to indicate that codegen can still generate code
1914 // if one of them is on stack.
1915 PreferredRegOptionalOperand(cmp)->SetRegOptional();
1920 //------------------------------------------------------------------------
1921 // LowerRMWMemOp: Determine if this is a valid RMW mem op, and if so lower it accordingly
1924 // node - The indirect store node (GT_STORE_IND) of interest
1927 // Returns true if 'node' is a valid RMW mem op; false otherwise.
1929 bool Lowering::LowerRMWMemOp(GenTreeIndir* storeInd)
1931 assert(storeInd->OperGet() == GT_STOREIND);
1933 // SSE2 doesn't support RMW on float values
1934 assert(!varTypeIsFloating(storeInd));
1937 // indirDst = memory write of an addr mode (i.e. storeind destination)
1938 // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
1939 // indirCandidate = memory read i.e. a gtInd of an addr mode
1940 // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
1942 GenTree* indirCandidate = nullptr;
1943 GenTree* indirOpSource = nullptr;
1945 if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
1947 JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
1948 storeInd->AsStoreInd()->GetRMWStatus());
1949 DISPTREERANGE(BlockRange(), storeInd);
1953 GenTree* indirDst = storeInd->gtGetOp1();
1954 GenTree* indirSrc = storeInd->gtGetOp2();
1955 genTreeOps oper = indirSrc->OperGet();
1957 // At this point we have successfully detected a RMW memory op of one of the following forms
1958 // storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
1959 // storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
1960 // storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
1962 // Here indirSrc = one of the supported binary or unary operation for RMW of memory
1963 // indirCandidate = a GT_IND node
1964 // indirCandidateChild = operand of GT_IND indirCandidate
1966 // The logic below does the following
1967 // Make indirOpSource contained.
1968 // Make indirSrc contained.
1969 // Make indirCandidate contained.
1970 // Make indirCandidateChild contained.
1971 // Make indirDst contained except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
1975 // We have already done containment analysis on the indirSrc op.
1976 // If any of its operands are marked regOptional, reset that now.
1977 indirSrc->AsOp()->gtOp1->ClearRegOptional();
1978 if (GenTree::OperIsBinary(oper))
1980 // On Xarch RMW operations require the source to be an immediate or in a register.
1981 // Therefore, if we have previously marked the indirOpSource as contained while lowering
1982 // the binary node, we need to reset that now.
1983 if (IsContainableMemoryOp(indirOpSource))
1985 indirOpSource->ClearContained();
1987 indirSrc->AsOp()->gtOp2->ClearRegOptional();
1988 JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
1992 assert(GenTree::OperIsUnary(oper));
1993 JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
1995 DISPTREERANGE(BlockRange(), storeInd);
1997 indirSrc->SetContained();
1998 indirCandidate->SetContained();
2000 GenTree* indirCandidateChild = indirCandidate->gtGetOp1();
2001 indirCandidateChild->SetContained();
2003 if (indirCandidateChild->OperGet() == GT_LEA)
2005 GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
2007 if (addrMode->HasBase())
2009 assert(addrMode->Base()->OperIsLeaf());
2010 addrMode->Base()->SetContained();
2013 if (addrMode->HasIndex())
2015 assert(addrMode->Index()->OperIsLeaf());
2016 addrMode->Index()->SetContained();
2019 indirDst->SetContained();
2023 assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
2024 indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT);
2026 // If it is a GT_LCL_VAR, it still needs the reg to hold the address.
2027 // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
2028 // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
2029 // time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
2030 if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
2032 indirDst->SetContained();
2034 else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
2036 indirDst->SetContained();
2042 //------------------------------------------------------------------------
2043 // ContainCheckBinary: Determine whether a binary op's operands should be contained.
2046 // node - the node we care about
2048 void Lowering::ContainCheckBinary(GenTreeOp* node)
2050 assert(node->OperIsBinary());
2052 if (varTypeIsFloating(node))
2054 assert(node->OperIs(GT_ADD, GT_SUB));
2055 ContainCheckFloatBinary(node);
2059 // We're not marking a constant hanging on the left of an add
2060 // as containable so we assign it to a register having CQ impact.
2061 // TODO-XArch-CQ: Detect this case and support both generating a single instruction
2062 // for GT_ADD(Constant, SomeTree)
2064 GenTree* op1 = node->gtOp1;
2065 GenTree* op2 = node->gtOp2;
2067 // We can directly encode the second operand if it is either a containable constant or a memory-op.
2068 // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
2069 // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
2070 // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
2071 bool directlyEncodable = false;
2072 bool binOpInRMW = false;
2073 GenTree* operand = nullptr;
2075 if (IsContainableImmed(node, op2))
2077 directlyEncodable = true;
2082 binOpInRMW = IsBinOpInRMWStoreInd(node);
2085 const unsigned operatorSize = genTypeSize(node->TypeGet());
2086 if (IsContainableMemoryOp(op2) && (genTypeSize(op2->TypeGet()) == operatorSize))
2088 directlyEncodable = true;
2091 else if (node->OperIsCommutative())
2093 if (IsContainableImmed(node, op1) ||
2094 (IsContainableMemoryOp(op1) && (genTypeSize(op1->TypeGet()) == operatorSize) &&
2095 IsSafeToContainMem(node, op1)))
2097 // If it is safe, we can reverse the order of operands of commutative operations for efficient
2099 directlyEncodable = true;
2106 if (directlyEncodable)
2108 assert(operand != nullptr);
2109 MakeSrcContained(node, operand);
2111 else if (!binOpInRMW)
2113 // If this binary op neither has contained operands, nor is a
2114 // Read-Modify-Write (RMW) operation, we can mark its operands
2116 SetRegOptionalForBinOp(node);
2120 //------------------------------------------------------------------------
2121 // ContainCheckBoundsChk: determine whether any source of a bounds check node should be contained.
2124 // node - pointer to the node
2126 void Lowering::ContainCheckBoundsChk(GenTreeBoundsChk* node)
2128 assert(node->OperIsBoundsCheck());
2130 if (CheckImmedAndMakeContained(node, node->gtIndex))
2132 other = node->gtArrLen;
2134 else if (CheckImmedAndMakeContained(node, node->gtArrLen))
2136 other = node->gtIndex;
2138 else if (IsContainableMemoryOp(node->gtIndex))
2140 other = node->gtIndex;
2144 other = node->gtArrLen;
2147 if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
2149 if (IsContainableMemoryOp(other))
2151 MakeSrcContained(node, other);
2155 // We can mark 'other' as reg optional, since it is not contained.
2156 other->SetRegOptional();
2161 //------------------------------------------------------------------------
2162 // ContainCheckIntrinsic: determine whether the source of an INTRINSIC node should be contained.
2165 // node - pointer to the node
2167 void Lowering::ContainCheckIntrinsic(GenTreeOp* node)
2169 assert(node->OperIs(GT_INTRINSIC));
2171 CorInfoIntrinsics intrinsicId = node->gtIntrinsic.gtIntrinsicId;
2173 if (intrinsicId == CORINFO_INTRINSIC_Sqrt || intrinsicId == CORINFO_INTRINSIC_Round ||
2174 intrinsicId == CORINFO_INTRINSIC_Ceiling || intrinsicId == CORINFO_INTRINSIC_Floor)
2176 GenTree* op1 = node->gtGetOp1();
2177 if (IsContainableMemoryOp(op1) || op1->IsCnsNonZeroFltOrDbl())
2179 MakeSrcContained(node, op1);
2183 // Mark the operand as reg optional since codegen can still
2184 // generate code if op1 is on stack.
2185 op1->SetRegOptional();
2191 //----------------------------------------------------------------------------------------------
2192 // ContainCheckSIMD: Perform containment analysis for a SIMD intrinsic node.
2195 // simdNode - The SIMD intrinsic node.
2197 void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
2199 switch (simdNode->gtSIMDIntrinsicID)
2204 case SIMDIntrinsicInit:
2206 op1 = simdNode->gtOp.gtOp1;
2207 #ifndef _TARGET_64BIT_
2208 if (op1->OperGet() == GT_LONG)
2210 MakeSrcContained(simdNode, op1);
2211 GenTree* op1lo = op1->gtGetOp1();
2212 GenTree* op1hi = op1->gtGetOp2();
2214 if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
2215 (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
2217 MakeSrcContained(op1, op1lo);
2218 MakeSrcContained(op1, op1hi);
2222 #endif // !_TARGET_64BIT_
2223 if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
2224 (varTypeIsIntegral(simdNode->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
2226 MakeSrcContained(simdNode, op1);
2228 else if ((comp->getSIMDSupportLevel() == SIMD_AVX2_Supported) &&
2229 ((simdNode->gtSIMDSize == 16) || (simdNode->gtSIMDSize == 32)))
2231 // Either op1 is a float or dbl constant or an addr
2232 if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
2234 MakeSrcContained(simdNode, op1);
2240 case SIMDIntrinsicInitArray:
2241 // We have an array and an index, which may be contained.
2242 CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2());
2245 case SIMDIntrinsicOpEquality:
2246 case SIMDIntrinsicOpInEquality:
2247 // On SSE4/AVX, we can generate optimal code for (in)equality
2248 // against zero using ptest. We can safely do this optimization
2249 // for integral vectors but not for floating-point for the reason
2250 // that we have +0.0 and -0.0 and +0.0 == -0.0
2251 op2 = simdNode->gtGetOp2();
2252 if ((comp->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(0))
2254 MakeSrcContained(simdNode, op2);
2258 case SIMDIntrinsicGetItem:
2260 // This implements get_Item method. The sources are:
2261 // - the source SIMD struct
2262 // - index (which element to get)
2263 // The result is baseType of SIMD struct.
2264 op1 = simdNode->gtOp.gtOp1;
2265 op2 = simdNode->gtOp.gtOp2;
2267 if (op1->OperGet() == GT_IND)
2269 assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0);
2270 op1->AsIndir()->Addr()->ClearContained();
2272 // If the index is a constant, mark it as contained.
2273 CheckImmedAndMakeContained(simdNode, op2);
2275 if (IsContainableMemoryOp(op1))
2277 MakeSrcContained(simdNode, op1);
2278 if (op1->OperGet() == GT_IND)
2280 op1->AsIndir()->Addr()->ClearContained();
2286 case SIMDIntrinsicShuffleSSE2:
2287 // Second operand is an integer constant and marked as contained.
2288 assert(simdNode->gtOp.gtOp2->IsCnsIntOrI());
2289 MakeSrcContained(simdNode, simdNode->gtOp.gtOp2);
2296 #endif // FEATURE_SIMD
2298 #ifdef FEATURE_HW_INTRINSICS
2299 //----------------------------------------------------------------------------------------------
2300 // IsContainableHWIntrinsicOp: Return true if 'node' is a containable HWIntrinsic op.
2303 // containingNode - The hardware intrinsic node which contains 'node'
2304 // node - The node to check
2307 // true if 'node' is a containable hardware intrinsic node; otherwise, false.
2309 bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, GenTree* node)
2311 if (!node->OperIsHWIntrinsic())
2313 // non-HWIntrinsic nodes are assumed to be unaligned loads, which are only
2314 // supported by the VEX encoding.
2315 return comp->canUseVexEncoding() && IsContainableMemoryOp(node);
2318 bool isContainable = false;
2320 // TODO-XArch: Update this to be table driven, if possible.
2322 NamedIntrinsic containingIntrinsicID = containingNode->gtHWIntrinsicId;
2323 HWIntrinsicCategory containingCategory = Compiler::categoryOfHWIntrinsic(containingIntrinsicID);
2324 NamedIntrinsic intrinsicID = node->AsHWIntrinsic()->gtHWIntrinsicId;
2326 switch (intrinsicID)
2328 // Non-VEX encoded instructions require aligned memory ops, so we can fold them.
2329 // However, we cannot do the same for the VEX-encoding as it changes an observable
2330 // side-effect and may mask an Access Violation that would otherwise occur.
2331 case NI_SSE_LoadAlignedVector128:
2332 case NI_SSE2_LoadAlignedVector128:
2333 isContainable = (containingCategory == HW_Category_SimpleSIMD) && !comp->canUseVexEncoding();
2336 // Only fold a scalar load into a SIMD scalar intrinsic to ensure the number of bits
2337 // read remains the same. Likewise, we can't fold a larger load into a SIMD scalar
2338 // intrinsic as that would read fewer bits that requested.
2339 case NI_SSE_LoadScalarVector128:
2340 case NI_SSE2_LoadScalarVector128:
2341 isContainable = (containingCategory == HW_Category_SIMDScalar);
2344 // VEX encoding supports unaligned memory ops, so we can fold them
2345 case NI_SSE_LoadVector128:
2346 case NI_SSE2_LoadVector128:
2347 case NI_AVX_LoadVector256:
2348 case NI_AVX_LoadAlignedVector256:
2349 isContainable = (containingCategory == HW_Category_SimpleSIMD) && comp->canUseVexEncoding();
2356 return isContainable;
2359 //----------------------------------------------------------------------------------------------
2360 // ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
2363 // node - The hardware intrinsic node.
2365 void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
2367 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
2368 HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID);
2369 HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID);
2370 int numArgs = Compiler::numArgsOfHWIntrinsic(node);
2371 GenTree* op1 = node->gtGetOp1();
2372 GenTree* op2 = node->gtGetOp2();
2374 if ((flags & HW_Flag_NoContainment) != 0)
2376 // Exit early if containment isn't supported
2380 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
2386 case HW_Category_SimpleSIMD:
2387 case HW_Category_SIMDScalar:
2388 if (IsContainableHWIntrinsicOp(node, op2))
2390 MakeSrcContained(node, op2);
2392 else if (((flags & HW_Flag_Commutative) != 0) && IsContainableHWIntrinsicOp(node, op1))
2394 MakeSrcContained(node, op1);
2396 // Swap the operands here to make the containment checks in codegen significantly simpler
2400 else if (comp->canUseVexEncoding())
2402 // We can only mark as reg optional when using the VEX encoding
2403 // since that supports unaligned mem operands and non-VEX doesn't
2404 op2->SetRegOptional();
2409 // TODO-XArch-CQ: Assert that this is unreached after we have ensured the relevant node types are
2411 // https://github.com/dotnet/coreclr/issues/16497
2416 if (Compiler::categoryOfHWIntrinsic(intrinsicID) == HW_Category_IMM)
2418 assert(numArgs >= 2);
2419 GenTree* lastOp = Compiler::lastOpOfHWIntrinsic(node, numArgs);
2420 assert(lastOp != nullptr);
2421 if (Compiler::isImmHWIntrinsic(intrinsicID, lastOp))
2423 if (lastOp->IsCnsIntOrI())
2425 MakeSrcContained(node, lastOp);
2430 #endif // FEATURE_HW_INTRINSICS
2432 //------------------------------------------------------------------------
2433 // ContainCheckFloatBinary: determine whether the sources of a floating point binary node should be contained.
2436 // node - pointer to the node
2438 void Lowering::ContainCheckFloatBinary(GenTreeOp* node)
2440 assert(node->OperIsBinary() && varTypeIsFloating(node));
2442 // overflow operations aren't supported on float/double types.
2443 assert(!node->gtOverflow());
2445 GenTree* op1 = node->gtGetOp1();
2446 GenTree* op2 = node->gtGetOp2();
2448 // No implicit conversions at this stage as the expectation is that
2449 // everything is made explicit by adding casts.
2450 assert(op1->TypeGet() == op2->TypeGet());
2452 if (IsContainableMemoryOp(op2) || op2->IsCnsNonZeroFltOrDbl())
2454 MakeSrcContained(node, op2);
2456 else if (node->OperIsCommutative() &&
2457 (op1->IsCnsNonZeroFltOrDbl() || (IsContainableMemoryOp(op1) && IsSafeToContainMem(node, op1))))
2459 // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
2460 // as long as it is safe so that the following efficient code sequence is generated:
2461 // addss/sd targetReg, memOp (if op1Reg == targetReg) OR
2462 // movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
2465 // movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg (if op1Reg == targetReg) OR
2466 // movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
2467 MakeSrcContained(node, op1);
2471 // If there are no containable operands, we can make an operand reg optional.
2472 SetRegOptionalForBinOp(node);
2476 #endif // _TARGET_XARCH_
2478 #endif // !LEGACY_BACKEND