1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Lowering for AMD64, x86 XX
10 XX This encapsulates all the logic for lowering trees for the AMD64 XX
11 XX architecture. For a more detailed view of what is lowering, please XX
12 XX take a look at Lower.cpp XX
15 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
24 #ifdef _TARGET_XARCH_ // This file is only used for xarch
27 #include "sideeffects.h"
30 // xarch supports both ROL and ROR instructions so no lowering is required.
31 void Lowering::LowerRotate(GenTree* tree)
33 ContainCheckShiftRotate(tree->AsOp());
36 //------------------------------------------------------------------------
37 // LowerStoreLoc: Lower a store of a lclVar
40 // storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
44 // - Handling of contained immediates.
45 // - Widening operations of unsigneds.
47 void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
49 // Try to widen the ops if they are going into a local var.
50 if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT))
52 GenTreeIntCon* con = storeLoc->gtOp1->AsIntCon();
53 ssize_t ival = con->gtIconVal;
55 unsigned varNum = storeLoc->gtLclNum;
56 LclVarDsc* varDsc = comp->lvaTable + varNum;
58 if (varDsc->lvIsSIMDType())
60 noway_assert(storeLoc->gtType != TYP_STRUCT);
62 unsigned size = genTypeSize(storeLoc);
63 // If we are storing a constant into a local variable
64 // we extend the size of the store here
65 if ((size < 4) && !varTypeIsStruct(varDsc))
67 if (!varTypeIsUnsigned(varDsc))
69 if (genTypeSize(storeLoc) == 1)
71 if ((ival & 0x7f) != ival)
73 ival = ival | 0xffffff00;
78 assert(genTypeSize(storeLoc) == 2);
79 if ((ival & 0x7fff) != ival)
81 ival = ival | 0xffff0000;
86 // A local stack slot is at least 4 bytes in size, regardless of
87 // what the local var is typed as, so auto-promote it here
88 // unless it is a field of a promoted struct
89 // TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
90 if (!varDsc->lvIsStructField)
92 storeLoc->gtType = TYP_INT;
93 con->SetIconValue(ival);
97 if (storeLoc->OperIs(GT_STORE_LCL_FLD))
99 // We should only encounter this for lclVars that are lvDoNotEnregister.
100 verifyLclFldDoNotEnregister(storeLoc->gtLclNum);
102 ContainCheckStoreLoc(storeLoc);
105 //------------------------------------------------------------------------
106 // LowerStoreIndir: Determine addressing mode for an indirection, and whether operands are contained.
109 // node - The indirect store node (GT_STORE_IND) of interest
114 void Lowering::LowerStoreIndir(GenTreeIndir* node)
116 // Mark all GT_STOREIND nodes to indicate that it is not known
117 // whether it represents a RMW memory op.
118 node->AsStoreInd()->SetRMWStatusDefault();
120 if (!varTypeIsFloating(node))
122 // Perform recognition of trees with the following structure:
123 // StoreInd(addr, BinOp(expr, GT_IND(addr)))
124 // to be able to fold this into an instruction of the form
125 // BINOP [addr], register
126 // where register is the actual place where 'expr' is computed.
128 // SSE2 doesn't support RMW form of instructions.
129 if (LowerRMWMemOp(node))
134 ContainCheckStoreIndir(node);
137 //------------------------------------------------------------------------
138 // LowerBlockStore: Set block store type
141 // blkNode - The block store node of interest
146 void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
148 GenTree* dstAddr = blkNode->Addr();
149 unsigned size = blkNode->gtBlkSize;
150 GenTree* source = blkNode->Data();
151 Compiler* compiler = comp;
152 GenTree* srcAddrOrFill = nullptr;
153 bool isInitBlk = blkNode->OperIsInitBlkOp();
157 // CopyObj or CopyBlk
158 if ((blkNode->OperGet() == GT_STORE_OBJ) && ((blkNode->AsObj()->gtGcPtrCount == 0) || blkNode->gtBlkOpGcUnsafe))
160 blkNode->SetOper(GT_STORE_BLK);
162 if (source->gtOper == GT_IND)
164 srcAddrOrFill = blkNode->Data()->gtGetOp1();
170 GenTree* initVal = source;
171 if (initVal->OperIsInitVal())
173 initVal->SetContained();
174 initVal = initVal->gtGetOp1();
176 srcAddrOrFill = initVal;
177 // If we have an InitBlk with constant block size we can optimize several ways:
178 // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
179 // we use rep stosb since this reduces the register pressure in LSRA and we have
180 // roughly the same performance as calling the helper.
181 // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant,
182 // we can speed this up by unrolling the loop using SSE2 stores. The reason for
183 // this threshold is because our last investigation (Fall 2013), more than 95% of initblks
184 // in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
185 // preferred code sequence for the vast majority of cases.
187 // This threshold will decide from using the helper or let the JIT decide to inline
188 // a code sequence of its choice.
189 unsigned helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);
191 // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
192 if (size != 0 && size <= helperThreshold)
194 // Always favor unrolling vs rep stos.
195 if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
197 // The fill value of an initblk is interpreted to hold a
198 // value of (unsigned int8) however a constant of any size
199 // may practically reside on the evaluation stack. So extract
200 // the lower byte out of the initVal constant and replicate
201 // it to a larger constant whose size is sufficient to support
202 // the largest width store of the desired inline expansion.
204 ssize_t fill = initVal->gtIntCon.gtIconVal & 0xFF;
205 #ifdef _TARGET_AMD64_
206 if (size < REGSIZE_BYTES)
208 initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
212 initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * fill;
213 initVal->gtType = TYP_LONG;
214 if ((fill == 0) && ((size & 0xf) == 0))
216 MakeSrcContained(blkNode, source);
219 #else // !_TARGET_AMD64_
220 initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
221 #endif // !_TARGET_AMD64_
223 if ((fill == 0) && ((size & 0xf) == 0))
225 MakeSrcContained(blkNode, source);
227 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
231 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
236 #ifdef _TARGET_AMD64_
237 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
238 #else // !_TARGET_AMD64_
239 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
240 #endif // !_TARGET_AMD64_
245 if (blkNode->gtOper == GT_STORE_OBJ)
249 GenTreeObj* cpObjNode = blkNode->AsObj();
251 unsigned slots = cpObjNode->gtSlots;
254 // CpObj must always have at least one GC-Pointer as a member.
255 assert(cpObjNode->gtGcPtrCount > 0);
257 assert(dstAddr->gtType == TYP_BYREF || dstAddr->gtType == TYP_I_IMPL);
259 CORINFO_CLASS_HANDLE clsHnd = cpObjNode->gtClass;
260 size_t classSize = comp->info.compCompHnd->getClassSize(clsHnd);
261 size_t blkSize = roundUp(classSize, TARGET_POINTER_SIZE);
263 // Currently, the EE always round up a class data structure so
264 // we are not handling the case where we have a non multiple of pointer sized
265 // struct. This behavior may change in the future so in order to keeps things correct
266 // let's assert it just to be safe. Going forward we should simply
268 assert(classSize == blkSize);
269 assert((blkSize / TARGET_POINTER_SIZE) == slots);
270 assert(cpObjNode->HasGCPtr());
273 bool IsRepMovsProfitable = false;
275 // If the destination is not on the stack, let's find out if we
276 // can improve code size by using rep movsq instead of generating
277 // sequences of movsq instructions.
278 if (!dstAddr->OperIsLocalAddr())
280 // Let's inspect the struct/class layout and determine if it's profitable
281 // to use rep movsq for copying non-gc memory instead of using single movsq
282 // instructions for each memory slot.
284 BYTE* gcPtrs = cpObjNode->gtGcPtrs;
288 unsigned nonGCSlots = 0;
289 // Measure a contiguous non-gc area inside the struct and note the maximum.
290 while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
296 while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
301 if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
303 IsRepMovsProfitable = true;
308 else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
310 IsRepMovsProfitable = true;
313 // There are two cases in which we need to materialize the
315 // a) When the destination is on the stack we don't need to use the
316 // write barrier, we can just simply call rep movsq and get a win in codesize.
317 // b) If we determine we have contiguous non-gc regions in the struct where it's profitable
318 // to use rep movsq instead of a sequence of single movsq instructions. According to the
319 // Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
320 // the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
321 if (IsRepMovsProfitable)
323 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
324 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
328 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
333 assert((blkNode->OperGet() == GT_STORE_BLK) || (blkNode->OperGet() == GT_STORE_DYN_BLK));
335 // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
336 // we can use rep movs to generate code instead of the helper call.
338 // This threshold will decide between using the helper or let the JIT decide to inline
339 // a code sequence of its choice.
340 unsigned helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
342 // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
343 if ((size != 0) && (size <= helperThreshold))
345 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
346 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
347 // our framework assemblies, so this is the main code generation scheme we'll use.
348 if (size <= CPBLK_UNROLL_LIMIT)
350 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
352 // If src or dst are on stack, we don't have to generate the address
353 // into a register because it's just some constant+SP.
354 if ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr())
356 MakeSrcContained(blkNode, srcAddrOrFill);
359 if (dstAddr->OperIsLocalAddr())
361 MakeSrcContained(blkNode, dstAddr);
366 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
369 #ifdef _TARGET_AMD64_
372 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
374 #elif defined(_TARGET_X86_)
377 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
379 #endif // _TARGET_X86_
380 assert(blkNode->gtBlkOpKind != GenTreeBlk::BlkOpKindInvalid);
383 // CopyObj or CopyBlk
384 if (source->gtOper == GT_IND)
386 // The GT_IND is contained, but the address must be in a register unless it is local.
387 MakeSrcContained(blkNode, source);
388 GenTree* addr = source->AsIndir()->Addr();
389 if (!addr->OperIsLocalAddr())
391 addr->ClearContained();
394 else if (!source->IsMultiRegCall() && !source->OperIsSimdOrHWintrinsic())
396 assert(source->IsLocal());
397 MakeSrcContained(blkNode, source);
402 //------------------------------------------------------------------------
403 // LowerPutArgStk: Lower a GT_PUTARG_STK.
406 // tree - The node of interest
411 void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
414 if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
416 putArgStk->gtNumberReferenceSlots = 0;
417 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;
419 GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
421 // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
422 // of uses is visible to LSRA.
423 unsigned fieldCount = 0;
424 GenTreeFieldList* head = nullptr;
425 for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next)
427 next = current->Rest();
429 // First, insert the field node into the sorted list.
430 GenTreeFieldList* prev = nullptr;
431 for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
433 // If the offset of the current list node is greater than the offset of the cursor or if we have
434 // reached the end of the list, insert the current node before the cursor and terminate.
435 if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset))
439 assert(cursor == head);
444 prev->Rest() = current;
447 current->Rest() = cursor;
455 // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
456 // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
457 // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
458 // corresponding field list nodes in two, giving an upper bound of 8.
460 // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
461 // the maximum size of a field list grows significantly, we will need to reevaluate it.
462 assert(fieldCount <= 8);
464 // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
466 if (head != fieldList)
468 head->gtFlags |= GTF_FIELD_LIST_HEAD;
469 head->SetContained();
471 fieldList->ClearContained();
472 fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
475 head->gtSeqNum = fieldList->gtSeqNum;
478 BlockRange().InsertAfter(fieldList, head);
479 BlockRange().Remove(fieldList);
482 putArgStk->gtOp1 = fieldList;
483 putArgStk->gtType = fieldList->gtType;
486 // Now that the fields have been sorted, the kind of code we will generate.
487 bool allFieldsAreSlots = true;
488 unsigned prevOffset = putArgStk->getArgSize();
489 for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
491 GenTree* const fieldNode = current->Current();
492 const var_types fieldType = fieldNode->TypeGet();
493 const unsigned fieldOffset = current->gtFieldOffset;
494 assert(fieldType != TYP_LONG);
496 // We can treat as a slot any field that is stored at a slot boundary, where the previous
497 // field is not in the same slot. (Note that we store the fields in reverse order.)
498 const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
501 allFieldsAreSlots = false;
504 if (varTypeIsGC(fieldType))
506 putArgStk->gtNumberReferenceSlots++;
509 // For x86 we must mark all integral fields as contained or reg-optional, and handle them
510 // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
511 // registers to be consumed atomically by the call.
512 if (varTypeIsIntegralOrI(fieldNode))
514 if (fieldNode->OperGet() == GT_LCL_VAR)
516 LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
517 if (!varDsc->lvDoNotEnregister)
519 fieldNode->SetRegOptional();
523 MakeSrcContained(putArgStk, fieldNode);
526 else if (fieldNode->IsIntCnsFitsInI32())
528 MakeSrcContained(putArgStk, fieldNode);
532 // For the case where we cannot directly push the value, if we run out of registers,
533 // it would be better to defer computation until we are pushing the arguments rather
534 // than spilling, but this situation is not all that common, as most cases of promoted
535 // structs do not have a large number of fields, and of those most are lclVars or
536 // copy-propagated constants.
537 fieldNode->SetRegOptional();
541 prevOffset = fieldOffset;
544 // Set the copy kind.
545 // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
546 // adjust the stack once for those fields. The latter is really best done in code generation, but
547 // this tuning should probably be undertaken as a whole.
548 // Also, if there are floating point fields, it may be better to use the "Unroll" mode
549 // of copying the struct as a whole, if the fields are not register candidates.
550 if (allFieldsAreSlots)
552 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
556 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
560 #endif // _TARGET_X86_
562 GenTree* src = putArgStk->gtOp1;
564 #ifdef FEATURE_PUT_STRUCT_ARG_STK
565 if (src->TypeGet() != TYP_STRUCT)
566 #endif // FEATURE_PUT_STRUCT_ARG_STK
568 // If the child of GT_PUTARG_STK is a constant, we don't need a register to
569 // move it to memory (stack location).
571 // On AMD64, we don't want to make 0 contained, because we can generate smaller code
572 // by zeroing a register and then storing it. E.g.:
574 // mov gword ptr [rsp+28H], rdx
575 // is 2 bytes smaller than:
576 // mov gword ptr [rsp+28H], 0
578 // On x86, we push stack arguments; we don't use 'mov'. So:
580 // is 1 byte smaller than:
584 if (IsContainableImmed(putArgStk, src)
585 #if defined(_TARGET_AMD64_)
586 && !src->IsIntegralConst(0)
587 #endif // _TARGET_AMD64_
590 MakeSrcContained(putArgStk, src);
595 #ifdef FEATURE_PUT_STRUCT_ARG_STK
596 GenTree* dst = putArgStk;
597 GenTree* srcAddr = nullptr;
599 bool haveLocalAddr = false;
600 if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
602 srcAddr = src->gtOp.gtOp1;
603 assert(srcAddr != nullptr);
604 haveLocalAddr = srcAddr->OperIsLocalAddr();
608 assert(varTypeIsSIMD(putArgStk));
611 // In case of a CpBlk we could use a helper call. In case of putarg_stk we
612 // can't do that since the helper call could kill some already set up outgoing args.
613 // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
614 // The cpyXXXX code is rather complex and this could cause it to be more complex, but
615 // it might be the right thing to do.
617 // This threshold will decide from using the helper or let the JIT decide to inline
618 // a code sequence of its choice.
619 ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
620 ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
622 // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
623 // (I don't know which).
625 // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
626 // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
627 // our framework assemblies, so this is the main code generation scheme we'll use.
628 if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0)
631 if (size < XMM_REGSIZE_BYTES)
633 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
636 #endif // _TARGET_X86_
638 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
642 else if (putArgStk->gtNumberReferenceSlots != 0)
644 // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
645 // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
646 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
648 #endif // _TARGET_X86_
651 putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
653 // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
654 MakeSrcContained(putArgStk, src);
657 // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
660 MakeSrcContained(putArgStk, srcAddr);
662 #endif // FEATURE_PUT_STRUCT_ARG_STK
665 /* Lower GT_CAST(srcType, DstType) nodes.
667 * Casts from small int type to float/double are transformed as follows:
668 * GT_CAST(byte, float/double) = GT_CAST(GT_CAST(byte, int32), float/double)
669 * GT_CAST(sbyte, float/double) = GT_CAST(GT_CAST(sbyte, int32), float/double)
670 * GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double)
671 * GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double)
673 * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
674 * are morphed as follows by front-end and hence should not be seen here.
675 * GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double)
676 * GT_CAST(uint64, float) = GT_CAST(GT_CAST(uint64, double), float)
679 * Similarly casts from float/double to a smaller int type are transformed as follows:
680 * GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte)
681 * GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte)
682 * GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16)
683 * GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16)
685 * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
686 * integer. The above transformations help us to leverage those instructions.
688 * Note that for the following conversions we still depend on helper calls and
689 * don't expect to see them here.
690 * i) GT_CAST(float/double, uint64)
691 * ii) GT_CAST(float/double, int type with overflow detection)
693 * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
694 * There are hardly any occurrences of this conversion operation in platform
695 * assemblies or in CQ perf benchmarks (1 occurrence in mscorlib, microsoft.jscript,
696 * 1 occurence in Roslyn and no occurrences in system, system.core, system.numerics
697 * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
698 * doing this optimization is a win, should consider generating in-lined code.
700 void Lowering::LowerCast(GenTree* tree)
702 assert(tree->OperGet() == GT_CAST);
704 GenTree* castOp = tree->gtCast.CastOp();
705 var_types castToType = tree->CastToType();
706 var_types srcType = castOp->TypeGet();
707 var_types tmpType = TYP_UNDEF;
709 // force the srcType to unsigned if GT_UNSIGNED flag is set
710 if (tree->gtFlags & GTF_UNSIGNED)
712 srcType = genUnsignedType(srcType);
715 // We should never see the following casts as they are expected to be lowered
716 // apropriately or converted into helper calls by front-end.
717 // srcType = float/double castToType = * and overflow detecting cast
718 // Reason: must be converted to a helper call
719 // srcType = float/double, castToType = ulong
720 // Reason: must be converted to a helper call
721 // srcType = uint castToType = float/double
722 // Reason: uint -> float/double = uint -> long -> float/double
723 // srcType = ulong castToType = float
724 // Reason: ulong -> float = ulong -> double -> float
725 if (varTypeIsFloating(srcType))
727 noway_assert(!tree->gtOverflow());
728 noway_assert(castToType != TYP_ULONG);
730 else if (srcType == TYP_UINT)
732 noway_assert(!varTypeIsFloating(castToType));
734 else if (srcType == TYP_ULONG)
736 noway_assert(castToType != TYP_FLOAT);
739 // Case of src is a small type and dst is a floating point type.
740 if (varTypeIsSmall(srcType) && varTypeIsFloating(castToType))
742 // These conversions can never be overflow detecting ones.
743 noway_assert(!tree->gtOverflow());
746 // case of src is a floating point type and dst is a small type.
747 else if (varTypeIsFloating(srcType) && varTypeIsSmall(castToType))
752 if (tmpType != TYP_UNDEF)
754 GenTree* tmp = comp->gtNewCastNode(tmpType, castOp, tree->IsUnsigned(), tmpType);
755 tmp->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT));
757 tree->gtFlags &= ~GTF_UNSIGNED;
758 tree->gtOp.gtOp1 = tmp;
759 BlockRange().InsertAfter(castOp, tmp);
760 ContainCheckCast(tmp->AsCast());
763 // Now determine if we have operands that should be contained.
764 ContainCheckCast(tree->AsCast());
768 //----------------------------------------------------------------------------------------------
769 // Lowering::LowerSIMD: Perform containment analysis for a SIMD intrinsic node.
772 // simdNode - The SIMD intrinsic node.
774 void Lowering::LowerSIMD(GenTreeSIMD* simdNode)
776 if (simdNode->TypeGet() == TYP_SIMD12)
778 // GT_SIMD node requiring to produce TYP_SIMD12 in fact
779 // produces a TYP_SIMD16 result
780 simdNode->gtType = TYP_SIMD16;
783 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN)
785 assert(simdNode->gtSIMDBaseType == TYP_FLOAT);
788 int constArgCount = 0;
789 float constArgValues[4]{0, 0, 0, 0};
791 for (GenTreeArgList* list = simdNode->gtGetOp1()->AsArgList(); list != nullptr; list = list->Rest())
793 GenTree* arg = list->Current();
795 assert(arg->TypeGet() == simdNode->gtSIMDBaseType);
796 assert(argCount < _countof(constArgValues));
798 if (arg->IsCnsFltOrDbl())
800 constArgValues[constArgCount] = static_cast<float>(arg->AsDblCon()->gtDconVal);
807 if (constArgCount == argCount)
809 for (GenTreeArgList* list = simdNode->gtGetOp1()->AsArgList(); list != nullptr; list = list->Rest())
811 BlockRange().Remove(list->Current());
814 CORINFO_FIELD_HANDLE hnd =
815 comp->getEmitter()->emitAnyConst(constArgValues, sizeof(constArgValues), emitDataAlignment::Required);
816 GenTree* clsVarAddr = new (comp, GT_CLS_VAR_ADDR) GenTreeClsVar(GT_CLS_VAR_ADDR, TYP_I_IMPL, hnd, nullptr);
817 BlockRange().InsertBefore(simdNode, clsVarAddr);
818 simdNode->ChangeOper(GT_IND);
819 simdNode->gtOp1 = clsVarAddr;
820 ContainCheckIndir(simdNode->AsIndir());
826 #ifdef _TARGET_XARCH_
827 if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (simdNode->gtGetOp1()->OperGet() == GT_IND))
829 // If SIMD vector is already in memory, we force its
830 // addr to be evaluated into a reg. This would allow
831 // us to generate [regBase] or [regBase+offset] or
832 // [regBase+sizeOf(SIMD vector baseType)*regIndex]
833 // to access the required SIMD vector element directly
836 // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
837 // might be able update GT_LEA to fold the regIndex
838 // or offset in some cases. Instead with this
839 // approach we always evaluate GT_LEA into a reg.
840 // Ideally, we should be able to lower GetItem intrinsic
841 // into GT_IND(newAddr) where newAddr combines
842 // the addr of SIMD vector with the given index.
843 simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
845 else if (simdNode->IsSIMDEqualityOrInequality())
849 if (BlockRange().TryGetUse(simdNode, &simdUse))
852 // Try to transform JTRUE(EQ|NE(SIMD<OpEquality|OpInEquality>(x, y), 0|1)) into
853 // JCC(SIMD<OpEquality|OpInEquality>(x, y)). SIMD<OpEquality|OpInEquality>(x, y)
854 // is expected to set the Zero flag appropriately.
855 // All the involved nodes must form a continuous range, there's no other way to
856 // guarantee that condition flags aren't changed between the SIMD node and the JCC
860 bool transformed = false;
861 GenTree* simdUser = simdUse.User();
863 if (simdUser->OperIs(GT_EQ, GT_NE) && simdUser->gtGetOp2()->IsCnsIntOrI() &&
864 (simdNode->gtNext == simdUser->gtGetOp2()) && (simdUser->gtGetOp2()->gtNext == simdUser))
866 ssize_t relopOp2Value = simdUser->gtGetOp2()->AsIntCon()->IconValue();
868 if ((relopOp2Value == 0) || (relopOp2Value == 1))
870 GenTree* jtrue = simdUser->gtNext;
872 if ((jtrue != nullptr) && jtrue->OperIs(GT_JTRUE) && (jtrue->gtGetOp1() == simdUser))
874 if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) != simdUser->OperIs(GT_EQ))
879 jtrue->ChangeOper(GT_JCC);
880 GenTreeCC* jcc = jtrue->AsCC();
881 jcc->gtFlags |= GTF_USE_FLAGS;
882 jcc->gtCondition = (relopOp2Value == 0) ? GenCondition::NE : GenCondition::EQ;
884 BlockRange().Remove(simdUser->gtGetOp2());
885 BlockRange().Remove(simdUser);
894 // The code generated for SIMD SIMD<OpEquality|OpInEquality>(x, y) nodes sets
895 // the Zero flag like integer compares do so we can simply use SETCC<EQ|NE>
896 // to produce the desired result. This avoids the need for subsequent phases
897 // to have to handle 2 cases (set flags/set destination register).
900 GenCondition condition =
901 (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? GenCondition::EQ : GenCondition::NE;
902 GenTreeCC* setcc = new (comp, GT_SETCC) GenTreeCC(GT_SETCC, condition, simdNode->TypeGet());
903 setcc->gtFlags |= GTF_USE_FLAGS;
904 BlockRange().InsertAfter(simdNode, setcc);
905 simdUse.ReplaceWith(comp, setcc);
909 simdNode->gtFlags |= GTF_SET_FLAGS;
910 simdNode->gtType = TYP_VOID;
913 ContainCheckSIMD(simdNode);
915 #endif // FEATURE_SIMD
917 #ifdef FEATURE_HW_INTRINSICS
918 //----------------------------------------------------------------------------------------------
919 // Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
922 // node - The hardware intrinsic node.
924 void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
926 ContainCheckHWIntrinsic(node);
928 #endif // FEATURE_HW_INTRINSICS
930 //----------------------------------------------------------------------------------------------
931 // Lowering::IsRMWIndirCandidate:
932 // Returns true if the given operand is a candidate indirection for a read-modify-write
936 // operand - The operand to consider.
937 // storeInd - The indirect store that roots the possible RMW operator.
939 bool Lowering::IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd)
941 // If the operand isn't an indirection, it's trivially not a candidate.
942 if (operand->OperGet() != GT_IND)
947 // If the indirection's source address isn't equivalent to the destination address of the storeIndir, then the
948 // indirection is not a candidate.
949 GenTree* srcAddr = operand->gtGetOp1();
950 GenTree* dstAddr = storeInd->gtGetOp1();
951 if ((srcAddr->OperGet() != dstAddr->OperGet()) || !IndirsAreEquivalent(operand, storeInd))
956 // If it is not safe to contain the entire tree rooted at the indirection, then the indirection is not a
957 // candidate. Crawl the IR from the node immediately preceding the storeIndir until the last node in the
958 // indirection's tree is visited and check the side effects at each point.
960 m_scratchSideEffects.Clear();
962 assert((operand->gtLIRFlags & LIR::Flags::Mark) == 0);
963 operand->gtLIRFlags |= LIR::Flags::Mark;
965 unsigned markCount = 1;
967 for (node = storeInd->gtPrev; markCount > 0; node = node->gtPrev)
969 assert(node != nullptr);
971 if ((node->gtLIRFlags & LIR::Flags::Mark) == 0)
973 m_scratchSideEffects.AddNode(comp, node);
977 node->gtLIRFlags &= ~LIR::Flags::Mark;
980 if (m_scratchSideEffects.InterferesWith(comp, node, false))
982 // The indirection's tree contains some node that can't be moved to the storeInder. The indirection is
983 // not a candidate. Clear any leftover mark bits and return.
984 for (; markCount > 0; node = node->gtPrev)
986 if ((node->gtLIRFlags & LIR::Flags::Mark) != 0)
988 node->gtLIRFlags &= ~LIR::Flags::Mark;
995 node->VisitOperands([&markCount](GenTree* nodeOperand) -> GenTree::VisitResult {
996 assert((nodeOperand->gtLIRFlags & LIR::Flags::Mark) == 0);
997 nodeOperand->gtLIRFlags |= LIR::Flags::Mark;
999 return GenTree::VisitResult::Continue;
1004 // At this point we've verified that the operand is an indirection, its address is equivalent to the storeIndir's
1005 // destination address, and that it and the transitive closure of its operand can be safely contained by the
1006 // storeIndir. This indirection is therefore a candidate for an RMW op.
1010 //----------------------------------------------------------------------------------------------
1011 // Returns true if this tree is bin-op of a GT_STOREIND of the following form
1012 // storeInd(subTreeA, binOp(gtInd(subTreeA), subtreeB)) or
1013 // storeInd(subTreeA, binOp(subtreeB, gtInd(subTreeA)) in case of commutative bin-ops
1015 // The above form for storeInd represents a read-modify-write memory binary operation.
1018 // tree - GentreePtr of binOp
1021 // True if 'tree' is part of a RMW memory operation pattern
1023 bool Lowering::IsBinOpInRMWStoreInd(GenTree* tree)
1025 // Must be a non floating-point type binary operator since SSE2 doesn't support RMW memory ops
1026 assert(!varTypeIsFloating(tree));
1027 assert(GenTree::OperIsBinary(tree->OperGet()));
1029 // Cheap bail out check before more expensive checks are performed.
1030 // RMW memory op pattern requires that one of the operands of binOp to be GT_IND.
1031 if (tree->gtGetOp1()->OperGet() != GT_IND && tree->gtGetOp2()->OperGet() != GT_IND)
1037 if (!BlockRange().TryGetUse(tree, &use) || use.User()->OperGet() != GT_STOREIND || use.User()->gtGetOp2() != tree)
1042 // Since it is not relatively cheap to recognize RMW memory op pattern, we
1043 // cache the result in GT_STOREIND node so that while lowering GT_STOREIND
1044 // we can use the result.
1045 GenTree* indirCandidate = nullptr;
1046 GenTree* indirOpSource = nullptr;
1047 return IsRMWMemOpRootedAtStoreInd(use.User(), &indirCandidate, &indirOpSource);
1050 //----------------------------------------------------------------------------------------------
1051 // This method recognizes the case where we have a treeNode with the following structure:
1052 // storeInd(IndirDst, binOp(gtInd(IndirDst), indirOpSource)) OR
1053 // storeInd(IndirDst, binOp(indirOpSource, gtInd(IndirDst)) in case of commutative operations OR
1054 // storeInd(IndirDst, unaryOp(gtInd(IndirDst)) in case of unary operations
1057 // indirDst = memory write of an addr mode (i.e. storeind destination)
1058 // indirSrc = value being written to memory (i.e. storeind source which could either be a binary or unary op)
1059 // indirCandidate = memory read i.e. a gtInd of an addr mode
1060 // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
1062 // In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the
1063 // following form in case of integer operations:
1064 // binOp [addressing mode], RegIndirOpSource
1065 // binOp [addressing mode], immediateVal
1066 // where RegIndirOpSource is the register where indirOpSource was computed.
1068 // Right now, we recognize few cases:
1069 // a) The gtInd child is a lea/lclVar/lclVarAddr/clsVarAddr/constant
1070 // b) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
1071 // c) unaryOp is either not/neg
1073 // Implementation Note: The following routines need to be in sync for RMW memory op optimization
1074 // to be correct and functional.
1075 // IndirsAreEquivalent()
1076 // NodesAreEquivalentLeaves()
1077 // Codegen of GT_STOREIND and genCodeForShiftRMW()
1080 // TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
1081 // package to perform more complex tree recognition.
1083 // TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
1086 // tree - GT_STOREIND node
1087 // outIndirCandidate - out param set to indirCandidate as described above
1088 // ouutIndirOpSource - out param set to indirOpSource as described above
1091 // True if there is a RMW memory operation rooted at a GT_STOREIND tree
1092 // and out params indirCandidate and indirOpSource are set to non-null values.
1093 // Otherwise, returns false with indirCandidate and indirOpSource set to null.
1094 // Also updates flags of GT_STOREIND tree with its RMW status.
1096 bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTree* tree, GenTree** outIndirCandidate, GenTree** outIndirOpSource)
1098 assert(!varTypeIsFloating(tree));
1099 assert(outIndirCandidate != nullptr);
1100 assert(outIndirOpSource != nullptr);
1102 *outIndirCandidate = nullptr;
1103 *outIndirOpSource = nullptr;
1105 // Early out if storeInd is already known to be a non-RMW memory op
1106 GenTreeStoreInd* storeInd = tree->AsStoreInd();
1107 if (storeInd->IsNonRMWMemoryOp())
1112 GenTree* indirDst = storeInd->gtGetOp1();
1113 GenTree* indirSrc = storeInd->gtGetOp2();
1114 genTreeOps oper = indirSrc->OperGet();
1116 // Early out if it is already known to be a RMW memory op
1117 if (storeInd->IsRMWMemoryOp())
1119 if (GenTree::OperIsBinary(oper))
1121 if (storeInd->IsRMWDstOp1())
1123 *outIndirCandidate = indirSrc->gtGetOp1();
1124 *outIndirOpSource = indirSrc->gtGetOp2();
1128 assert(storeInd->IsRMWDstOp2());
1129 *outIndirCandidate = indirSrc->gtGetOp2();
1130 *outIndirOpSource = indirSrc->gtGetOp1();
1132 assert(IndirsAreEquivalent(*outIndirCandidate, storeInd));
1136 assert(GenTree::OperIsUnary(oper));
1137 assert(IndirsAreEquivalent(indirSrc->gtGetOp1(), storeInd));
1138 *outIndirCandidate = indirSrc->gtGetOp1();
1139 *outIndirOpSource = indirSrc->gtGetOp1();
1145 // If reached here means that we do not know RMW status of tree rooted at storeInd
1146 assert(storeInd->IsRMWStatusUnknown());
1148 // Early out if indirDst is not one of the supported memory operands.
1149 if (indirDst->OperGet() != GT_LEA && indirDst->OperGet() != GT_LCL_VAR && indirDst->OperGet() != GT_LCL_VAR_ADDR &&
1150 indirDst->OperGet() != GT_CLS_VAR_ADDR && indirDst->OperGet() != GT_CNS_INT)
1152 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1156 // We can not use Read-Modify-Write instruction forms with overflow checking instructions
1157 // because we are not allowed to modify the target until after the overflow check.
1158 if (indirSrc->gtOverflowEx())
1160 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1164 // At this point we can match one of two patterns:
1166 // t_ind = indir t_addr_0
1168 // t_value = binop t_ind, t_other
1170 // storeIndir t_addr_1, t_value
1174 // t_ind = indir t_addr_0
1176 // t_value = unop t_ind
1178 // storeIndir t_addr_1, t_value
1180 // In all cases, we will eventually make the binop that produces t_value and the entire dataflow tree rooted at
1181 // t_ind contained by t_value.
1183 GenTree* indirCandidate = nullptr;
1184 GenTree* indirOpSource = nullptr;
1185 RMWStatus status = STOREIND_RMW_STATUS_UNKNOWN;
1186 if (GenTree::OperIsBinary(oper))
1188 // Return if binary op is not one of the supported operations for RMW of memory.
1189 if (!GenTree::OperIsRMWMemOp(oper))
1191 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1195 if (GenTree::OperIsShiftOrRotate(oper) && varTypeIsSmall(storeInd))
1197 // In ldind, Integer values smaller than 4 bytes, a boolean, or a character converted to 4 bytes
1198 // by sign or zero-extension as appropriate. If we directly shift the short type data using sar, we
1199 // will lose the sign or zero-extension bits.
1200 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_TYPE);
1204 // In the common case, the second operand to the binop will be the indir candidate.
1205 GenTreeOp* binOp = indirSrc->AsOp();
1206 if (GenTree::OperIsCommutative(oper) && IsRMWIndirCandidate(binOp->gtOp2, storeInd))
1208 indirCandidate = binOp->gtOp2;
1209 indirOpSource = binOp->gtOp1;
1210 status = STOREIND_RMW_DST_IS_OP2;
1212 else if (IsRMWIndirCandidate(binOp->gtOp1, storeInd))
1214 indirCandidate = binOp->gtOp1;
1215 indirOpSource = binOp->gtOp2;
1216 status = STOREIND_RMW_DST_IS_OP1;
1220 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1224 else if (GenTree::OperIsUnary(oper))
1226 // Nodes other than GT_NOT and GT_NEG are not yet supported.
1227 if (oper != GT_NOT && oper != GT_NEG)
1229 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1233 if (indirSrc->gtGetOp1()->OperGet() != GT_IND)
1235 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1239 GenTreeUnOp* unOp = indirSrc->AsUnOp();
1240 if (IsRMWIndirCandidate(unOp->gtOp1, storeInd))
1242 // src and dest are the same in case of unary ops
1243 indirCandidate = unOp->gtOp1;
1244 indirOpSource = unOp->gtOp1;
1245 status = STOREIND_RMW_DST_IS_OP1;
1249 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1255 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
1259 // By this point we've verified that we have a supported operand with a supported address. Now we need to ensure
1260 // that we're able to move the destination address for the source indirection forwards.
1261 if (!IsSafeToContainMem(storeInd, indirDst))
1263 storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
1267 assert(indirCandidate != nullptr);
1268 assert(indirOpSource != nullptr);
1269 assert(status != STOREIND_RMW_STATUS_UNKNOWN);
1271 *outIndirCandidate = indirCandidate;
1272 *outIndirOpSource = indirOpSource;
1273 storeInd->SetRMWStatus(status);
1277 // anything is in range for AMD64
1278 bool Lowering::IsCallTargetInRange(void* addr)
1283 // return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
1284 bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode)
1286 if (!childNode->IsIntCnsFitsInI32())
1291 // At this point we know that it is an int const fits within 4-bytes and hence can safely cast to IntConCommon.
1292 // Icons that need relocation should never be marked as contained immed
1293 if (childNode->AsIntConCommon()->ImmedValNeedsReloc(comp))
1301 //-----------------------------------------------------------------------
1302 // PreferredRegOptionalOperand: returns one of the operands of given
1303 // binary oper that is to be preferred for marking as reg optional.
1305 // Since only one of op1 or op2 can be a memory operand on xarch, only
1306 // one of them have to be marked as reg optional. Since Lower doesn't
1307 // know apriori which of op1 or op2 is not likely to get a register, it
1308 // has to make a guess. This routine encapsulates heuristics that
1309 // guess whether it is likely to be beneficial to mark op1 or op2 as
1314 // tree - a binary-op tree node that is either commutative
1315 // or a compare oper.
1318 // Returns op1 or op2 of tree node that is preferred for
1319 // marking as reg optional.
1321 // Note: if the tree oper is neither commutative nor a compare oper
1322 // then only op2 can be reg optional on xarch and hence no need to
1323 // call this routine.
1324 GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
1326 assert(GenTree::OperIsBinary(tree->OperGet()));
1327 assert(tree->OperIsCommutative() || tree->OperIsCompare() || tree->OperIs(GT_CMP));
1329 GenTree* op1 = tree->gtGetOp1();
1330 GenTree* op2 = tree->gtGetOp2();
1331 assert(!op1->IsRegOptional() && !op2->IsRegOptional());
1333 // We default to op1, as op2 is likely to have the shorter lifetime.
1334 GenTree* preferredOp = op1;
1336 // This routine uses the following heuristics:
1338 // a) If both are register candidates, marking the one with lower weighted
1339 // ref count as reg-optional would likely be beneficial as it has
1340 // higher probability of not getting a register. Note that we use !lvDoNotEnregister
1341 // here because this is being done while we are adding lclVars for Lowering.
1343 // b) op1 = tracked local and op2 = untracked local: LSRA creates two
1344 // ref positions for op2: a def and use position. op2's def position
1345 // requires a reg and it is allocated a reg by spilling another
1346 // interval (if required) and that could be even op1. For this reason
1347 // it is beneficial to mark op1 as reg optional.
1349 // TODO: It is not always mandatory for a def position of an untracked
1350 // local to be allocated a register if it is on rhs of an assignment
1351 // and its use position is reg-optional and has not been assigned a
1352 // register. Reg optional def positions is currently not yet supported.
1354 // c) op1 = untracked local and op2 = tracked local: marking op1 as
1355 // reg optional is beneficial, since its use position is less likely
1356 // to get a register.
1358 // d) If both are untracked locals (i.e. treated like tree temps by
1359 // LSRA): though either of them could be marked as reg optional,
1360 // marking op1 as reg optional is likely to be beneficial because
1361 // while allocating op2's def position, there is a possibility of
1362 // spilling op1's def and in which case op1 is treated as contained
1363 // memory operand rather than requiring to reload.
1365 // e) If only one of them is a local var, prefer to mark it as
1366 // reg-optional. This is heuristic is based on the results
1367 // obtained against CQ perf benchmarks.
1369 // f) If neither of them are local vars (i.e. tree temps), prefer to
1370 // mark op1 as reg optional for the same reason as mentioned in (d) above.
1371 if (op1->OperGet() == GT_LCL_VAR && op2->OperGet() == GT_LCL_VAR)
1373 LclVarDsc* v1 = comp->lvaTable + op1->AsLclVarCommon()->GetLclNum();
1374 LclVarDsc* v2 = comp->lvaTable + op2->AsLclVarCommon()->GetLclNum();
1376 bool v1IsRegCandidate = !v1->lvDoNotEnregister;
1377 bool v2IsRegCandidate = !v2->lvDoNotEnregister;
1378 if (v1IsRegCandidate && v2IsRegCandidate)
1380 // Both are enregisterable locals. The one with lower weight is less likely
1381 // to get a register and hence beneficial to mark the one with lower
1382 // weight as reg optional.
1383 // If either is not tracked, it may be that it was introduced after liveness
1384 // was run, in which case we will always prefer op1 (should we use raw refcnt??).
1385 if (v1->lvTracked && v2->lvTracked && (v1->lvRefCntWtd() >= v2->lvRefCntWtd()))
1391 else if (!(op1->OperGet() == GT_LCL_VAR) && (op2->OperGet() == GT_LCL_VAR))
1399 //------------------------------------------------------------------------
1400 // Containment analysis
1401 //------------------------------------------------------------------------
1403 //------------------------------------------------------------------------
1404 // ContainCheckCallOperands: Determine whether operands of a call should be contained.
1407 // call - The call node of interest
1412 void Lowering::ContainCheckCallOperands(GenTreeCall* call)
1414 GenTree* ctrlExpr = call->gtControlExpr;
1415 if (call->gtCallType == CT_INDIRECT)
1417 // either gtControlExpr != null or gtCallAddr != null.
1418 // Both cannot be non-null at the same time.
1419 assert(ctrlExpr == nullptr);
1420 assert(call->gtCallAddr != nullptr);
1421 ctrlExpr = call->gtCallAddr;
1424 // Fast tail calls aren't currently supported on x86, but if they ever are, the code
1425 // below that handles indirect VSD calls will need to be fixed.
1426 assert(!call->IsFastTailCall() || !call->IsVirtualStub());
1427 #endif // _TARGET_X86_
1430 // set reg requirements on call target represented as control sequence.
1431 if (ctrlExpr != nullptr)
1433 // we should never see a gtControlExpr whose type is void.
1434 assert(ctrlExpr->TypeGet() != TYP_VOID);
1436 // In case of fast tail implemented as jmp, make sure that gtControlExpr is
1437 // computed into a register.
1438 if (!call->IsFastTailCall())
1441 // On x86, we need to generate a very specific pattern for indirect VSD calls:
1444 // call dword ptr [eax]
1446 // Where EAX is also used as an argument to the stub dispatch helper. Make
1447 // sure that the call target address is computed into EAX in this case.
1448 if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
1450 assert(ctrlExpr->isIndir());
1451 MakeSrcContained(call, ctrlExpr);
1454 #endif // _TARGET_X86_
1455 if (ctrlExpr->isIndir())
1457 // We may have cases where we have set a register target on the ctrlExpr, but if it
1458 // contained we must clear it.
1459 ctrlExpr->gtRegNum = REG_NA;
1460 MakeSrcContained(call, ctrlExpr);
1465 GenTree* args = call->gtCallArgs;
1468 GenTree* arg = args->gtOp.gtOp1;
1469 if (arg->gtOper == GT_PUTARG_STK)
1471 LowerPutArgStk(arg->AsPutArgStk());
1473 args = args->gtOp.gtOp2;
1475 args = call->gtCallLateArgs;
1478 GenTree* arg = args->gtOp.gtOp1;
1479 if (arg->gtOper == GT_PUTARG_STK)
1481 LowerPutArgStk(arg->AsPutArgStk());
1483 args = args->gtOp.gtOp2;
1487 //------------------------------------------------------------------------
1488 // ContainCheckIndir: Determine whether operands of an indir should be contained.
1491 // node - The indirection node of interest
1494 // This is called for both store and load indirections. In the former case, it is assumed that
1495 // LowerStoreIndir() has already been called to check for RMW opportunities.
1500 void Lowering::ContainCheckIndir(GenTreeIndir* node)
1502 GenTree* addr = node->Addr();
1504 // If this is the rhs of a block copy it will be handled when we handle the store.
1505 if (node->TypeGet() == TYP_STRUCT)
1511 // If indirTree is of TYP_SIMD12, don't mark addr as contained
1512 // so that it always get computed to a register. This would
1513 // mean codegen side logic doesn't need to handle all possible
1514 // addr expressions that could be contained.
1516 // TODO-XArch-CQ: handle other addr mode expressions that could be marked
1518 if (node->TypeGet() == TYP_SIMD12)
1522 #endif // FEATURE_SIMD
1524 if ((node->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
1526 // The address of an indirection that requires its address in a reg.
1527 // Skip any further processing that might otherwise make it contained.
1529 else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
1531 // These nodes go into an addr mode:
1532 // - GT_CLS_VAR_ADDR turns into a constant.
1533 // - GT_LCL_VAR_ADDR is a stack addr mode.
1535 // make this contained, it turns into a constant that goes into an addr mode
1536 MakeSrcContained(node, addr);
1538 else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
1541 // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
1542 // (i.e. those VSD calls for which stub addr is known during JIT compilation time). In this case,
1543 // VM requires us to pass stub addr in VirtualStubParam.reg - see LowerVirtualStubCall(). For
1544 // that reason we cannot mark such an addr as contained. Note that this is not an issue for
1545 // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
1549 // Note that LowerVirtualStubCall() sets addr->gtRegNum to VirtualStubParam.reg and Lowering::doPhase()
1550 // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA.
1551 // Ideally we should set a flag on addr nodes that shouldn't be marked as contained
1552 // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose. As a workaround
1553 // an explicit check is made here.
1555 // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
1556 MakeSrcContained(node, addr);
1558 else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(node, addr))
1560 MakeSrcContained(node, addr);
1564 //------------------------------------------------------------------------
1565 // ContainCheckStoreIndir: determine whether the sources of a STOREIND node should be contained.
1568 // node - pointer to the node
1570 void Lowering::ContainCheckStoreIndir(GenTreeIndir* node)
1572 // If the source is a containable immediate, make it contained, unless it is
1573 // an int-size or larger store of zero to memory, because we can generate smaller code
1574 // by zeroing a register and then storing it.
1575 GenTree* src = node->gtOp.gtOp2;
1576 if (IsContainableImmed(node, src) &&
1577 (!src->IsIntegralConst(0) || varTypeIsSmall(node) || node->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
1579 MakeSrcContained(node, src);
1581 ContainCheckIndir(node);
1584 //------------------------------------------------------------------------
1585 // ContainCheckMul: determine whether the sources of a MUL node should be contained.
1588 // node - pointer to the node
1590 void Lowering::ContainCheckMul(GenTreeOp* node)
1592 #if defined(_TARGET_X86_)
1593 assert(node->OperIs(GT_MUL, GT_MULHI, GT_MUL_LONG));
1595 assert(node->OperIs(GT_MUL, GT_MULHI));
1598 // Case of float/double mul.
1599 if (varTypeIsFloating(node->TypeGet()))
1601 ContainCheckFloatBinary(node);
1605 GenTree* op1 = node->gtOp.gtOp1;
1606 GenTree* op2 = node->gtOp.gtOp2;
1608 bool isSafeToContainOp1 = true;
1609 bool isSafeToContainOp2 = true;
1611 bool isUnsignedMultiply = ((node->gtFlags & GTF_UNSIGNED) != 0);
1612 bool requiresOverflowCheck = node->gtOverflowEx();
1613 bool useLeaEncoding = false;
1614 GenTree* memOp = nullptr;
1616 bool hasImpliedFirstOperand = false;
1617 GenTreeIntConCommon* imm = nullptr;
1618 GenTree* other = nullptr;
1620 // Multiply should never be using small types
1621 assert(!varTypeIsSmall(node->TypeGet()));
1623 // We do use the widening multiply to implement
1624 // the overflow checking for unsigned multiply
1626 if (isUnsignedMultiply && requiresOverflowCheck)
1628 hasImpliedFirstOperand = true;
1630 else if (node->OperGet() == GT_MULHI)
1632 hasImpliedFirstOperand = true;
1634 #if defined(_TARGET_X86_)
1635 else if (node->OperGet() == GT_MUL_LONG)
1637 hasImpliedFirstOperand = true;
1640 else if (IsContainableImmed(node, op2) || IsContainableImmed(node, op1))
1642 if (IsContainableImmed(node, op2))
1644 imm = op2->AsIntConCommon();
1649 imm = op1->AsIntConCommon();
1653 // CQ: We want to rewrite this into a LEA
1654 ssize_t immVal = imm->AsIntConCommon()->IconValue();
1655 if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
1657 useLeaEncoding = true;
1660 MakeSrcContained(node, imm); // The imm is always contained
1661 if (IsContainableMemoryOp(other))
1663 memOp = other; // memOp may be contained below
1667 // We allow one operand to be a contained memory operand.
1668 // The memory op type must match with the 'node' type.
1669 // This is because during codegen we use 'node' type to derive EmitTypeSize.
1670 // E.g op1 type = byte, op2 type = byte but GT_MUL node type is int.
1672 if (memOp == nullptr)
1674 if ((op2->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op2))
1676 isSafeToContainOp2 = IsSafeToContainMem(node, op2);
1677 if (isSafeToContainOp2)
1683 if ((memOp == nullptr) && (op1->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op1))
1685 isSafeToContainOp1 = IsSafeToContainMem(node, op1);
1686 if (isSafeToContainOp1)
1694 if ((memOp->TypeGet() != node->TypeGet()))
1698 else if (!IsSafeToContainMem(node, memOp))
1702 isSafeToContainOp1 = false;
1706 isSafeToContainOp2 = false;
1711 // To generate an LEA we need to force memOp into a register
1712 // so don't allow memOp to be 'contained'
1714 if (!useLeaEncoding)
1716 if (memOp != nullptr)
1718 MakeSrcContained(node, memOp);
1722 // IsSafeToContainMem is expensive so we call it at most once for each operand
1723 // in this method. If we already called IsSafeToContainMem, it must have returned false;
1724 // otherwise, memOp would be set to the corresponding operand (op1 or op2).
1727 // Has a contained immediate operand.
1728 // Only 'other' operand can be marked as reg optional.
1729 assert(other != nullptr);
1731 isSafeToContainOp1 = ((other == op1) && isSafeToContainOp1 && IsSafeToContainMem(node, op1));
1732 isSafeToContainOp2 = ((other == op2) && isSafeToContainOp2 && IsSafeToContainMem(node, op2));
1734 else if (hasImpliedFirstOperand)
1736 // Only op2 can be marked as reg optional.
1737 isSafeToContainOp1 = false;
1738 isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
1742 // If there are no containable operands, we can make either of op1 or op2
1744 isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
1745 isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
1747 SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
1752 //------------------------------------------------------------------------
1753 // ContainCheckDivOrMod: determine which operands of a div/mod should be contained.
1756 // node - pointer to the node
1758 void Lowering::ContainCheckDivOrMod(GenTreeOp* node)
1760 assert(node->OperIs(GT_DIV, GT_MOD, GT_UDIV, GT_UMOD));
1762 if (varTypeIsFloating(node->TypeGet()))
1764 ContainCheckFloatBinary(node);
1768 GenTree* dividend = node->gtGetOp1();
1769 GenTree* divisor = node->gtGetOp2();
1771 bool divisorCanBeRegOptional = true;
1773 if (dividend->OperGet() == GT_LONG)
1775 divisorCanBeRegOptional = false;
1776 MakeSrcContained(node, dividend);
1780 // divisor can be an r/m, but the memory indirection must be of the same size as the divide
1781 if (IsContainableMemoryOp(divisor) && (divisor->TypeGet() == node->TypeGet()))
1783 MakeSrcContained(node, divisor);
1785 else if (divisorCanBeRegOptional)
1787 // If there are no containable operands, we can make an operand reg optional.
1788 // Div instruction allows only divisor to be a memory op.
1789 divisor->SetRegOptional();
1793 //------------------------------------------------------------------------
1794 // ContainCheckShiftRotate: determine whether the sources of a shift/rotate node should be contained.
1797 // node - pointer to the node
1799 void Lowering::ContainCheckShiftRotate(GenTreeOp* node)
1801 assert(node->OperIsShiftOrRotate());
1803 GenTree* source = node->gtOp1;
1804 if (node->OperIsShiftLong())
1806 assert(source->OperGet() == GT_LONG);
1807 MakeSrcContained(node, source);
1809 #endif // !_TARGET_X86_
1811 GenTree* shiftBy = node->gtOp2;
1812 if (IsContainableImmed(node, shiftBy) && (shiftBy->gtIntConCommon.IconValue() <= 255) &&
1813 (shiftBy->gtIntConCommon.IconValue() >= 0))
1815 MakeSrcContained(node, shiftBy);
1819 //------------------------------------------------------------------------
1820 // ContainCheckStoreLoc: determine whether the source of a STORE_LCL* should be contained.
1823 // node - pointer to the node
1825 void Lowering::ContainCheckStoreLoc(GenTreeLclVarCommon* storeLoc)
1827 assert(storeLoc->OperIsLocalStore());
1828 GenTree* op1 = storeLoc->gtGetOp1();
1831 if (varTypeIsSIMD(storeLoc))
1833 if (op1->IsCnsIntOrI())
1835 // For an InitBlk we want op1 to be contained; otherwise we want it to
1836 // be evaluated into an xmm register.
1837 MakeSrcContained(storeLoc, op1);
1841 #endif // FEATURE_SIMD
1843 // If the source is a containable immediate, make it contained, unless it is
1844 // an int-size or larger store of zero to memory, because we can generate smaller code
1845 // by zeroing a register and then storing it.
1846 if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
1848 MakeSrcContained(storeLoc, op1);
1851 else if (op1->OperGet() == GT_LONG)
1853 MakeSrcContained(storeLoc, op1);
1855 #endif // _TARGET_X86_
1858 //------------------------------------------------------------------------
1859 // ContainCheckCast: determine whether the source of a CAST node should be contained.
1862 // node - pointer to the node
1864 void Lowering::ContainCheckCast(GenTreeCast* node)
1866 GenTree* castOp = node->CastOp();
1867 var_types castToType = node->CastToType();
1868 var_types srcType = castOp->TypeGet();
1870 // force the srcType to unsigned if GT_UNSIGNED flag is set
1871 if (node->gtFlags & GTF_UNSIGNED)
1873 srcType = genUnsignedType(srcType);
1876 if (!node->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(srcType)))
1879 // If converting to float/double, the operand must be 4 or 8 byte in size.
1880 if (varTypeIsFloating(castToType))
1882 unsigned opSize = genTypeSize(srcType);
1883 assert(opSize == 4 || opSize == 8);
1887 // U8 -> R8 conversion requires that the operand be in a register.
1888 if (srcType != TYP_ULONG)
1890 if (IsContainableMemoryOp(castOp) || castOp->IsCnsNonZeroFltOrDbl())
1892 MakeSrcContained(node, castOp);
1896 // Mark castOp as reg optional to indicate codegen
1897 // can still generate code if it is on stack.
1898 castOp->SetRegOptional();
1902 #if !defined(_TARGET_64BIT_)
1903 if (varTypeIsLong(srcType))
1905 noway_assert(castOp->OperGet() == GT_LONG);
1906 castOp->SetContained();
1908 #endif // !defined(_TARGET_64BIT_)
1911 //------------------------------------------------------------------------
1912 // ContainCheckCompare: determine whether the sources of a compare node should be contained.
1915 // node - pointer to the node
1917 void Lowering::ContainCheckCompare(GenTreeOp* cmp)
1919 assert(cmp->OperIsCompare() || cmp->OperIs(GT_CMP));
1921 GenTree* op1 = cmp->gtOp.gtOp1;
1922 GenTree* op2 = cmp->gtOp.gtOp2;
1923 var_types op1Type = op1->TypeGet();
1924 var_types op2Type = op2->TypeGet();
1926 // If either of op1 or op2 is floating point values, then we need to use
1927 // ucomiss or ucomisd to compare, both of which support the following form:
1928 // ucomis[s|d] xmm, xmm/mem
1929 // That is only the second operand can be a memory op.
1931 // Second operand is a memory Op: Note that depending on comparison operator,
1932 // the operands of ucomis[s|d] need to be reversed. Therefore, either op1 or
1933 // op2 can be a memory op depending on the comparison operator.
1934 if (varTypeIsFloating(op1Type))
1936 // The type of the operands has to be the same and no implicit conversions at this stage.
1937 assert(op1Type == op2Type);
1940 if (GenCondition::FromFloatRelop(cmp).PreferSwap())
1949 assert(otherOp != nullptr);
1950 bool isSafeToContainOtherOp = true;
1951 if (otherOp->IsCnsNonZeroFltOrDbl())
1953 MakeSrcContained(cmp, otherOp);
1955 else if (IsContainableMemoryOp(otherOp))
1957 isSafeToContainOtherOp = IsSafeToContainMem(cmp, otherOp);
1958 if (isSafeToContainOtherOp)
1960 MakeSrcContained(cmp, otherOp);
1964 if (!otherOp->isContained() && isSafeToContainOtherOp && IsSafeToContainMem(cmp, otherOp))
1966 // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
1967 // contained, we can mark it reg-optional.
1968 // IsSafeToContainMem is expensive so we call it at most once for otherOp.
1969 // If we already called IsSafeToContainMem, it must have returned false;
1970 // otherwise, otherOp would be contained.
1971 otherOp->SetRegOptional();
1977 // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
1978 // or in other backend.
1980 if (CheckImmedAndMakeContained(cmp, op2))
1982 // If the types are the same, or if the constant is of the correct size,
1983 // we can treat the MemoryOp as contained.
1984 if (op1Type == op2Type)
1986 if (IsContainableMemoryOp(op1))
1988 MakeSrcContained(cmp, op1);
1992 op1->SetRegOptional();
1996 else if (op1Type == op2Type)
1998 // Note that TEST does not have a r,rm encoding like CMP has but we can still
1999 // contain the second operand because the emitter maps both r,rm and rm,r to
2000 // the same instruction code. This avoids the need to special case TEST here.
2002 bool isSafeToContainOp1 = true;
2003 bool isSafeToContainOp2 = true;
2005 if (IsContainableMemoryOp(op2))
2007 isSafeToContainOp2 = IsSafeToContainMem(cmp, op2);
2008 if (isSafeToContainOp2)
2010 MakeSrcContained(cmp, op2);
2014 if (!op2->isContained() && IsContainableMemoryOp(op1))
2016 isSafeToContainOp1 = IsSafeToContainMem(cmp, op1);
2017 if (isSafeToContainOp1)
2019 MakeSrcContained(cmp, op1);
2023 if (!op1->isContained() && !op2->isContained())
2025 // One of op1 or op2 could be marked as reg optional
2026 // to indicate that codegen can still generate code
2027 // if one of them is on stack.
2028 GenTree* regOptionalCandidate = op1->IsCnsIntOrI() ? op2 : PreferredRegOptionalOperand(cmp);
2030 // IsSafeToContainMem is expensive so we call it at most once for each operand
2031 // in this method. If we already called IsSafeToContainMem, it must have returned false;
2032 // otherwise, the corresponding operand (op1 or op2) would be contained.
2033 bool setRegOptional = (regOptionalCandidate == op1) ? isSafeToContainOp1 && IsSafeToContainMem(cmp, op1)
2034 : isSafeToContainOp2 && IsSafeToContainMem(cmp, op2);
2037 regOptionalCandidate->SetRegOptional();
2043 //------------------------------------------------------------------------
2044 // LowerRMWMemOp: Determine if this is a valid RMW mem op, and if so lower it accordingly
2047 // node - The indirect store node (GT_STORE_IND) of interest
2050 // Returns true if 'node' is a valid RMW mem op; false otherwise.
2052 bool Lowering::LowerRMWMemOp(GenTreeIndir* storeInd)
2054 assert(storeInd->OperGet() == GT_STOREIND);
2056 // SSE2 doesn't support RMW on float values
2057 assert(!varTypeIsFloating(storeInd));
2060 // indirDst = memory write of an addr mode (i.e. storeind destination)
2061 // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
2062 // indirCandidate = memory read i.e. a gtInd of an addr mode
2063 // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
2065 GenTree* indirCandidate = nullptr;
2066 GenTree* indirOpSource = nullptr;
2068 if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
2070 JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
2071 storeInd->AsStoreInd()->GetRMWStatus());
2072 DISPTREERANGE(BlockRange(), storeInd);
2076 GenTree* indirDst = storeInd->gtGetOp1();
2077 GenTree* indirSrc = storeInd->gtGetOp2();
2078 genTreeOps oper = indirSrc->OperGet();
2080 // At this point we have successfully detected a RMW memory op of one of the following forms
2081 // storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
2082 // storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
2083 // storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
2085 // Here indirSrc = one of the supported binary or unary operation for RMW of memory
2086 // indirCandidate = a GT_IND node
2087 // indirCandidateChild = operand of GT_IND indirCandidate
2089 // The logic below does the following
2090 // Make indirOpSource contained.
2091 // Make indirSrc contained.
2092 // Make indirCandidate contained.
2093 // Make indirCandidateChild contained.
2094 // Make indirDst contained except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
2098 // We have already done containment analysis on the indirSrc op.
2099 // If any of its operands are marked regOptional, reset that now.
2100 indirSrc->AsOp()->gtOp1->ClearRegOptional();
2101 if (GenTree::OperIsBinary(oper))
2103 // On Xarch RMW operations require the source to be an immediate or in a register.
2104 // Therefore, if we have previously marked the indirOpSource as contained while lowering
2105 // the binary node, we need to reset that now.
2106 if (IsContainableMemoryOp(indirOpSource))
2108 indirOpSource->ClearContained();
2110 indirSrc->AsOp()->gtOp2->ClearRegOptional();
2111 JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
2115 assert(GenTree::OperIsUnary(oper));
2116 JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
2118 DISPTREERANGE(BlockRange(), storeInd);
2120 indirSrc->SetContained();
2121 indirCandidate->SetContained();
2123 GenTree* indirCandidateChild = indirCandidate->gtGetOp1();
2124 indirCandidateChild->SetContained();
2126 if (indirCandidateChild->OperGet() == GT_LEA)
2128 GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
2130 if (addrMode->HasBase())
2132 assert(addrMode->Base()->OperIsLeaf());
2133 addrMode->Base()->SetContained();
2136 if (addrMode->HasIndex())
2138 assert(addrMode->Index()->OperIsLeaf());
2139 addrMode->Index()->SetContained();
2142 indirDst->SetContained();
2146 assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
2147 indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT);
2149 // If it is a GT_LCL_VAR, it still needs the reg to hold the address.
2150 // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
2151 // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
2152 // time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
2153 if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
2155 indirDst->SetContained();
2157 else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
2159 indirDst->SetContained();
2165 //------------------------------------------------------------------------
2166 // ContainCheckBinary: Determine whether a binary op's operands should be contained.
2169 // node - the node we care about
2171 void Lowering::ContainCheckBinary(GenTreeOp* node)
2173 assert(node->OperIsBinary());
2175 if (varTypeIsFloating(node))
2177 assert(node->OperIs(GT_ADD, GT_SUB));
2178 ContainCheckFloatBinary(node);
2182 GenTree* op1 = node->gtOp1;
2183 GenTree* op2 = node->gtOp2;
2185 // We can directly encode the second operand if it is either a containable constant or a memory-op.
2186 // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
2187 // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
2188 // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
2189 bool directlyEncodable = false;
2190 bool binOpInRMW = false;
2191 GenTree* operand = nullptr;
2192 bool isSafeToContainOp1 = true;
2193 bool isSafeToContainOp2 = true;
2195 if (IsContainableImmed(node, op2))
2197 directlyEncodable = true;
2202 binOpInRMW = IsBinOpInRMWStoreInd(node);
2205 const unsigned operatorSize = genTypeSize(node->TypeGet());
2206 if ((genTypeSize(op2->TypeGet()) == operatorSize) && IsContainableMemoryOp(op2))
2208 isSafeToContainOp2 = IsSafeToContainMem(node, op2);
2209 if (isSafeToContainOp2)
2211 directlyEncodable = true;
2216 if ((operand == nullptr) && node->OperIsCommutative())
2218 // If it is safe, we can reverse the order of operands of commutative operations for efficient
2220 if (IsContainableImmed(node, op1))
2222 directlyEncodable = true;
2225 else if ((genTypeSize(op1->TypeGet()) == operatorSize) && IsContainableMemoryOp(op1))
2227 isSafeToContainOp1 = IsSafeToContainMem(node, op1);
2228 if (isSafeToContainOp1)
2230 directlyEncodable = true;
2238 if (directlyEncodable)
2240 assert(operand != nullptr);
2241 MakeSrcContained(node, operand);
2243 else if (!binOpInRMW)
2245 // If this binary op neither has contained operands, nor is a
2246 // Read-Modify-Write (RMW) operation, we can mark its operands
2249 // IsSafeToContainMem is expensive so we call it at most once for each operand
2250 // in this method. If we already called IsSafeToContainMem, it must have returned false;
2251 // otherwise, directlyEncodable would be true.
2252 isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
2253 isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
2255 SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
2259 //------------------------------------------------------------------------
2260 // ContainCheckBoundsChk: determine whether any source of a bounds check node should be contained.
2263 // node - pointer to the node
2265 void Lowering::ContainCheckBoundsChk(GenTreeBoundsChk* node)
2267 assert(node->OperIsBoundsCheck());
2269 if (CheckImmedAndMakeContained(node, node->gtIndex))
2271 other = node->gtArrLen;
2273 else if (CheckImmedAndMakeContained(node, node->gtArrLen))
2275 other = node->gtIndex;
2277 else if (IsContainableMemoryOp(node->gtIndex))
2279 other = node->gtIndex;
2283 other = node->gtArrLen;
2286 if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
2288 if (IsContainableMemoryOp(other))
2290 MakeSrcContained(node, other);
2294 // We can mark 'other' as reg optional, since it is not contained.
2295 other->SetRegOptional();
2300 //------------------------------------------------------------------------
2301 // ContainCheckIntrinsic: determine whether the source of an INTRINSIC node should be contained.
2304 // node - pointer to the node
2306 void Lowering::ContainCheckIntrinsic(GenTreeOp* node)
2308 assert(node->OperIs(GT_INTRINSIC));
2310 CorInfoIntrinsics intrinsicId = node->gtIntrinsic.gtIntrinsicId;
2312 if (intrinsicId == CORINFO_INTRINSIC_Sqrt || intrinsicId == CORINFO_INTRINSIC_Round ||
2313 intrinsicId == CORINFO_INTRINSIC_Ceiling || intrinsicId == CORINFO_INTRINSIC_Floor)
2315 GenTree* op1 = node->gtGetOp1();
2316 if (IsContainableMemoryOp(op1) || op1->IsCnsNonZeroFltOrDbl())
2318 MakeSrcContained(node, op1);
2322 // Mark the operand as reg optional since codegen can still
2323 // generate code if op1 is on stack.
2324 op1->SetRegOptional();
2330 //----------------------------------------------------------------------------------------------
2331 // ContainCheckSIMD: Perform containment analysis for a SIMD intrinsic node.
2334 // simdNode - The SIMD intrinsic node.
2336 void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
2338 switch (simdNode->gtSIMDIntrinsicID)
2343 case SIMDIntrinsicInit:
2345 op1 = simdNode->gtOp.gtOp1;
2346 #ifndef _TARGET_64BIT_
2347 if (op1->OperGet() == GT_LONG)
2349 MakeSrcContained(simdNode, op1);
2350 GenTree* op1lo = op1->gtGetOp1();
2351 GenTree* op1hi = op1->gtGetOp2();
2353 if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
2354 (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
2356 MakeSrcContained(op1, op1lo);
2357 MakeSrcContained(op1, op1hi);
2361 #endif // !_TARGET_64BIT_
2362 if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
2363 (varTypeIsIntegral(simdNode->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
2365 MakeSrcContained(simdNode, op1);
2367 else if ((comp->getSIMDSupportLevel() == SIMD_AVX2_Supported) &&
2368 ((simdNode->gtSIMDSize == 16) || (simdNode->gtSIMDSize == 32)))
2370 // Either op1 is a float or dbl constant or an addr
2371 if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
2373 MakeSrcContained(simdNode, op1);
2379 case SIMDIntrinsicInitArray:
2380 // We have an array and an index, which may be contained.
2381 CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2());
2384 case SIMDIntrinsicOpEquality:
2385 case SIMDIntrinsicOpInEquality:
2386 // On SSE4/AVX, we can generate optimal code for (in)equality
2387 // against zero using ptest. We can safely do this optimization
2388 // for integral vectors but not for floating-point for the reason
2389 // that we have +0.0 and -0.0 and +0.0 == -0.0
2390 op2 = simdNode->gtGetOp2();
2391 if ((comp->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(0))
2393 MakeSrcContained(simdNode, op2);
2397 case SIMDIntrinsicGetItem:
2399 // This implements get_Item method. The sources are:
2400 // - the source SIMD struct
2401 // - index (which element to get)
2402 // The result is baseType of SIMD struct.
2403 op1 = simdNode->gtOp.gtOp1;
2404 op2 = simdNode->gtOp.gtOp2;
2406 if (op1->OperGet() == GT_IND)
2408 assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0);
2409 op1->AsIndir()->Addr()->ClearContained();
2411 // If the index is a constant, mark it as contained.
2412 CheckImmedAndMakeContained(simdNode, op2);
2414 if (IsContainableMemoryOp(op1))
2416 MakeSrcContained(simdNode, op1);
2417 if (op1->OperGet() == GT_IND)
2419 op1->AsIndir()->Addr()->ClearContained();
2425 case SIMDIntrinsicShuffleSSE2:
2426 // Second operand is an integer constant and marked as contained.
2427 assert(simdNode->gtOp.gtOp2->IsCnsIntOrI());
2428 MakeSrcContained(simdNode, simdNode->gtOp.gtOp2);
2435 #endif // FEATURE_SIMD
2437 #ifdef FEATURE_HW_INTRINSICS
2438 //----------------------------------------------------------------------------------------------
2439 // IsContainableHWIntrinsicOp: Return true if 'node' is a containable HWIntrinsic op.
2442 // containingNode - The hardware intrinsic node which contains 'node'
2443 // node - The node to check
2444 // [Out] supportsRegOptional - On return, this will be true if 'containingNode' supports regOptional operands;
2445 // otherwise, false.
2448 // true if 'node' is a containable hardware intrinsic node; otherwise, false.
2450 bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, GenTree* node, bool* supportsRegOptional)
2452 NamedIntrinsic containingIntrinsicId = containingNode->gtHWIntrinsicId;
2453 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(containingIntrinsicId);
2455 // We shouldn't have called in here if containingNode doesn't support containment
2456 assert(HWIntrinsicInfo::SupportsContainment(containingIntrinsicId));
2458 // containingNode supports nodes that read from an aligned memory address
2460 // This will generally be an explicit LoadAligned instruction and is generally
2461 // false for machines with VEX support. This is because there is currently no way
2462 // to guarantee that the address read from will always be aligned and we could silently
2463 // change the behavior of the program in the case where an Access Violation would have
2464 // otherwise occurred.
2465 bool supportsAlignedSIMDLoads = false;
2467 // containingNode supports nodes that read from general memory
2469 // We currently have to assume all "general" loads are unaligned. As such, this is
2470 // generally used to determine if we can mark the node as `regOptional` in the case
2471 // where `node` is not containable. However, this can also be used to determine whether
2472 // we can mark other types of reads as contained (such as when directly reading a local).
2473 bool supportsGeneralLoads = false;
2475 // containingNode supports nodes that read from a scalar memory address
2477 // This will generally be an explicit LoadScalar instruction but is also used to determine
2478 // whether we can read an address of type T (we don't support this when the load would
2479 // read more than sizeof(T) bytes).
2480 bool supportsSIMDScalarLoads = false;
2482 // containingNode supports nodes that read from an unaligned memory address
2484 // This will generally be an explicit Load instruction and is generally false for machines
2485 // without VEX support. This is because older hardware required that the SIMD operand always
2486 // be aligned to the 'natural alignment' of the type.
2487 bool supportsUnalignedSIMDLoads = false;
2491 case HW_Category_SimpleSIMD:
2493 // These intrinsics only expect 16 or 32-byte nodes for containment
2494 assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
2495 assert(supportsSIMDScalarLoads == false);
2497 supportsAlignedSIMDLoads =
2498 !comp->canUseVexEncoding() && (containingIntrinsicId != NI_SSE2_ConvertToVector128Double);
2499 supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
2500 supportsGeneralLoads = supportsUnalignedSIMDLoads;
2505 case HW_Category_IMM:
2507 switch (containingIntrinsicId)
2509 case NI_SSE_Shuffle:
2510 case NI_SSE2_CompareLessThan:
2511 case NI_SSE2_ShiftLeftLogical:
2512 case NI_SSE2_ShiftRightArithmetic:
2513 case NI_SSE2_ShiftRightLogical:
2514 case NI_SSE2_Shuffle:
2515 case NI_SSE2_ShuffleHigh:
2516 case NI_SSE2_ShuffleLow:
2517 case NI_SSSE3_AlignRight:
2518 case NI_SSE41_Blend:
2519 case NI_SSE41_DotProduct:
2520 case NI_SSE41_MultipleSumAbsoluteDifferences:
2521 case NI_AES_KeygenAssist:
2522 case NI_PCLMULQDQ_CarrylessMultiply:
2524 case NI_AVX_Compare:
2525 case NI_AVX_DotProduct:
2526 case NI_AVX_InsertVector128:
2527 case NI_AVX_Permute:
2528 case NI_AVX_Permute2x128:
2530 case NI_AVX2_InsertVector128:
2531 case NI_AVX2_MultipleSumAbsoluteDifferences:
2532 case NI_AVX2_Permute2x128:
2533 case NI_AVX2_Permute4x64:
2534 case NI_AVX2_ShiftLeftLogical:
2535 case NI_AVX2_ShiftRightArithmetic:
2536 case NI_AVX2_ShiftRightLogical:
2537 case NI_AVX2_ShuffleHigh:
2538 case NI_AVX2_ShuffleLow:
2540 // These intrinsics only expect 16 or 32-byte nodes for containment
2541 assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
2542 assert(supportsSIMDScalarLoads == false);
2544 supportsAlignedSIMDLoads = !comp->canUseVexEncoding();
2545 supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
2546 supportsGeneralLoads = supportsUnalignedSIMDLoads;
2551 case NI_SSE2_Insert:
2552 case NI_SSE41_Insert:
2553 case NI_SSE41_X64_Insert:
2555 if (containingNode->gtSIMDBaseType == TYP_FLOAT)
2557 assert(containingIntrinsicId == NI_SSE41_Insert);
2558 assert(genTypeSize(node->TypeGet()) == 16);
2560 // Sse41.Insert(V128<float>, V128<float>, byte) is a bit special
2561 // in that it has different behavior depending on whether the
2562 // second operand is coming from a register or memory. When coming
2563 // from a register, all 4 elements of the vector can be used and it
2564 // is effectively a regular `SimpleSIMD` operation; but when loading
2565 // from memory, it only works with the lowest element and is effectively
2568 assert(supportsAlignedSIMDLoads == false);
2569 assert(supportsUnalignedSIMDLoads == false);
2570 assert(supportsGeneralLoads == false);
2571 assert(supportsSIMDScalarLoads == false);
2573 GenTree* op1 = containingNode->gtGetOp1();
2574 GenTree* op2 = nullptr;
2575 GenTree* op3 = nullptr;
2577 assert(op1->OperIsList());
2578 assert(containingNode->gtGetOp2() == nullptr);
2580 GenTreeArgList* argList = op1->AsArgList();
2582 op1 = argList->Current();
2583 argList = argList->Rest();
2585 op2 = argList->Current();
2586 argList = argList->Rest();
2588 assert(node == op2);
2590 op3 = argList->Current();
2592 // The upper two bits of the immediate value are ignored if
2593 // op2 comes from memory. In order to support using the upper
2594 // bits, we need to disable containment support if op3 is not
2595 // constant or if the constant is greater than 0x3F (which means
2596 // at least one of the upper two bits is set).
2598 if (op3->IsCnsIntOrI())
2600 ssize_t ival = op3->AsIntCon()->IconValue();
2601 assert((ival >= 0) && (ival <= 255));
2603 supportsSIMDScalarLoads = (ival <= 0x3F);
2604 supportsGeneralLoads = supportsSIMDScalarLoads;
2609 // We should only get here for integral nodes.
2610 assert(varTypeIsIntegral(node->TypeGet()));
2612 assert(supportsAlignedSIMDLoads == false);
2613 assert(supportsUnalignedSIMDLoads == false);
2614 assert(supportsSIMDScalarLoads == false);
2616 const unsigned expectedSize = genTypeSize(containingNode->gtSIMDBaseType);
2617 const unsigned operandSize = genTypeSize(node->TypeGet());
2619 supportsGeneralLoads = (operandSize >= expectedSize);
2623 case NI_AVX_CompareScalar:
2625 // These intrinsics only expect 16 or 32-byte nodes for containment
2626 assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
2628 assert(supportsAlignedSIMDLoads == false);
2629 assert(supportsUnalignedSIMDLoads == false);
2631 supportsSIMDScalarLoads = true;
2632 supportsGeneralLoads = supportsSIMDScalarLoads;
2638 assert(supportsAlignedSIMDLoads == false);
2639 assert(supportsGeneralLoads == false);
2640 assert(supportsSIMDScalarLoads == false);
2641 assert(supportsUnalignedSIMDLoads == false);
2648 case HW_Category_SIMDScalar:
2650 assert(supportsAlignedSIMDLoads == false);
2651 assert(supportsUnalignedSIMDLoads == false);
2653 switch (containingIntrinsicId)
2655 case NI_Base_Vector128_CreateScalarUnsafe:
2656 case NI_Base_Vector256_CreateScalarUnsafe:
2658 assert(supportsSIMDScalarLoads == false);
2660 const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
2661 const unsigned operandSize = genTypeSize(node->TypeGet());
2663 supportsGeneralLoads = (operandSize == expectedSize);
2667 case NI_SSE_ConvertScalarToVector128Single:
2668 case NI_SSE2_ConvertScalarToVector128Double:
2669 case NI_SSE2_ConvertScalarToVector128Int32:
2670 case NI_SSE2_ConvertScalarToVector128UInt32:
2671 case NI_SSE_X64_ConvertScalarToVector128Single:
2672 case NI_SSE2_X64_ConvertScalarToVector128Double:
2673 case NI_SSE2_X64_ConvertScalarToVector128Int64:
2674 case NI_SSE2_X64_ConvertScalarToVector128UInt64:
2676 if (!varTypeIsIntegral(node->TypeGet()))
2678 // The floating-point overload doesn't require any special semantics
2679 assert(containingIntrinsicId == NI_SSE2_ConvertScalarToVector128Double);
2680 supportsSIMDScalarLoads = true;
2681 supportsGeneralLoads = supportsSIMDScalarLoads;
2685 assert(supportsSIMDScalarLoads == false);
2687 const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
2688 const unsigned operandSize = genTypeSize(node->TypeGet());
2690 supportsGeneralLoads = (operandSize == expectedSize);
2696 // These intrinsics only expect 16 or 32-byte nodes for containment
2697 assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
2699 supportsSIMDScalarLoads = true;
2700 supportsGeneralLoads = supportsSIMDScalarLoads;
2707 case HW_Category_Scalar:
2709 // We should only get here for integral nodes.
2710 assert(varTypeIsIntegral(node->TypeGet()));
2712 assert(supportsAlignedSIMDLoads == false);
2713 assert(supportsUnalignedSIMDLoads == false);
2714 assert(supportsSIMDScalarLoads == false);
2716 unsigned expectedSize = genTypeSize(containingNode->TypeGet());
2717 const unsigned operandSize = genTypeSize(node->TypeGet());
2719 // CRC32 codegen depends on its second oprand's type.
2720 // Currently, we are using SIMDBaseType to store the op2Type info.
2721 if (containingIntrinsicId == NI_SSE42_Crc32)
2723 var_types op2Type = containingNode->gtSIMDBaseType;
2724 expectedSize = genTypeSize(op2Type);
2727 supportsGeneralLoads = (operandSize >= expectedSize);
2733 assert(supportsAlignedSIMDLoads == false);
2734 assert(supportsGeneralLoads == false);
2735 assert(supportsSIMDScalarLoads == false);
2736 assert(supportsUnalignedSIMDLoads == false);
2741 noway_assert(supportsRegOptional != nullptr);
2742 *supportsRegOptional = supportsGeneralLoads;
2744 if (!node->OperIsHWIntrinsic())
2746 return supportsGeneralLoads && IsContainableMemoryOp(node);
2749 // TODO-XArch: Update this to be table driven, if possible.
2751 NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->gtHWIntrinsicId;
2753 switch (intrinsicId)
2755 case NI_SSE_LoadAlignedVector128:
2756 case NI_SSE2_LoadAlignedVector128:
2757 case NI_AVX_LoadAlignedVector256:
2759 return supportsAlignedSIMDLoads;
2762 case NI_SSE_LoadScalarVector128:
2763 case NI_SSE2_LoadScalarVector128:
2765 return supportsSIMDScalarLoads;
2768 // VEX encoding supports unaligned memory ops, so we can fold them
2769 case NI_SSE_LoadVector128:
2770 case NI_SSE2_LoadVector128:
2771 case NI_AVX_LoadVector256:
2773 return supportsUnalignedSIMDLoads;
2778 assert(!node->isContainableHWIntrinsic());
2784 //----------------------------------------------------------------------------------------------
2785 // ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
2788 // node - The hardware intrinsic node.
2790 void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
2792 NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
2793 HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
2794 int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
2795 var_types baseType = node->gtSIMDBaseType;
2797 GenTree* op1 = node->gtGetOp1();
2798 GenTree* op2 = node->gtGetOp2();
2799 GenTree* op3 = nullptr;
2801 if (!HWIntrinsicInfo::SupportsContainment(intrinsicId))
2803 // AVX2 gather are not contaibable and always have constant IMM argument
2804 if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId))
2806 GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
2807 assert(lastOp != nullptr);
2808 MakeSrcContained(node, lastOp);
2810 // Exit early if containment isn't supported
2814 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
2816 const bool isCommutative = HWIntrinsicInfo::IsCommutative(intrinsicId);
2820 // One argument intrinsics cannot be commutative
2821 assert(!isCommutative);
2823 assert(!op1->OperIsList());
2824 assert(op2 == nullptr);
2828 case HW_Category_SimpleSIMD:
2829 case HW_Category_SIMDScalar:
2830 case HW_Category_Scalar:
2832 switch (intrinsicId)
2834 case NI_SSE_ReciprocalScalar:
2835 case NI_SSE_ReciprocalSqrtScalar:
2836 case NI_SSE_SqrtScalar:
2837 case NI_SSE2_SqrtScalar:
2838 case NI_SSE41_CeilingScalar:
2839 case NI_SSE41_FloorScalar:
2840 case NI_SSE41_RoundCurrentDirectionScalar:
2841 case NI_SSE41_RoundToNearestIntegerScalar:
2842 case NI_SSE41_RoundToNegativeInfinityScalar:
2843 case NI_SSE41_RoundToPositiveInfinityScalar:
2844 case NI_SSE41_RoundToZeroScalar:
2846 // These intrinsics have both 1 and 2-operand overloads.
2848 // The 1-operand overload basically does `intrinsic(op1, op1)`
2850 // Because of this, the operand must be loaded into a register
2851 // and cannot be contained.
2855 case NI_SSE2_ConvertToInt32:
2856 case NI_SSE2_X64_ConvertToInt64:
2857 case NI_SSE2_ConvertToUInt32:
2858 case NI_SSE2_X64_ConvertToUInt64:
2859 case NI_AVX2_ConvertToInt32:
2860 case NI_AVX2_ConvertToUInt32:
2862 if (varTypeIsIntegral(baseType))
2864 // These intrinsics are "ins reg/mem, xmm" and don't
2865 // currently support containment.
2878 bool supportsRegOptional = false;
2880 if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2882 MakeSrcContained(node, op1);
2884 else if (supportsRegOptional)
2886 op1->SetRegOptional();
2902 assert(!op1->OperIsList());
2903 assert(op2 != nullptr);
2904 assert(!op2->OperIsList());
2908 case HW_Category_SimpleSIMD:
2909 case HW_Category_SIMDScalar:
2910 case HW_Category_Scalar:
2912 if (HWIntrinsicInfo::GeneratesMultipleIns(intrinsicId))
2914 switch (intrinsicId)
2916 case NI_SSE_CompareLessThanOrderedScalar:
2917 case NI_SSE_CompareLessThanUnorderedScalar:
2918 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
2919 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
2920 case NI_SSE2_CompareLessThanOrderedScalar:
2921 case NI_SSE2_CompareLessThanUnorderedScalar:
2922 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
2923 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
2925 // We need to swap the operands for CompareLessThanOrEqual
2934 // TODO-XArch-CQ: The Compare*OrderedScalar and Compare*UnorderedScalar methods
2935 // are commutative if you also inverse the intrinsic.
2941 bool supportsRegOptional = false;
2943 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
2945 MakeSrcContained(node, op2);
2947 else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) ||
2948 (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) &&
2949 IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
2951 MakeSrcContained(node, op1);
2953 // Swap the operands here to make the containment checks in codegen significantly simpler
2957 else if (supportsRegOptional)
2959 op2->SetRegOptional();
2961 // TODO-XArch-CQ: For commutative nodes, either operand can be reg-optional.
2962 // https://github.com/dotnet/coreclr/issues/6361
2967 case HW_Category_IMM:
2969 // We don't currently have any IMM intrinsics which are also commutative
2970 assert(!isCommutative);
2971 bool supportsRegOptional = false;
2973 switch (intrinsicId)
2975 case NI_SSE2_ShiftLeftLogical:
2976 case NI_SSE2_ShiftRightArithmetic:
2977 case NI_SSE2_ShiftRightLogical:
2978 case NI_AVX2_ShiftLeftLogical:
2979 case NI_AVX2_ShiftRightArithmetic:
2980 case NI_AVX2_ShiftRightLogical:
2982 // These intrinsics can have op2 be imm or reg/mem
2984 if (!HWIntrinsicInfo::isImmOp(intrinsicId, op2))
2986 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
2988 MakeSrcContained(node, op2);
2990 else if (supportsRegOptional)
2992 op2->SetRegOptional();
2998 case NI_SSE2_Shuffle:
2999 case NI_SSE2_ShuffleHigh:
3000 case NI_SSE2_ShuffleLow:
3001 case NI_AVX2_Permute4x64:
3003 // These intrinsics have op2 as an imm and op1 as a reg/mem
3005 if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3007 MakeSrcContained(node, op1);
3009 else if (supportsRegOptional)
3011 op1->SetRegOptional();
3016 case NI_AVX_Permute:
3018 // These intrinsics can have op2 be imm or reg/mem
3019 // They also can have op1 be reg/mem and op2 be imm
3021 if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
3023 if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3025 MakeSrcContained(node, op1);
3027 else if (supportsRegOptional)
3029 op1->SetRegOptional();
3032 else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3034 MakeSrcContained(node, op2);
3036 else if (supportsRegOptional)
3038 op2->SetRegOptional();
3043 case NI_AES_KeygenAssist:
3045 if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3047 MakeSrcContained(node, op1);
3049 else if (supportsRegOptional)
3051 op1->SetRegOptional();
3065 case HW_Category_Special:
3067 if (intrinsicId == NI_SSE2_CompareLessThan)
3069 bool supportsRegOptional = false;
3071 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3073 MakeSrcContained(node, op2);
3075 else if (supportsRegOptional)
3077 op2->SetRegOptional();
3094 else if (numArgs == 3)
3096 // three argument intrinsics should not be marked commutative
3097 assert(!isCommutative);
3099 assert(op1->OperIsList());
3100 assert(op2 == nullptr);
3102 GenTreeArgList* argList = op1->AsArgList();
3103 GenTreeArgList* originalArgList = argList;
3105 op1 = argList->Current();
3106 argList = argList->Rest();
3108 op2 = argList->Current();
3109 argList = argList->Rest();
3111 op3 = argList->Current();
3112 assert(argList->Rest() == nullptr);
3116 case HW_Category_SimpleSIMD:
3117 case HW_Category_SIMDScalar:
3118 case HW_Category_Scalar:
3120 if ((intrinsicId >= NI_FMA_MultiplyAdd) && (intrinsicId <= NI_FMA_MultiplySubtractNegatedScalar))
3122 bool supportsRegOptional = false;
3124 if (IsContainableHWIntrinsicOp(node, op3, &supportsRegOptional))
3126 // 213 form: op1 = (op2 * op1) + [op3]
3127 MakeSrcContained(node, op3);
3129 else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3131 // 132 form: op1 = (op1 * op3) + [op2]
3132 MakeSrcContained(node, op2);
3134 else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3136 // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
3138 if (!HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
3140 // 231 form: op3 = (op2 * op3) + [op1]
3141 MakeSrcContained(node, op1);
3146 assert(supportsRegOptional);
3148 // TODO-XArch-CQ: Technically any one of the three operands can
3149 // be reg-optional. With a limitation on op1 where
3150 // it can only be so if CopyUpperBits is off.
3151 // https://github.com/dotnet/coreclr/issues/6361
3153 // 213 form: op1 = (op2 * op1) + op3
3154 op3->SetRegOptional();
3159 bool supportsRegOptional = false;
3161 switch (intrinsicId)
3163 case NI_SSE41_BlendVariable:
3164 case NI_AVX_BlendVariable:
3165 case NI_AVX2_BlendVariable:
3167 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3169 MakeSrcContained(node, op2);
3171 else if (supportsRegOptional)
3173 op2->SetRegOptional();
3178 case NI_BMI2_MultiplyNoFlags:
3179 case NI_BMI2_X64_MultiplyNoFlags:
3181 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3183 MakeSrcContained(node, op2);
3185 else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
3187 MakeSrcContained(node, op1);
3188 // MultiplyNoFlags is a Commutative operation, so swap the first two operands here
3189 // to make the containment checks in codegen significantly simpler
3190 *(originalArgList->pCurrent()) = op2;
3191 *(originalArgList->Rest()->pCurrent()) = op1;
3193 else if (supportsRegOptional)
3195 op2->SetRegOptional();
3209 case HW_Category_IMM:
3211 bool supportsRegOptional = false;
3213 switch (intrinsicId)
3215 case NI_SSE_Shuffle:
3216 case NI_SSE2_Insert:
3217 case NI_SSE2_Shuffle:
3218 case NI_SSSE3_AlignRight:
3219 case NI_SSE41_Blend:
3220 case NI_SSE41_DotProduct:
3221 case NI_SSE41_Insert:
3222 case NI_SSE41_X64_Insert:
3223 case NI_SSE41_MultipleSumAbsoluteDifferences:
3225 case NI_AVX_Compare:
3226 case NI_AVX_CompareScalar:
3227 case NI_AVX_DotProduct:
3228 case NI_AVX_Permute2x128:
3229 case NI_AVX_Shuffle:
3231 case NI_AVX2_MultipleSumAbsoluteDifferences:
3232 case NI_AVX2_Permute2x128:
3233 case NI_PCLMULQDQ_CarrylessMultiply:
3235 if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
3237 MakeSrcContained(node, op2);
3239 else if (supportsRegOptional)
3241 op2->SetRegOptional();
3267 if (HWIntrinsicInfo::lookupCategory(intrinsicId) == HW_Category_IMM)
3269 GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
3270 assert(lastOp != nullptr);
3272 if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && lastOp->IsCnsIntOrI())
3274 MakeSrcContained(node, lastOp);
3279 #endif // FEATURE_HW_INTRINSICS
3281 //------------------------------------------------------------------------
3282 // ContainCheckFloatBinary: determine whether the sources of a floating point binary node should be contained.
3285 // node - pointer to the node
3287 void Lowering::ContainCheckFloatBinary(GenTreeOp* node)
3289 assert(node->OperIs(GT_ADD, GT_SUB, GT_MUL, GT_DIV) && varTypeIsFloating(node));
3291 // overflow operations aren't supported on float/double types.
3292 assert(!node->gtOverflowEx());
3294 GenTree* op1 = node->gtGetOp1();
3295 GenTree* op2 = node->gtGetOp2();
3297 // No implicit conversions at this stage as the expectation is that
3298 // everything is made explicit by adding casts.
3299 assert(op1->TypeGet() == op2->TypeGet());
3301 bool isSafeToContainOp1 = true;
3302 bool isSafeToContainOp2 = true;
3304 if (op2->IsCnsNonZeroFltOrDbl())
3306 MakeSrcContained(node, op2);
3308 else if (IsContainableMemoryOp(op2))
3310 isSafeToContainOp2 = IsSafeToContainMem(node, op2);
3311 if (isSafeToContainOp2)
3313 MakeSrcContained(node, op2);
3317 if (!op2->isContained() && node->OperIsCommutative())
3319 // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
3320 // as long as it is safe so that the following efficient code sequence is generated:
3321 // addss/sd targetReg, memOp (if op1Reg == targetReg) OR
3322 // movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
3325 // movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg (if op1Reg == targetReg) OR
3326 // movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
3328 if (op1->IsCnsNonZeroFltOrDbl())
3330 MakeSrcContained(node, op1);
3332 else if (IsContainableMemoryOp(op1))
3334 isSafeToContainOp1 = IsSafeToContainMem(node, op1);
3335 if (isSafeToContainOp1)
3337 MakeSrcContained(node, op1);
3342 if (!op1->isContained() && !op2->isContained())
3344 // If there are no containable operands, we can make an operand reg optional.
3345 // IsSafeToContainMem is expensive so we call it at most once for each operand
3346 // in this method. If we already called IsSafeToContainMem, it must have returned false;
3347 // otherwise, the corresponding operand (op1 or op2) would be contained.
3348 isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
3349 isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
3350 SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
3354 #endif // _TARGET_XARCH_