1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Amd64 SIMD Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator.
25 #include "gcinfoencoder.h"
29 // Instruction immediates
32 // - bits 6 and 7 of the immediate indicate which source item to select (0..3)
33 // - bits 4 and 5 of the immediate indicate which target item to insert into (0..3)
34 // - bits 0 to 3 of the immediate indicate which target item to zero
35 #define INSERTPS_SOURCE_SELECT(i) (i<<6)
36 #define INSERTPS_TARGET_SELECT(i) (i<<4)
37 #define INSERTPS_ZERO(i) (1<<i)
39 // getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic
42 // intrinsicId - SIMD intrinsic Id
43 // baseType - Base type of the SIMD vector
44 // immed - Out param. Any immediate byte operand that needs to be passed to SSE2 opcode
48 // Instruction (op) to be used, and immed is set if instruction requires an immediate operand.
51 CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId,
53 unsigned *ival /*=nullptr*/)
55 // Minimal required instruction set is SSE2.
56 assert(compiler->canUseSSE2());
58 instruction result = INS_invalid;
61 case SIMDIntrinsicInit:
62 if (compiler->canUseAVX())
64 // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
65 // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
66 // If we decide to use AVX2 only, we can remove this assert.
67 if ((compiler->opts.eeFlags & CORJIT_FLG_USE_AVX2) == 0)
69 assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
73 case TYP_FLOAT: result = INS_vbroadcastss; break;
74 case TYP_DOUBLE: result = INS_vbroadcastsd; break;
75 case TYP_ULONG: __fallthrough;
76 case TYP_LONG: result = INS_vpbroadcastq; break;
77 case TYP_UINT: __fallthrough;
78 case TYP_INT: result = INS_vpbroadcastd; break;
79 case TYP_CHAR: __fallthrough;
80 case TYP_SHORT: result = INS_vpbroadcastw; break;
81 case TYP_UBYTE: __fallthrough;
82 case TYP_BYTE: result = INS_vpbroadcastb; break;
87 // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
89 case SIMDIntrinsicShuffleSSE2:
90 if (baseType == TYP_FLOAT)
94 else if (baseType == TYP_DOUBLE)
98 else if (baseType == TYP_INT || baseType == TYP_UINT)
102 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
104 // We don't have a seperate SSE2 instruction and will
105 // use the instruction meant for doubles since it is
106 // of the same size as a long.
111 case SIMDIntrinsicSqrt:
112 if (baseType == TYP_FLOAT)
116 else if (baseType == TYP_DOUBLE)
126 case SIMDIntrinsicAdd:
127 if (baseType == TYP_FLOAT)
131 else if (baseType == TYP_DOUBLE)
135 else if (baseType == TYP_INT || baseType == TYP_UINT)
139 else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
143 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
147 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
153 case SIMDIntrinsicSub:
154 if (baseType == TYP_FLOAT)
158 else if (baseType == TYP_DOUBLE)
162 else if (baseType == TYP_INT || baseType == TYP_UINT)
166 else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
170 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
174 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
180 case SIMDIntrinsicMul:
181 if (baseType == TYP_FLOAT)
185 else if (baseType == TYP_DOUBLE)
189 else if (baseType == TYP_SHORT)
193 else if (compiler->canUseAVX())
195 if (baseType == TYP_INT)
202 case SIMDIntrinsicDiv:
203 if (baseType == TYP_FLOAT)
207 else if (baseType == TYP_DOUBLE)
217 case SIMDIntrinsicMin:
218 if (baseType == TYP_FLOAT)
222 else if (baseType == TYP_DOUBLE)
226 else if (baseType == TYP_UBYTE)
230 else if (baseType == TYP_SHORT)
240 case SIMDIntrinsicMax:
241 if (baseType == TYP_FLOAT)
245 else if (baseType == TYP_DOUBLE)
249 else if (baseType == TYP_UBYTE)
253 else if (baseType == TYP_SHORT)
263 case SIMDIntrinsicEqual:
264 if (baseType == TYP_FLOAT)
267 assert(ival != nullptr);
270 else if (baseType == TYP_DOUBLE)
273 assert(ival != nullptr);
276 else if (baseType == TYP_INT || baseType == TYP_UINT)
278 result = INS_pcmpeqd;
280 else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
282 result = INS_pcmpeqw;
284 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
286 result = INS_pcmpeqb;
288 else if (compiler->canUseAVX() && (baseType == TYP_ULONG || baseType == TYP_LONG))
290 result = INS_pcmpeqq;
294 case SIMDIntrinsicLessThan:
295 // Packed integers use > with swapped operands
296 assert(baseType != TYP_INT);
298 if (baseType == TYP_FLOAT)
301 assert(ival != nullptr);
304 else if (baseType == TYP_DOUBLE)
307 assert(ival != nullptr);
312 case SIMDIntrinsicLessThanOrEqual:
313 // Packed integers use (a==b) || ( b > a) in place of a <= b.
314 assert(baseType != TYP_INT);
316 if (baseType == TYP_FLOAT)
319 assert(ival != nullptr);
322 else if (baseType == TYP_DOUBLE)
325 assert(ival != nullptr);
330 case SIMDIntrinsicGreaterThan:
331 // Packed float/double use < with swapped operands
332 assert(!varTypeIsFloating(baseType));
334 // SSE2 supports only signed >
335 if (baseType == TYP_INT)
337 result = INS_pcmpgtd;
339 else if (baseType == TYP_SHORT)
341 result = INS_pcmpgtw;
343 else if (baseType == TYP_BYTE)
345 result = INS_pcmpgtb;
347 else if (compiler->canUseAVX() && (baseType == TYP_LONG))
349 result = INS_pcmpgtq;
353 case SIMDIntrinsicBitwiseAnd:
354 if (baseType == TYP_FLOAT)
358 else if (baseType == TYP_DOUBLE)
362 else if (varTypeIsIntegral(baseType))
368 case SIMDIntrinsicBitwiseAndNot:
369 if (baseType == TYP_FLOAT)
373 else if (baseType == TYP_DOUBLE)
377 else if (baseType == TYP_INT)
381 else if (varTypeIsIntegral(baseType))
387 case SIMDIntrinsicBitwiseOr:
388 if (baseType == TYP_FLOAT)
392 else if (baseType == TYP_DOUBLE)
396 else if (varTypeIsIntegral(baseType))
402 case SIMDIntrinsicBitwiseXor:
403 if (baseType == TYP_FLOAT)
407 else if (baseType == TYP_DOUBLE)
411 else if (varTypeIsIntegral(baseType))
417 case SIMDIntrinsicCast:
421 case SIMDIntrinsicShiftLeftInternal:
422 // base type doesn't matter since the entire vector is shifted left
426 case SIMDIntrinsicShiftRightInternal:
427 // base type doesn't matter since the entire vector is shifted right
431 case SIMDIntrinsicUpperSave:
432 result = INS_vextractf128;
435 case SIMDIntrinsicUpperRestore:
436 result = INS_insertps;
440 assert(!"Unsupported SIMD intrinsic");
444 noway_assert(result != INS_invalid);
448 // genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg
449 // to target mm reg, zeroing out the upper bits if and only if specified.
452 // type the type of value to be moved
453 // targetReg the target reg
454 // srcReg the src reg
455 // moveType action to be performed on target upper bits
461 // This is currently only supported for floating point types.
464 CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
466 var_types targetType = compiler->getSIMDVectorType();
467 assert(varTypeIsFloating(type));
468 #ifdef FEATURE_AVX_SUPPORT
469 if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
473 case SMT_PreserveUpper:
474 if (srcReg != targetReg)
476 instruction ins = ins_Store(type);
477 if (getEmitter()->IsThreeOperandMoveAVXInstruction(ins))
479 // In general, when we use a three-operands move instruction, we want to merge the src with itself.
480 // This is an exception in that we actually want the "merge" behavior, so we must specify it with
482 inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(targetType));
486 inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
491 case SMT_ZeroInitUpper:
493 // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
494 // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
495 // to zero all but the lower bits.
496 unsigned int insertpsImm = (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
497 inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
501 case SMT_ZeroInitUpper_SrcHasUpperZeros:
502 if (srcReg != targetReg)
504 instruction ins = ins_Copy(type);
505 assert(!getEmitter()->IsThreeOperandMoveAVXInstruction(ins));
506 inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
515 #endif // FEATURE_AVX_SUPPORT
521 case SMT_PreserveUpper:
522 if (srcReg != targetReg)
524 inst_RV_RV(ins_Store(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
528 case SMT_ZeroInitUpper:
529 if (srcReg == targetReg)
531 // There is no guarantee that upper bits of op1Reg are zero.
532 // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
533 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, type);
534 getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
535 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, type);
536 getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
540 genSIMDZero(targetType, TYP_FLOAT, targetReg);
541 inst_RV_RV(ins_Store(type), targetReg, srcReg);
545 case SMT_ZeroInitUpper_SrcHasUpperZeros:
546 if (srcReg != targetReg)
548 inst_RV_RV(ins_Copy(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
559 CodeGen::genSIMDZero(var_types targetType, var_types baseType, regNumber targetReg)
562 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType);
563 inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
566 //------------------------------------------------------------------------
567 // genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize.
570 // simdNode - The GT_SIMD node
576 CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
578 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit);
580 GenTree* op1 = simdNode->gtGetOp1();
581 var_types baseType = simdNode->gtSIMDBaseType;
582 regNumber targetReg = simdNode->gtRegNum;
583 assert(targetReg != REG_NA);
584 var_types targetType = simdNode->TypeGet();
585 InstructionSet iset = compiler->getSIMDInstructionSet();
586 unsigned size = simdNode->gtSIMDSize;
588 // Should never see small int base type vectors except for zero initialization.
589 noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0));
591 instruction ins = INS_invalid;
592 if (op1->isContained())
594 if (op1->IsIntegralConst(0) || op1->IsFPZero())
596 genSIMDZero(targetType, baseType, targetReg);
598 else if (varTypeIsIntegral(baseType) && op1->IsIntegralConst(-1))
600 // case of initializing elements of vector with all 1's
601 // generate pcmpeqd reg, reg
602 ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
603 inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
605 #ifdef FEATURE_AVX_SUPPORT
608 assert(iset == InstructionSet_AVX);
609 ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType);
610 if (op1->IsCnsFltOrDbl())
612 getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1);
614 else if (op1->OperIsLocalAddr())
616 unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : 0;
617 getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum, offset);
624 #endif // FEATURE_AVX_SUPPORT
626 else if (iset == InstructionSet_AVX && ((size == 32) || (size == 16)))
628 regNumber srcReg = genConsumeReg(op1);
629 if (baseType == TYP_INT || baseType == TYP_UINT ||
630 baseType == TYP_LONG || baseType == TYP_ULONG)
632 ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
633 assert(ins != INS_invalid);
634 inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
638 ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
639 getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg);
643 // If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType.
644 // In either case we are going to use the SSE2 shuffle instruction.
646 regNumber op1Reg = genConsumeReg(op1);
647 unsigned shuffleControl = 0;
649 if (compiler->isSubRegisterSIMDType(simdNode))
651 assert(baseType == TYP_FLOAT);
653 // We cannot assume that upper bits of op1Reg or targetReg be zero.
654 // Therefore we need to explicitly zero out upper bits. This is
655 // essential for the shuffle operation performed below.
657 // If op1 is a float/double constant, we would have loaded it from
658 // data section using movss/sd. Similarly if op1 is a memory op we
659 // would have loaded it using movss/sd. Movss/sd when loading a xmm reg
660 // from memory would zero-out upper bits. In these cases we can
661 // avoid explicitly zero'ing out targetReg if targetReg and op1Reg are the same or do it more efficiently
662 // if they are not the same.
663 SIMDScalarMoveType moveType = op1->IsCnsFltOrDbl() || op1->isMemoryOp()
664 ? SMT_ZeroInitUpper_SrcHasUpperZeros
667 genSIMDScalarMove(TYP_FLOAT, targetReg, op1Reg, moveType);
671 shuffleControl = 0x50;
675 shuffleControl = 0x40;
679 noway_assert(!"Unexpected size for SIMD type");
684 if (op1Reg != targetReg)
686 if (varTypeIsFloating(baseType))
688 ins = ins_Copy(targetType);
690 else if (baseType == TYP_INT || baseType == TYP_UINT ||
691 baseType == TYP_LONG || baseType == TYP_ULONG)
693 ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
696 assert(ins != INS_invalid);
697 inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType));
701 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
702 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl);
705 genProduceReg(simdNode);
708 //-------------------------------------------------------------------------------------------
709 // genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes
710 // a number of arguments equal to the length of the Vector.
713 // simdNode - The GT_SIMD node
719 CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
721 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN);
723 // Right now this intrinsic is supported only on TYP_FLOAT vectors
724 var_types baseType = simdNode->gtSIMDBaseType;
725 noway_assert(baseType == TYP_FLOAT);
727 regNumber targetReg = simdNode->gtRegNum;
728 assert(targetReg != REG_NA);
730 var_types targetType = simdNode->TypeGet();
732 // Note that we cannot use targetReg before consumed all source operands. Therefore,
733 // Need an internal register to stitch together all the values into a single vector
735 assert(simdNode->gtRsvdRegs != RBM_NONE);
736 assert(genCountBits(simdNode->gtRsvdRegs) == 1);
737 regNumber vectorReg = genRegNumFromMask(simdNode->gtRsvdRegs);
739 // Zero out vectorReg if we are constructing a vector whose size is not equal to targetType vector size.
740 // For example in case of Vector4f we don't need to zero when using SSE2.
741 if (compiler->isSubRegisterSIMDType(simdNode))
743 genSIMDZero(targetType, baseType, vectorReg);
746 unsigned int baseTypeSize = genTypeSize(baseType);
747 instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
749 // We will first consume the list items in execution (left to right) order,
750 // and record the registers.
751 regNumber operandRegs[SIMD_INTRINSIC_MAX_PARAM_COUNT];
752 unsigned initCount = 0;
753 for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2())
755 assert(list->OperGet() == GT_LIST);
756 GenTree* listItem = list->gtGetOp1();
757 assert(listItem->TypeGet() == baseType);
758 assert(!listItem->isContained());
759 regNumber operandReg = genConsumeReg(listItem);
760 operandRegs[initCount] = operandReg;
764 unsigned int offset = 0;
765 for (unsigned i = 0; i < initCount; i++)
767 // We will now construct the vector from the list items in reverse order.
768 // This allows us to efficiently stitch together a vector as follows:
769 // vectorReg = (vectorReg << offset)
770 // VectorReg[0] = listItemReg
771 // Use genSIMDScalarMove with SMT_PreserveUpper in order to ensure that the upper
772 // bits of vectorReg are not modified.
774 regNumber operandReg = operandRegs[initCount - i - 1];
777 getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
779 genSIMDScalarMove(baseType, vectorReg, operandReg, SMT_PreserveUpper);
781 offset += baseTypeSize;
784 noway_assert(offset == simdNode->gtSIMDSize);
786 // Load the initialized value.
787 if (targetReg != vectorReg)
789 inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType));
791 genProduceReg(simdNode);
794 //----------------------------------------------------------------------------------
795 // genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt.
798 // simdNode - The GT_SIMD node
804 CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
806 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast);
808 GenTree* op1 = simdNode->gtGetOp1();
809 var_types baseType = simdNode->gtSIMDBaseType;
810 regNumber targetReg = simdNode->gtRegNum;
811 assert(targetReg != REG_NA);
812 var_types targetType = simdNode->TypeGet();
814 regNumber op1Reg = genConsumeReg(op1);
815 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
816 if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast || targetReg != op1Reg)
818 inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
820 genProduceReg(simdNode);
823 //--------------------------------------------------------------------------------
824 // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
825 // add, sub, mul, bit-wise And, AndNot and Or.
828 // simdNode - The GT_SIMD node
834 CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
836 assert( simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd ||
837 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub ||
838 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul ||
839 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv ||
840 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd ||
841 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot ||
842 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr ||
843 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor ||
844 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin ||
845 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax
848 GenTree* op1 = simdNode->gtGetOp1();
849 GenTree* op2 = simdNode->gtGetOp2();
850 var_types baseType = simdNode->gtSIMDBaseType;
851 regNumber targetReg = simdNode->gtRegNum;
852 assert(targetReg != REG_NA);
853 var_types targetType = simdNode->TypeGet();
854 InstructionSet iset = compiler->getSIMDInstructionSet();
856 genConsumeOperands(simdNode);
857 regNumber op1Reg = op1->gtRegNum;
858 regNumber op2Reg = op2->gtRegNum;
859 regNumber otherReg = op2Reg;
862 // SSE2 doesn't have an instruction to perform this operation directly
863 // whereas SSE4.1 does (pmulld). This is special cased and computed
865 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul &&
866 baseType == TYP_INT &&
867 iset == InstructionSet_SSE2)
869 // We need a temporary register that is NOT the same as the target,
870 // and we MAY need another.
871 assert(simdNode->gtRsvdRegs != RBM_NONE);
872 assert(genCountBits(simdNode->gtRsvdRegs) == 2);
874 regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
875 regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
876 tmpRegsMask &= ~tmpReg1Mask;
877 regNumber tmpReg = genRegNumFromMask(tmpReg1Mask);
878 regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
879 // The register allocator guarantees the following conditions:
880 // - the only registers that may be the same among op1Reg, op2Reg, tmpReg
881 // and tmpReg2 are op1Reg and op2Reg.
882 // Let's be extra-careful and assert that now.
883 assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) &&
884 (op2Reg != tmpReg) && (op2Reg != tmpReg2) &&
885 (tmpReg != tmpReg2));
887 // We will start by setting things up so that:
888 // - We have op1 in op1Reg and targetReg, and they are different registers.
889 // - We have op2 in op2Reg and tmpReg
890 // - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified,
891 // OR they are the targetReg that will be produced.
892 // (Note that in the code we generate below op1Reg and op2Reg are never written.)
893 // We will copy things as necessary to ensure that this is the case.
894 // Note that we can swap op1 and op2, since multiplication is commutative.
895 // We will not modify the values in op1Reg and op2Reg.
896 // (Though note that if either op1 or op2 is the same as targetReg, we will make
897 // a copy and use that copy as the input register. In that case we WILL modify
898 // the original value in the register, but will wind up with the result in targetReg
899 // in the end, as expected.)
901 // First, we need a tmpReg that is NOT the same as targetReg.
902 // Note that if we have another reg that is the same as targetReg,
903 // we can use tmpReg2 for that case, as we will not have hit this case.
904 if (tmpReg == targetReg)
909 if (op2Reg == targetReg)
911 // We will swap the operands.
912 // Since the code below only deals with registers, this now becomes the case where
913 // op1Reg == targetReg.
917 if (op1Reg == targetReg)
919 // Copy op1, and make tmpReg2 the new op1Reg.
920 // Note that those regs can't be the same, as we asserted above.
921 // Also, we know that tmpReg2 hasn't been used, because we couldn't have hit
922 // the "tmpReg == targetReg" case.
923 inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType));
925 inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
926 // However, we have one more case to worry about: what if op2Reg is also targetReg
927 // (i.e. we have the same operand as op1 and op2)?
928 // In that case we will set op2Reg to the same register as op1Reg.
929 if (op2Reg == targetReg)
936 // Copy op1 to targetReg and op2 to tmpReg.
937 inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
938 inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
940 // Let's assert that things are as we expect.
941 // - We have op1 in op1Reg and targetReg, and they are different registers.
942 assert(op1Reg != targetReg);
943 // - We have op2 in op2Reg and tmpReg, and they are different registers.
944 assert(op2Reg != tmpReg);
945 // - Either we are going to leave op1's reg unmodified, or it is the targetReg.
946 assert((op1->gtRegNum == op1Reg) || (op1->gtRegNum == op2Reg) || (op1->gtRegNum == targetReg));
947 // - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg.
948 assert((op2->gtRegNum == op1Reg) || (op2->gtRegNum == op2Reg) || (op2->gtRegNum == targetReg));
950 // Now we can generate the code.
952 // targetReg = op1 >> 4-bytes (op1 is already in targetReg)
953 getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, 4);
955 // tmpReg = op2 >> 4-bytes (op2 is already in tmpReg)
956 getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, 4);
958 // tmp = unsigned double word multiply of targetReg and tmpReg. Essentially
959 // tmpReg[63:0] = op1[1] * op2[1]
960 // tmpReg[127:64] = op1[3] * op2[3]
961 inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType));
963 // Extract first and third double word results from tmpReg
964 // tmpReg = shuffle(0,0,2,0) of tmpReg
965 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, 0x08);
967 // targetReg[63:0] = op1[0] * op2[0]
968 // targetReg[127:64] = op1[2] * op2[2]
969 inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
970 inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType));
972 // Extract first and third double word results from targetReg
973 // targetReg = shuffle(0,0,2,0) of targetReg
974 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, 0x08);
976 // pack the results into a single vector
977 inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
981 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
983 //Currently AVX doesn't support integer.
984 //if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
985 if (op1Reg != targetReg &&
986 compiler->canUseAVX() &&
987 !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) &&
988 getEmitter()->IsThreeOperandAVXInstruction(ins))
990 inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
994 if (op2Reg == targetReg)
998 else if (op1Reg != targetReg)
1000 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1003 inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1007 // Vector2/3 div: since the top-most elements will be zero, we end up
1008 // perfoming 0/0 which is a NAN. Therefore, post division we need to set the
1009 // top-most elements to zero. This is achieved by left logical shift followed
1010 // by right logical shift of targetReg.
1011 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16))
1013 // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
1014 unsigned shiftCount = 16 - simdNode->gtSIMDSize;
1015 assert(shiftCount != 0);
1016 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
1017 getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1018 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1019 getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1022 genProduceReg(simdNode);
1025 //--------------------------------------------------------------------------------
1026 // genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater
1027 // <, <=, >, >= and ==
1030 // simdNode - The GT_SIMD node
1036 CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
1038 GenTree* op1 = simdNode->gtGetOp1();
1039 GenTree* op2 = simdNode->gtGetOp2();
1040 var_types baseType = simdNode->gtSIMDBaseType;
1041 regNumber targetReg = simdNode->gtRegNum;
1042 assert(targetReg != REG_NA);
1043 var_types targetType = simdNode->TypeGet();
1044 InstructionSet iset = compiler->getSIMDInstructionSet();
1046 genConsumeOperands(simdNode);
1047 regNumber op1Reg = op1->gtRegNum;
1048 regNumber op2Reg = op2->gtRegNum;
1049 regNumber otherReg = op2Reg;
1051 switch(simdNode->gtSIMDIntrinsicID)
1053 case SIMDIntrinsicEqual:
1054 case SIMDIntrinsicGreaterThan:
1056 // SSE2: vector<(u)long> relation op should be implemented in terms of TYP_INT comparison operations
1057 assert(((iset == InstructionSet_AVX) || (baseType != TYP_LONG)) &&
1058 (baseType != TYP_ULONG));
1060 // Greater-than: Floating point vectors use "<" with swapped operands
1061 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan)
1063 assert(!varTypeIsFloating(baseType));
1067 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1069 // targetReg = op1reg > op2reg
1070 // Therefore, we can optimize if op1Reg == targetReg
1072 if (op1Reg != targetReg)
1074 if (op2Reg == targetReg)
1076 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual);
1081 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1085 if (varTypeIsFloating(baseType))
1087 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, otherReg, ival);
1091 inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1096 case SIMDIntrinsicLessThan:
1097 case SIMDIntrinsicLessThanOrEqual:
1099 // Int vectors use ">" and ">=" with swapped operands
1100 assert(varTypeIsFloating(baseType));
1102 // Get the instruction opcode for compare operation
1104 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1106 // targetReg = op1reg RelOp op2reg
1107 // Thefore, we can optimize if op1Reg == targetReg
1108 if (op1Reg != targetReg)
1110 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1113 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival);
1117 // (In)Equality that produces bool result instead of a bit vector
1118 case SIMDIntrinsicOpEquality:
1119 case SIMDIntrinsicOpInEquality:
1121 assert(genIsValidIntReg(targetReg));
1123 // We need two additional XMM register as scratch
1124 assert(simdNode->gtRsvdRegs != RBM_NONE);
1125 assert(genCountBits(simdNode->gtRsvdRegs) == 2);
1127 regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
1128 regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
1129 tmpRegsMask &= ~tmpReg1Mask;
1130 regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
1131 regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
1132 var_types simdType = op1->TypeGet();
1133 // TODO-1stClassStructs: Temporary to minimize asmDiffs
1134 if (simdType == TYP_DOUBLE)
1135 simdType = TYP_SIMD8;
1137 // Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16
1138 // since both the operands will be in XMM registers.
1139 if (simdType == TYP_SIMD12)
1141 simdType = TYP_SIMD16;
1144 // tmpReg1 = (op1Reg == op2Reg)
1145 // Call this value of tmpReg1 as 'compResult' for further reference below.
1146 regNumber otherReg = op2Reg;
1147 if (tmpReg1 != op2Reg)
1149 if (tmpReg1 != op1Reg)
1151 inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
1159 // For all integer types we can use TYP_INT comparison.
1161 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
1163 if (varTypeIsFloating(baseType))
1165 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
1169 inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
1172 // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
1173 if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
1175 // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
1177 // Generated code sequence
1178 // - vextractf128 tmpReg2, tmpReg1, 0x01
1179 // tmpReg2[128..255] <- 0
1180 // tmpReg2[0..127] <- tmpReg1[128..255]
1181 // - vandps tmpReg1, tempReg2
1182 // This will zero-out upper portion of tmpReg1 and
1183 // lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
1184 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
1185 inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
1187 // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
1188 if (simdType != TYP_SIMD8)
1190 // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
1191 // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
1192 getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
1194 // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
1196 // Note that what we have computed is as follows at this point:
1197 // tmpReg1[0] = compResult[0] & compResult[2]
1198 // tmpReg1[1] = compResult[1] & compResult[3]
1199 inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
1201 // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
1202 // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
1204 // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
1205 // tmpReg2[0] = compResult[1] & compResult[3]
1206 getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
1208 // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
1209 // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
1210 inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
1212 // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
1213 // (Note that for mov_xmm2i, the int register is always in the reg2 position.
1214 inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
1216 // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
1218 // cmp targetReg, 0xFFFFFFFF
1220 // movzx targetReg, targetReg
1223 // cmp targetReg, 0xFFFFFFFF
1225 // movzx targetReg, targetReg
1227 getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
1228 inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT, EA_1BYTE);
1229 assert(simdNode->TypeGet() == TYP_INT);
1230 // Set the higher bytes to 0
1231 inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
1236 noway_assert(!"Unimplemented SIMD relational operation.");
1240 genProduceReg(simdNode);
1244 //--------------------------------------------------------------------------------
1245 // genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
1248 // simdNode - The GT_SIMD node
1254 CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
1256 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);
1258 GenTree* op1 = simdNode->gtGetOp1();
1259 GenTree* op2 = simdNode->gtGetOp2();
1260 var_types baseType = simdNode->gtSIMDBaseType;
1261 var_types simdType = op1->TypeGet();
1262 // TODO-1stClassStructs: Temporary to minimize asmDiffs
1263 if (simdType == TYP_DOUBLE)
1264 simdType = TYP_SIMD8;
1265 var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType;
1266 regNumber targetReg = simdNode->gtRegNum;
1267 assert(targetReg != REG_NA);
1269 // DotProduct is only supported on floating point types.
1270 var_types targetType = simdNode->TypeGet();
1271 assert(targetType == baseType);
1272 assert(varTypeIsFloating(baseType));
1274 genConsumeOperands(simdNode);
1275 regNumber op1Reg = op1->gtRegNum;
1276 regNumber op2Reg = op2->gtRegNum;
1278 regNumber tmpReg = REG_NA;
1279 // For SSE, or AVX with 32-byte vectors, we need an additional Xmm register as scratch.
1280 // However, it must be distinct from targetReg, so we request two from the register allocator.
1281 // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
1282 if ((compiler->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdEvalType == TYP_SIMD32))
1284 assert(simdNode->gtRsvdRegs != RBM_NONE);
1285 assert(genCountBits(simdNode->gtRsvdRegs) == 2);
1287 regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
1288 regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
1289 tmpRegsMask &= ~tmpReg1Mask;
1290 regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
1291 regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
1293 // Choose any register different from targetReg as tmpReg
1294 if (tmpReg1 != targetReg)
1300 assert(targetReg != tmpReg2);
1303 assert(tmpReg != REG_NA);
1304 assert(tmpReg != targetReg);
1307 if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
1309 // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg
1310 if (op1Reg == targetReg)
1313 // nothing to do, we have registers in the right place
1315 else if (op2Reg == targetReg)
1321 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
1324 // DotProduct(v1, v2)
1325 // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg
1326 if (baseType == TYP_FLOAT)
1329 // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its position
1330 // tmp = shuffle(tmp, tmp, Shuffle(2,3,0,1)) // tmp = (2, 3, 0, 1)
1331 // v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1)
1333 // tmp = shuffle(tmp, tmp, Shuffle(0,1,2,3)) // tmp = (0+1, 1+0, 2+3, 3+2)
1334 // v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
1335 // // Essentially horizontal addtion of all elements.
1336 // // We could achieve the same using SSEv3 instruction HADDPS.
1338 inst_RV_RV(INS_mulps, targetReg, op2Reg);
1339 inst_RV_RV(INS_movaps, tmpReg, targetReg);
1340 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0xb1);
1341 inst_RV_RV(INS_addps, targetReg, tmpReg);
1342 inst_RV_RV(INS_movaps, tmpReg, targetReg);
1343 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0x1b);
1344 inst_RV_RV(INS_addps, targetReg, tmpReg);
1346 else if (baseType == TYP_DOUBLE)
1349 // tmp = v0 // v0 = (1, 0) - each element is given by its position
1350 // tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1)
1351 // v0 = v0 + tmp // v0 = (1+0, 0+1)
1352 inst_RV_RV(INS_mulpd, targetReg, op2Reg);
1353 inst_RV_RV(INS_movaps, tmpReg, targetReg);
1354 inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg, tmpReg, 0x01);
1355 inst_RV_RV(INS_addpd, targetReg, tmpReg);
1364 // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
1365 // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
1366 // use the 3-op form, so that we can avoid these copies.
1367 // TODO-CQ: Add inst_RV_RV_RV_IV().
1368 if (op1Reg == targetReg)
1371 // nothing to do, we have registers in the right place
1373 else if (op2Reg == targetReg)
1379 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
1382 emitAttr emitSize = emitActualTypeSize(simdEvalType);
1383 if (baseType == TYP_FLOAT)
1385 // dpps computes the dot product of the upper & lower halves of the 32-byte register.
1386 // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
1387 inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, 0xf1);
1388 // If this is TYP_SIMD32, we need to combine the lower & upper results.
1389 if (simdEvalType == TYP_SIMD32)
1391 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
1392 inst_RV_RV(INS_addps, targetReg, tmpReg, targetType, emitTypeSize(targetType));
1395 else if (baseType == TYP_DOUBLE)
1397 // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use
1399 assert(simdType == TYP_SIMD32);
1401 // targetReg = targetReg * op2Reg
1402 // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
1403 // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg
1404 // targetReg = targetReg + tmpReg
1405 inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
1406 inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
1407 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
1408 inst_RV_RV(INS_addpd, targetReg, tmpReg, targetType, emitTypeSize(targetType));
1416 genProduceReg(simdNode);
1419 //------------------------------------------------------------------------------------
1420 // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
1423 // simdNode - The GT_SIMD node
1429 CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
1431 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
1433 GenTree* op1 = simdNode->gtGetOp1();
1434 GenTree* op2 = simdNode->gtGetOp2();
1435 var_types simdType = op1->TypeGet();
1436 assert(varTypeIsSIMD(simdType));
1438 // op1 of TYP_SIMD12 should be considered as TYP_SIMD16,
1439 // since it is in XMM register.
1440 if (simdType == TYP_SIMD12)
1442 simdType = TYP_SIMD16;
1445 var_types baseType = simdNode->gtSIMDBaseType;
1446 regNumber targetReg = simdNode->gtRegNum;
1447 assert(targetReg != REG_NA);
1448 var_types targetType = simdNode->TypeGet();
1449 assert(targetType == genActualType(baseType));
1451 // GetItem has 2 operands:
1452 // - the source of SIMD type (op1)
1453 // - the index of the value to be returned.
1454 genConsumeOperands(simdNode);
1455 regNumber srcReg = op1->gtRegNum;
1457 // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
1458 // For the non-constant case, we will use the SIMD temp location to store the vector, and
1459 // the load the desired element.
1460 // The range check will already have been performed, so at this point we know we have an index
1461 // within the bounds of the vector.
1462 if (!op2->IsCnsIntOrI())
1464 unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
1465 noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
1467 unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
1468 regNumber indexReg = op2->gtRegNum;
1470 // Store the vector to the temp location.
1471 getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)),
1472 emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0);
1474 // Now, load the desired element.
1475 getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
1476 emitTypeSize(baseType), // Of the vector baseType
1477 targetReg, // To targetReg
1478 (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based
1479 indexReg, // Indexed
1480 genTypeSize(baseType), // by the size of the baseType
1482 genProduceReg(simdNode);
1486 noway_assert(op2->isContained());
1487 unsigned int index = (unsigned int) op2->gtIntCon.gtIconVal;
1488 unsigned int byteShiftCnt = index * genTypeSize(baseType);
1490 // In general we shouldn't have an index greater than or equal to the length of the vector.
1491 // However, if we have an out-of-range access, under minOpts it will not be optimized
1492 // away. The code will throw before we reach this point, but we still need to generate
1493 // code. In that case, we will simply mask off the upper bits.
1494 if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength())
1496 byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - 1);
1497 index = byteShiftCnt / genTypeSize(baseType);
1500 regNumber tmpReg = REG_NA;
1501 if (simdNode->gtRsvdRegs != RBM_NONE)
1503 assert(genCountBits(simdNode->gtRsvdRegs) == 1);
1504 tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
1508 assert((byteShiftCnt == 0) ||
1509 varTypeIsFloating(baseType) ||
1510 (varTypeIsSmallInt(baseType) && (byteShiftCnt < 16)));
1513 if (byteShiftCnt >= 16)
1515 assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);
1517 regNumber newSrcReg;
1518 if (varTypeIsFloating(baseType))
1520 newSrcReg = targetReg;
1525 assert(tmpReg != REG_NA);
1528 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, 0x01);
1533 // Generate the following sequence:
1534 // 1) baseType is floating point
1535 // movaps targetReg, srcReg
1536 // psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element
1538 // 2) baseType is not floating point
1539 // movaps tmpReg, srcReg <-- not generated if accessing zero'th element
1540 // OR if tmpReg == srcReg
1541 // psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element
1542 // mov_xmm2i targetReg, tmpReg
1543 if (varTypeIsFloating(baseType))
1545 if (targetReg != srcReg)
1547 inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType));
1550 if (byteShiftCnt != 0)
1552 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1553 getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
1558 if (varTypeIsSmallInt(baseType))
1560 // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits.
1561 // In case of vector<short> we also need to sign extend the 16-bit value in targetReg
1562 // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right
1563 // by 8-bits if index is odd. In case of Vector<sbyte> also sign extend targetReg.
1565 unsigned baseSize = genTypeSize(baseType);
1570 // We actually want index % 8 for the AVX case (for SSE it will never be > 8).
1571 // Note that this doesn't matter functionally, because the instruction uses just the
1572 // low 3 bits of index, but it's better to use the right value.
1575 assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);
1579 getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index);
1581 bool ZeroOrSignExtnReqd = true;
1584 if ((op2->gtIntCon.gtIconVal % 2) == 1)
1586 // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element.
1587 inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8);
1589 // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE
1590 ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
1592 // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits
1596 // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT
1597 assert(baseSize == 2);
1598 ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
1601 if (ZeroOrSignExtnReqd)
1603 // Zero/sign extend the byte/short to 32-bits
1604 inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType));
1609 // We need a temp xmm register if the baseType is not floating point and
1610 // accessing non-zero'th element.
1613 if (byteShiftCnt != 0)
1615 assert(tmpReg != REG_NA);
1617 if (tmpReg != srcReg)
1619 inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
1622 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1623 getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
1630 assert(tmpReg != REG_NA);
1631 ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
1632 // (Note that for mov_xmm2i, the int register is always in the reg2 position.
1633 inst_RV_RV(ins, tmpReg, targetReg, baseType);
1637 genProduceReg(simdNode);
1640 //------------------------------------------------------------------------------------
1641 // genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
1644 // simdNode - The GT_SIMD node
1649 // TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case.
1652 CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
1654 // Determine index based on intrinsic ID
1656 switch(simdNode->gtSIMDIntrinsicID)
1658 case SIMDIntrinsicSetX:
1661 case SIMDIntrinsicSetY:
1664 case SIMDIntrinsicSetZ:
1667 case SIMDIntrinsicSetW:
1674 assert(index != -1);
1676 // op1 is the SIMD vector
1677 // op2 is the value to be set
1678 GenTree* op1 = simdNode->gtGetOp1();
1679 GenTree* op2 = simdNode->gtGetOp2();
1681 var_types baseType = simdNode->gtSIMDBaseType;
1682 regNumber targetReg = simdNode->gtRegNum;
1683 assert(targetReg != REG_NA);
1684 var_types targetType = simdNode->TypeGet();
1685 assert(varTypeIsSIMD(targetType));
1687 // the following assert must hold.
1688 // supported only on vector2f/3f/4f right now
1689 noway_assert(baseType == TYP_FLOAT);
1690 assert(op2->TypeGet() == baseType);
1691 assert(simdNode->gtSIMDSize >= ((index + 1) * genTypeSize(baseType)));
1693 genConsumeOperands(simdNode);
1694 regNumber op1Reg = op1->gtRegNum;
1695 regNumber op2Reg = op2->gtRegNum;
1697 // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate.
1698 if (targetReg != op1Reg)
1700 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1703 // Right now this intrinsic is supported only for float base type vectors.
1704 // If in future need to support on other base type vectors, the below
1705 // logic needs modification.
1706 noway_assert(baseType == TYP_FLOAT);
1708 if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
1710 // We need one additional int register as scratch
1711 assert(simdNode->gtRsvdRegs != RBM_NONE);
1712 assert(genCountBits(simdNode->gtRsvdRegs) == 1);
1713 regNumber tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
1714 assert(genIsValidIntReg(tmpReg));
1716 // Move the value from xmm reg to an int reg
1717 instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT);
1718 // (Note that for mov_xmm2i, the int register is always in the reg2 position.
1719 inst_RV_RV(ins, op2Reg, tmpReg, baseType);
1721 // First insert the lower 16-bits of tmpReg in targetReg at 2*index position
1722 // since every float has two 16-bit words.
1723 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2*index);
1725 // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position
1726 inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16);
1727 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2*index+1);
1731 unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0)|INSERTPS_TARGET_SELECT(index));
1732 inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm);
1735 genProduceReg(simdNode);
1738 //------------------------------------------------------------------------
1739 // genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle.
1742 // simdNode - The GT_SIMD node
1748 CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode)
1750 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2);
1751 noway_assert(compiler->getSIMDInstructionSet() == InstructionSet_SSE2);
1753 GenTree* op1 = simdNode->gtGetOp1();
1754 GenTree* op2 = simdNode->gtGetOp2();
1755 assert(op2->isContained());
1756 assert(op2->IsCnsIntOrI());
1757 int shuffleControl = (int) op2->AsIntConCommon()->IconValue();
1758 var_types baseType = simdNode->gtSIMDBaseType;
1759 var_types targetType = simdNode->TypeGet();
1760 regNumber targetReg = simdNode->gtRegNum;
1761 assert(targetReg != REG_NA);
1763 regNumber op1Reg = genConsumeReg(op1);
1764 if (targetReg != op1Reg)
1766 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1769 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1770 getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl);
1771 genProduceReg(simdNode);
1774 //-----------------------------------------------------------------------------
1775 // genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
1776 // Since Vector3 is not a hardware supported write size, it is performed
1777 // as two writes: 8 byte followed by 4-byte.
1780 // treeNode - tree node that is attempting to store indirect
1787 CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
1789 assert(treeNode->OperGet() == GT_STOREIND);
1791 GenTree* addr = treeNode->gtOp.gtOp1;
1792 GenTree* data = treeNode->gtOp.gtOp2;
1794 // addr and data should not be contained.
1795 assert(!data->isContained());
1796 assert(!addr->isContained());
1799 // Should not require a write barrier
1800 GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data);
1801 assert(writeBarrierForm == GCInfo::WBF_NoBarrier);
1804 // Need an addtional Xmm register to extract upper 4 bytes from data.
1805 assert(treeNode->gtRsvdRegs != RBM_NONE);
1806 assert(genCountBits(treeNode->gtRsvdRegs) == 1);
1807 regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
1809 genConsumeOperands(treeNode->AsOp());
1812 getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, 0);
1814 // Extract upper 4-bytes from data
1815 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, 0x02);
1818 getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, 8);
1821 //-----------------------------------------------------------------------------
1822 // genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
1823 // Since Vector3 is not a hardware supported write size, it is performed
1824 // as two loads: 8 byte followed by 4-byte.
1827 // treeNode - tree node of GT_IND
1834 CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
1836 assert(treeNode->OperGet() == GT_IND);
1838 regNumber targetReg = treeNode->gtRegNum;
1839 GenTreePtr op1 = treeNode->gtOp.gtOp1;
1840 assert(!op1->isContained());
1841 regNumber operandReg = genConsumeReg(op1);
1843 // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg
1844 assert(treeNode->gtRsvdRegs != RBM_NONE);
1845 assert(genCountBits(treeNode->gtRsvdRegs) == 2);
1847 regNumber tmpReg = REG_NA;
1848 regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
1849 regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
1850 tmpRegsMask &= ~tmpReg1Mask;
1851 regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
1852 regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
1854 // Choose any register different from targetReg as tmpReg
1855 if (tmpReg1 != targetReg)
1861 assert(targetReg != tmpReg2);
1864 assert(tmpReg != REG_NA);
1865 assert(tmpReg != targetReg);
1867 // Load upper 4 bytes in tmpReg
1868 getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, 8);
1870 // Load lower 8 bytes in targetReg
1871 getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);
1873 // combine upper 4 bytes and lower 8 bytes in targetReg
1874 getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
1876 genProduceReg(treeNode);
1879 //-----------------------------------------------------------------------------
1880 // genStoreLclFldTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
1881 // Since Vector3 is not a hardware supported write size, it is performed
1882 // as two stores: 8 byte followed by 4-byte.
1885 // treeNode - tree node that is attempting to store TYP_SIMD12 field
1891 CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
1893 assert(treeNode->OperGet() == GT_STORE_LCL_FLD);
1895 unsigned offs = treeNode->gtLclFld.gtLclOffs;
1896 unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
1897 assert(varNum < compiler->lvaCount);
1899 GenTreePtr op1 = treeNode->gtOp.gtOp1;
1900 assert(!op1->isContained());
1901 regNumber operandReg = genConsumeReg(op1);
1903 // Need an addtional Xmm register to extract upper 4 bytes from data.
1904 assert(treeNode->gtRsvdRegs != RBM_NONE);
1905 assert(genCountBits(treeNode->gtRsvdRegs) == 1);
1906 regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
1908 // store lower 8 bytes
1909 getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs);
1911 // Extract upper 4-bytes from operandReg
1912 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
1914 // Store upper 4 bytes
1915 getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs+8);
1918 //-----------------------------------------------------------------------------
1919 // genLoadLclFldTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
1920 // Since Vector3 is not a hardware supported write size, it is performed
1921 // as two reads: 8 byte followed by 4-byte.
1924 // treeNode - tree node that is attempting to load TYP_SIMD12 field
1930 CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
1932 assert(treeNode->OperGet() == GT_LCL_FLD);
1934 regNumber targetReg = treeNode->gtRegNum;
1935 unsigned offs = treeNode->gtLclFld.gtLclOffs;
1936 unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
1937 assert(varNum < compiler->lvaCount);
1939 // Need an addtional Xmm register to read upper 4 bytes
1940 assert(treeNode->gtRsvdRegs != RBM_NONE);
1941 assert(genCountBits(treeNode->gtRsvdRegs) == 2);
1943 regNumber tmpReg = REG_NA;
1944 regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
1945 regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
1946 tmpRegsMask &= ~tmpReg1Mask;
1947 regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
1948 regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
1950 // Choose any register different from targetReg as tmpReg
1951 if (tmpReg1 != targetReg)
1957 assert(targetReg != tmpReg2);
1960 assert(tmpReg != REG_NA);
1961 assert(tmpReg != targetReg);
1963 // Read upper 4 bytes to tmpReg
1964 getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs+8);
1966 // Read lower 8 bytes to targetReg
1967 getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
1969 // combine upper 4 bytes and lower 8 bytes in targetReg
1970 getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
1972 genProduceReg(treeNode);
1975 //-----------------------------------------------------------------------------
1976 // genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
1977 // the given register, if any, or to memory.
1980 // simdNode - The GT_SIMD node
1986 // The upper half of all AVX registers is volatile, even the callee-save registers.
1987 // When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic
1988 // to cause the upper half to be saved. It will first attempt to find another, unused, callee-save
1989 // register. If such a register cannot be found, it will save it to an available caller-save register.
1990 // In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte
1991 // value to the stack. (Note that if there are no caller-save registers available, the entire 32 byte
1992 // value will be spilled to the stack.)
1995 CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
1997 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave);
1999 GenTree* op1 = simdNode->gtGetOp1();
2000 assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
2001 regNumber targetReg = simdNode->gtRegNum;
2002 regNumber op1Reg = genConsumeReg(op1);
2003 assert(op1Reg != REG_NA);
2004 assert(targetReg != REG_NA);
2005 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01);
2007 genProduceReg(simdNode);
2010 //-----------------------------------------------------------------------------
2011 // genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to
2012 // the given register, if any, or to memory.
2015 // simdNode - The GT_SIMD node
2021 // For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always
2022 // have their home register, this node has its targetReg on the lclVar child, and its source
2024 // Regarding spill, please see the note above on genSIMDIntrinsicUpperSave. If we have spilled
2025 // an upper-half to a caller save register, this node will be marked GTF_SPILLED. However, unlike
2026 // most spill scenarios, the saved tree will be different from the restored tree, but the spill
2027 // restore logic, which is triggered by the call to genConsumeReg, requires us to provide the
2028 // spilled tree (saveNode) in order to perform the reload. We can easily find that tree,
2029 // as it is in the spill descriptor for the register from which it was saved.
2032 CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
2034 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore);
2036 GenTree* op1 = simdNode->gtGetOp1();
2037 assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
2038 regNumber srcReg = simdNode->gtRegNum;
2039 regNumber lclVarReg = genConsumeReg(op1);
2040 unsigned varNum = op1->AsLclVarCommon()->gtLclNum;
2041 assert(lclVarReg != REG_NA);
2042 assert(srcReg != REG_NA);
2043 if (simdNode->gtFlags & GTF_SPILLED)
2045 GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree;
2046 noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg));
2047 genConsumeReg(saveNode);
2049 getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01);
2052 //------------------------------------------------------------------------
2053 // genSIMDIntrinsic: Generate code for a SIMD Intrinsic. This is the main
2054 // routine which in turn calls apropriate genSIMDIntrinsicXXX() routine.
2057 // simdNode - The GT_SIMD node
2063 // Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and
2064 // a limited set of methods.
2067 CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
2069 // NYI for unsupported base types
2070 if (simdNode->gtSIMDBaseType != TYP_INT &&
2071 simdNode->gtSIMDBaseType != TYP_LONG &&
2072 simdNode->gtSIMDBaseType != TYP_FLOAT &&
2073 simdNode->gtSIMDBaseType != TYP_DOUBLE &&
2074 simdNode->gtSIMDBaseType != TYP_CHAR &&
2075 simdNode->gtSIMDBaseType != TYP_UBYTE &&
2076 simdNode->gtSIMDBaseType != TYP_SHORT &&
2077 simdNode->gtSIMDBaseType != TYP_BYTE &&
2078 simdNode->gtSIMDBaseType != TYP_UINT &&
2079 simdNode->gtSIMDBaseType != TYP_ULONG
2082 noway_assert(!"SIMD intrinsic with unsupported base type.");
2085 switch(simdNode->gtSIMDIntrinsicID)
2087 case SIMDIntrinsicInit:
2088 genSIMDIntrinsicInit(simdNode);
2091 case SIMDIntrinsicInitN:
2092 genSIMDIntrinsicInitN(simdNode);
2095 case SIMDIntrinsicSqrt:
2096 case SIMDIntrinsicCast:
2097 genSIMDIntrinsicUnOp(simdNode);
2100 case SIMDIntrinsicAdd:
2101 case SIMDIntrinsicSub:
2102 case SIMDIntrinsicMul:
2103 case SIMDIntrinsicDiv:
2104 case SIMDIntrinsicBitwiseAnd:
2105 case SIMDIntrinsicBitwiseAndNot:
2106 case SIMDIntrinsicBitwiseOr:
2107 case SIMDIntrinsicBitwiseXor:
2108 case SIMDIntrinsicMin:
2109 case SIMDIntrinsicMax:
2110 genSIMDIntrinsicBinOp(simdNode);
2113 case SIMDIntrinsicOpEquality:
2114 case SIMDIntrinsicOpInEquality:
2115 case SIMDIntrinsicEqual:
2116 case SIMDIntrinsicLessThan:
2117 case SIMDIntrinsicGreaterThan:
2118 case SIMDIntrinsicLessThanOrEqual:
2119 case SIMDIntrinsicGreaterThanOrEqual:
2120 genSIMDIntrinsicRelOp(simdNode);
2123 case SIMDIntrinsicDotProduct:
2124 genSIMDIntrinsicDotProduct(simdNode);
2127 case SIMDIntrinsicGetItem:
2128 genSIMDIntrinsicGetItem(simdNode);
2131 case SIMDIntrinsicShuffleSSE2:
2132 genSIMDIntrinsicShuffleSSE2(simdNode);
2135 case SIMDIntrinsicSetX:
2136 case SIMDIntrinsicSetY:
2137 case SIMDIntrinsicSetZ:
2138 case SIMDIntrinsicSetW:
2139 genSIMDIntrinsicSetItem(simdNode);
2142 case SIMDIntrinsicUpperSave:
2143 genSIMDIntrinsicUpperSave(simdNode);
2145 case SIMDIntrinsicUpperRestore:
2146 genSIMDIntrinsicUpperRestore(simdNode);
2150 noway_assert(!"Unimplemented SIMD intrinsic.");
2155 #endif // FEATURE_SIMD
2156 #endif //_TARGET_AMD64_
2157 #endif // !LEGACY_BACKEND