1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Amd64 SIMD Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
23 #include "sideeffects.h"
26 #include "gcinfoencoder.h"
28 // Instruction immediates
31 // - bits 6 and 7 of the immediate indicate which source item to select (0..3)
32 // - bits 4 and 5 of the immediate indicate which target item to insert into (0..3)
33 // - bits 0 to 3 of the immediate indicate which target item to zero
34 #define INSERTPS_SOURCE_SELECT(i) (i << 6)
35 #define INSERTPS_TARGET_SELECT(i) (i << 4)
36 #define INSERTPS_ZERO(i) (1 << i)
38 // getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic
41 // intrinsicId - SIMD intrinsic Id
42 // baseType - Base type of the SIMD vector
43 // immed - Out param. Any immediate byte operand that needs to be passed to SSE2 opcode
47 // Instruction (op) to be used, and immed is set if instruction requires an immediate operand.
49 instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /*=nullptr*/)
51 // Minimal required instruction set is SSE2.
52 assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
54 instruction result = INS_invalid;
57 case SIMDIntrinsicInit:
58 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
60 // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
61 // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
62 // If we decide to use AVX2 only, we can remove this assert.
63 if (!compiler->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_USE_AVX2))
65 assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
70 result = INS_vbroadcastss;
73 result = INS_vbroadcastsd;
77 // NOTE: for x86, this instruction is valid if the src is xmm2/m64, but NOT if it is supposed
78 // to be TYP_LONG reg.
79 result = INS_vpbroadcastq;
83 result = INS_vpbroadcastd;
87 result = INS_vpbroadcastw;
91 result = INS_vpbroadcastb;
99 // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
102 case SIMDIntrinsicShuffleSSE2:
103 if (baseType == TYP_FLOAT)
107 else if (baseType == TYP_DOUBLE)
111 else if (baseType == TYP_INT || baseType == TYP_UINT)
115 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
117 // We don't have a separate SSE2 instruction and will
118 // use the instruction meant for doubles since it is
119 // of the same size as a long.
124 case SIMDIntrinsicSqrt:
125 if (baseType == TYP_FLOAT)
129 else if (baseType == TYP_DOUBLE)
139 case SIMDIntrinsicAdd:
140 if (baseType == TYP_FLOAT)
144 else if (baseType == TYP_DOUBLE)
148 else if (baseType == TYP_INT || baseType == TYP_UINT)
152 else if (baseType == TYP_USHORT || baseType == TYP_SHORT)
156 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
160 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
166 case SIMDIntrinsicSub:
167 if (baseType == TYP_FLOAT)
171 else if (baseType == TYP_DOUBLE)
175 else if (baseType == TYP_INT || baseType == TYP_UINT)
179 else if (baseType == TYP_USHORT || baseType == TYP_SHORT)
183 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
187 else if (baseType == TYP_LONG || baseType == TYP_ULONG)
193 case SIMDIntrinsicMul:
194 if (baseType == TYP_FLOAT)
198 else if (baseType == TYP_DOUBLE)
202 else if (baseType == TYP_SHORT)
206 else if ((baseType == TYP_INT) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
212 case SIMDIntrinsicDiv:
213 if (baseType == TYP_FLOAT)
217 else if (baseType == TYP_DOUBLE)
227 case SIMDIntrinsicMin:
228 if (baseType == TYP_FLOAT)
232 else if (baseType == TYP_DOUBLE)
236 else if (baseType == TYP_UBYTE)
240 else if (baseType == TYP_SHORT)
244 else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
246 if (baseType == TYP_BYTE)
250 else if (baseType == TYP_USHORT)
254 else if (baseType == TYP_INT)
258 else if (baseType == TYP_UINT)
269 case SIMDIntrinsicMax:
270 if (baseType == TYP_FLOAT)
274 else if (baseType == TYP_DOUBLE)
278 else if (baseType == TYP_UBYTE)
282 else if (baseType == TYP_SHORT)
286 else if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
288 if (baseType == TYP_BYTE)
292 else if (baseType == TYP_USHORT)
296 else if (baseType == TYP_INT)
300 else if (baseType == TYP_UINT)
311 case SIMDIntrinsicAbs:
312 if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
314 if (baseType == TYP_INT)
318 else if (baseType == TYP_SHORT)
322 else if (baseType == TYP_BYTE)
329 case SIMDIntrinsicEqual:
330 if (baseType == TYP_FLOAT)
333 assert(ival != nullptr);
336 else if (baseType == TYP_DOUBLE)
339 assert(ival != nullptr);
342 else if (baseType == TYP_INT || baseType == TYP_UINT)
344 result = INS_pcmpeqd;
346 else if (baseType == TYP_USHORT || baseType == TYP_SHORT)
348 result = INS_pcmpeqw;
350 else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
352 result = INS_pcmpeqb;
354 else if ((baseType == TYP_ULONG || baseType == TYP_LONG) &&
355 (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
357 result = INS_pcmpeqq;
361 case SIMDIntrinsicLessThan:
362 // Packed integers use > with swapped operands
363 assert(baseType != TYP_INT);
365 if (baseType == TYP_FLOAT)
368 assert(ival != nullptr);
371 else if (baseType == TYP_DOUBLE)
374 assert(ival != nullptr);
379 case SIMDIntrinsicLessThanOrEqual:
380 // Packed integers use (a==b) || ( b > a) in place of a <= b.
381 assert(baseType != TYP_INT);
383 if (baseType == TYP_FLOAT)
386 assert(ival != nullptr);
389 else if (baseType == TYP_DOUBLE)
392 assert(ival != nullptr);
397 case SIMDIntrinsicGreaterThan:
398 // Packed float/double use < with swapped operands
399 assert(!varTypeIsFloating(baseType));
401 // SSE2 supports only signed >
402 if (baseType == TYP_INT)
404 result = INS_pcmpgtd;
406 else if (baseType == TYP_SHORT)
408 result = INS_pcmpgtw;
410 else if (baseType == TYP_BYTE)
412 result = INS_pcmpgtb;
414 else if ((baseType == TYP_LONG) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported))
416 result = INS_pcmpgtq;
420 case SIMDIntrinsicBitwiseAnd:
421 if (baseType == TYP_FLOAT)
425 else if (baseType == TYP_DOUBLE)
429 else if (varTypeIsIntegral(baseType))
435 case SIMDIntrinsicBitwiseAndNot:
436 if (baseType == TYP_FLOAT)
440 else if (baseType == TYP_DOUBLE)
444 else if (baseType == TYP_INT)
448 else if (varTypeIsIntegral(baseType))
454 case SIMDIntrinsicBitwiseOr:
455 if (baseType == TYP_FLOAT)
459 else if (baseType == TYP_DOUBLE)
463 else if (varTypeIsIntegral(baseType))
469 case SIMDIntrinsicBitwiseXor:
470 if (baseType == TYP_FLOAT)
474 else if (baseType == TYP_DOUBLE)
478 else if (varTypeIsIntegral(baseType))
484 case SIMDIntrinsicCast:
488 case SIMDIntrinsicConvertToSingle:
489 result = INS_cvtdq2ps;
492 case SIMDIntrinsicConvertToDouble:
493 assert(baseType == TYP_LONG);
494 result = INS_cvtsi2sd;
497 case SIMDIntrinsicConvertToInt32:
498 assert(baseType == TYP_FLOAT);
499 result = INS_cvttps2dq;
502 case SIMDIntrinsicConvertToInt64:
503 assert(baseType == TYP_DOUBLE);
504 result = INS_cvttsd2si;
507 case SIMDIntrinsicNarrow:
508 // Note that for the integer types the caller must zero the upper bits of
509 // each source element, since the instructions saturate.
514 if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)
516 result = INS_packusdw;
520 result = INS_packssdw;
525 result = INS_packuswb;
528 assert(!"Invalid baseType for SIMDIntrinsicNarrow");
529 result = INS_invalid;
534 case SIMDIntrinsicWidenLo:
535 // Some of these have multiple instruction implementations, with one instruction to widen the lo half,
536 // and another to widen the hi half.
540 result = INS_cvtps2pd;
544 result = INS_punpckldq;
548 result = INS_punpcklwd;
552 result = INS_punpcklbw;
555 assert(!"Invalid baseType for SIMDIntrinsicWidenLo");
556 result = INS_invalid;
561 case SIMDIntrinsicWidenHi:
565 // For this case, we actually use the same instruction.
566 result = INS_cvtps2pd;
570 result = INS_punpckhdq;
574 result = INS_punpckhwd;
578 result = INS_punpckhbw;
581 assert(!"Invalid baseType for SIMDIntrinsicWidenHi");
582 result = INS_invalid;
587 case SIMDIntrinsicShiftLeftInternal:
591 // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
603 assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal");
604 result = INS_invalid;
609 case SIMDIntrinsicShiftRightInternal:
613 // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
625 assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal");
626 result = INS_invalid;
631 case SIMDIntrinsicUpperSave:
632 result = INS_vextractf128;
635 case SIMDIntrinsicUpperRestore:
636 result = INS_insertps;
640 assert(!"Unsupported SIMD intrinsic");
644 noway_assert(result != INS_invalid);
648 // genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg
649 // to target mm reg, zeroing out the upper bits if and only if specified.
652 // targetType the target type
653 // baseType the base type of value to be moved
654 // targetReg the target reg
655 // srcReg the src reg
656 // moveType action to be performed on target upper bits
662 // This is currently only supported for floating point types.
664 void CodeGen::genSIMDScalarMove(
665 var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
667 assert(varTypeIsFloating(baseType));
670 case SMT_PreserveUpper:
671 if (srcReg != targetReg)
673 instruction ins = ins_Store(baseType);
674 if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
676 // In general, when we use a three-operands move instruction, we want to merge the src with
677 // itself. This is an exception in that we actually want the "merge" behavior, so we must
678 // specify it with all 3 operands.
679 inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
683 inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
688 case SMT_ZeroInitUpper:
689 if (compiler->canUseVexEncoding())
691 // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
692 // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
693 // to zero all but the lower bits.
694 unsigned int insertpsImm =
695 (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
696 inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
700 if (srcReg == targetReg)
702 // There is no guarantee that upper bits of op1Reg are zero.
703 // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
704 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
705 getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
706 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
707 getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
711 genSIMDZero(targetType, TYP_FLOAT, targetReg);
712 inst_RV_RV(ins_Store(baseType), targetReg, srcReg);
717 case SMT_ZeroInitUpper_SrcHasUpperZeros:
718 if (srcReg != targetReg)
720 instruction ins = ins_Copy(baseType);
721 assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
722 inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
731 void CodeGen::genSIMDZero(var_types targetType, var_types baseType, regNumber targetReg)
733 // We just use `INS_xorps` instead of `getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType)`
734 // since `genSIMDZero` is used for both `System.Numerics.Vectors` and HardwareIntrinsics. Modern
735 // CPUs handle this specially in the renamer and it never hits the execution pipeline, additionally
736 // `INS_xorps` is always available (when using either the legacy or VEX encoding).
737 inst_RV_RV(INS_xorps, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
740 //------------------------------------------------------------------------
741 // genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize.
744 // simdNode - The GT_SIMD node
749 void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
751 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit);
753 GenTree* op1 = simdNode->gtGetOp1();
754 var_types baseType = simdNode->gtSIMDBaseType;
755 regNumber targetReg = simdNode->gtRegNum;
756 assert(targetReg != REG_NA);
757 var_types targetType = simdNode->TypeGet();
758 SIMDLevel level = compiler->getSIMDSupportLevel();
759 unsigned size = simdNode->gtSIMDSize;
761 // Should never see small int base type vectors except for zero initialization.
762 noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0));
764 instruction ins = INS_invalid;
766 #if !defined(_TARGET_64BIT_)
767 if (op1->OperGet() == GT_LONG)
769 assert(varTypeIsLong(baseType));
771 GenTree* op1lo = op1->gtGetOp1();
772 GenTree* op1hi = op1->gtGetOp2();
774 if (op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0))
776 genSIMDZero(targetType, baseType, targetReg);
778 else if (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1))
780 // Initialize elements of vector with all 1's: generate pcmpeqd reg, reg.
781 ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
782 inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
787 // mov_i2xmm targetReg, op1lo
788 // mov_i2xmm xmmtmp, op1hi
789 // shl xmmtmp, 4 bytes
790 // por targetReg, xmmtmp
791 // Now, targetReg has the long in the low 64 bits. For SSE2, move it to the high 64 bits using:
792 // shufpd targetReg, targetReg, 0 // move the long to all the lanes
793 // For AVX2, move it to all 4 of the 64-bit lanes using:
794 // vpbroadcastq targetReg, targetReg
798 regNumber op1loReg = genConsumeReg(op1lo);
799 ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
800 inst_RV_RV(ins, targetReg, op1loReg, TYP_INT, emitTypeSize(TYP_INT));
802 regNumber tmpReg = simdNode->GetSingleTempReg();
804 regNumber op1hiReg = genConsumeReg(op1hi);
805 ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
806 inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
808 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
809 getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes
811 ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
812 inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
814 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
816 inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
820 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
821 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, 0);
826 #endif // !defined(_TARGET_64BIT_)
827 if (op1->isContained())
829 if (op1->IsIntegralConst(0) || op1->IsFPZero())
831 genSIMDZero(targetType, baseType, targetReg);
833 else if (varTypeIsIntegral(baseType) && op1->IsIntegralConst(-1))
835 // case of initializing elements of vector with all 1's
836 // generate pcmpeqd reg, reg
837 ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
838 inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
842 assert(level == SIMD_AVX2_Supported);
843 ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType);
844 if (op1->IsCnsFltOrDbl())
846 getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1);
848 else if (op1->OperIsLocalAddr())
850 unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : 0;
851 getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum,
860 else if (level == SIMD_AVX2_Supported && ((size == 32) || (size == 16)))
862 regNumber srcReg = genConsumeReg(op1);
863 if (baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG)
865 ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
866 assert(ins != INS_invalid);
867 inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
871 ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
872 getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg);
876 // If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType.
877 // In either case we are going to use the SSE2 shuffle instruction.
879 regNumber op1Reg = genConsumeReg(op1);
880 unsigned shuffleControl = 0;
882 if (compiler->isSubRegisterSIMDType(simdNode))
884 assert(baseType == TYP_FLOAT);
886 // We cannot assume that upper bits of op1Reg or targetReg be zero.
887 // Therefore we need to explicitly zero out upper bits. This is
888 // essential for the shuffle operation performed below.
890 // If op1 is a float/double constant, we would have loaded it from
891 // data section using movss/sd. Similarly if op1 is a memory op we
892 // would have loaded it using movss/sd. Movss/sd when loading a xmm reg
893 // from memory would zero-out upper bits. In these cases we can
894 // avoid explicitly zero'ing out targetReg if targetReg and op1Reg are the same or do it more efficiently
895 // if they are not the same.
896 SIMDScalarMoveType moveType =
897 op1->IsCnsFltOrDbl() || op1->isMemoryOp() ? SMT_ZeroInitUpper_SrcHasUpperZeros : SMT_ZeroInitUpper;
899 genSIMDScalarMove(targetType, TYP_FLOAT, targetReg, op1Reg, moveType);
903 shuffleControl = 0x50;
907 shuffleControl = 0x40;
911 noway_assert(!"Unexpected size for SIMD type");
916 if (op1Reg != targetReg)
918 if (varTypeIsFloating(baseType))
920 ins = ins_Copy(targetType);
922 else if (baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG)
924 ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
927 assert(ins != INS_invalid);
928 inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType));
932 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
933 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl);
936 genProduceReg(simdNode);
939 //-------------------------------------------------------------------------------------------
940 // genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes
941 // a number of arguments equal to the length of the Vector.
944 // simdNode - The GT_SIMD node
949 void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
951 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN);
953 // Right now this intrinsic is supported only on TYP_FLOAT vectors
954 var_types baseType = simdNode->gtSIMDBaseType;
955 noway_assert(baseType == TYP_FLOAT);
957 regNumber targetReg = simdNode->gtRegNum;
958 assert(targetReg != REG_NA);
960 var_types targetType = simdNode->TypeGet();
962 // Note that we cannot use targetReg before consumed all source operands. Therefore,
963 // Need an internal register to stitch together all the values into a single vector
965 regNumber vectorReg = simdNode->GetSingleTempReg();
967 // Zero out vectorReg if we are constructing a vector whose size is not equal to targetType vector size.
968 // For example in case of Vector4f we don't need to zero when using SSE2.
969 if (compiler->isSubRegisterSIMDType(simdNode))
971 genSIMDZero(targetType, baseType, vectorReg);
974 unsigned int baseTypeSize = genTypeSize(baseType);
975 instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
977 // We will first consume the list items in execution (left to right) order,
978 // and record the registers.
979 regNumber operandRegs[SIMD_INTRINSIC_MAX_PARAM_COUNT];
980 unsigned initCount = 0;
981 for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2())
983 assert(list->OperGet() == GT_LIST);
984 GenTree* listItem = list->gtGetOp1();
985 assert(listItem->TypeGet() == baseType);
986 assert(!listItem->isContained());
987 regNumber operandReg = genConsumeReg(listItem);
988 operandRegs[initCount] = operandReg;
992 unsigned int offset = 0;
993 for (unsigned i = 0; i < initCount; i++)
995 // We will now construct the vector from the list items in reverse order.
996 // This allows us to efficiently stitch together a vector as follows:
997 // vectorReg = (vectorReg << offset)
998 // VectorReg[0] = listItemReg
999 // Use genSIMDScalarMove with SMT_PreserveUpper in order to ensure that the upper
1000 // bits of vectorReg are not modified.
1002 regNumber operandReg = operandRegs[initCount - i - 1];
1005 getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
1007 genSIMDScalarMove(targetType, baseType, vectorReg, operandReg, SMT_PreserveUpper);
1009 offset += baseTypeSize;
1012 noway_assert(offset == simdNode->gtSIMDSize);
1014 // Load the initialized value.
1015 if (targetReg != vectorReg)
1017 inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType));
1019 genProduceReg(simdNode);
1022 //----------------------------------------------------------------------------------
1023 // genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt.
1026 // simdNode - The GT_SIMD node
1031 void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
1033 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast ||
1034 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAbs);
1036 GenTree* op1 = simdNode->gtGetOp1();
1037 var_types baseType = simdNode->gtSIMDBaseType;
1038 regNumber targetReg = simdNode->gtRegNum;
1039 assert(targetReg != REG_NA);
1040 var_types targetType = simdNode->TypeGet();
1042 regNumber op1Reg = genConsumeReg(op1);
1043 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1044 if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast || targetReg != op1Reg)
1046 inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1048 genProduceReg(simdNode);
1051 //----------------------------------------------------------------------------------
1052 // genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float)
1055 // simdNode - The GT_SIMD node
1060 void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode)
1062 SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
1063 assert((intrinsicID == SIMDIntrinsicConvertToSingle) || (intrinsicID == SIMDIntrinsicConvertToInt32));
1065 GenTree* op1 = simdNode->gtGetOp1();
1066 var_types baseType = simdNode->gtSIMDBaseType;
1067 regNumber targetReg = simdNode->gtRegNum;
1068 assert(targetReg != REG_NA);
1069 var_types targetType = simdNode->TypeGet();
1071 regNumber op1Reg = genConsumeReg(op1);
1072 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1073 if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT)
1075 regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
1076 regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1077 regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1078 assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
1080 // We will generate the following:
1081 // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
1082 // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
1083 // vpsrld targetReg, 16 (get upper 16 bits of src and put it into targetReg)
1084 // vpslld tmpReg2, 16
1085 // vpsrld tmpReg2, 16 (get lower 16 bits of src and put it into tmpReg2)
1086 // mov tmpIntReg, 0x5300000053000000
1087 // vmovd tmpReg, tmpIntReg
1088 // vpbroadcastd tmpReg, tmpReg (build mask for converting upper 16 bits of src)
1089 // vorps targetReg, tmpReg
1090 // vsubps targetReg, tmpReg (convert upper 16 bits of src and put it into targetReg)
1091 // vcvtdq2ps tmpReg2, tmpReg2 (convert lower 16 bits of src and put it into tmpReg2)
1092 // vaddps targetReg, tmpReg2 (add upper 16 bits and lower 16 bits)
1093 inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType));
1094 if (targetReg != op1Reg)
1096 inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType));
1099 // prepare upper 16 bits
1100 getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, 16);
1102 // prepare lower 16 bits
1103 getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, 16);
1104 getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, 16);
1107 #ifdef _TARGET_AMD64_
1108 getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X5300000053000000);
1109 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1111 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1113 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X53000000);
1114 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1118 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X00005300);
1119 inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1120 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 1);
1121 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 3);
1124 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1126 inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1130 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
1133 // convert upper 16 bits
1134 inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1135 inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1137 // convert lower 16 bits
1138 inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType));
1140 // add lower 16 bits and upper 16 bits
1141 inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType));
1145 inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1147 genProduceReg(simdNode);
1150 //----------------------------------------------------------------------------------
1151 // genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double)
1154 // intrinsicID the SIMD intrinsic ID
1155 // simdType the SIMD node type
1156 // baseType the base type of value to be converted
1157 // tmpReg the tmp reg
1158 // tmpIntReg the tmp integer reg
1159 // targetReg the target reg
1164 void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID,
1168 regNumber tmpIntReg,
1169 regNumber targetReg)
1171 instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType);
1172 if (intrinsicID == SIMDIntrinsicConvertToDouble)
1174 // Note that for mov_xmm2i, the int register is always in the reg2 position
1175 inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG);
1176 inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType));
1180 inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType));
1181 inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG);
1185 //----------------------------------------------------------------------------------
1186 // genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double)
1189 // simdNode - The GT_SIMD node
1192 // There are no instructions for converting to/from 64-bit integers, so for these we
1193 // do the conversion an element at a time.
1195 void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode)
1197 SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
1198 assert((intrinsicID == SIMDIntrinsicConvertToDouble) || (intrinsicID == SIMDIntrinsicConvertToInt64));
1200 GenTree* op1 = simdNode->gtGetOp1();
1201 var_types baseType = simdNode->gtSIMDBaseType;
1202 regNumber targetReg = simdNode->gtRegNum;
1203 assert(targetReg != REG_NA);
1204 var_types simdType = simdNode->TypeGet();
1205 regNumber op1Reg = genConsumeReg(op1);
1206 regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
1210 SIMDLevel level = compiler->getSIMDSupportLevel();
1213 if (baseType == TYP_LONG)
1215 tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1216 tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1217 tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1218 assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg);
1222 if (level == SIMD_AVX2_Supported || (baseType == TYP_ULONG))
1224 tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1225 tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1227 assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
1231 tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1232 assert(tmpReg != op1Reg);
1237 if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG))
1239 // We will generate the following
1240 // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
1241 // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
1242 // vpsrlq targetReg, 32 (get upper 32 bits of src and put it into targetReg)
1243 // vpsllq tmpReg2, 32
1244 // vpsrlq tmpReg2, 32 (get lower 32 bits of src and put it into tmpReg2)
1245 // mov tmpIntReg, 0x4530000000000000
1246 // vmovd tmpReg, tmpIntReg
1247 // vpbroadcastq tmpReg, tmpReg (build mask for upper 32 bits of src)
1248 // vorpd targetReg, tmpReg
1249 // vsubpd targetReg, tmpReg (convert upper 32 bits of src and put it into targetReg)
1250 // mov tmpIntReg, 0x4330000000000000
1251 // vmovd tmpReg, tmpIntReg
1252 // vpbroadcastq tmpReg, tmpReg (build mask for lower 32 bits of src)
1253 // vorpd tmpReg2, tmpReg
1254 // vsubpd tmpReg2, tmpReg (convert lower 32 bits of src and put it into tmpReg2)
1255 // vaddpd targetReg, tmpReg2 (add upper 32 bits and lower 32 bits together)
1256 inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
1257 if (targetReg != op1Reg)
1259 inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType));
1262 // prepare upper 32 bits
1263 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
1265 // prepare lower 32 bits
1266 getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
1267 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
1269 // prepare mask for converting upper 32 bits
1270 #ifdef _TARGET_AMD64_
1271 getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4530000000000000);
1272 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1274 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
1275 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1276 getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
1278 if (level == SIMD_AVX2_Supported)
1280 inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1284 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1287 // convert upper 32 bits
1288 inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1289 inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1291 // prepare mask for converting lower 32 bits
1292 #ifdef _TARGET_AMD64_
1293 getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4330000000000000);
1294 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
1296 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
1297 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1298 getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
1300 if (level == SIMD_AVX2_Supported)
1302 inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1306 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1309 // convert lower 32 bits
1310 inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1311 inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1313 // add lower 32 bits and upper 32 bits
1314 inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
1316 else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG))
1318 #ifdef _TARGET_AMD64_
1319 instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1320 instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1322 if (level == SIMD_AVX2_Supported)
1324 // Extract the high 16-bits
1325 getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
1327 // Put v[3] (the high-order element) in tmpReg2 and convert it.
1328 inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1329 getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
1330 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
1332 // Shift the resulting 64-bits left.
1333 getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
1335 // Convert v[2], in the lo bits of tmpReg.
1336 // For the convert to double, the convert preserves the upper bits in tmpReg2.
1337 // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
1338 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2);
1341 // Put v[1] in tmpReg.
1342 inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
1343 getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
1345 // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
1346 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1348 // Shift the resulting 64-bits left.
1349 getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
1351 // Convert the lo 64-bits into targetReg
1352 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg);
1354 // Merge or copy the results (only at this point are we done with op1Reg).
1355 if (tmpReg != targetReg)
1357 inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1360 if (level == SIMD_AVX2_Supported)
1362 getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, 0x01);
1365 // get the sign bit and put it in tmpReg3
1366 inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType));
1367 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, 63);
1368 getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, 63);
1370 // get the absolute value of src and put it into tmpReg2 and targetReg
1371 inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
1372 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY);
1373 getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, 32);
1374 inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
1375 inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
1376 inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType));
1378 // prepare upper 32 bits
1379 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
1381 // prepare lower 32 bits
1382 getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
1383 getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
1385 // prepare mask for converting upper 32 bits
1386 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
1387 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1388 getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
1390 if (level == SIMD_AVX2_Supported)
1392 inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1396 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1399 // convert upper 32 bits
1400 inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1401 inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1403 // prepare mask for converting lower 32 bits
1404 getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
1405 inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
1406 getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
1408 if (level == SIMD_AVX2_Supported)
1410 inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1414 inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
1417 // convert lower 32 bits
1418 inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1419 inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1421 // add lower 32 bits and upper 32 bits
1422 inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
1425 inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType));
1430 instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1431 instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1433 if (level == SIMD_AVX2_Supported)
1435 // Extract the high 16-bits
1436 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, 0x01);
1438 // Put v[3] (the high-order element) in tmpReg2 and convert it.
1439 inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1440 getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
1441 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
1443 // Shift the resulting 64-bits left.
1444 getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
1446 // Convert v[2], in the lo bits of tmpReg.
1447 // For the convert to double, the convert preserves the upper bits in tmpReg2.
1448 // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
1449 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1450 inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
1453 // Put v[1] in tmpReg.
1454 inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
1455 getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
1457 // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
1458 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
1460 // Shift the resulting 64-bits left.
1461 getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
1463 // Convert the lo 64-bits into targetReg
1464 genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg);
1466 // Merge or copy the results (only at this point are we done with op1Reg).
1467 assert(tmpReg != targetReg);
1468 inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
1469 if (level == SIMD_AVX2_Supported)
1471 getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, 0x01);
1474 genProduceReg(simdNode);
1477 //--------------------------------------------------------------------------------
1478 // genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register
1481 // simdNode - The GT_SIMD node
1484 // This is used for the WidenHi intrinsic to extract the upper half.
1485 // On SSE*, this is 8 bytes, and on AVX2 it is 16 bytes.
1487 void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg)
1489 var_types simdType = simdNode->TypeGet();
1490 emitAttr emitSize = emitActualTypeSize(simdType);
1491 if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
1493 instruction extractIns = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128;
1494 getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, 0x01);
1498 instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1499 if (tgtReg != srcReg)
1501 inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize);
1503 getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, 8);
1507 //--------------------------------------------------------------------------------
1508 // genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations
1511 // simdNode - The GT_SIMD node
1514 // The Widen intrinsics are broken into separate intrinsics for the two results.
1516 void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode)
1518 assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) ||
1519 (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi));
1521 GenTree* op1 = simdNode->gtGetOp1();
1522 var_types baseType = simdNode->gtSIMDBaseType;
1523 regNumber targetReg = simdNode->gtRegNum;
1524 assert(targetReg != REG_NA);
1525 var_types simdType = simdNode->TypeGet();
1526 SIMDLevel level = compiler->getSIMDSupportLevel();
1528 genConsumeOperands(simdNode);
1529 regNumber op1Reg = op1->gtRegNum;
1530 regNumber srcReg = op1Reg;
1531 emitAttr emitSize = emitActualTypeSize(simdType);
1532 instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1534 if (baseType == TYP_FLOAT)
1536 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
1538 genSIMDExtractUpperHalf(simdNode, srcReg, targetReg);
1541 inst_RV_RV(widenIns, targetReg, srcReg, simdType);
1545 // We will generate the following on AVX:
1546 // vpermq targetReg, op1Reg, 0xd4|0xe8
1547 // vpxor tmpReg, tmpReg
1548 // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed)
1549 // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg
1550 regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1551 assert(tmpReg != op1Reg);
1553 if (level == SIMD_AVX2_Supported)
1555 // permute op1Reg and put it into targetReg
1556 unsigned ival = 0xd4;
1557 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
1561 getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival);
1563 else if (targetReg != op1Reg)
1565 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
1568 genSIMDZero(simdType, baseType, tmpReg);
1569 if (!varTypeIsUnsigned(baseType))
1571 instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType);
1572 inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize);
1574 inst_RV_RV(widenIns, targetReg, tmpReg, simdType);
1576 genProduceReg(simdNode);
1579 //--------------------------------------------------------------------------------
1580 // genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations
1583 // simdNode - The GT_SIMD node
1586 // This intrinsic takes two arguments. The first operand is narrowed to produce the
1587 // lower elements of the results, and the second operand produces the high elements.
1589 void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
1591 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow);
1593 GenTree* op1 = simdNode->gtGetOp1();
1594 GenTree* op2 = simdNode->gtGetOp2();
1595 var_types baseType = simdNode->gtSIMDBaseType;
1596 regNumber targetReg = simdNode->gtRegNum;
1597 assert(targetReg != REG_NA);
1598 var_types simdType = simdNode->TypeGet();
1599 emitAttr emitSize = emitTypeSize(simdType);
1600 SIMDLevel level = compiler->getSIMDSupportLevel();
1602 genConsumeOperands(simdNode);
1603 regNumber op1Reg = op1->gtRegNum;
1604 regNumber op2Reg = op2->gtRegNum;
1605 if (baseType == TYP_DOUBLE)
1607 regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1609 inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
1610 inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
1611 // Now insert the high-order result (in tmpReg) into the upper half of targetReg.
1612 if (level == SIMD_AVX2_Supported)
1614 getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01);
1618 inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX);
1621 else if (varTypeIsLong(baseType))
1623 if (level == SIMD_AVX2_Supported)
1625 // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg.
1626 // We will generate the following:
1627 // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg)
1628 // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2)
1629 // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg)
1630 // mov tmpReg2, op1Reg
1631 // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2)
1632 // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg
1633 // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg
1634 // punpcklqdq tgtReg, tmpReg
1635 regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1636 regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1637 getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
1638 getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
1639 getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01);
1640 inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize);
1641 getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
1642 getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX);
1643 getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX);
1644 inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize);
1648 // We will generate the following:
1649 // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements)
1650 // psrldq targetReg, 8 (shift them right to get zeros in the high elements)
1651 // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements)
1652 // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements)
1653 // por targetReg, tmpReg
1654 regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1655 instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1656 instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1657 emitAttr emitSize = emitTypeSize(simdType);
1659 getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX);
1660 getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8);
1661 getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX);
1662 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8);
1663 inst_RV_RV(INS_por, targetReg, tmpReg, simdType);
1668 // We will generate the following:
1669 // mov targetReg, op1Reg
1670 // mov tmpReg, op2Reg
1671 // psll? targetReg, shiftCount
1672 // pslr? targetReg, shiftCount
1673 // psll? tmpReg, shiftCount
1674 // pslr? tmpReg, shiftCount
1675 // <pack> targetReg, tmpReg
1676 // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType),
1677 // and <pack> is the appropriate instruction to pack the result (note that we have to truncate to
1678 // get CLR type semantics; otherwise it will saturate).
1680 int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2);
1681 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1682 instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
1683 instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1685 if (level == SIMD_AVX2_Supported)
1687 regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
1688 regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1690 // The AVX instructions generally operate on "lanes", so we have to permute the
1691 // inputs so that the destination register has the low 128-bit halves of the two
1692 // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs.
1693 getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20);
1694 getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31);
1695 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount);
1696 getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount);
1697 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
1698 getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount);
1699 inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType));
1703 regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
1705 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
1706 inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize);
1708 instruction tmpShiftRight = shiftRightIns;
1709 if ((baseType == TYP_INT || baseType == TYP_UINT) && level == SIMD_SSE2_Supported)
1711 tmpShiftRight = INS_psrad;
1714 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount);
1715 getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount);
1716 getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
1717 getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount);
1718 inst_RV_RV(ins, targetReg, tmpReg, simdType);
1721 genProduceReg(simdNode);
1724 //--------------------------------------------------------------------------------
1725 // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
1726 // add, sub, mul, bit-wise And, AndNot and Or.
1729 // simdNode - The GT_SIMD node
1734 void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
1736 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub ||
1737 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv ||
1738 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd ||
1739 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot ||
1740 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr ||
1741 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin ||
1742 simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax);
1744 GenTree* op1 = simdNode->gtGetOp1();
1745 GenTree* op2 = simdNode->gtGetOp2();
1746 var_types baseType = simdNode->gtSIMDBaseType;
1747 regNumber targetReg = simdNode->gtRegNum;
1748 assert(targetReg != REG_NA);
1749 var_types targetType = simdNode->TypeGet();
1750 SIMDLevel level = compiler->getSIMDSupportLevel();
1752 genConsumeOperands(simdNode);
1753 regNumber op1Reg = op1->gtRegNum;
1754 regNumber op2Reg = op2->gtRegNum;
1755 regNumber otherReg = op2Reg;
1758 // SSE2 doesn't have an instruction to perform this operation directly
1759 // whereas SSE4.1 does (pmulld). This is special cased and computed
1761 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul && baseType == TYP_INT && level == SIMD_SSE2_Supported)
1763 // We need a temporary register that is NOT the same as the target,
1764 // and we MAY need another.
1765 regNumber tmpReg = simdNode->ExtractTempReg();
1766 regNumber tmpReg2 = simdNode->GetSingleTempReg();
1768 // The register allocator guarantees the following conditions:
1769 // - the only registers that may be the same among op1Reg, op2Reg, tmpReg
1770 // and tmpReg2 are op1Reg and op2Reg.
1771 // Let's be extra-careful and assert that now.
1772 assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) && (op2Reg != tmpReg) && (op2Reg != tmpReg2) &&
1773 (tmpReg != tmpReg2));
1775 // We will start by setting things up so that:
1776 // - We have op1 in op1Reg and targetReg, and they are different registers.
1777 // - We have op2 in op2Reg and tmpReg
1778 // - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified,
1779 // OR they are the targetReg that will be produced.
1780 // (Note that in the code we generate below op1Reg and op2Reg are never written.)
1781 // We will copy things as necessary to ensure that this is the case.
1782 // Note that we can swap op1 and op2, since multiplication is commutative.
1783 // We will not modify the values in op1Reg and op2Reg.
1784 // (Though note that if either op1 or op2 is the same as targetReg, we will make
1785 // a copy and use that copy as the input register. In that case we WILL modify
1786 // the original value in the register, but will wind up with the result in targetReg
1787 // in the end, as expected.)
1789 // First, we need a tmpReg that is NOT the same as targetReg.
1790 // Note that if we have another reg that is the same as targetReg,
1791 // we can use tmpReg2 for that case, as we will not have hit this case.
1792 if (tmpReg == targetReg)
1797 if (op2Reg == targetReg)
1799 // We will swap the operands.
1800 // Since the code below only deals with registers, this now becomes the case where
1801 // op1Reg == targetReg.
1805 if (op1Reg == targetReg)
1807 // Copy op1, and make tmpReg2 the new op1Reg.
1808 // Note that those regs can't be the same, as we asserted above.
1809 // Also, we know that tmpReg2 hasn't been used, because we couldn't have hit
1810 // the "tmpReg == targetReg" case.
1811 inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType));
1813 inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
1814 // However, we have one more case to worry about: what if op2Reg is also targetReg
1815 // (i.e. we have the same operand as op1 and op2)?
1816 // In that case we will set op2Reg to the same register as op1Reg.
1817 if (op2Reg == targetReg)
1824 // Copy op1 to targetReg and op2 to tmpReg.
1825 inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1826 inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
1828 // Let's assert that things are as we expect.
1829 // - We have op1 in op1Reg and targetReg, and they are different registers.
1830 assert(op1Reg != targetReg);
1831 // - We have op2 in op2Reg and tmpReg, and they are different registers.
1832 assert(op2Reg != tmpReg);
1833 // - Either we are going to leave op1's reg unmodified, or it is the targetReg.
1834 assert((op1->gtRegNum == op1Reg) || (op1->gtRegNum == op2Reg) || (op1->gtRegNum == targetReg));
1835 // - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg.
1836 assert((op2->gtRegNum == op1Reg) || (op2->gtRegNum == op2Reg) || (op2->gtRegNum == targetReg));
1838 // Now we can generate the code.
1840 // targetReg = op1 >> 4-bytes (op1 is already in targetReg)
1841 getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, 4);
1843 // tmpReg = op2 >> 4-bytes (op2 is already in tmpReg)
1844 getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, 4);
1846 // tmp = unsigned double word multiply of targetReg and tmpReg. Essentially
1847 // tmpReg[63:0] = op1[1] * op2[1]
1848 // tmpReg[127:64] = op1[3] * op2[3]
1849 inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType));
1851 // Extract first and third double word results from tmpReg
1852 // tmpReg = shuffle(0,0,2,0) of tmpReg
1853 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX);
1855 // targetReg[63:0] = op1[0] * op2[0]
1856 // targetReg[127:64] = op1[2] * op2[2]
1857 inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1858 inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType));
1860 // Extract first and third double word results from targetReg
1861 // targetReg = shuffle(0,0,2,0) of targetReg
1862 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX);
1864 // pack the results into a single vector
1865 inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
1869 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1871 // Currently AVX doesn't support integer.
1872 // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
1873 if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
1874 !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins))
1876 inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
1880 if (op2Reg == targetReg)
1884 else if (op1Reg != targetReg)
1886 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1889 inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1893 // Vector2/3 div: since the top-most elements will be zero, we end up
1894 // perfoming 0/0 which is a NAN. Therefore, post division we need to set the
1895 // top-most elements to zero. This is achieved by left logical shift followed
1896 // by right logical shift of targetReg.
1897 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16))
1899 // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
1900 unsigned shiftCount = 16 - simdNode->gtSIMDSize;
1901 assert(shiftCount != 0);
1902 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
1903 getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1904 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
1905 getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1908 genProduceReg(simdNode);
1911 //--------------------------------------------------------------------------------
1912 // genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater
1913 // <, <=, >, >= and ==
1916 // simdNode - The GT_SIMD node
1921 void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
1923 GenTree* op1 = simdNode->gtGetOp1();
1924 GenTree* op2 = simdNode->gtGetOp2();
1925 var_types baseType = simdNode->gtSIMDBaseType;
1926 regNumber targetReg = simdNode->gtRegNum;
1927 var_types targetType = simdNode->TypeGet();
1928 SIMDLevel level = compiler->getSIMDSupportLevel();
1930 genConsumeOperands(simdNode);
1931 regNumber op1Reg = op1->gtRegNum;
1932 regNumber op2Reg = op2->gtRegNum;
1933 regNumber otherReg = op2Reg;
1935 switch (simdNode->gtSIMDIntrinsicID)
1937 case SIMDIntrinsicEqual:
1938 case SIMDIntrinsicGreaterThan:
1940 assert(targetReg != REG_NA);
1943 // SSE2: vector<(u)long> relational op should be implemented in terms of
1944 // TYP_INT comparison operations
1945 if (baseType == TYP_LONG || baseType == TYP_ULONG)
1947 assert(level >= SIMD_SSE4_Supported);
1951 // Greater-than: Floating point vectors use "<" with swapped operands
1952 if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan)
1954 assert(!varTypeIsFloating(baseType));
1958 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1960 // targetReg = op1reg > op2reg
1961 // Therefore, we can optimize if op1Reg == targetReg
1963 if (op1Reg != targetReg)
1965 if (op2Reg == targetReg)
1967 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual);
1972 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1976 if (varTypeIsFloating(baseType))
1978 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, otherReg, ival);
1982 inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1987 case SIMDIntrinsicLessThan:
1988 case SIMDIntrinsicLessThanOrEqual:
1990 assert(targetReg != REG_NA);
1992 // Int vectors use ">" and ">=" with swapped operands
1993 assert(varTypeIsFloating(baseType));
1995 // Get the instruction opcode for compare operation
1997 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1999 // targetReg = op1reg RelOp op2reg
2000 // Thefore, we can optimize if op1Reg == targetReg
2001 if (op1Reg != targetReg)
2003 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2006 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival);
2010 // (In)Equality that produces bool result instead of a bit vector
2011 case SIMDIntrinsicOpEquality:
2012 case SIMDIntrinsicOpInEquality:
2014 // We're only setting condition flags, if a 0/1 value is desired then Lowering should have inserted a SETCC.
2015 assert(targetReg == REG_NA);
2017 var_types simdType = op1->TypeGet();
2018 // TODO-1stClassStructs: Temporary to minimize asmDiffs
2019 if (simdType == TYP_DOUBLE)
2021 simdType = TYP_SIMD8;
2024 // Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16
2025 // since both the operands will be in XMM registers.
2026 if (simdType == TYP_SIMD12)
2028 simdType = TYP_SIMD16;
2031 // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest.
2032 if (op2->isContained())
2034 assert((compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) && op2->IsIntegralConstVector(0));
2035 inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType));
2039 // We need one additional SIMD register to store the result of the SIMD compare.
2040 regNumber tmpReg1 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
2042 // tmpReg1 = (op1Reg == op2Reg)
2043 // Call this value of tmpReg1 as 'compResult' for further reference below.
2044 regNumber otherReg = op2Reg;
2045 if (tmpReg1 != op2Reg)
2047 if (tmpReg1 != op1Reg)
2049 inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
2057 // For all integer types we can use TYP_INT comparison.
2060 getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
2062 if (varTypeIsFloating(baseType))
2064 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
2068 inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
2071 regNumber intReg = simdNode->GetSingleTempReg(RBM_ALLINT);
2072 inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType));
2073 // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare
2074 // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a
2075 // subset of each component's ones/zeroes. In the end we need to know if the result is
2076 // "all ones" where the number of ones is given by the vector byte size, not by the
2077 // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
2078 // for SSE registers we need to compare to 0x0000FFFF.
2079 // The SIMD12 case is handled specially, because we can't rely on the upper bytes being
2080 // zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF).
2081 // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
2082 // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
2083 // encoding instead of 83F8FF.
2085 if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
2088 getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask);
2090 else if (emitActualTypeSize(simdType) == 32)
2098 getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask);
2104 noway_assert(!"Unimplemented SIMD relational operation.");
2108 genProduceReg(simdNode);
2111 //--------------------------------------------------------------------------------
2112 // genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
2115 // simdNode - The GT_SIMD node
2120 void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
2122 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);
2124 GenTree* op1 = simdNode->gtGetOp1();
2125 GenTree* op2 = simdNode->gtGetOp2();
2126 var_types baseType = simdNode->gtSIMDBaseType;
2127 var_types simdType = op1->TypeGet();
2128 // TODO-1stClassStructs: Temporary to minimize asmDiffs
2129 if (simdType == TYP_DOUBLE)
2131 simdType = TYP_SIMD8;
2133 var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType;
2134 regNumber targetReg = simdNode->gtRegNum;
2135 assert(targetReg != REG_NA);
2137 var_types targetType = simdNode->TypeGet();
2138 assert(targetType == baseType);
2140 genConsumeOperands(simdNode);
2141 regNumber op1Reg = op1->gtRegNum;
2142 regNumber op2Reg = op2->gtRegNum;
2143 regNumber tmpReg1 = REG_NA;
2144 regNumber tmpReg2 = REG_NA;
2146 SIMDLevel level = compiler->getSIMDSupportLevel();
2148 // Dot product intrinsic is supported only on float/double vectors
2149 // and 32-byte int vectors on AVX.
2151 // Float/Double Vectors:
2152 // For SSE, or AVX with 32-byte vectors, we need one additional Xmm register
2153 // different from targetReg as scratch. Note that if this is a TYP_SIMD16 or
2154 // smaller on AVX, then we don't need a tmpReg.
2156 // 32-byte integer vector on AVX: we need two additional Xmm registers
2157 // different from targetReg as scratch.
2159 // 16-byte integer vector on SSE4: we need one additional Xmm register
2160 // different from targetReg as scratch.
2161 if (varTypeIsFloating(baseType))
2163 if ((compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) || (simdEvalType == TYP_SIMD32))
2165 tmpReg1 = simdNode->GetSingleTempReg();
2166 assert(tmpReg1 != targetReg);
2170 assert(simdNode->AvailableTempRegCount() == 0);
2175 assert(baseType == TYP_INT);
2176 assert(level >= SIMD_SSE4_Supported);
2178 if (level == SIMD_SSE4_Supported)
2180 tmpReg1 = simdNode->GetSingleTempReg();
2184 tmpReg1 = simdNode->ExtractTempReg();
2185 tmpReg2 = simdNode->GetSingleTempReg();
2189 if (level == SIMD_SSE2_Supported)
2191 // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg
2192 if (op1Reg == targetReg)
2195 // nothing to do, we have registers in the right place
2197 else if (op2Reg == targetReg)
2203 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
2206 // DotProduct(v1, v2)
2207 // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1
2208 if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
2210 assert(baseType == TYP_FLOAT);
2212 // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
2214 // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY) // tmp = (2, 0, 0, 1) - don't really care what's in upper
2216 // v0 = v0 + tmp // v0 = (3+2, 0+2, 1+0, 0+1)
2217 // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW) // tmp = ( 1, 1, 2, 2)
2218 // v0 = v0 + tmp // v0 = (1+2+3, 0+1+2, 0+1+2, 0+1+2)
2220 inst_RV_RV(INS_mulps, targetReg, op2Reg);
2221 inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2222 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY);
2223 inst_RV_RV(INS_addps, targetReg, tmpReg1);
2224 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW);
2225 inst_RV_RV(INS_addps, targetReg, tmpReg1);
2227 else if (baseType == TYP_FLOAT)
2230 // tmp = v0 // v0 = (3, 2, 1, 0) - each element is given by its
2232 // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY) // tmp = (2, 3, 0, 1)
2233 // v0 = v0 + tmp // v0 = (3+2, 2+3, 1+0, 0+1)
2235 // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW) // tmp = (0+1, 1+0, 2+3, 3+2)
2236 // v0 = v0 + tmp // v0 = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
2237 // // Essentially horizontal addition of all elements.
2238 // // We could achieve the same using SSEv3 instruction
2241 inst_RV_RV(INS_mulps, targetReg, op2Reg);
2242 inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2243 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY);
2244 inst_RV_RV(INS_addps, targetReg, tmpReg1);
2245 inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2246 inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW);
2247 inst_RV_RV(INS_addps, targetReg, tmpReg1);
2251 assert(baseType == TYP_DOUBLE);
2254 // tmp = v0 // v0 = (1, 0) - each element is given by its position
2255 // tmp = shuffle(tmp, tmp, Shuffle(0,1)) // tmp = (0, 1)
2256 // v0 = v0 + tmp // v0 = (1+0, 0+1)
2257 inst_RV_RV(INS_mulpd, targetReg, op2Reg);
2258 inst_RV_RV(INS_movaps, tmpReg1, targetReg);
2259 inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg1, tmpReg1, 0x01);
2260 inst_RV_RV(INS_addpd, targetReg, tmpReg1);
2265 assert(level >= SIMD_SSE4_Supported);
2267 if (varTypeIsFloating(baseType))
2269 // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
2270 // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
2271 // use the 3-op form, so that we can avoid these copies.
2272 // TODO-CQ: Add inst_RV_RV_RV_IV().
2273 if (op1Reg == targetReg)
2276 // nothing to do, we have registers in the right place
2278 else if (op2Reg == targetReg)
2284 inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
2287 emitAttr emitSize = emitActualTypeSize(simdEvalType);
2288 if (baseType == TYP_FLOAT)
2290 // dpps computes the dot product of the upper & lower halves of the 32-byte register.
2291 // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
2292 unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1;
2293 inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask);
2294 // dpps computes the dot product of the upper & lower halves of the 32-byte register.
2295 // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
2296 // If this is TYP_SIMD32, we need to combine the lower & upper results.
2297 if (simdEvalType == TYP_SIMD32)
2299 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01);
2300 inst_RV_RV(INS_addps, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
2303 else if (baseType == TYP_DOUBLE)
2305 if (simdEvalType == TYP_SIMD32)
2307 // targetReg = targetReg * op2Reg
2308 // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
2309 // tmpReg = vextractf128(targetReg, 1) ; Moves the upper sum into tempReg
2310 // targetReg = targetReg + tmpReg1
2311 inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
2312 inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
2313 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg1, targetReg, 0x01);
2314 inst_RV_RV(INS_addpd, targetReg, tmpReg1, targetType, emitTypeSize(targetType));
2318 // On AVX, we have no 16-byte vectors of double. Note that, if we did, we could use
2320 assert(level == SIMD_SSE4_Supported);
2321 inst_RV_RV_IV(INS_dppd, emitSize, targetReg, op2Reg, 0x31);
2327 // Dot product of 32-byte int vector on SSE4/AVX.
2328 assert(baseType == TYP_INT);
2329 assert(simdEvalType == TYP_SIMD16 || simdEvalType == TYP_SIMD32);
2332 // SSE4: We need 1 scratch register.
2333 // AVX2: We need 2 scratch registers.
2334 if (simdEvalType == TYP_SIMD16)
2336 assert(tmpReg1 != REG_NA);
2340 assert(tmpReg1 != REG_NA);
2341 assert(tmpReg2 != REG_NA);
2345 // tmpReg1 = op1 * op2
2346 if (level == SIMD_AVX2_Supported)
2348 // On AVX take advantage 3 operand form of pmulld
2349 inst_RV_RV_RV(INS_pmulld, tmpReg1, op1Reg, op2Reg, emitTypeSize(simdEvalType));
2353 inst_RV_RV(ins_Copy(simdEvalType), tmpReg1, op1Reg, simdEvalType);
2354 inst_RV_RV(INS_pmulld, tmpReg1, op2Reg, simdEvalType);
2357 if (simdEvalType == TYP_SIMD32)
2359 // tmpReg2[127..0] = Upper 128-bits of tmpReg1
2360 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
2362 // tmpReg1[127..0] = tmpReg1[127..0] + tmpReg2[127..0]
2363 // This will compute
2364 // tmpReg1[0] = op1[0]*op2[0] + op1[4]*op2[4]
2365 // tmpReg1[1] = op1[1]*op2[1] + op1[5]*op2[5]
2366 // tmpReg1[2] = op1[2]*op2[2] + op1[6]*op2[6]
2367 // tmpReg1[4] = op1[4]*op2[4] + op1[7]*op2[7]
2368 inst_RV_RV(INS_paddd, tmpReg1, tmpReg2, TYP_SIMD16, EA_16BYTE);
2371 // This horizontal add will compute
2374 // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[1]*op2[1]
2375 // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[4]*op2[4]
2378 // tmpReg1[0] = tmpReg1[2] = op1[0]*op2[0] + op1[4]*op2[4] + op1[1]*op2[1] + op1[5]*op2[5]
2379 // tmpReg1[1] = tmpReg1[3] = op1[2]*op2[2] + op1[6]*op2[6] + op1[4]*op2[4] + op1[7]*op2[7]
2380 inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
2382 // DotProduct(op1, op2) = tmpReg1[0] = tmpReg1[0] + tmpReg1[1]
2383 inst_RV_RV(INS_phaddd, tmpReg1, tmpReg1, TYP_SIMD16, EA_16BYTE);
2385 // TargetReg = integer result from tmpReg1
2386 // (Note that for mov_xmm2i, the int register is always in the reg2 position)
2387 inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
2391 genProduceReg(simdNode);
2394 //------------------------------------------------------------------------------------
2395 // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
2398 // simdNode - The GT_SIMD node
2403 void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
2405 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
2407 GenTree* op1 = simdNode->gtGetOp1();
2408 GenTree* op2 = simdNode->gtGetOp2();
2409 var_types simdType = op1->TypeGet();
2410 assert(varTypeIsSIMD(simdType));
2412 // op1 of TYP_SIMD12 should be considered as TYP_SIMD16,
2413 // since it is in XMM register.
2414 if (simdType == TYP_SIMD12)
2416 simdType = TYP_SIMD16;
2419 var_types baseType = simdNode->gtSIMDBaseType;
2420 regNumber targetReg = simdNode->gtRegNum;
2421 assert(targetReg != REG_NA);
2422 var_types targetType = simdNode->TypeGet();
2423 assert(targetType == genActualType(baseType));
2425 // GetItem has 2 operands:
2426 // - the source of SIMD type (op1)
2427 // - the index of the value to be returned.
2428 genConsumeOperands(simdNode);
2429 regNumber srcReg = op1->gtRegNum;
2431 // Optimize the case of op1 is in memory and trying to access ith element.
2432 if (!op1->isUsedFromReg())
2434 assert(op1->isContained());
2440 if (op1->OperIsLocal())
2442 // There are three parts to the total offset here:
2443 // {offset of local} + {offset of SIMD Vector field (lclFld only)} + {offset of element within SIMD vector}.
2445 unsigned varNum = op1->gtLclVarCommon.gtLclNum;
2446 offset += compiler->lvaFrameAddress(varNum, &isEBPbased);
2447 if (op1->OperGet() == GT_LCL_FLD)
2449 offset += op1->gtLclFld.gtLclOffs;
2451 baseReg = (isEBPbased) ? REG_EBP : REG_ESP;
2455 // Require GT_IND addr to be not contained.
2456 assert(op1->OperGet() == GT_IND);
2458 GenTree* addr = op1->AsIndir()->Addr();
2459 assert(!addr->isContained());
2460 baseReg = addr->gtRegNum;
2463 if (op2->isContainedIntOrIImmed())
2466 offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType);
2470 indexReg = op2->gtRegNum;
2471 assert(genIsValidIntReg(indexReg));
2474 // Now, load the desired element.
2475 getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
2476 emitTypeSize(baseType), // Of the vector baseType
2477 targetReg, // To targetReg
2478 baseReg, // Base Reg
2479 indexReg, // Indexed
2480 genTypeSize(baseType), // by the size of the baseType
2482 genProduceReg(simdNode);
2486 // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
2487 // For the non-constant case, we will use the SIMD temp location to store the vector, and
2488 // the load the desired element.
2489 // The range check will already have been performed, so at this point we know we have an index
2490 // within the bounds of the vector.
2491 if (!op2->IsCnsIntOrI())
2493 unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
2494 noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
2496 unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
2497 regNumber indexReg = op2->gtRegNum;
2499 // Store the vector to the temp location.
2500 getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)),
2501 emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0);
2503 // Now, load the desired element.
2504 getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
2505 emitTypeSize(baseType), // Of the vector baseType
2506 targetReg, // To targetReg
2507 (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based
2508 indexReg, // Indexed
2509 genTypeSize(baseType), // by the size of the baseType
2511 genProduceReg(simdNode);
2515 noway_assert(op2->isContained());
2516 noway_assert(op2->IsCnsIntOrI());
2517 unsigned int index = (unsigned int)op2->gtIntCon.gtIconVal;
2518 unsigned int byteShiftCnt = index * genTypeSize(baseType);
2520 // In general we shouldn't have an index greater than or equal to the length of the vector.
2521 // However, if we have an out-of-range access, under minOpts it will not be optimized
2522 // away. The code will throw before we reach this point, but we still need to generate
2523 // code. In that case, we will simply mask off the upper bits.
2524 if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength())
2526 byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - 1);
2527 index = byteShiftCnt / genTypeSize(baseType);
2530 regNumber tmpReg = REG_NA;
2531 if (simdNode->AvailableTempRegCount() != 0)
2533 tmpReg = simdNode->GetSingleTempReg();
2537 assert((byteShiftCnt == 0) || varTypeIsFloating(baseType) ||
2538 (varTypeIsSmallInt(baseType) && (byteShiftCnt < 16)));
2541 if (byteShiftCnt >= 16)
2543 assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
2545 regNumber newSrcReg;
2546 if (varTypeIsFloating(baseType))
2548 newSrcReg = targetReg;
2553 assert(tmpReg != REG_NA);
2556 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, 0x01);
2561 // Generate the following sequence:
2562 // 1) baseType is floating point
2563 // movaps targetReg, srcReg
2564 // psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element
2566 // 2) baseType is not floating point
2567 // movaps tmpReg, srcReg <-- not generated if accessing zero'th element
2568 // OR if tmpReg == srcReg
2569 // psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element
2570 // mov_xmm2i targetReg, tmpReg
2571 if (varTypeIsFloating(baseType))
2573 if (targetReg != srcReg)
2575 inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType));
2578 if (byteShiftCnt != 0)
2580 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
2581 getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
2586 if (varTypeIsSmallInt(baseType))
2588 // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits.
2589 // In case of vector<short> we also need to sign extend the 16-bit value in targetReg
2590 // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right
2591 // by 8-bits if index is odd. In case of Vector<sbyte> also sign extend targetReg.
2593 unsigned baseSize = genTypeSize(baseType);
2598 // We actually want index % 8 for the AVX case (for SSE it will never be > 8).
2599 // Note that this doesn't matter functionally, because the instruction uses just the
2600 // low 3 bits of index, but it's better to use the right value.
2603 assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
2607 getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index);
2609 bool ZeroOrSignExtnReqd = true;
2612 if ((op2->gtIntCon.gtIconVal % 2) == 1)
2614 // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element.
2615 inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8);
2617 // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE
2618 ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
2620 // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits
2624 // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT
2625 assert(baseSize == 2);
2626 ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
2629 if (ZeroOrSignExtnReqd)
2631 // Zero/sign extend the byte/short to 32-bits
2632 inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType));
2637 // We need a temp xmm register if the baseType is not floating point and
2638 // accessing non-zero'th element.
2641 if (byteShiftCnt != 0)
2643 assert(tmpReg != REG_NA);
2645 if (tmpReg != srcReg)
2647 inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
2650 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
2651 getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
2658 assert(tmpReg != REG_NA);
2659 ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
2660 // (Note that for mov_xmm2i, the int register is always in the reg2 position.)
2661 inst_RV_RV(ins, tmpReg, targetReg, baseType);
2665 genProduceReg(simdNode);
2668 //------------------------------------------------------------------------------------
2669 // genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
2672 // simdNode - The GT_SIMD node
2677 // TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case.
2679 void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
2681 // Determine index based on intrinsic ID
2683 switch (simdNode->gtSIMDIntrinsicID)
2685 case SIMDIntrinsicSetX:
2688 case SIMDIntrinsicSetY:
2691 case SIMDIntrinsicSetZ:
2694 case SIMDIntrinsicSetW:
2701 assert(index != -1);
2703 // op1 is the SIMD vector
2704 // op2 is the value to be set
2705 GenTree* op1 = simdNode->gtGetOp1();
2706 GenTree* op2 = simdNode->gtGetOp2();
2708 var_types baseType = simdNode->gtSIMDBaseType;
2709 regNumber targetReg = simdNode->gtRegNum;
2710 assert(targetReg != REG_NA);
2711 var_types targetType = simdNode->TypeGet();
2712 assert(varTypeIsSIMD(targetType));
2714 // the following assert must hold.
2715 // supported only on vector2f/3f/4f right now
2716 noway_assert(baseType == TYP_FLOAT);
2717 assert(op2->TypeGet() == baseType);
2718 assert(simdNode->gtSIMDSize >= ((index + 1) * genTypeSize(baseType)));
2720 genConsumeOperands(simdNode);
2721 regNumber op1Reg = op1->gtRegNum;
2722 regNumber op2Reg = op2->gtRegNum;
2724 // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate.
2725 if (targetReg != op1Reg)
2727 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2730 // Right now this intrinsic is supported only for float base type vectors.
2731 // If in future need to support on other base type vectors, the below
2732 // logic needs modification.
2733 noway_assert(baseType == TYP_FLOAT);
2735 if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
2737 // We need one additional int register as scratch
2738 regNumber tmpReg = simdNode->GetSingleTempReg();
2739 assert(genIsValidIntReg(tmpReg));
2741 // Move the value from xmm reg to an int reg
2742 instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT);
2743 // (Note that for mov_xmm2i, the int register is always in the reg2 position.
2744 inst_RV_RV(ins, op2Reg, tmpReg, baseType);
2746 // First insert the lower 16-bits of tmpReg in targetReg at 2*index position
2747 // since every float has two 16-bit words.
2748 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index);
2750 // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position
2751 inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16);
2752 getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index + 1);
2756 unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0) | INSERTPS_TARGET_SELECT(index));
2757 inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm);
2760 genProduceReg(simdNode);
2763 //------------------------------------------------------------------------
2764 // genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle.
2767 // simdNode - The GT_SIMD node
2772 void CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode)
2774 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2);
2775 noway_assert(compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported);
2777 GenTree* op1 = simdNode->gtGetOp1();
2778 GenTree* op2 = simdNode->gtGetOp2();
2779 assert(op2->isContained());
2780 assert(op2->IsCnsIntOrI());
2781 int shuffleControl = (int)op2->AsIntConCommon()->IconValue();
2782 var_types baseType = simdNode->gtSIMDBaseType;
2783 var_types targetType = simdNode->TypeGet();
2784 regNumber targetReg = simdNode->gtRegNum;
2785 assert(targetReg != REG_NA);
2787 regNumber op1Reg = genConsumeReg(op1);
2788 if (targetReg != op1Reg)
2790 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
2793 instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
2794 getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl);
2795 genProduceReg(simdNode);
2798 //-----------------------------------------------------------------------------
2799 // genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
2800 // Since Vector3 is not a hardware supported write size, it is performed
2801 // as two writes: 8 byte followed by 4-byte.
2804 // treeNode - tree node that is attempting to store indirect
2810 void CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
2812 assert(treeNode->OperGet() == GT_STOREIND);
2814 GenTree* addr = treeNode->gtOp.gtOp1;
2815 GenTree* data = treeNode->gtOp.gtOp2;
2817 // addr and data should not be contained.
2818 assert(!data->isContained());
2819 assert(!addr->isContained());
2822 // Should not require a write barrier
2823 GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data);
2824 assert(writeBarrierForm == GCInfo::WBF_NoBarrier);
2827 // Need an addtional Xmm register to extract upper 4 bytes from data.
2828 regNumber tmpReg = treeNode->GetSingleTempReg();
2830 genConsumeOperands(treeNode->AsOp());
2833 getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, 0);
2835 // Extract upper 4-bytes from data
2836 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, 0x02);
2839 getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, 8);
2842 //-----------------------------------------------------------------------------
2843 // genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
2844 // Since Vector3 is not a hardware supported write size, it is performed
2845 // as two loads: 8 byte followed by 4-byte.
2848 // treeNode - tree node of GT_IND
2854 void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
2856 assert(treeNode->OperGet() == GT_IND);
2858 regNumber targetReg = treeNode->gtRegNum;
2859 GenTree* op1 = treeNode->gtOp.gtOp1;
2860 assert(!op1->isContained());
2861 regNumber operandReg = genConsumeReg(op1);
2863 // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg
2864 regNumber tmpReg = treeNode->GetSingleTempReg();
2865 assert(tmpReg != targetReg);
2867 // Load upper 4 bytes in tmpReg
2868 getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, 8);
2870 // Load lower 8 bytes in targetReg
2871 getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);
2873 // combine upper 4 bytes and lower 8 bytes in targetReg
2874 getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
2876 genProduceReg(treeNode);
2879 //-----------------------------------------------------------------------------
2880 // genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
2881 // Since Vector3 is not a hardware supported write size, it is performed
2882 // as two stores: 8 byte followed by 4-byte.
2885 // treeNode - tree node that is attempting to store TYP_SIMD12 field
2890 void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode)
2892 assert((treeNode->OperGet() == GT_STORE_LCL_FLD) || (treeNode->OperGet() == GT_STORE_LCL_VAR));
2895 unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
2896 assert(varNum < compiler->lvaCount);
2898 if (treeNode->OperGet() == GT_STORE_LCL_FLD)
2900 offs = treeNode->gtLclFld.gtLclOffs;
2903 GenTree* op1 = treeNode->gtOp.gtOp1;
2904 assert(!op1->isContained());
2905 regNumber operandReg = genConsumeReg(op1);
2907 // Need an addtional Xmm register to extract upper 4 bytes from data.
2908 regNumber tmpReg = treeNode->GetSingleTempReg();
2910 // store lower 8 bytes
2911 getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs);
2913 // Extract upper 4-bytes from operandReg
2914 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
2916 // Store upper 4 bytes
2917 getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs + 8);
2920 //-----------------------------------------------------------------------------
2921 // genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
2922 // Since Vector3 is not a hardware supported read size, it is performed
2923 // as two reads: 4 byte followed by 8 byte.
2926 // treeNode - tree node that is attempting to load TYP_SIMD12 field
2931 void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
2933 assert((treeNode->OperGet() == GT_LCL_FLD) || (treeNode->OperGet() == GT_LCL_VAR));
2935 regNumber targetReg = treeNode->gtRegNum;
2937 unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
2938 assert(varNum < compiler->lvaCount);
2940 if (treeNode->OperGet() == GT_LCL_FLD)
2942 offs = treeNode->gtLclFld.gtLclOffs;
2945 // Need an additional Xmm register that is different from targetReg to read upper 4 bytes.
2946 regNumber tmpReg = treeNode->GetSingleTempReg();
2947 assert(tmpReg != targetReg);
2949 // Read upper 4 bytes to tmpReg
2950 getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs + 8);
2952 // Read lower 8 bytes to targetReg
2953 getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
2955 // combine upper 4 bytes and lower 8 bytes in targetReg
2956 getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
2958 genProduceReg(treeNode);
2963 //-----------------------------------------------------------------------------
2964 // genStoreSIMD12ToStack: store a TYP_SIMD12 (i.e. Vector3) type field to the stack.
2965 // Since Vector3 is not a hardware supported write size, it is performed
2966 // as two stores: 8 byte followed by 4-byte. The stack is assumed to have
2967 // already been adjusted.
2970 // operandReg - the xmm register containing the SIMD12 to store.
2971 // tmpReg - an xmm register that can be used as a temporary for the operation.
2976 void CodeGen::genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg)
2978 assert(genIsValidFloatReg(operandReg));
2979 assert(genIsValidFloatReg(tmpReg));
2982 getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
2984 // Extract upper 4-bytes from data
2985 getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
2988 getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
2991 //-----------------------------------------------------------------------------
2992 // genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
2993 // Since Vector3 is not a hardware supported write size, it is performed
2994 // as two stores: 8 byte followed by 4-byte. The stack is assumed to have
2995 // already been adjusted.
2998 // treeNode - tree node that is attempting to store TYP_SIMD12 field
3003 void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
3005 assert(treeNode->OperGet() == GT_PUTARG_STK);
3007 GenTree* op1 = treeNode->gtOp.gtOp1;
3008 assert(!op1->isContained());
3009 regNumber operandReg = genConsumeReg(op1);
3011 // Need an addtional Xmm register to extract upper 4 bytes from data.
3012 regNumber tmpReg = treeNode->GetSingleTempReg();
3014 genStoreSIMD12ToStack(operandReg, tmpReg);
3017 #endif // _TARGET_X86_
3019 //-----------------------------------------------------------------------------
3020 // genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
3021 // the given register, if any, or to memory.
3024 // simdNode - The GT_SIMD node
3030 // The upper half of all AVX registers is volatile, even the callee-save registers.
3031 // When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic
3032 // to cause the upper half to be saved. It will first attempt to find another, unused, callee-save
3033 // register. If such a register cannot be found, it will save it to an available caller-save register.
3034 // In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte
3035 // value to the stack. (Note that if there are no caller-save registers available, the entire 32 byte
3036 // value will be spilled to the stack.)
3038 void CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
3040 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave);
3042 GenTree* op1 = simdNode->gtGetOp1();
3043 assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
3044 regNumber targetReg = simdNode->gtRegNum;
3045 regNumber op1Reg = genConsumeReg(op1);
3046 assert(op1Reg != REG_NA);
3047 assert(targetReg != REG_NA);
3048 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01);
3050 genProduceReg(simdNode);
3053 //-----------------------------------------------------------------------------
3054 // genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to
3055 // the given register, if any, or to memory.
3058 // simdNode - The GT_SIMD node
3064 // For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always
3065 // have their home register, this node has its targetReg on the lclVar child, and its source
3067 // Regarding spill, please see the note above on genSIMDIntrinsicUpperSave. If we have spilled
3068 // an upper-half to a caller save register, this node will be marked GTF_SPILLED. However, unlike
3069 // most spill scenarios, the saved tree will be different from the restored tree, but the spill
3070 // restore logic, which is triggered by the call to genConsumeReg, requires us to provide the
3071 // spilled tree (saveNode) in order to perform the reload. We can easily find that tree,
3072 // as it is in the spill descriptor for the register from which it was saved.
3074 void CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
3076 assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore);
3078 GenTree* op1 = simdNode->gtGetOp1();
3079 assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
3080 regNumber srcReg = simdNode->gtRegNum;
3081 regNumber lclVarReg = genConsumeReg(op1);
3082 assert(lclVarReg != REG_NA);
3083 assert(srcReg != REG_NA);
3084 if (simdNode->gtFlags & GTF_SPILLED)
3086 GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree;
3087 noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg));
3088 genConsumeReg(saveNode);
3090 getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01);
3093 //------------------------------------------------------------------------
3094 // genSIMDIntrinsic: Generate code for a SIMD Intrinsic. This is the main
3095 // routine which in turn calls appropriate genSIMDIntrinsicXXX() routine.
3098 // simdNode - The GT_SIMD node
3104 // Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and
3105 // a limited set of methods.
3107 void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
3109 // NYI for unsupported base types
3110 if (simdNode->gtSIMDBaseType != TYP_INT && simdNode->gtSIMDBaseType != TYP_LONG &&
3111 simdNode->gtSIMDBaseType != TYP_FLOAT && simdNode->gtSIMDBaseType != TYP_DOUBLE &&
3112 simdNode->gtSIMDBaseType != TYP_USHORT && simdNode->gtSIMDBaseType != TYP_UBYTE &&
3113 simdNode->gtSIMDBaseType != TYP_SHORT && simdNode->gtSIMDBaseType != TYP_BYTE &&
3114 simdNode->gtSIMDBaseType != TYP_UINT && simdNode->gtSIMDBaseType != TYP_ULONG)
3116 noway_assert(!"SIMD intrinsic with unsupported base type.");
3119 switch (simdNode->gtSIMDIntrinsicID)
3121 case SIMDIntrinsicInit:
3122 genSIMDIntrinsicInit(simdNode);
3125 case SIMDIntrinsicInitN:
3126 genSIMDIntrinsicInitN(simdNode);
3129 case SIMDIntrinsicSqrt:
3130 case SIMDIntrinsicCast:
3131 case SIMDIntrinsicAbs:
3132 genSIMDIntrinsicUnOp(simdNode);
3135 case SIMDIntrinsicConvertToSingle:
3136 case SIMDIntrinsicConvertToInt32:
3137 genSIMDIntrinsic32BitConvert(simdNode);
3140 case SIMDIntrinsicConvertToDouble:
3141 case SIMDIntrinsicConvertToInt64:
3142 genSIMDIntrinsic64BitConvert(simdNode);
3145 case SIMDIntrinsicWidenLo:
3146 case SIMDIntrinsicWidenHi:
3147 genSIMDIntrinsicWiden(simdNode);
3150 case SIMDIntrinsicNarrow:
3151 genSIMDIntrinsicNarrow(simdNode);
3154 case SIMDIntrinsicAdd:
3155 case SIMDIntrinsicSub:
3156 case SIMDIntrinsicMul:
3157 case SIMDIntrinsicDiv:
3158 case SIMDIntrinsicBitwiseAnd:
3159 case SIMDIntrinsicBitwiseAndNot:
3160 case SIMDIntrinsicBitwiseOr:
3161 case SIMDIntrinsicBitwiseXor:
3162 case SIMDIntrinsicMin:
3163 case SIMDIntrinsicMax:
3164 genSIMDIntrinsicBinOp(simdNode);
3167 case SIMDIntrinsicOpEquality:
3168 case SIMDIntrinsicOpInEquality:
3169 case SIMDIntrinsicEqual:
3170 case SIMDIntrinsicLessThan:
3171 case SIMDIntrinsicGreaterThan:
3172 case SIMDIntrinsicLessThanOrEqual:
3173 case SIMDIntrinsicGreaterThanOrEqual:
3174 genSIMDIntrinsicRelOp(simdNode);
3177 case SIMDIntrinsicDotProduct:
3178 genSIMDIntrinsicDotProduct(simdNode);
3181 case SIMDIntrinsicGetItem:
3182 genSIMDIntrinsicGetItem(simdNode);
3185 case SIMDIntrinsicShuffleSSE2:
3186 genSIMDIntrinsicShuffleSSE2(simdNode);
3189 case SIMDIntrinsicSetX:
3190 case SIMDIntrinsicSetY:
3191 case SIMDIntrinsicSetZ:
3192 case SIMDIntrinsicSetW:
3193 genSIMDIntrinsicSetItem(simdNode);
3196 case SIMDIntrinsicUpperSave:
3197 genSIMDIntrinsicUpperSave(simdNode);
3199 case SIMDIntrinsicUpperRestore:
3200 genSIMDIntrinsicUpperRestore(simdNode);
3204 noway_assert(!"Unimplemented SIMD intrinsic.");
3209 #endif // FEATURE_SIMD
3210 #endif //_TARGET_XARCH_