INST3( movaps, "movaps" , 0, IUM_WR, 0, 0, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28))
INST3( movupd, "movupd" , 0, IUM_WR, 0, 0, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10))
INST3( movups, "movups" , 0, IUM_WR, 0, 0, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10))
+INST3( movlhps, "movlhps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x16))
INST3( shufps, "shufps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0xC6))
INST3( shufpd, "shufpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC6))
-
+
+INST3( punpckhdq, "punpckhdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6A))
+
// SSE 2 arith
INST3( addps, "addps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x58)) // Add packed singles
INST3( addss, "addss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x58)) // Add scalar singles
INST3( pandn, "pandn" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDF)) // Packed bit-wise AND NOT of two xmm regs
INST3( por, "por" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEB)) // Packed bit-wise OR of two xmm regs
INST3( pxor, "pxor" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEF)) // Packed bit-wise XOR of two xmm regs
+
+// Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode,
+// which is handled in emitxarch.cpp.
INST3( psrldq, "psrldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift right logical of xmm reg by given number of bytes
INST3( pslldq, "pslldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift left logical of xmm reg by given number of bytes
+INST3( psllq, "psllq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift left logical of 64-bit integers
+INST3( psrlq, "psrlq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift right logical of 64-bit integers
+INST3( pslld, "pslld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift left logical of 32-bit integers
+INST3( psrld, "psrld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right logical of 32-bit integers
+INST3( psllw, "psllw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift left logical of 16-bit integers
+INST3( psrlw, "psrlw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift right logical of 16-bit integers
+INST3( psrad, "psrad" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right arithmetic of 32-bit integers
+
INST3( pmaxub, "pmaxub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDE)) // packed maximum unsigned bytes
INST3( pminub, "pminub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDA)) // packed minimum unsigned bytes
INST3( pmaxsw, "pmaxsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEE)) // packed maximum signed words
INST3( pextrw, "pextrw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC5)) // Extract 16-bit value into a r32 with zero extended to 32-bits
INST3( pinsrw, "pinsrw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC4)) // packed insert word
+INST3( punpckhbw, "punpckhbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x68)) // Packed logical (unsigned) widen ubyte to ushort (hi)
+INST3( punpcklbw, "punpcklbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x60)) // Packed logical (unsigned) widen ubyte to ushort (lo)
+INST3( punpckhqdq, "punpckhqdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6D)) // Packed logical (unsigned) widen uint to ulong (hi)
+INST3( punpcklqdq, "punpcklqdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6C)) // Packed logical (unsigned) widen uint to ulong (lo)
+INST3( punpckhwd, "punpckhwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x69)) // Packed logical (unsigned) widen ushort to uint (hi)
+INST3( punpcklwd, "punpcklwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x61)) // Packed logical (unsigned) widen ushort to uint (lo)
+
+INST3( packssdw, "packssdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6B)) // Pack (narrow) int to short with saturation
+INST3( packsswb, "packsswb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x63)) // Pack (narrow) short to byte with saturation
+INST3( packuswb, "packuswb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x67)) // Pack (narrow) short to unsigned byte with saturation
#endif // !LEGACY_BACKEND
INST3(LAST_SSE2_INSTRUCTION, "LAST_SSE2_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
#ifndef LEGACY_BACKEND
INST3(FIRST_SSE4_INSTRUCTION, "FIRST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
// enum name FP updmode rf wf MR MI RM
-INST3( dpps, "dpps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x40)) // Packed bit-wise AND NOT of two xmm regs
-INST3( dppd, "dppd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x41)) // Packed bit-wise AND NOT of two xmm regs
+INST3( dpps, "dpps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x40)) // Packed dot product of two float vector regs
+INST3( dppd, "dppd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x41)) // Packed dot product of two double vector regs
INST3( insertps, "insertps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x21)) // Insert packed single precision float value
INST3( pcmpeqq, "pcmpeqq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x29)) // Packed compare 64-bit integers for equality
INST3( pcmpgtq, "pcmpgtq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x37)) // Packed compare 64-bit integers for equality
INST3( pmaxsd, "pmaxsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3D)) // packed maximum 32-bit signed integers
INST3( pmaxuw, "pmaxuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3E)) // packed maximum 16-bit unsigned integers
INST3( pmaxud, "pmaxud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3F)) // packed maximum 32-bit unsigned integers
+INST3( pmovsxbw, "pmovsxbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x20)) // Packed sign extend byte to short
+INST3( pmovsxwd, "pmovsxwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x23)) // Packed sign extend short to int
+INST3( pmovsxdq, "pmovsxdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x25)) // Packed sign extend int to long
+INST3( packusdw, "packusdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2B)) // Pack (narrow) int to unsigned short with saturation
+
INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
INST3( vpbroadcastd, "pbroadcastd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x58)) // Broadcast int32 value from reg/memory to entire ymm register
INST3( vpbroadcastq, "pbroadcastq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x59)) // Broadcast int64 value from reg/memory to entire ymm register
INST3( vextractf128, "extractf128" , 0, IUM_WR, 0, 0, SSE3A(0x19), BAD_CODE, BAD_CODE) // Extract 128-bit packed floating point values
+INST3( vextracti128, "extracti128" , 0, IUM_WR, 0, 0, SSE3A(0x39), BAD_CODE, BAD_CODE) // Extract 128-bit packed integer values
INST3( vinsertf128, "insertf128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x18)) // Insert 128-bit packed floating point values
+INST3( vinserti128, "inserti128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x38)) // Insert 128-bit packed integer values
INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
-
+INST3( vperm2i128, "perm2i128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x46)) // Permute 128-bit halves of input register
+INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x00)) // Permute 64-bit of input register
INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
#endif // !LEGACY_BACKEND
// enum name FP updmode rf wf R/M,R/M[reg] R/M,icon
result = INS_movaps;
break;
+ case SIMDIntrinsicConvertToSingle:
+ result = INS_cvtdq2ps;
+ break;
+
+ case SIMDIntrinsicConvertToDouble:
+ assert(baseType == TYP_LONG);
+ result = INS_cvtsi2sd;
+ break;
+
+ case SIMDIntrinsicConvertToInt32:
+ case SIMDIntrinsicConvertToUInt32:
+ assert(baseType == TYP_FLOAT);
+ result = INS_cvttps2dq;
+ break;
+
+ case SIMDIntrinsicConvertToInt64:
+ case SIMDIntrinsicConvertToUInt64:
+ assert(baseType == TYP_DOUBLE);
+ result = INS_cvttsd2si;
+ break;
+
+ case SIMDIntrinsicNarrow:
+ // Note that for the integer types the caller must zero the upper bits of
+ // each source element, since the instructions saturate.
+ switch (baseType)
+ {
+ case TYP_INT:
+ case TYP_UINT:
+ if (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4)
+ {
+ result = INS_packusdw;
+ }
+ else
+ {
+ result = INS_packssdw;
+ }
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ result = INS_packuswb;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicNarrow");
+ result = INS_invalid;
+ break;
+ }
+ break;
+
+ case SIMDIntrinsicWidenLo:
+ // Some of these have multiple instruction implementations, with one instruction to widen the lo half,
+ // and another to widen the hi half.
+ switch (baseType)
+ {
+ case TYP_FLOAT:
+ result = INS_cvtps2pd;
+ break;
+ case TYP_INT:
+ case TYP_UINT:
+ result = INS_punpckldq;
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ result = INS_punpcklwd;
+ break;
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ result = INS_punpcklbw;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicWidenLo");
+ result = INS_invalid;
+ break;
+ }
+ break;
+
+ case SIMDIntrinsicWidenHi:
+ switch (baseType)
+ {
+ case TYP_FLOAT:
+ // For this case, we actually use the same instruction.
+ result = INS_cvtps2pd;
+ break;
+ case TYP_INT:
+ case TYP_UINT:
+ result = INS_punpckhdq;
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ result = INS_punpckhwd;
+ break;
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ result = INS_punpckhbw;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicWidenHi");
+ result = INS_invalid;
+ break;
+ }
+ break;
+
case SIMDIntrinsicShiftLeftInternal:
- // base type doesn't matter since the entire vector is shifted left
- result = INS_pslldq;
+ switch (baseType)
+ {
+ case TYP_SIMD16:
+ // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
+ result = INS_pslldq;
+ break;
+ case TYP_UINT:
+ case TYP_INT:
+ result = INS_pslld;
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ case TYP_USHORT:
+ result = INS_psllw;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal");
+ result = INS_invalid;
+ break;
+ }
break;
case SIMDIntrinsicShiftRightInternal:
- // base type doesn't matter since the entire vector is shifted right
- result = INS_psrldq;
+ switch (baseType)
+ {
+ case TYP_SIMD16:
+ // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
+ result = INS_psrldq;
+ break;
+ case TYP_UINT:
+ case TYP_INT:
+ result = INS_psrld;
+ break;
+ case TYP_SHORT:
+ case TYP_CHAR:
+ case TYP_USHORT:
+ result = INS_psrlw;
+ break;
+ default:
+ assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal");
+ result = INS_invalid;
+ break;
+ }
break;
case SIMDIntrinsicUpperSave:
{
// There is no guarantee that upper bits of op1Reg are zero.
// We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
}
else
ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes
ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
}
unsigned int baseTypeSize = genTypeSize(baseType);
- instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
// We will first consume the list items in execution (left to right) order,
// and record the registers.
genProduceReg(simdNode);
}
+//----------------------------------------------------------------------------------
+// genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float)
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Return Value:
+// None.
+//
+void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode)
+{
+ SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
+ assert((intrinsicID == SIMDIntrinsicConvertToSingle) || (intrinsicID == SIMDIntrinsicConvertToInt32) ||
+ (intrinsicID == SIMDIntrinsicConvertToUInt32));
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types targetType = simdNode->TypeGet();
+
+ regNumber op1Reg = genConsumeReg(op1);
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+ if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT)
+ {
+ regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
+ regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
+
+ // We will generate the following:
+ // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
+ // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
+ // vpsrld targetReg, 16 (get upper 16 bits of src and put it into targetReg)
+ // vpslld tmpReg2, 16
+ // vpsrld tmpReg2, 16 (get lower 16 bits of src and put it into tmpReg2)
+ // mov tmpIntReg, 0x5300000053000000
+ // vmovd tmpReg, tmpIntReg
+ // vpbroadcastd tmpReg, tmpReg (build mask for converting upper 16 bits of src)
+ // vorps targetReg, tmpReg
+ // vsubps targetReg, tmpReg (convert upper 16 bits of src and put it into targetReg)
+ // vcvtdq2ps tmpReg2, tmpReg2 (convert lower 16 bits of src and put it into tmpReg2)
+ // vaddps targetReg, tmpReg2 (add upper 16 bits and lower 16 bits)
+ inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType));
+ if (targetReg != op1Reg)
+ {
+ inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType));
+ }
+
+ // prepare upper 16 bits
+ getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, 16);
+
+ // prepare lower 16 bits
+ getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, 16);
+ getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, 16);
+
+// prepare mask
+#ifdef _TARGET_AMD64_
+ getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X5300000053000000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+ if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X53000000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ }
+ else
+ {
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X00005300);
+ inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+ getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 1);
+ getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 3);
+ }
+#endif
+ if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+ }
+
+ // convert upper 16 bits
+ inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+ inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+
+ // convert lower 16 bits
+ inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType));
+
+ // add lower 16 bits and upper 16 bits
+ inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType));
+ }
+ else
+ {
+ inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+ }
+ genProduceReg(simdNode);
+}
+
+//----------------------------------------------------------------------------------
+// genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double)
+//
+// Arguments:
+// intrinsicID the SIMD intrinsic ID
+// simdType the SIMD node type
+// baseType the base type of value to be converted
+// tmpReg the tmp reg
+// tmpIntReg the tmp integer reg
+// targetReg the target reg
+//
+// Return Value:
+// None.
+//
+void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID,
+ var_types simdType,
+ var_types baseType,
+ regNumber tmpReg,
+ regNumber tmpIntReg,
+ regNumber targetReg)
+{
+ instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType);
+ if (intrinsicID == SIMDIntrinsicConvertToDouble)
+ {
+ // Note that for mov_xmm2i, the int register is always in the reg2 position
+ inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG);
+ inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType));
+ }
+ else
+ {
+ inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType));
+ inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG);
+ }
+}
+
+//----------------------------------------------------------------------------------
+// genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double)
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Notes:
+// There are no instructions for converting to/from 64-bit integers, so for these we
+// do the conversion an element at a time.
+//
+void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode)
+{
+ SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
+ assert((intrinsicID == SIMDIntrinsicConvertToDouble) || (intrinsicID == SIMDIntrinsicConvertToInt64) ||
+ (intrinsicID == SIMDIntrinsicConvertToUInt64));
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types simdType = simdNode->TypeGet();
+ regNumber op1Reg = genConsumeReg(op1);
+ regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
+ regNumber tmpReg;
+ regNumber tmpReg2;
+ regNumber tmpReg3;
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+#ifdef _TARGET_X86_
+ if (baseType == TYP_LONG)
+ {
+ tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg);
+ }
+ else
+#endif
+ if (iset == InstructionSet_AVX || (baseType == TYP_ULONG))
+ {
+ tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ tmpReg3 = REG_NA;
+ assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
+ }
+ else
+ {
+ tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ assert(tmpReg != op1Reg);
+ tmpReg2 = REG_NA;
+ tmpReg3 = REG_NA;
+ }
+
+ if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG))
+ {
+ // We will generate the following
+ // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2)
+ // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg)
+ // vpsrlq targetReg, 32 (get upper 32 bits of src and put it into targetReg)
+ // vpsllq tmpReg2, 32
+ // vpsrlq tmpReg2, 32 (get lower 32 bits of src and put it into tmpReg2)
+ // mov tmpIntReg, 0x4530000000000000
+ // vmovd tmpReg, tmpIntReg
+ // vpbroadcastq tmpReg, tmpReg (build mask for upper 32 bits of src)
+ // vorpd targetReg, tmpReg
+ // vsubpd targetReg, tmpReg (convert upper 32 bits of src and put it into targetReg)
+ // mov tmpIntReg, 0x4330000000000000
+ // vmovd tmpReg, tmpIntReg
+ // vpbroadcastq tmpReg, tmpReg (build mask for lower 32 bits of src)
+ // vorpd tmpReg2, tmpReg
+ // vsubpd tmpReg2, tmpReg (convert lower 32 bits of src and put it into tmpReg2)
+ // vaddpd targetReg, tmpReg2 (add upper 32 bits and lower 32 bits together)
+ inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
+ if (targetReg != op1Reg)
+ {
+ inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType));
+ }
+
+ // prepare upper 32 bits
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
+
+ // prepare lower 32 bits
+ getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
+
+// prepare mask for converting upper 32 bits
+#ifdef _TARGET_AMD64_
+ getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4530000000000000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+#endif
+ if (iset == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // convert upper 32 bits
+ inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+
+// prepare mask for converting lower 32 bits
+#ifdef _TARGET_AMD64_
+ getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4330000000000000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+#endif
+ if (iset == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // convert lower 32 bits
+ inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+
+ // add lower 32 bits and upper 32 bits
+ inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
+ }
+ else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG))
+ {
+#ifdef _TARGET_AMD64_
+ instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+ instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+
+ if (iset == InstructionSet_AVX)
+ {
+ // Extract the high 16-bits
+ getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+
+ // Put v[3] (the high-order element) in tmpReg2 and convert it.
+ inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
+
+ // Shift the resulting 64-bits left.
+ getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+
+ // Convert v[2], in the lo bits of tmpReg.
+ // For the convert to double, the convert preserves the upper bits in tmpReg2.
+ // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2);
+ }
+
+ // Put v[1] in tmpReg.
+ inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+ // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+
+ // Shift the resulting 64-bits left.
+ getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+ // Convert the lo 64-bits into targetReg
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg);
+
+ // Merge or copy the results (only at this point are we done with op1Reg).
+ if (tmpReg != targetReg)
+ {
+ inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ if (iset == InstructionSet_AVX)
+ {
+ getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, 0x01);
+ }
+#else
+ // get the sign bit and put it in tmpReg3
+ inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, 63);
+ getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, 63);
+
+ // get the absolute value of src and put it into tmpReg2 and targetReg
+ inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY);
+ getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, 32);
+ inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType));
+
+ // prepare upper 32 bits
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
+
+ // prepare lower 32 bits
+ getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
+ getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
+
+ // prepare mask for converting upper 32 bits
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+
+ if (iset == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // convert upper 32 bits
+ inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+
+ // prepare mask for converting lower 32 bits
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
+ inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+ getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+
+ if (iset == InstructionSet_AVX)
+ {
+ inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // convert lower 32 bits
+ inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+
+ // add lower 32 bits and upper 32 bits
+ inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
+
+ // add sign bit
+ inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType));
+#endif
+ }
+ else
+ {
+ instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+ instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+
+ if (iset == InstructionSet_AVX)
+ {
+ // Extract the high 16-bits
+ getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+
+ // Put v[3] (the high-order element) in tmpReg2 and convert it.
+ inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
+
+ // Shift the resulting 64-bits left.
+ getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+
+ // Convert v[2], in the lo bits of tmpReg.
+ // For the convert to double, the convert preserves the upper bits in tmpReg2.
+ // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+ inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+ }
+
+ // Put v[1] in tmpReg.
+ inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
+ getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+ // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+
+ // Shift the resulting 64-bits left.
+ getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+ // Convert the lo 64-bits into targetReg
+ genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg);
+
+ // Merge or copy the results (only at this point are we done with op1Reg).
+ assert(tmpReg != targetReg);
+ inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+ if (iset == InstructionSet_AVX)
+ {
+ getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, 0x01);
+ }
+ }
+ genProduceReg(simdNode);
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Notes:
+// This is used for the WidenHi intrinsic to extract the upper half.
+// On SSE*, this is 8 bytes, and on AVX2 it is 16 bytes.
+//
+void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg)
+{
+ var_types simdType = simdNode->TypeGet();
+ emitAttr emitSize = emitActualTypeSize(simdType);
+ if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ instruction extractIns = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128;
+ getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, 0x01);
+ }
+ else
+ {
+ instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+ if (tgtReg != srcReg)
+ {
+ inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize);
+ }
+ getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, 8);
+ }
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Notes:
+// The Widen intrinsics are broken into separate intrinsics for the two results.
+//
+void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode)
+{
+ assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) ||
+ (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi));
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types simdType = simdNode->TypeGet();
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+ genConsumeOperands(simdNode);
+ regNumber op1Reg = op1->gtRegNum;
+ regNumber srcReg = op1Reg;
+ emitAttr emitSize = emitActualTypeSize(simdType);
+ instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+
+ if (baseType == TYP_FLOAT)
+ {
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
+ {
+ genSIMDExtractUpperHalf(simdNode, srcReg, targetReg);
+ srcReg = targetReg;
+ }
+ inst_RV_RV(widenIns, targetReg, srcReg, simdType);
+ }
+ else
+ {
+ // We will generate the following on AVX:
+ // vpermq targetReg, op1Reg, 0xd4|0xe8
+ // vpxor tmpReg, tmpReg
+ // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed)
+ // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg
+ regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ assert(tmpReg != op1Reg);
+
+ if (iset == InstructionSet_AVX)
+ {
+ // permute op1Reg and put it into targetReg
+ unsigned ival = 0xd4;
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
+ {
+ ival = 0xe8;
+ }
+ getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival);
+ }
+ else if (targetReg != op1Reg)
+ {
+ inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
+ }
+
+ genSIMDZero(simdType, baseType, tmpReg);
+ if (!varTypeIsUnsigned(baseType))
+ {
+ instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType);
+ inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize);
+ }
+ inst_RV_RV(widenIns, targetReg, tmpReg, simdType);
+ }
+ genProduceReg(simdNode);
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations
+//
+// Arguments:
+// simdNode - The GT_SIMD node
+//
+// Notes:
+// This intrinsic takes two arguments. The first operand is narrowed to produce the
+// lower elements of the results, and the second operand produces the high elements.
+//
+void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
+{
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow);
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ regNumber targetReg = simdNode->gtRegNum;
+ assert(targetReg != REG_NA);
+ var_types simdType = simdNode->TypeGet();
+ emitAttr emitSize = emitTypeSize(simdType);
+ InstructionSet iset = compiler->getSIMDInstructionSet();
+
+ genConsumeOperands(simdNode);
+ regNumber op1Reg = op1->gtRegNum;
+ regNumber op2Reg = op2->gtRegNum;
+ if (baseType == TYP_DOUBLE)
+ {
+ regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+ inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
+ inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
+ // Now insert the high-order result (in tmpReg) into the upper half of targetReg.
+ if (compiler->canUseAVX())
+ {
+ getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01);
+ }
+ else
+ {
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX);
+ }
+ }
+ else if (varTypeIsLong(baseType))
+ {
+ if (iset == InstructionSet_AVX)
+ {
+ // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg.
+ // We will generate the following:
+ // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg)
+ // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2)
+ // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg)
+ // mov tmpReg2, op1Reg
+ // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2)
+ // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg
+ // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg
+ // punpcklqdq tgtReg, tmpReg
+ regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+ getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
+ getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01);
+ inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize);
+ getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX);
+ inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize);
+ }
+ else
+ {
+ // We will generate the following:
+ // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements)
+ // psrldq targetReg, 8 (shift them right to get zeros in the high elements)
+ // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements)
+ // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements)
+ // por targetReg, tmpReg
+ regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+ instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+ instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+ emitAttr emitSize = emitTypeSize(simdType);
+
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX);
+ getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX);
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8);
+ inst_RV_RV(INS_por, targetReg, tmpReg, simdType);
+ }
+ }
+ else
+ {
+ // We will generate the following:
+ // mov targetReg, op1Reg
+ // mov tmpReg, op2Reg
+ // psll? targetReg, shiftCount
+ // pslr? targetReg, shiftCount
+ // psll? tmpReg, shiftCount
+ // pslr? tmpReg, shiftCount
+ // <pack> targetReg, tmpReg
+ // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType),
+ // and <pack> is the appropriate instruction to pack the result (note that we have to truncate to
+ // get CLR type semantics; otherwise it will saturate).
+ //
+ int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2);
+ instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+ instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+
+ if (iset == InstructionSet_AVX)
+ {
+ regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+ regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+ // The AVX instructions generally operate on "lanes", so we have to permute the
+ // inputs so that the destination register has the low 128-bit halves of the two
+ // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs.
+ getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20);
+ getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31);
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount);
+ getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount);
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
+ getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount);
+ inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType));
+ }
+ else
+ {
+ regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+ inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
+ inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize);
+
+ instruction tmpShiftRight = shiftRightIns;
+ if ((baseType == TYP_INT || baseType == TYP_UINT) && iset == InstructionSet_SSE2)
+ {
+ tmpShiftRight = INS_psrad;
+ }
+
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount);
+ getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount);
+ getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
+ getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount);
+ inst_RV_RV(ins, targetReg, tmpReg, simdType);
+ }
+ }
+ genProduceReg(simdNode);
+}
+
//--------------------------------------------------------------------------------
// genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
// add, sub, mul, bit-wise And, AndNot and Or.
// Extract first and third double word results from tmpReg
// tmpReg = shuffle(0,0,2,0) of tmpReg
- getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, 0x08);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX);
// targetReg[63:0] = op1[0] * op2[0]
// targetReg[127:64] = op1[2] * op2[2]
// Extract first and third double word results from targetReg
// targetReg = shuffle(0,0,2,0) of targetReg
- getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, 0x08);
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX);
// pack the results into a single vector
inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
// These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
unsigned shiftCount = 16 - simdNode->gtSIMDSize;
assert(shiftCount != 0);
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
}
if (byteShiftCnt != 0)
{
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
}
}
inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
}
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
}
else
genSIMDIntrinsicUnOp(simdNode);
break;
+ case SIMDIntrinsicConvertToSingle:
+ case SIMDIntrinsicConvertToInt32:
+ case SIMDIntrinsicConvertToUInt32:
+ genSIMDIntrinsic32BitConvert(simdNode);
+ break;
+
+ case SIMDIntrinsicConvertToDouble:
+ case SIMDIntrinsicConvertToInt64:
+ case SIMDIntrinsicConvertToUInt64:
+ genSIMDIntrinsic64BitConvert(simdNode);
+ break;
+
+ case SIMDIntrinsicWidenLo:
+ case SIMDIntrinsicWidenHi:
+ genSIMDIntrinsicWiden(simdNode);
+ break;
+
+ case SIMDIntrinsicNarrow:
+ genSIMDIntrinsicNarrow(simdNode);
+ break;
+
case SIMDIntrinsicAdd:
case SIMDIntrinsicSub:
case SIMDIntrinsicMul: