From c767b4d9b330c9e8a9c51e92aa01f2fceedec181 Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Thu, 1 Mar 2018 13:45:45 -0800 Subject: [PATCH] Implement SSE4.1 insert and extract --- src/jit/emitxarch.cpp | 39 +++++++++++++++++++++++++++++---- src/jit/hwintrinsiccodegenxarch.cpp | 43 +++++++++++++++++++++++++++++++++++++ src/jit/hwintrinsiclistxarch.h | 22 +++++++++++-------- src/jit/hwintrinsicxarch.cpp | 33 +++++++++++++++++++--------- src/jit/instrsxarch.h | 7 ++++++ src/jit/lsraxarch.cpp | 14 ++++++++++++ src/jit/namedintrinsiclist.h | 2 +- 7 files changed, 136 insertions(+), 24 deletions(-) diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 64a7635..4ffe342 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -155,7 +155,10 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_phsubd: case INS_phsubsw: case INS_phsubw: + case INS_pinsrb: case INS_pinsrw: + case INS_pinsrd: + case INS_pinsrq: case INS_pmaddubsw: case INS_pmaddwd: case INS_pmaxsb: @@ -383,9 +386,16 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr) // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are // required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE), // and here we must special case these by the opcode. - if (ins == INS_vpermq || ins == INS_vpsrlvq || ins == INS_vpsllvq) + switch (ins) { - return true; + case INS_vpermq: + case INS_vpsrlvq: + case INS_vpsllvq: + case INS_pinsrq: + case INS_pextrq: + return true; + default: + break; } #endif // !LEGACY_BACKEND #ifdef _TARGET_AMD64_ @@ -1084,7 +1094,7 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id) if (!IsSSEOrAVXInstruction(ins) || ins == INS_mov_xmm2i || ins == INS_cvttsd2si #ifndef LEGACY_BACKEND || ins == INS_cvttss2si || ins == INS_cvtsd2si || ins == INS_cvtss2si || ins == INS_pmovmskb || - ins == INS_pextrw + ins == INS_pextrw || ins == INS_pextrb || ins == INS_pextrd || ins == INS_pextrq || ins == INS_extractps #endif // !LEGACY_BACKEND ) { @@ -4034,6 +4044,12 @@ void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regN sz += emitGetRexPrefixSize(ins); } + if ((ins == INS_pextrq || ins == INS_pinsrq) && !UseVEXEncoding()) + { + assert(UseSSE4()); + sz += 1; + } + id->idIns(ins); id->idInsFmt(IF_RRW_RRW_CNS); id->idReg1(reg1); @@ -5519,11 +5535,26 @@ static bool isSseShift(instruction ins) } } +static bool isSSEExtract(instruction ins) +{ + switch (ins) + { + case INS_pextrb: + case INS_pextrw: + case INS_pextrd: + case INS_pextrq: + case INS_extractps: + return true; + default: + return false; + } +} + void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int ival) { // TODO-XARCH refactoring emitIns_R_R_I to handle SSE2/AVX2 shift as well as emitIns_R_I bool isShift = isSseShift(ins); - if (UseVEXEncoding() && !isShift) + if (isSSEExtract(ins) || (UseVEXEncoding() && !isShift)) { emitIns_R_R_I(ins, attr, reg, reg1, ival); } diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index b38c2bf..0864552 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -119,6 +119,11 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } else if (Compiler::isImmHWIntrinsic(intrinsicID, op2)) { + if (intrinsicID == NI_SSE2_Extract) + { + // extract instructions return to GP-registers, so it needs int size as the emitsize + simdSize = emitTypeSize(TYP_INT); + } auto emitSwCase = [&](unsigned i) { emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i); }; @@ -1116,6 +1121,44 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node) break; } + case NI_SSE41_Extract: + { + regNumber tmpTargetReg = REG_NA; + instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType); + if (baseType == TYP_FLOAT) + { + tmpTargetReg = node->ExtractTempReg(); + } + auto emitSwCase = [&](unsigned i) { + if (baseType == TYP_FLOAT) + { + // extract instructions return to GP-registers, so it needs int size as the emitsize + emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), op1Reg, tmpTargetReg, (int)i); + emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg); + } + else + { + emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, (int)i); + } + }; + + if (op2->IsCnsIntOrI()) + { + ssize_t ival = op2->AsIntCon()->IconValue(); + emitSwCase((unsigned)ival); + } + else + { + // We emit a fallback case for the scenario when the imm-op is not a constant. This should + // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it + // can also occur if the consumer calls it directly and just doesn't pass a constant value. + regNumber baseReg = node->ExtractTempReg(); + regNumber offsReg = node->GetSingleTempReg(); + genHWIntrinsicJumpTableFallback(intrinsicID, op2->gtRegNum, baseReg, offsReg, emitSwCase); + } + break; + } + default: unreached(); break; diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index e01a00c..4335799 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -138,7 +138,7 @@ HARDWARE_INTRINSIC(SSE2_CompareEqualOrderedScalar, "CompareEqu HARDWARE_INTRINSIC(SSE2_CompareEqualScalar, "CompareEqualScalar", SSE2, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_CompareGreaterThan, "CompareGreaterThan", SSE2, 6, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE2, -1, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantic +HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE2, -1, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE2, 6, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE2, -1, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqual, "CompareGreaterThanOrEqual", SSE2, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -179,8 +179,8 @@ HARDWARE_INTRINSIC(SSE2_ConvertToUInt64, "ConvertToU HARDWARE_INTRINSIC(SSE2_ConvertToVector128Double, "ConvertToVector128Double", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2pd, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2pd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Double, "ConvertScalarToVector128Double", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd, INS_invalid, INS_cvtsi2sd, INS_invalid, INS_cvtss2sd, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SecondArgMaybe64Bit|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_ConvertToVector128Int32, "ConvertToVector128Int32", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Int32, "ConvertScalarToVector128Int32", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_i2xmm, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantic|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_ConvertToVector128Int32WithTruncation, "ConvertToVector128Int32WithTruncation", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_cvttpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantic +HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Int32, "ConvertScalarToVector128Int32", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_i2xmm, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_ConvertToVector128Int32WithTruncation, "ConvertToVector128Int32WithTruncation", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_cvttpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Int64, "ConvertScalarToVector128Int64", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_i2xmm, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_64BitOnly|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_ConvertToVector128Single, "ConvertToVector128Single", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Single, "ConvertScalarToVector128Single", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2ss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) @@ -188,6 +188,8 @@ HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128UInt32, "ConvertSca HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128UInt64, "ConvertScalarToVector128UInt64", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_i2xmm, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_64BitOnly|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_Divide, "Divide", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2_DivideScalar, "DivideScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2_Extract, "Extract", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_pextrw, INS_pextrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_Insert, "Insert", SSE2, -1, 16, 3, {INS_invalid, INS_invalid, INS_pinsrw, INS_pinsrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128, "LoadAlignedVector128", SSE2, -1, 16, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_LoadFence, "LoadFence", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_LoadScalarVector128, "LoadScalarVector128", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) @@ -272,12 +274,14 @@ HARDWARE_INTRINSIC(SSE41_BlendVariable, "BlendVaria HARDWARE_INTRINSIC(SSE41_Ceiling, "Ceiling", SSE41, 10, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41_CeilingScalar, "CeilingScalar", SSE41, 10, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE41_CompareEqual, "CompareEqual", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int16, "ConvertToVector128Int16", SSE41, -1, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int32, "ConvertToVector128Int32", SSE41, -1, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int64, "ConvertToVector128Int64", SSE41, -1, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int16, "ConvertToVector128Int16", SSE41, -1, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int32, "ConvertToVector128Int32", SSE41, -1, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int64, "ConvertToVector128Int64", SSE41, -1, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41_DotProduct, "DotProduct", SSE41, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41_Extract, "Extract", SSE41, -1, 16, 2, {INS_pextrb, INS_pextrb, INS_invalid, INS_invalid, INS_pextrd, INS_pextrd, INS_pextrq, INS_pextrq, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiIns|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41_Floor, "Floor", SSE41, 9, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41_FloorScalar, "FloorScalar", SSE41, 9, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41_Insert, "Insert", SSE41, -1, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_pinsrq, INS_pinsrq, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SecondArgMaybe64Bit) HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41_Max, "Max", SSE41, -1, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41_Min, "Min", SSE41, -1, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -351,9 +355,9 @@ HARDWARE_INTRINSIC(AVX_Store, "Store", HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", AVX, -1, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Subtract, "Subtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX_TestC, "TestC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) -HARDWARE_INTRINSIC(AVX_TestNotZAndNotC, "TestNotZAndNotC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) -HARDWARE_INTRINSIC(AVX_TestZ, "TestZ", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) +HARDWARE_INTRINSIC(AVX_TestC, "TestC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX_TestNotZAndNotC, "TestNotZAndNotC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX_TestZ, "TestZ", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX_UnpackHigh, "UnpackHigh", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_UnpackLow, "UnpackLow", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_Xor, "Xor", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index d34345e..86a9489 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -200,12 +200,22 @@ unsigned Compiler::simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_I return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize; } - CORINFO_CLASS_HANDLE typeHnd = - ((flags & HW_Flag_BaseTypeFromArg) == 0) ? sig->retTypeSigClass : info.compCompHnd->getArgClass(sig, sig->args); + CORINFO_CLASS_HANDLE typeHnd = nullptr; - int simdSize = getSIMDTypeSizeInBytes(typeHnd); - assert(simdSize > 0); - return (unsigned)simdSize; + if (JITtype2varType(sig->retType) == TYP_STRUCT) + { + typeHnd = sig->retTypeSigClass; + } + else + { + assert((flags & HW_Flag_BaseTypeFromFirstArg) != 0); + typeHnd = info.compCompHnd->getArgClass(sig, sig->args); + } + + unsigned simdSize = 0; + var_types baseType = getBaseTypeAndSizeOfSIMDType(typeHnd, &simdSize); + assert(simdSize > 0 && baseType != TYP_UNKNOWN); + return simdSize; } //------------------------------------------------------------------------ @@ -486,7 +496,6 @@ bool Compiler::isFullyImplmentedISAClass(InstructionSet isa) switch (isa) { case InstructionSet_SSE2: - case InstructionSet_SSE41: case InstructionSet_SSE42: case InstructionSet_AVX: case InstructionSet_AVX2: @@ -500,6 +509,7 @@ bool Compiler::isFullyImplmentedISAClass(InstructionSet isa) case InstructionSet_SSE: case InstructionSet_SSE3: case InstructionSet_SSSE3: + case InstructionSet_SSE41: case InstructionSet_LZCNT: case InstructionSet_POPCNT: return true; @@ -560,13 +570,16 @@ bool Compiler::compSupportsHWIntrinsic(InstructionSet isa) // flags - flags of the intrinsics // // Return Value: -// On 32-bit platforms, return true if the intrinsic does not use any 64-bit registers (r64) +// Returns true iff the given type signature is supported +// Notes: +// - This is only used on 32-bit systems to determine whether the signature uses no 64-bit registers. +// - The `retType` is passed to avoid another call to the type system, as it has already been retrieved. bool Compiler::hwIntrinsicSignatureTypeSupported(var_types retType, CORINFO_SIG_INFO* sig, HWIntrinsicFlag flags) { #ifdef _TARGET_X86_ CORINFO_CLASS_HANDLE argClass; - if (((flags & HW_Flag_64BitOnly) != 0)) + if ((flags & HW_Flag_64BitOnly) != 0) { return false; } @@ -681,7 +694,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, bool isTableDriven = impIsTableDrivenHWIntrinsic(category, flags); - if (isTableDriven && (category == HW_Category_MemoryStore || + if (isTableDriven && ((category == HW_Category_MemoryStore) || ((flags & (HW_Flag_BaseTypeFromFirstArg | HW_Flag_BaseTypeFromSecondArg)) != 0))) { if ((flags & HW_Flag_BaseTypeFromFirstArg) != 0) @@ -690,7 +703,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } else { - assert(category == HW_Category_MemoryStore || ((flags & HW_Flag_BaseTypeFromSecondArg) != 0)); + assert((category == HW_Category_MemoryStore) || ((flags & HW_Flag_BaseTypeFromSecondArg) != 0)); CORINFO_ARG_LIST_HANDLE secondArg = info.compCompHnd->getArgNext(sig->args); CORINFO_CLASS_HANDLE secondArgClass = info.compCompHnd->getArgClass(sig, secondArg); baseType = getBaseTypeOfSIMDType(secondArgClass); diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 7dcaf93..3a89bdd 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -451,6 +451,13 @@ INST3( movsldup, "movsldup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( movshdup, "movshdup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x16)) // Replicate odd-indexed Single FP Values INST3( phminposuw, "phminposuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x41)) // Packed Horizontal Word Minimum INST3( mpsadbw, "mpsadbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x42)) // Compute Multiple Packed Sums of Absolute Difference +INST3( pinsrb, "pinsrb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x20)) // Insert Byte +INST3( pinsrd, "pinsrd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x22)) // Insert Dword +INST3( pinsrq, "pinsrq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x22)) // Insert Qword +INST3( pextrb, "pextrb" , 0, IUM_WR, 0, 0, SSE3A(0x14), BAD_CODE, BAD_CODE) // Extract Byte +INST3( pextrd, "pextrd" , 0, IUM_WR, 0, 0, SSE3A(0x16), BAD_CODE, BAD_CODE) // Extract Dword +INST3( pextrq, "pextrq" , 0, IUM_WR, 0, 0, SSE3A(0x16), BAD_CODE, BAD_CODE) // Extract Qword +INST3( extractps, "extractps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x17)) // Extract Packed Floating-Point Values INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index c2c1992..d158676 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2255,6 +2255,7 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) { TreeNodeInfo* info = currentNodeInfo; NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId; + var_types baseType = intrinsicTree->gtSIMDBaseType; InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID); HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID); HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID); @@ -2382,6 +2383,19 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) break; } + case NI_SSE41_Extract: + if (baseType == TYP_FLOAT) + { + info->internalIntCount += 1; + } +#ifdef _TARGET_X86_ + else if (varTypeIsByte(baseType)) + { + info->setDstCandidates(this, RBM_BYTE_REGS); + } +#endif + break; + #ifdef _TARGET_X86_ case NI_SSE42_Crc32: { diff --git a/src/jit/namedintrinsiclist.h b/src/jit/namedintrinsiclist.h index 96c6b86..91c9720 100644 --- a/src/jit/namedintrinsiclist.h +++ b/src/jit/namedintrinsiclist.h @@ -103,7 +103,7 @@ enum HWIntrinsicFlag : unsigned int HW_Flag_SpecialCodeGen = 0x20000, // No Read/Modify/Write Semantics - // the intrinsic does not have read/modify/write semantics and doesn't need + // the intrinsic doesn't have read/modify/write semantics in two/three-operand form. HW_Flag_NoRMWSemantics = 0x40000, }; -- 2.7.4