From b5913d8fcd8ed136736a7e1d73ea539b2c87005d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 12 Jul 2018 07:40:30 -0700 Subject: [PATCH] Implementing the Avx2 AlignRight, Blend, and ConvertTo* hwintrinsics. Commit migrated from https://github.com/dotnet/coreclr/commit/71a9d8bbb0d80bb9b998c9df2ab2170027b796e3 --- src/coreclr/src/jit/emitxarch.cpp | 1 + src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp | 27 ++++++++++++++++++++----- src/coreclr/src/jit/hwintrinsiclistxarch.h | 5 +++++ src/coreclr/src/jit/instrsxarch.h | 1 + src/coreclr/src/jit/lowerxarch.cpp | 4 ++++ src/coreclr/src/jit/lsraxarch.cpp | 1 + 6 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp index bdbf335..17aae89 100644 --- a/src/coreclr/src/jit/emitxarch.cpp +++ b/src/coreclr/src/jit/emitxarch.cpp @@ -360,6 +360,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_vinserti128: case INS_vmaskmovps: case INS_vmaskmovpd: + case INS_vpblendd: case INS_vperm2i128: case INS_vperm2f128: case INS_vpermilpsvar: diff --git a/src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp index 7e5a981..dafb45a 100644 --- a/src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/src/jit/hwintrinsiccodegenxarch.cpp @@ -1887,11 +1887,33 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) if ((op1 != nullptr) && !op1->OperIsList()) { + op1Reg = op1->gtRegNum; genConsumeOperands(node); } switch (intrinsicId) { + case NI_AVX2_ConvertToDouble: + { + assert(op2 == nullptr); + if (op1Reg != targetReg) + { + instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); + emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg); + } + break; + } + + case NI_AVX2_ConvertToInt32: + case NI_AVX2_ConvertToUInt32: + { + assert(op2 == nullptr); + assert((baseType == TYP_INT) || (baseType == TYP_UINT)); + instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); + emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg); + break; + } + case NI_AVX_SetZeroVector256: { assert(op1 == nullptr); @@ -1913,7 +1935,6 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) { assert(op1 != nullptr); assert(op2 == nullptr); - op1Reg = op1->gtRegNum; if (varTypeIsIntegral(baseType)) { // If the argument is a integer, it needs to be moved into a XMM register @@ -1977,7 +1998,6 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) // to ensure that Bits MAXVL-1:128 are zeroed. assert(op2 == nullptr); - regNumber op1Reg = op1->gtRegNum; emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg); break; } @@ -1985,8 +2005,6 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) case NI_AVX_GetLowerHalf: { assert(op2 == nullptr); - regNumber op1Reg = op1->gtRegNum; - if (op1Reg != targetReg) { emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg); @@ -2027,7 +2045,6 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) if (numArgs == 2) { assert(intrinsicId == NI_AVX_ExtractVector128 || NI_AVX_ExtractVector128); - op1Reg = op1->gtRegNum; op2Reg = op2->gtRegNum; lastOp = op2; } diff --git a/src/coreclr/src/jit/hwintrinsiclistxarch.h b/src/coreclr/src/jit/hwintrinsiclistxarch.h index 5c76d89..bfb9cec 100644 --- a/src/coreclr/src/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/src/jit/hwintrinsiclistxarch.h @@ -408,9 +408,11 @@ HARDWARE_INTRINSIC(AVX2_IsSupported, "get_IsSuppo HARDWARE_INTRINSIC(AVX2_Abs, "Abs", AVX2, -1, 32, 1, {INS_pabsb, INS_pabsb, INS_pabsw, INS_pabsw, INS_pabsd, INS_pabsd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2_Add, "Add", AVX2, -1, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_AddSaturate, "AddSaturate", AVX2, -1, 32, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX2_AlignRight, "AlignRight", AVX2, -1, 32, 3, {INS_palignr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2_And, "And", AVX2, -1, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_AndNot, "AndNot", AVX2, -1, 32, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_Average, "Average", AVX2, -1, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX2_Blend, "Blend", AVX2, -1, 0, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_UnfixedSIMDSize|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2_BlendVariable, "BlendVariable", AVX2, -1, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX2, -1, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_OneTypeGeneric) HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX2, -1, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_OneTypeGeneric) @@ -418,6 +420,9 @@ HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256, "BroadcastVe HARDWARE_INTRINSIC(AVX2_CompareEqual, "CompareEqual", AVX2, -1, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_CompareGreaterThan, "CompareGreaterThan", AVX2, -1, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_ExtractVector128, "ExtractVector128", AVX2, -1, 32, -1, {INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2_ConvertToDouble, "ConvertToDouble", AVX2, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsdsse2}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX2_ConvertToInt32, "ConvertToInt32", AVX2, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_xmm2i, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX2_ConvertToUInt32, "ConvertToUInt32", AVX2, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_xmm2i, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int16, "ConvertToVector256Int16", AVX2, -1, 32, 1, {INS_pmovsxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt16, "ConvertToVector256UInt16", AVX2, -1, 32, 1, {INS_invalid, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int32, "ConvertToVector256Int32", AVX2, -1, 32, 1, {INS_pmovsxbd, INS_invalid, INS_pmovsxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) diff --git a/src/coreclr/src/jit/instrsxarch.h b/src/coreclr/src/jit/instrsxarch.h index 995cb4c..87fd76a 100644 --- a/src/coreclr/src/jit/instrsxarch.h +++ b/src/coreclr/src/jit/instrsxarch.h @@ -480,6 +480,7 @@ INST3( vinserti128, "inserti128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) INST3( vperm2i128, "perm2i128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x46)) // Permute 128-bit halves of input register INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x00)) // Permute 64-bit of input register +INST3( vpblendd, "pblendd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x02)) // Blend Packed DWORDs INST3( vblendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4A)) // Variable Blend Packed Singles INST3( vblendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4B)) // Variable Blend Packed Doubles INST3( vpblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4C)) // Variable Blend Packed Bytes diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp index 4787e45..938ddb5 100644 --- a/src/coreclr/src/jit/lowerxarch.cpp +++ b/src/coreclr/src/jit/lowerxarch.cpp @@ -2386,6 +2386,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge case NI_AVX_InsertVector128: case NI_AVX_Permute: case NI_AVX_Permute2x128: + case NI_AVX2_Blend: case NI_AVX2_InsertVector128: case NI_AVX2_Permute2x128: case NI_AVX2_ShiftLeftLogical: @@ -2629,6 +2630,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_SSE2_ConvertToInt64: case NI_SSE2_ConvertToUInt32: case NI_SSE2_ConvertToUInt64: + case NI_AVX2_ConvertToInt32: + case NI_AVX2_ConvertToUInt32: { if (varTypeIsIntegral(baseType)) { @@ -2958,6 +2961,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX_Insert: case NI_AVX_Permute2x128: case NI_AVX_Shuffle: + case NI_AVX2_Blend: case NI_AVX2_Permute2x128: { if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) diff --git a/src/coreclr/src/jit/lsraxarch.cpp b/src/coreclr/src/jit/lsraxarch.cpp index 089e80f..f5c4a03 100644 --- a/src/coreclr/src/jit/lsraxarch.cpp +++ b/src/coreclr/src/jit/lsraxarch.cpp @@ -2387,6 +2387,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) case NI_SSE2_ConvertToDouble: case NI_AVX_ExtendToVector256: case NI_AVX_GetLowerHalf: + case NI_AVX2_ConvertToDouble: { assert(numArgs == 1); assert(!isRMW); -- 2.7.4