From f98a1f23c28073e2f3509b7fdd71f2c69b6dac39 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 28 Feb 2018 00:35:36 -0800 Subject: [PATCH] Adding support for additional AVX hardware intrinsics --- src/jit/hwintrinsiccodegenxarch.cpp | 25 +++++++++++++++++++ src/jit/hwintrinsiclistxarch.h | 16 +++++++++++-- src/jit/hwintrinsicxarch.cpp | 48 +++++++++++++++++++++---------------- src/jit/instrsxarch.h | 3 ++- 4 files changed, 69 insertions(+), 23 deletions(-) diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index 24829ea..7fda952 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -1226,6 +1226,31 @@ void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node) } break; } + + case NI_AVX_TestC: + { + emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg); + emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum); + emit->emitIns_R(INS_setb, EA_1BYTE, targetReg); + break; + } + + case NI_AVX_TestNotZAndNotC: + { + emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg); + emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum); + emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); + break; + } + + case NI_AVX_TestZ: + { + emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg); + emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum); + emit->emitIns_R(INS_sete, EA_1BYTE, targetReg); + break; + } + default: unreached(); break; diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index cc4710a..e14bbd1 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -313,18 +313,22 @@ HARDWARE_INTRINSIC(SSE42_CompareGreaterThan, "CompareGre // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************ // AVX Intrinsics -HARDWARE_INTRINSIC(AVX_IsSupported, "get_IsSupported", AVX, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_IsSupported, "get_IsSupported", AVX, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_Add, "Add", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX_AddSubtract, "AddSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_And, "And", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX_AndNot, "AndNot", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX_AddSubtract, "AddSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_Blend, "Blend", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX_BlendVariable, "BlendVariable", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_Ceiling, "Ceiling", AVX, 10, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Compare, "Compare", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_CompareScalar, "CompareScalar", AVX, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX_ConvertToSingle, "ConvertToSingle", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Divide, "Divide", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_DotProduct, "DotProduct", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX_DuplicateEvenIndexed, "DuplicateEvenIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_DuplicateOddIndexed, "DuplicateOddIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_Floor, "Floor", AVX, 9, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_HorizontalAdd, "HorizontalAdd", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_HorizontalSubtract, "HorizontalSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) @@ -336,12 +340,20 @@ HARDWARE_INTRINSIC(AVX_Multiply, "Multiply", HARDWARE_INTRINSIC(AVX_Or, "Or", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX_Reciprocal, "Reciprocal", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_ReciprocalSqrt, "ReciprocalSqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_RoundCurrentDirection, "RoundCurrentDirection", AVX, 4, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_RoundToNearestInteger, "RoundToNearestInteger", AVX, 8, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_RoundToNegativeInfinity, "RoundToNegativeInfinity", AVX, 9, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_RoundToPositiveInfinity, "RoundToPositiveInfinity", AVX, 10, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX_RoundToZero, "RoundToZero", AVX, 11, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_SetZeroVector256, "SetZeroVector256", AVX, -1, 32, 0, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_xorps, INS_xorpd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Sqrt, "Sqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Store, "Store", AVX, -1, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", AVX, -1, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX_Subtract, "Subtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_TestC, "TestC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) +HARDWARE_INTRINSIC(AVX_TestNotZAndNotC, "TestNotZAndNotC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) +HARDWARE_INTRINSIC(AVX_TestZ, "TestZ", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_OneTypeGeneric|HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) HARDWARE_INTRINSIC(AVX_UnpackHigh, "UnpackHigh", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_UnpackLow, "UnpackLow", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_Xor, "Xor", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index c4b4823..36f48ba 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -192,12 +192,18 @@ int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic) unsigned Compiler::simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig) { assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); - if ((Compiler::flagsOfHWIntrinsic(intrinsic) & HW_Flag_UnfixedSIMDSize) == 0) + + HWIntrinsicFlag flags = flagsOfHWIntrinsic(intrinsic); + + if ((flags & HW_Flag_UnfixedSIMDSize) == 0) { return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize; } - int simdSize = getSIMDTypeSizeInBytes(sig->retTypeSigClass); + CORINFO_CLASS_HANDLE typeHnd = + ((flags & HW_Flag_BaseTypeFromArg) == 0) ? sig->retTypeSigClass : info.compCompHnd->getArgClass(sig, sig->args); + + int simdSize = getSIMDTypeSizeInBytes(typeHnd); assert(simdSize > 0); return (unsigned)simdSize; } @@ -592,7 +598,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, var_types retType = JITtype2varType(sig->retType); var_types baseType = TYP_UNKNOWN; - if (retType == TYP_STRUCT && featureSIMD) + if ((retType == TYP_STRUCT) && featureSIMD) { unsigned int sizeBytes; baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes); @@ -648,6 +654,24 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } } + bool isTableDriven = impIsTableDrivenHWIntrinsic(category, flags); + + if (isTableDriven && (!varTypeIsSIMD(retType) || ((flags & HW_Flag_BaseTypeFromArg) != 0))) + { + if (retType != TYP_VOID) + { + baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)); + } + else + { + assert(category == HW_Category_MemoryStore); + baseType = + getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, info.compCompHnd->getArgNext(sig->args))); + } + + assert(baseType != TYP_UNKNOWN); + } + if ((flags & (HW_Flag_OneTypeGeneric | HW_Flag_TwoTypeGeneric)) != 0) { if (!varTypeIsArithmetic(baseType)) @@ -675,24 +699,8 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } // table-driven importer of simple intrinsics - if (impIsTableDrivenHWIntrinsic(category, flags)) + if (isTableDriven) { - if (!varTypeIsSIMD(retType) || ((flags & HW_Flag_BaseTypeFromArg) != 0)) - { - if (retType != TYP_VOID) - { - baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)); - } - else - { - assert(category == HW_Category_MemoryStore); - baseType = - getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, info.compCompHnd->getArgNext(sig->args))); - } - - assert(baseType != TYP_UNKNOWN); - } - unsigned simdSize = simdSizeOfHWIntrinsic(intrinsic, sig); CORINFO_ARG_LIST_HANDLE argList = sig->args; CORINFO_CLASS_HANDLE argClass; diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index ae8adff..7dcaf93 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -472,7 +472,8 @@ INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( vblendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4A)) // Variable Blend Packed Singles INST3( vblendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4B)) // Variable Blend Packed Doubles INST3( vpblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4C)) // Variable Blend Packed Bytes - +INST3( vtestps, "testps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x0E)) // Packed Bit Test +INST3( vtestpd, "testpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x0F)) // Packed Bit Test INST3( vpsrlvd, "psrlvd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x45)) // Variable Bit Shift Right Logical INST3( vpsrlvq, "psrlvq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x45)) // Variable Bit Shift Right Logical INST3( vpsravd, "psravd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x46)) // Variable Bit Shift Right Arithmetic -- 2.7.4