From d256114fb726dac0b4fbfd4574a0449012feee6d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sat, 3 Feb 2018 09:53:54 -0800 Subject: [PATCH] Adding support for the SSE2 LoadVector128, LoadAlignedVector128, and LoadScalarVector128 intrinsics. --- src/jit/hwintrinsiclistxarch.h | 5 ++++- src/jit/instrsxarch.h | 1 + src/jit/lowerxarch.cpp | 3 +++ .../System/Runtime/Intrinsics/X86/Sse2.PlatformNotSupported.cs | 10 ++++++---- src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.cs | 10 ++++++---- 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index 18b0bc9..6238838 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -121,7 +121,7 @@ HARDWARE_INTRINSIC(SSE_Xor, "Xor", // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************ // SSE2 Intrinsics -HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2_Add, "Add", SSE2, -1, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2_AddSaturate, "AddSaturate", SSE2, -1, 16, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2_And, "And", SSE2, -1, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -144,6 +144,9 @@ HARDWARE_INTRINSIC(SSE2_ConvertToVector128Int32, "ConvertToV HARDWARE_INTRINSIC(SSE2_ConvertToVector128Int32WithTruncation, "ConvertToVector128Int32WithTruncation", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_cvttpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromArg) HARDWARE_INTRINSIC(SSE2_ConvertToVector128Single, "ConvertToVector128Single", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromArg) HARDWARE_INTRINSIC(SSE2_Divide, "Divide", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128, "LoadAlignedVector128", SSE2, -1, 16, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE2_LoadScalarVector128, "LoadScalarVector128", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE2_LoadVector128, "LoadVector128", SSE2, -1, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2_Max, "Max", SSE2, -1, 16, 2, {INS_invalid, INS_pmaxub, INS_pmaxsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2_Min, "Min", SSE2, -1, 16, 2, {INS_invalid, INS_pminub, INS_pminsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2_MoveMask, "MoveMask", SSE2, -1, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_Special, HW_Flag_NoFlag) diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index ca61a66..8a016d3 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -184,6 +184,7 @@ INST3( mov_i2xmm, "movd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCK INST3( mov_xmm2i, "movd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7E)) // Move xmm reg to an int reg. reg1=xmm reg, reg2=int reg INST3( pmovmskb, "pmovmskb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xD7)) // Move the MSB bits of all bytes in a xmm reg to an int reg INST3( movmskpd, "movmskpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x50)) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros. +INST3( movd, "movd" , 0, IUM_WR, 0, 0, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E)) INST3( movq, "movq" , 0, IUM_WR, 0, 0, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E)) INST3( movsdsse2, "movsd" , 0, IUM_WR, 0, 0, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10)) diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 559d0c7..28ce7a8 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2329,6 +2329,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge // However, we cannot do the same for the VEX-encoding as it changes an observable // side-effect and may mask an Access Violation that would otherwise occur. case NI_SSE_LoadAlignedVector128: + case NI_SSE2_LoadAlignedVector128: isContainable = (containingCategory == HW_Category_SimpleSIMD) && !comp->canUseVexEncoding(); break; @@ -2336,11 +2337,13 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge // read remains the same. Likewise, we can't fold a larger load into a SIMD scalar // intrinsic as that would read fewer bits that requested. case NI_SSE_LoadScalarVector128: + case NI_SSE2_LoadScalarVector128: isContainable = (containingCategory == HW_Category_SIMDScalar); break; // VEX encoding supports unaligned memory ops, so we can fold them case NI_SSE_LoadVector128: + case NI_SSE2_LoadVector128: isContainable = (containingCategory == HW_Category_SimpleSIMD) && comp->canUseVexEncoding(); break; diff --git a/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.PlatformNotSupported.cs b/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.PlatformNotSupported.cs index 7e9eb89..fe86fd1 100644 --- a/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.PlatformNotSupported.cs +++ b/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.PlatformNotSupported.cs @@ -768,13 +768,15 @@ namespace System.Runtime.Intrinsics.X86 /// public static unsafe Vector128 LoadScalarVector128(ushort* address) { throw new PlatformNotSupportedException(); } /// - /// __m128i _mm_loadl_epi64 (__m128i const* mem_addr) - /// MOVQ xmm, reg/m64 + /// __m128i _mm_loadl_epi32 (__m128i const* mem_addr) + /// MOVD xmm, reg/m64 + /// The above native signature does not exist. We provide this additional overload for completeness. /// public static unsafe Vector128 LoadScalarVector128(int* address) { throw new PlatformNotSupportedException(); } /// - /// __m128i _mm_loadl_epi64 (__m128i const* mem_addr) - /// MOVQ xmm, reg/m64 + /// __m128i _mm_loadl_epi32 (__m128i const* mem_addr) + /// MOVD xmm, reg/m64 + /// The above native signature does not exist. We provide this additional overload for completeness. /// public static unsafe Vector128 LoadScalarVector128(uint* address) { throw new PlatformNotSupportedException(); } /// diff --git a/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.cs b/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.cs index f43d96e..c9b645a 100644 --- a/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.cs +++ b/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse2.cs @@ -768,13 +768,15 @@ namespace System.Runtime.Intrinsics.X86 /// public static unsafe Vector128 LoadScalarVector128(ushort* address) => LoadScalarVector128(address); /// - /// __m128i _mm_loadl_epi64 (__m128i const* mem_addr) - /// MOVQ xmm, reg/m64 + /// __m128i _mm_loadl_epi32 (__m128i const* mem_addr) + /// MOVD xmm, reg/m32 + /// The above native signature does not exist. We provide this additional overload for completeness. /// public static unsafe Vector128 LoadScalarVector128(int* address) => LoadScalarVector128(address); /// - /// __m128i _mm_loadl_epi64 (__m128i const* mem_addr) - /// MOVQ xmm, reg/m64 + /// __m128i _mm_loadl_epi32 (__m128i const* mem_addr) + /// MOVD xmm, reg/m32 + /// The above native signature does not exist. We provide this additional overload for completeness. /// public static unsafe Vector128 LoadScalarVector128(uint* address) => LoadScalarVector128(address); /// -- 2.7.4