Improve Ascii (and Utf8) encoding (#85266)
authorDaniel Svensson <daniel.svensson@hotmail.se>
Fri, 12 May 2023 10:43:40 +0000 (12:43 +0200)
committerGitHub <noreply@github.com>
Fri, 12 May 2023 10:43:40 +0000 (12:43 +0200)
* Improve writing of lower vector part in ascii convertion

* from 10 /17 to 1 instruction for 64/32 bit x86

* Add [MethodImpl(MethodImplOptions.AggressiveInlining)] to NarrowUtf16ToAscii_Intrinsified

* rewrite StoreLower without Sse2.StoreScalar

* move helper to Vector128 and call in case conversion

* remove unused helpers

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs
src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs

index d1c7d5e..31aac45 100644 (file)
@@ -2718,6 +2718,24 @@ namespace System.Runtime.Intrinsics
         public static unsafe void StoreAlignedNonTemporal<T>(this Vector128<T> source, T* destination)
             where T : unmanaged => source.StoreAligned(destination);
 
+        /// <summary>
+        /// Stores to lower 64 bits of <paramref name="source"/> to memory destination of <paramref name="destination"/>[<paramref name="elementOffset"/>]
+        /// </summary>
+        /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+        /// <param name="source">The vector that will be stored.</param>
+        /// <param name="destination">The destination to which <paramref name="elementOffset" /> will be added before the vector will be stored.</param>
+        /// <param name="elementOffset">The element offset from <paramref name="destination" /> from which the vector will be stored.</param>
+        /// <remarks>
+        /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack)
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void StoreLowerUnsafe<T>(this Vector128<T> source, ref T destination, nuint elementOffset = 0)
+            where T : struct
+        {
+            ref byte address = ref Unsafe.As<T, byte>(ref Unsafe.Add(ref destination, elementOffset));
+            Unsafe.WriteUnaligned<double>(ref address, source.AsDouble().ToScalar());
+        }
+
         /// <summary>Stores a vector at the given destination.</summary>
         /// <typeparam name="T">The type of the elements in the vector.</typeparam>
         /// <param name="source">The vector that will be stored.</param>
index c226161..a9cdc30 100644 (file)
@@ -464,41 +464,6 @@ namespace System.Text
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void Widen8To16AndAndWriteTo(Vector128<byte> narrowVector, char* pDest, nuint destOffset)
-        {
-            if (Vector256.IsHardwareAccelerated)
-            {
-                Vector256<ushort> wide = Vector256.WidenLower(narrowVector.ToVector256Unsafe());
-                wide.StoreUnsafe(ref *(ushort*)pDest, destOffset);
-            }
-            else
-            {
-                Vector128.WidenLower(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset);
-                Vector128.WidenUpper(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset + 8);
-            }
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void Narrow16To8AndAndWriteTo(Vector128<ushort> wideVector, byte* pDest, nuint destOffset)
-        {
-            Vector128<byte> narrow = Vector128.Narrow(wideVector, wideVector);
-
-            if (Sse2.IsSupported)
-            {
-                // MOVQ is supported even on x86, unaligned accesses allowed
-                Sse2.StoreScalar((ulong*)(pDest + destOffset), narrow.AsUInt64());
-            }
-            else if (Vector64.IsHardwareAccelerated)
-            {
-                narrow.GetLower().StoreUnsafe(ref *pDest, destOffset);
-            }
-            else
-            {
-                Unsafe.WriteUnaligned<ulong>(pDest + destOffset, narrow.AsUInt64().ToScalar());
-            }
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static unsafe void ChangeWidthAndWriteTo<TFrom, TTo>(Vector128<TFrom> vector, TTo* pDest, nuint elementOffset)
             where TFrom : unmanaged
             where TTo : unmanaged
@@ -524,12 +489,9 @@ namespace System.Text
             }
             else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1)
             {
-                // narrowing operation required
-                // since we know data is all-ASCII, special-case SSE2 to avoid unneeded PAND in Narrow call
-                Vector128<byte> narrow = (Sse2.IsSupported)
-                    ? Sse2.PackUnsignedSaturate(vector.AsInt16(), vector.AsInt16())
-                    : Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16());
-                narrow.GetLower().StoreUnsafe(ref *(byte*)pDest, elementOffset);
+                // narrowing operation required, we know data is all-ASCII so use extract helper
+                Vector128<byte> narrow = ExtractAsciiVector(vector.AsUInt16(), vector.AsUInt16());
+                narrow.StoreLowerUnsafe(ref *(byte*)pDest, elementOffset);
             }
             else
             {
@@ -556,25 +518,6 @@ namespace System.Text
             }
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<TTo> NarrowOrWidenLowerVectorUnsigned<TFrom, TTo>(Vector128<TFrom> vector)
-            where TFrom : unmanaged
-            where TTo : unmanaged
-        {
-            if (sizeof(TFrom) == 1 && sizeof(TTo) == 2)
-            {
-                return Vector128.WidenLower(vector.AsByte()).As<ushort, TTo>();
-            }
-            else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1)
-            {
-                return Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16()).As<byte, TTo>();
-            }
-            else
-            {
-                throw new NotSupportedException();
-            }
-        }
-
         private struct ToUpperConversion { }
         private struct ToLowerConversion { }
     }
index c30fd05..2537f9c 100644 (file)
@@ -1518,6 +1518,7 @@ namespace System.Text
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
         {
             // This method contains logic optimized using vector instructions for both x64 and Arm64.
@@ -1550,7 +1551,7 @@ namespace System.Text
 
             ref byte asciiBuffer = ref *pAsciiBuffer;
             Vector128<byte> asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-            asciiVector.GetLower().StoreUnsafe(ref asciiBuffer);
+            asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0);
             nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
 
             // We're going to get the best performance when we have aligned writes, so we'll take the
@@ -1577,7 +1578,7 @@ namespace System.Text
 
                 // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
                 asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-                asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
+                asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
             }
 
             // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
@@ -1630,7 +1631,7 @@ namespace System.Text
 
             Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
             asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-            asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
+            asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
             currentOffsetInElements += SizeOfVector128 / 2;
 
             goto Finish;