From 9d88a948ba3c18a9bea2217ad51828913701c2eb Mon Sep 17 00:00:00 2001
From: Carlos Sanchez Lopez <1175054+carlossanlop@users.noreply.github.com>
Date: Mon, 20 Jul 2020 15:26:22 -0700
Subject: [PATCH] AdvSimd support for
 System.Text.Unicode.Utf8Utility.GetPointerToFirstInvalidByte (#38653)

* AdvSimd support for System.Text.Unicode.Utf8Utility.GetPointerToFirstInvalidByte

* Move comment to the top, add shims.

* Little endian checks

* Use custom MoveMask method for AdvSimd

* Address suggestions to improve the AdvSimdMoveMask method

* Define initialMask outside MoveMask method

* UInt64 in Arm64MoveMask

* Add unit test case to verify intrinsics improvement

* Avoid casting to smaller integer type

* Typo and comment

* Use ShiftRightArithmetic instead of CompareEqual + And.
Remove test case causing other unit tests to fail.

* Use AddPairwise version of GetNotAsciiBytes

* Add missing shims causing Linux build to fail

* Simplify GetNonAsciiBytes to only one AddPairwise call, shorter bitmask

* Respect data type returned by masking method

* Address suggestions - assert trailingzerocount and bring back uint mask

* Trailing zeroes in AdvSimd need to be divided by 4, and total number should not be larger than 16

* Avoid declaring static field which causes PNSE in Utf8String.Experimental (S.P.Corelib code is used for being NetStandard)

* Prefer using nuint for BitConverter.TrailingZeroCount
---
 .../System/Text/Unicode/Utf8Utility.Validation.cs  | 64 ++++++++++++++++------
 1 file changed, 47 insertions(+), 17 deletions(-)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index 33e5181..7730708 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -4,6 +4,8 @@
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 
 #if SYSTEM_PRIVATE_CORELIB
@@ -117,22 +119,35 @@ namespace System.Text.Unicode
                         // the alignment check consumes at most a single DWORD.)
 
                         byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here
-                        uint mask;
+                        nuint trailingZeroCount;
+
+                        Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
+                            Vector128.Create((ushort)0x1001).AsByte() :
+                            Vector128.Create((ushort)0x0110).AsByte();
 
                         do
                         {
-                            if (Sse2.IsSupported)
+                            // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
+                            // going to perform an unaligned load. We don't necessarily care about aligning
+                            // this because we pessimistically assume we'll encounter non-ASCII data at some
+                            // point in the not-too-distant future (otherwise we would've stayed entirely
+                            // within the all-ASCII vectorized code at the entry to this method).
+                            if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)
                             {
-                                // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
-                                // going to perform an unaligned load. We don't necessarily care about aligning
-                                // this because we pessimistically assume we'll encounter non-ASCII data at some
-                                // point in the not-too-distant future (otherwise we would've stayed entirely
-                                // within the all-ASCII vectorized code at the entry to this method).
-
-                                mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte*)pInputBuffer));
+                                ulong mask = GetNonAsciiBytes(AdvSimd.LoadVector128(pInputBuffer), bitMask128);
+                                if (mask != 0)
+                                {
+                                    trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask) >> 2;
+                                    goto LoopTerminatedEarlyDueToNonAsciiData;
+                                }
+                            }
+                            else if (Sse2.IsSupported)
+                            {
+                                uint mask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pInputBuffer));
                                 if (mask != 0)
                                 {
-                                    goto Sse2LoopTerminatedEarlyDueToNonAsciiData;
+                                    trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask);
+                                    goto LoopTerminatedEarlyDueToNonAsciiData;
                                 }
                             }
                             else
@@ -153,19 +168,20 @@ namespace System.Text.Unicode
 
                         continue; // need to perform a bounds check because we might be running out of data
 
-                    Sse2LoopTerminatedEarlyDueToNonAsciiData:
+                    LoopTerminatedEarlyDueToNonAsciiData:
+                        // x86 can only be little endian, while ARM can be big or little endian
+                        // so if we reached this label we need to check both combinations are supported
+                        Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported);
 
-                        Debug.Assert(BitConverter.IsLittleEndian);
-                        Debug.Assert(Sse2.IsSupported);
 
                         // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
-                        // for each non-ASCII byte we saw. We can count the number of ASCII bytes,
+                        // for each non-ASCII byte we saw. trailingZeroCount will count the number of ASCII bytes,
                         // bump our input counter by that amount, and resume processing from the
                         // "the first byte is no longer ASCII" portion of the main loop.
+                        // We should not expect a total number of zeroes equal or larger than 16.
+                        Debug.Assert(trailingZeroCount < 16);
 
-                        Debug.Assert(mask != 0);
-
-                        pInputBuffer += BitOperations.TrailingZeroCount(mask);
+                        pInputBuffer += trailingZeroCount;
                         if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
                         {
                             goto ProcessRemainingBytesSlow;
@@ -719,5 +735,19 @@ namespace System.Text.Unicode
             scalarCountAdjustment = tempScalarCountAdjustment;
             return pInputBuffer;
         }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bitMask128)
+        {
+            if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            Vector128<byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
+            Vector128<byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128);
+            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
+            return extractedBits.AsUInt64().ToScalar();
+        }
     }
 }
-- 
2.7.4