Remove SSE2-specific logic from UTF-16 validation code (#52349)

author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>

Thu, 6 May 2021 22:36:31 +0000 (15:36 -0700)

committer GitHub <noreply@github.com>

Thu, 6 May 2021 22:36:31 +0000 (15:36 -0700)
author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Thu, 6 May 2021 22:36:31 +0000 (15:36 -0700)
committer GitHub <noreply@github.com>
Thu, 6 May 2021 22:36:31 +0000 (15:36 -0700)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs

index 35a29e6..c5b27ed 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs
@@ -66,7 +66,7 @@ namespace System.Text.Unicode
              // ARM64-intrinsicified code paths. ARM64 platforms may still use the vectorized
              // non-intrinsicified 'else' block below.
  
-            if (/* (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || */ Sse2.IsSupported)
+            if (/* (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || */ Sse41.IsSupported)
              {
                  if (inputLength >= Vector128<ushort>.Count)
                  {
@@ -96,25 +96,16 @@ namespace System.Text.Unicode
                          pInputBuffer += Vector128<ushort>.Count; // eagerly bump this now in preparation for next loop, will adjust later if necessary
                          Vector128<ushort> charIsNonAscii;
  
+                        // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
+                        // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
+
                          if (AdvSimd.Arm64.IsSupported)
                          {
-                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
-                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                              charIsNonAscii = AdvSimd.Min(utf16Data, vector0080);
                          }
-                        else if (Sse41.IsSupported)
-                        {
-                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
-                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
-                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
-                        }
                          else
                          {
-                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
-                            // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will
-                            // be handled in a few lines.
-
-                            charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080);
+                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
                          }
  
  #if DEBUG
@@ -139,9 +130,8 @@ namespace System.Text.Unicode
  
                          // Since 3-byte elements have a value >= 0x0800, we'll perform a saturating add of 0x7800 in order to
                          // get all 3-byte elements to have their 0x8000 bits set. A saturating add will not set the 0x8000
-                        // bit for 1-byte or 2-byte elements. 2-byte and 3-byte elements MAY have their 0x0080 bits set,
-                        // but this is not required, and the 0x0080 bits (see 'charIsNonAscii' above) will be explicitly
-                        // set for such elements anyway by the immediately following OR instruction.
+                        // bit for 1-byte or 2-byte elements. The 0x0080 bit will already have been set for non-ASCII (2-byte
+                        // and 3-byte) elements.
  
                          if (AdvSimd.IsSupported)
                          {
diff --git a/src/libraries/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs b/src/libraries/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs

index 19c0d24..d9c0716 100644 (file)
--- a/src/libraries/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs
+++ b/src/libraries/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs
@@ -165,11 +165,11 @@ namespace System.Text.Unicode.Tests
              processedInput[0] = '\u0080'; // 2-byte UTF-8 sequence
              processedInput[1] = '\u0800'; // 3-byte UTF-8 sequence
              processedInput[2] = '\u0080'; // 2-byte UTF-8 sequence
-            processedInput[3] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[3] = '\u8000'; // 3-byte UTF-8 sequence (a negative number, when signed)
              processedInput[4] = '\u0080'; // 2-byte UTF-8 sequence
              processedInput[5] = '\u0800'; // 3-byte UTF-8 sequence
              processedInput[6] = '\u0080'; // 2-byte UTF-8 sequence
-            processedInput[7] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[7] = '\u8880'; // 3-byte UTF-8 sequence (a negative number, when signed, with 0x0800 and 0x0080 marker bits also set)
  
              expectedUtf8ByteCount += 12;
author	Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
	Thu, 6 May 2021 22:36:31 +0000 (15:36 -0700)
committer	GitHub <noreply@github.com>
	Thu, 6 May 2021 22:36:31 +0000 (15:36 -0700)
src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs		patch \| blob \| history
src/libraries/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs		patch \| blob \| history